Last active
April 7, 2020 23:40
-
-
Save jwhendy/d28f17fd1837c1a3679768244969ef2b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import pandas as pd | |
import os | |
import re | |
import subprocess | |
import time | |
### assumes you have run: | |
# cd path/mobility-report-data-extractor | |
# python ./mobility.py download -p US-foo | |
# python ./mobility.py proc svgs/US-foo.svg ./output | |
seg_list = [x for _ in range(2) | |
for x in ['Retail & recreation', 'Grocery & pharmacy', 'Parks', | |
'Transit stations', 'Workplace', 'Residential']] | |
### set these paths | |
path = '/home/uname/foo' # base path containing mobility-report-data-extractor | |
dir_mob = 'mobility-report-data-extractor' | |
areas = [d for d in os.listdir(os.path.join(path, dir_mob, 'output')) | |
if d.startswith('US')] | |
# start = time.time() | |
data_all = [] | |
for area in areas: | |
### process pdf to text | |
f = os.path.join(path, dir_mob, 'pdfs', area) | |
subprocess.call(['/usr/bin/pdftotext', '-layout', '-raw', f'{f}.pdf', f'{f}.txt']) | |
with open(f'{f}.txt') as f: | |
lines = [l for l in f.read().split('\n') if l.strip()] | |
header = re.split(', | ', lines[1]) | |
date = f'{header[-1]}-{header[-3]}-{header[-2]}' | |
date = datetime.datetime.strptime(date, '%Y-%B-%d').strftime('%Y-%m-%d') | |
area = ' '.join(header[:-3]) | |
data = [] | |
for i, line in enumerate(lines): | |
if re.findall('Retail & recreation', line) and i<20: | |
vals = [re.sub('%|\+', '', lines[i+x]) for x in [1, 13, 26, 38, 49, 59]] | |
rows = [{'area': area, 'loc': 'summary', 'seg': seg_list[i], 'ast': None, 'value': vals[i]} for i in range(6)] | |
data.extend(rows) | |
if re.findall('\f', line) and i>50: | |
locs = [x.strip() for x in [lines[i], lines[i+13]] for _ in range(6)] | |
locs = [l for l in locs if len(l.split(' ')) < 4] | |
asts = [lines[i+n-1] for n, x in enumerate(lines[i:i+110]) if x.startswith('Sun')] | |
asts = [ast if ast=='*' else None for ast in asts] | |
vals = [re.sub('%|\+|compared to baseline', '', lines[i+x]) | |
for x in [2, 4, 6, 8, 10, 12, 15, 17, 19, 21, 23, 25]] | |
vals = [val.strip(' ') if val != 'Not enough data for this date' else None for val in vals] | |
segs = [lines[i+n+1] for n in [0, 2, 4, 6, 8, 10, 13, 15, 17, 19, 21, 23]] | |
for i, loc in enumerate(locs): | |
if segs[i] not in seg_list: | |
continue | |
data.append({'area': area, 'loc': locs[i], 'seg': segs[i], 'ast': asts[i], 'value': vals[i]}) | |
for i, d in enumerate(data): | |
seq = (6*int(i/6))+(i%6)+1 | |
data[i]['i'] = seq | |
data[i]['path'] = f'output/US-{d["area"]}/{seq}.csv' | |
data_all.extend(data) | |
df = pd.DataFrame(data_all) | |
df['value'] = pd.to_numeric(df['value']) # fix missing values | |
# end = time.time() | |
# timing on above | |
# print(end-start) # 8.454864130020142 | |
### example output (run on directory of all 50 states) | |
# df | |
# area loc seg ast value i path | |
#0 Alabama summary Retail & recreation None -41 1 output/US-Alabama/1.csv | |
#1 Alabama summary Grocery & pharmacy None -13 2 output/US-Alabama/2.csv | |
#2 Alabama summary Parks None 19 3 output/US-Alabama/3.csv | |
#3 Alabama summary Transit stations None -30 4 output/US-Alabama/4.csv | |
#4 Alabama summary Workplace None -32 5 output/US-Alabama/5.csv | |
#... ... ... ... ... ... ... ... | |
#17095 Wyoming Weston County Grocery & pharmacy * -24 134 output/US-Wyoming/134.csv | |
#17096 Wyoming Weston County Parks * None 135 output/US-Wyoming/135.csv | |
#17097 Wyoming Weston County Transit stations * None 136 output/US-Wyoming/136.csv | |
#17098 Wyoming Weston County Workplace * -34 137 output/US-Wyoming/137.csv | |
#17099 Wyoming Weston County Residential * None 138 output/US-Wyoming/138.csv | |
# | |
#[17100 rows x 7 columns] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment