dannguyen · January 20, 2016 07:16
diff --git a/_README.md b/_README.md
diff --git a/collate.py b/collate.py
 ## This expects the fetching script to have run
 ## it compiles all the spreadsheets:
 ##   - k-immune.csv
 ##   - k-immune.json

 import csv
 import json
 import os.path
 import re
 from glob import glob
 from os import makedirs
 from xlrd import open_workbook
 XLS_DIR = "./data-hold/xls/immunization"
 FINISHED_DIR = './data-hold/finished'

 pre_2012_headers = ['school_code', 'county', 'school_type', 'district_code', 'school_name',
 'enrollment', 'uptodate_num', 'uptodate_pct', 'conditional_num', 'conditional_pct',
 'pme_num', 'pme_pct', 'pbe_num', 'pbe_pct', 'dtp_num', 'dtp_pct', 'polio_num',
 'polio_pct', 'mmr1_num', 'mmr1_pct', 'mmr2_num', 'mmr2_pct', 'hepb_num', 'hepb_pct',
 'vari_num', 'vari_pct']
 post_2012_headers = ['school_code', 'county', 'school_type', 'district_name', 'city',
 'school_name', 'enrollment', 'uptodate_num', 'uptodate_pct', 'conditional_num', 'conditional_pct',
 'pme_num', 'pme_pct', 'pbe_num', 'pbe_pct', 'dtp_num', 'dtp_pct', 'polio_num',
 'polio_pct', 'mmr2_num', 'mmr2_pct', 'hepb_num', 'hepb_pct', 'vari_num', 'vari_pct', 'reported']
 # differences between pre/post 2012:
 #  post-2012 only records 2-dose MMR, e.g. `mmr2_num` and `mmr2_pct`

 makedirs(XLS_DIR, exist_ok = True)
 makedirs(FINISHED_DIR, exist_ok = True)

 data = []
 for xlsname in glob(os.path.join(XLS_DIR, '*.xls*')):
    # extract the year numbers from the file name
    # e.g. "2006" and "2007" from "K--2006-2007.xls"
    yr_1, yr_2 = re.search('(\d{4})-(\d{4})', xlsname).groups()
    year = int(yr_1)
    headers = pre_2012_headers if year < 2012 else post_2012_headers
    # open the Excel workbook
    book = open_workbook(xlsname)
    # open the first non-empty spreadsheet
    sheet = [s for s in book.sheets() if s.nrows > 0][0]
    print(xlsname, "has", sheet.nrows, "rows")
    for x in range(1, sheet.nrows - 1):
        row = sheet.row_values(x)
        if re.search('\d{7}', str(row[0])):
            d = dict(zip(headers, row))
            d['year'] = year
            data.append(d)


 print("There are", len(data), 'data rows all together')

 # write a JSON
 # Note: fields that don't exist in a given layout are *not* included as null values.
 #       they are simply left out of each dict
 jname = os.path.join(FINISHED_DIR, 'k-immune.json')
 print("Writing to JSON:", jname)
 with open(jname, "w") as jfile:
    jfile.write(json.dumps(data, indent = 4))

 # write a CSV
 cname = os.path.join(FINISHED_DIR, 'k-immune.csv')
 print("Writing to CSV:", cname)
 writer = csv.DictWriter(open(cname, 'w', encoding = 'utf-8'),
   fieldnames = set(pre_2012_headers + post_2012_headers + ['year']),
   delimiter=','
 )
 writer.writeheader()
 for d in data:
    writer.writerow(d)
diff --git a/fetcher.py b/fetcher.py
 # just fetches the spreadsheets from the California site
 import re
 import os.path
 import requests
 from urllib.parse import urljoin
 from lxml import html
 from os import makedirs

 XLS_DIR = "./data-hold/xls/immunization"
 INDEX_URL = "http://www.cdph.ca.gov/programs/immunize/pages/immunizationlevels.aspx"
 makedirs(XLS_DIR, exist_ok = True)
 # Download the HTML listing
 response = requests.get(INDEX_URL)
 doc = html.fromstring(response.text)
 all_urls = doc.xpath('//a[contains(@href, "Kinder") and contains(@href, "xls")]/@href')
 for url in all_urls:
    y1, y2 = re.search('(\d{2})-(?:\d{2})?(\d{2})', url).groups()
    # y1 and y2 are the 2 digit years
    ext = os.path.splitext(url)[1]

    # Now rename to a proper year
    # e.g. ./data-hold/xls/K--2005-2006.xls
    oname = os.path.join(XLS_DIR, "K--20{0}-20{1}{2}".format(y1, y2, ext))
    full_url = urljoin(INDEX_URL, url)
    print("Downloading:\n {0}\n into: {1}".format(full_url, oname))

    # Sample output:
    # Downloading:
    #  http://www.cdph.ca.gov/programs/immunize/Documents/2007-2008%20CA%20Kindergarten%20Data.xls
    #  into: ./data-hold/xls/K--2007-2008.xls
    xlsfile = requests.get(full_url)
    with open(oname, 'wb') as ofile:
         ofile.write(xlsfile.content)
diff --git a/get_cde_data.py b/get_cde_data.py
 # In progress...this script, when done, should produce a lookup table to find CDS code given a school code and county name/code

 # Ideally, this would run _after_ the collate.py step, which could attach CDScodes to the health data...however
 # there are a number of complexities, foremost being that school districts change over the years, and the CDE only 
 # has the file for the current date. So a decision has to be made by the user on how much to care about tracking
 # historical data across different schools.

 # Landing page
 # http://www.cde.ca.gov/ds/si/ds/pubschls.asp
 # More about schools
 # http://www.cde.ca.gov/ds/si/ds/

 from os import makedirs
 from urllib.request import urlretrieve
 import csv
 import os.path
 import shutil

 DATA_DIR = './data-hold/cde'
 DATA_FILES = {
    'district': {
        "url": 'http://www.cde.ca.gov/ds/si/ds/documents/legdist2014.xls',
        'local': os.path.join(DATA_DIR, 'legdist2014.xls')
    },
    'schools': {
        "url": 'ftp://ftp.cde.ca.gov/demo/schlname/pubschls.txt',
        'local': os.path.join(DATA_DIR, 'pubschls.txt')
    }
 }
 # TODO: pubschls gets updated daily; might be worth timestamping it upon each download
 makedirs(DATA_DIR, exist_ok = True)
 for d in DATA_FILES.values():
    # download the data
    resp = urlretrieve(d['url'])
    # whatever I don't even understand urllib
    shutil.copy(resp[0], d['local'])
    print("Copied", d['url'], 'to:', d['local'])

 # Now work with the local schools file:
 # TODO
 # txt = open(DATA_FILES['schools']['local'], encoding = 'latin-1').read()
 # rows = list(csv.DictReader(txt.splitlines(), delimiter = "\t"))
	## This expects the fetching script to have run
	## it compiles all the spreadsheets:
	## - k-immune.csv
	## - k-immune.json

	import csv
	import json
	import os.path
	import re
	from glob import glob
	from os import makedirs
	from xlrd import open_workbook
	XLS_DIR = "./data-hold/xls/immunization"
	FINISHED_DIR = './data-hold/finished'

	pre_2012_headers = ['school_code', 'county', 'school_type', 'district_code', 'school_name',
	'enrollment', 'uptodate_num', 'uptodate_pct', 'conditional_num', 'conditional_pct',
	'pme_num', 'pme_pct', 'pbe_num', 'pbe_pct', 'dtp_num', 'dtp_pct', 'polio_num',
	'polio_pct', 'mmr1_num', 'mmr1_pct', 'mmr2_num', 'mmr2_pct', 'hepb_num', 'hepb_pct',
	'vari_num', 'vari_pct']
	post_2012_headers = ['school_code', 'county', 'school_type', 'district_name', 'city',
	'school_name', 'enrollment', 'uptodate_num', 'uptodate_pct', 'conditional_num', 'conditional_pct',
	'pme_num', 'pme_pct', 'pbe_num', 'pbe_pct', 'dtp_num', 'dtp_pct', 'polio_num',
	'polio_pct', 'mmr2_num', 'mmr2_pct', 'hepb_num', 'hepb_pct', 'vari_num', 'vari_pct', 'reported']
	# differences between pre/post 2012:
	# post-2012 only records 2-dose MMR, e.g. `mmr2_num` and `mmr2_pct`

	makedirs(XLS_DIR, exist_ok = True)
	makedirs(FINISHED_DIR, exist_ok = True)

	data = []
	for xlsname in glob(os.path.join(XLS_DIR, '.xls')):
	# extract the year numbers from the file name
	# e.g. "2006" and "2007" from "K--2006-2007.xls"
	yr_1, yr_2 = re.search('(\d{4})-(\d{4})', xlsname).groups()
	year = int(yr_1)
	headers = pre_2012_headers if year < 2012 else post_2012_headers
	# open the Excel workbook
	book = open_workbook(xlsname)
	# open the first non-empty spreadsheet
	sheet = [s for s in book.sheets() if s.nrows > 0][0]
	print(xlsname, "has", sheet.nrows, "rows")
	for x in range(1, sheet.nrows - 1):
	row = sheet.row_values(x)
	if re.search('\d{7}', str(row[0])):
	d = dict(zip(headers, row))
	d['year'] = year
	data.append(d)


	print("There are", len(data), 'data rows all together')

	# write a JSON
	# Note: fields that don't exist in a given layout are not included as null values.
	# they are simply left out of each dict
	jname = os.path.join(FINISHED_DIR, 'k-immune.json')
	print("Writing to JSON:", jname)
	with open(jname, "w") as jfile:
	jfile.write(json.dumps(data, indent = 4))

	# write a CSV
	cname = os.path.join(FINISHED_DIR, 'k-immune.csv')
	print("Writing to CSV:", cname)
	writer = csv.DictWriter(open(cname, 'w', encoding = 'utf-8'),
	fieldnames = set(pre_2012_headers + post_2012_headers + ['year']),
	delimiter=','
	)
	writer.writeheader()
	for d in data:
	writer.writerow(d)
	# just fetches the spreadsheets from the California site
	import re
	import os.path
	import requests
	from urllib.parse import urljoin
	from lxml import html
	from os import makedirs

	XLS_DIR = "./data-hold/xls/immunization"
	INDEX_URL = "http://www.cdph.ca.gov/programs/immunize/pages/immunizationlevels.aspx"
	makedirs(XLS_DIR, exist_ok = True)
	# Download the HTML listing
	response = requests.get(INDEX_URL)
	doc = html.fromstring(response.text)
	all_urls = doc.xpath('//a[contains(@href, "Kinder") and contains(@href, "xls")]/@href')
	for url in all_urls:
	y1, y2 = re.search('(\d{2})-(?:\d{2})?(\d{2})', url).groups()
	# y1 and y2 are the 2 digit years
	ext = os.path.splitext(url)[1]

	# Now rename to a proper year
	# e.g. ./data-hold/xls/K--2005-2006.xls
	oname = os.path.join(XLS_DIR, "K--20{0}-20{1}{2}".format(y1, y2, ext))
	full_url = urljoin(INDEX_URL, url)
	print("Downloading:\n {0}\n into: {1}".format(full_url, oname))

	# Sample output:
	# Downloading:
	# http://www.cdph.ca.gov/programs/immunize/Documents/2007-2008%20CA%20Kindergarten%20Data.xls
	# into: ./data-hold/xls/K--2007-2008.xls
	xlsfile = requests.get(full_url)
	with open(oname, 'wb') as ofile:
	ofile.write(xlsfile.content)
	# In progress...this script, when done, should produce a lookup table to find CDS code given a school code and county name/code

	# Ideally, this would run _after_ the collate.py step, which could attach CDScodes to the health data...however
	# there are a number of complexities, foremost being that school districts change over the years, and the CDE only
	# has the file for the current date. So a decision has to be made by the user on how much to care about tracking
	# historical data across different schools.

	# Landing page
	# http://www.cde.ca.gov/ds/si/ds/pubschls.asp
	# More about schools
	# http://www.cde.ca.gov/ds/si/ds/

	from os import makedirs
	from urllib.request import urlretrieve
	import csv
	import os.path
	import shutil

	DATA_DIR = './data-hold/cde'
	DATA_FILES = {
	'district': {
	"url": 'http://www.cde.ca.gov/ds/si/ds/documents/legdist2014.xls',
	'local': os.path.join(DATA_DIR, 'legdist2014.xls')
	},
	'schools': {
	"url": 'ftp://ftp.cde.ca.gov/demo/schlname/pubschls.txt',
	'local': os.path.join(DATA_DIR, 'pubschls.txt')
	}
	}
	# TODO: pubschls gets updated daily; might be worth timestamping it upon each download
	makedirs(DATA_DIR, exist_ok = True)
	for d in DATA_FILES.values():
	# download the data
	resp = urlretrieve(d['url'])
	# whatever I don't even understand urllib
	shutil.copy(resp[0], d['local'])
	print("Copied", d['url'], 'to:', d['local'])

	# Now work with the local schools file:
	# TODO
	# txt = open(DATA_FILES['schools']['local'], encoding = 'latin-1').read()
	# rows = list(csv.DictReader(txt.splitlines(), delimiter = "\t"))