jtrussell · August 19, 2019 14:52
diff --git a/environment.yml b/environment.yml
 name: wingardium-leviosa
 dependencies:
  - python=3.7.3
  - pandas
  - requests
diff --git a/get-synpuf-ip-claims.py b/get-synpuf-ip-claims.py
 import zipfile
 import requests
 import os
 import pandas as pd
 from os.path import join, isfile

 # Create a bunch of folders for us to put stuff in
 print('Creating directories...')
 os.makedirs('tmp/zip', exist_ok=True)
 os.makedirs('tmp/extract', exist_ok=True)
 os.makedirs('final', exist_ok=True)

 # These years define the data range for the synpuf files we want to download
 year_from = '2008'
 year_to = '2010'

 # The URL to download synpuf IP claims. Note the placeholders for year and file
 # index
 base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/DE1_0_{year_from}_to_{year_to}_Inpatient_Claims_Sample_{file_index}.zip'
 base_url = base_url.replace('{year_from}', year_from)
 base_url = base_url.replace('{year_to}', year_to)

 # Yeah, they broke the "large" data file into a number of "smaller" ones to make
 # it "easier" for people to use
 omg_how_many_files_are_there = 20

 # We have to download and unzip each file separately.
 print('Downloading data files...')
 for file_index in range(1, omg_how_many_files_are_there + 1):
    file_index = str(file_index)

    # Download data files - these will be zip archives
    url = base_url.replace('{file_index}', file_index)
    response = requests.get(url)
    zip_file_path = f'tmp/zip/ip_claims_{file_index}.zip'
    with open(zip_file_path, 'wb') as zip_file:
        zip_file.write(response.content)

        # Extract the file(s) inside the zip
        with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
            zip_file.extractall('tmp/extract')

 # Get a list of the data file (CSV) paths
 data_files = [path for path in os.listdir('tmp/extract')]
 data_files = [join('tmp/extract', path) for path in data_files]
 data_files = [path for path in data_files if isfile(path)]

 # Load in each CSV as a data frame
 #
 # If we wanted to be more precise we could tells pandas what data type to use
 # for each column. I.e. integers vs decimals vs strings, etc.
 print('Loading data...')
 data_frames = [pd.read_csv(path) for path in data_files]

 # Combine into single data frame
 df = pd.concat(data_frames, ignore_index=True)

 # Here, we could perform validation and update routines on the final set. For
 # now, we'll just grab some high level stats to demonstrate and make sure we
 # actually got something reasonable.
 num_rows = len(df)
 num_cols = len(df.columns)
 pmt_avg = df.CLM_PMT_AMT.mean()
 pmt_med = df.CLM_PMT_AMT.median()
 pmt_std = df.CLM_PMT_AMT.std()
 print('')
 print(' Stats:')
 print('')
 print(f'   Total Rows: {num_rows}')
 print(f'   Total Columns: {num_cols}')
 print('')
 print(f'   Average claims payment: {pmt_avg:,}')
 print(f'   Standard deviation of claims payments: {pmt_std:,}')
 print(f'   Median claims payment: {pmt_med:,}')
 print('')

 # Save the final result
 print('Saving final CSV...')
 final_path = f'final/ip_claims_{year_from}_{year_to}.csv'
 df.to_csv(final_path, index=False)

 # Here we could also clean up our tmp files, we'll leave them in for demo
 # purposes.
 #import shutil
 #shutil.rmtree('tmp')

 print('Done.')
	name: wingardium-leviosa
	dependencies:
	- python=3.7.3
	- pandas
	- requests
	import zipfile
	import requests
	import os
	import pandas as pd
	from os.path import join, isfile

	# Create a bunch of folders for us to put stuff in
	print('Creating directories...')
	os.makedirs('tmp/zip', exist_ok=True)
	os.makedirs('tmp/extract', exist_ok=True)
	os.makedirs('final', exist_ok=True)

	# These years define the data range for the synpuf files we want to download
	year_from = '2008'
	year_to = '2010'

	# The URL to download synpuf IP claims. Note the placeholders for year and file
	# index
	base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/DE1_0_{year_from}_to_{year_to}_Inpatient_Claims_Sample_{file_index}.zip'
	base_url = base_url.replace('{year_from}', year_from)
	base_url = base_url.replace('{year_to}', year_to)

	# Yeah, they broke the "large" data file into a number of "smaller" ones to make
	# it "easier" for people to use
	omg_how_many_files_are_there = 20

	# We have to download and unzip each file separately.
	print('Downloading data files...')
	for file_index in range(1, omg_how_many_files_are_there + 1):
	file_index = str(file_index)

	# Download data files - these will be zip archives
	url = base_url.replace('{file_index}', file_index)
	response = requests.get(url)
	zip_file_path = f'tmp/zip/ip_claims_{file_index}.zip'
	with open(zip_file_path, 'wb') as zip_file:
	zip_file.write(response.content)

	# Extract the file(s) inside the zip
	with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
	zip_file.extractall('tmp/extract')

	# Get a list of the data file (CSV) paths
	data_files = [path for path in os.listdir('tmp/extract')]
	data_files = [join('tmp/extract', path) for path in data_files]
	data_files = [path for path in data_files if isfile(path)]

	# Load in each CSV as a data frame
	#
	# If we wanted to be more precise we could tells pandas what data type to use
	# for each column. I.e. integers vs decimals vs strings, etc.
	print('Loading data...')
	data_frames = [pd.read_csv(path) for path in data_files]

	# Combine into single data frame
	df = pd.concat(data_frames, ignore_index=True)

	# Here, we could perform validation and update routines on the final set. For
	# now, we'll just grab some high level stats to demonstrate and make sure we
	# actually got something reasonable.
	num_rows = len(df)
	num_cols = len(df.columns)
	pmt_avg = df.CLM_PMT_AMT.mean()
	pmt_med = df.CLM_PMT_AMT.median()
	pmt_std = df.CLM_PMT_AMT.std()
	print('')
	print(' Stats:')
	print('')
	print(f' Total Rows: {num_rows}')
	print(f' Total Columns: {num_cols}')
	print('')
	print(f' Average claims payment: {pmt_avg:,}')
	print(f' Standard deviation of claims payments: {pmt_std:,}')
	print(f' Median claims payment: {pmt_med:,}')
	print('')

	# Save the final result
	print('Saving final CSV...')
	final_path = f'final/ip_claims_{year_from}_{year_to}.csv'
	df.to_csv(final_path, index=False)

	# Here we could also clean up our tmp files, we'll leave them in for demo
	# purposes.
	#import shutil
	#shutil.rmtree('tmp')

	print('Done.')