Created
August 19, 2019 14:52
-
-
Save jtrussell/1a70b8b888e1957d4cd5cdaedc4e6381 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: wingardium-leviosa | |
dependencies: | |
- python=3.7.3 | |
- pandas | |
- requests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zipfile | |
import requests | |
import os | |
import pandas as pd | |
from os.path import join, isfile | |
# Create a bunch of folders for us to put stuff in | |
print('Creating directories...') | |
os.makedirs('tmp/zip', exist_ok=True) | |
os.makedirs('tmp/extract', exist_ok=True) | |
os.makedirs('final', exist_ok=True) | |
# These years define the data range for the synpuf files we want to download | |
year_from = '2008' | |
year_to = '2010' | |
# The URL to download synpuf IP claims. Note the placeholders for year and file | |
# index | |
base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/DE1_0_{year_from}_to_{year_to}_Inpatient_Claims_Sample_{file_index}.zip' | |
base_url = base_url.replace('{year_from}', year_from) | |
base_url = base_url.replace('{year_to}', year_to) | |
# Yeah, they broke the "large" data file into a number of "smaller" ones to make | |
# it "easier" for people to use | |
omg_how_many_files_are_there = 20 | |
# We have to download and unzip each file separately. | |
print('Downloading data files...') | |
for file_index in range(1, omg_how_many_files_are_there + 1): | |
file_index = str(file_index) | |
# Download data files - these will be zip archives | |
url = base_url.replace('{file_index}', file_index) | |
response = requests.get(url) | |
zip_file_path = f'tmp/zip/ip_claims_{file_index}.zip' | |
with open(zip_file_path, 'wb') as zip_file: | |
zip_file.write(response.content) | |
# Extract the file(s) inside the zip | |
with zipfile.ZipFile(zip_file_path, 'r') as zip_file: | |
zip_file.extractall('tmp/extract') | |
# Get a list of the data file (CSV) paths | |
data_files = [path for path in os.listdir('tmp/extract')] | |
data_files = [join('tmp/extract', path) for path in data_files] | |
data_files = [path for path in data_files if isfile(path)] | |
# Load in each CSV as a data frame | |
# | |
# If we wanted to be more precise we could tells pandas what data type to use | |
# for each column. I.e. integers vs decimals vs strings, etc. | |
print('Loading data...') | |
data_frames = [pd.read_csv(path) for path in data_files] | |
# Combine into single data frame | |
df = pd.concat(data_frames, ignore_index=True) | |
# Here, we could perform validation and update routines on the final set. For | |
# now, we'll just grab some high level stats to demonstrate and make sure we | |
# actually got something reasonable. | |
num_rows = len(df) | |
num_cols = len(df.columns) | |
pmt_avg = df.CLM_PMT_AMT.mean() | |
pmt_med = df.CLM_PMT_AMT.median() | |
pmt_std = df.CLM_PMT_AMT.std() | |
print('') | |
print(' Stats:') | |
print('') | |
print(f' Total Rows: {num_rows}') | |
print(f' Total Columns: {num_cols}') | |
print('') | |
print(f' Average claims payment: {pmt_avg:,}') | |
print(f' Standard deviation of claims payments: {pmt_std:,}') | |
print(f' Median claims payment: {pmt_med:,}') | |
print('') | |
# Save the final result | |
print('Saving final CSV...') | |
final_path = f'final/ip_claims_{year_from}_{year_to}.csv' | |
df.to_csv(final_path, index=False) | |
# Here we could also clean up our tmp files, we'll leave them in for demo | |
# purposes. | |
#import shutil | |
#shutil.rmtree('tmp') | |
print('Done.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment