Created
October 12, 2016 21:34
-
-
Save gregjurman/3f67128102564fd88fef0dce6582a5b2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import json | |
import urllib | |
import boto3 | |
import csv | |
import zipfile | |
print('Loading function') | |
s3 = boto3.client('s3') | |
def lambda_handler(event, context): | |
#print("Received event: " + json.dumps(event, indent=2)) | |
log = [] | |
rem_ids = [] | |
# Get the object from the event and show its content type | |
bucket = event['Records'][0]['s3']['bucket']['name'] | |
key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8') | |
response=None | |
try: | |
s3.download_file(Bucket=bucket, Key=key, Filename="/tmp/inbound.zip") | |
except Exception as e: | |
print(e) | |
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket)) | |
raise e | |
# open our inbound zip | |
with zipfile.ZipFile("/tmp/inbound.zip") as zfile: | |
# get the businesses file and open it as a csv | |
with zfile.open("businesses.csv") as biz_file: | |
biz_csv = csv.reader(biz_file) | |
# make a new temp csv for output | |
with open("/tmp/businesses.csv", "wb") as out_biz_file: | |
out_biz_csv = csv.writer(out_biz_file) | |
out_biz_csv.writerow(biz_csv.next()) # write header out | |
# iterate the businesses and filter out ones that arent for Danbury | |
for r in biz_csv: | |
if str(r[3]).lower() not in ["danbury"]: # City is not 'danbury' | |
rem_ids.append(int(r[0])) | |
log.append("Removed '%s'(%i) from file: City not Danbury. Got: %s" % (r[1],int(r[0]),r[3])) | |
continue | |
if str(r[4]).lower() not in ["ct"]: # State is not 'ct' | |
rem_ids.append(int(r[0])) | |
log.append("Removed '%s'(%i) from file: State not CT. Got: %s" % (r[1],int(r[0]), r[4])) | |
continue | |
if str(r[5])[0:5] not in ['06810','06811','06817','06813','06814','06816']: | |
rem_ids.append(int(r[0])) | |
log.append("Removed '%s'(%i) from file: Zipcode doesn't belong. Got: %s" % (r[1],int(r[0]),r[5])) | |
continue | |
out_biz_csv.writerow(r) # all tests passed | |
# get inspections data and purge bad business listings | |
with zfile.open("inspections.csv") as insp_file: | |
insp_csv = csv.reader(insp_file) | |
# create new inspections file and NOT copy over rem_id lines | |
with open("/tmp/inspections.csv", "wb") as out_insp_file: | |
out_insp_csv = csv.writer(out_insp_file) | |
out_insp_csv.writerow(insp_csv.next()) # copy header | |
for r in insp_csv: | |
if int(r[0]) in rem_ids: | |
continue | |
out_insp_csv.writerow(r) | |
# get violations data and purge bad business listings | |
with zfile.open("violations.csv") as insp_file: | |
vio_csv = csv.reader(insp_file) | |
# create new inspections file and NOT copy over rem_id lines | |
with open("/tmp/violations.csv", "wb") as out_vio_file: | |
out_vio_csv = csv.writer(out_vio_file) | |
out_vio_csv.writerow(vio_csv.next()) # copy header | |
for r in vio_csv: | |
if int(r[0]) in rem_ids: | |
continue | |
out_vio_csv.writerow(r) | |
# get the feed info data | |
zfile.extract("feed_info.csv", "/tmp/") | |
# done purging data, make a new zip file | |
with zipfile.ZipFile("/tmp/outbound.zip", "w") as out_zip: | |
out_zip.write("/tmp/feed_info.csv","feed_info.csv") | |
out_zip.write("/tmp/businesses.csv", "businesses.csv") | |
out_zip.write("/tmp/violations.csv", "violations.csv") | |
out_zip.write("/tmp/inspections.csv", "inspections.csv") | |
# spit out log for CloudWatch | |
for l in log: | |
print(l) | |
try: | |
s3.upload_file(Filename="/tmp/outbound.zip", Bucket="cod-yelp-outbound", Key="healthinsp.zip") | |
return | |
except Exception as e: | |
print(e) | |
print('Error putting object {} into bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format("healthinsp.zip", "cod-yelp-outbound")) | |
raise e |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment