Created
June 21, 2016 19:04
-
-
Save bblincoe/c1f647c55c704b6e0b18f70fec4b999b to your computer and use it in GitHub Desktop.
Copy timestamped data from AWS S3 to HDFS in Python (boto3)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import botocore | |
def copy_s3_to_hdfs(client, job_flow_id, day): | |
"""Copy data from AWS S3 to HDFS. Assumes that the file name includes | |
date followed by a timestamp. | |
""" | |
return client_throttle( | |
client.add_job_flow_steps, | |
JobFlowId=job_flow_id, | |
Steps=[ | |
{ | |
'Name': 'Copy from S3 to HDFS', | |
'ActionOnFailure': 'TERMINATE_CLUSTER', | |
'HadoopJarStep': { | |
'Jar': '/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp.jar', | |
'Args': [ | |
'--s3Endpoint', | |
's3.amazonaws.com', | |
'--src', | |
's3n://path/to/my/data', # this should be passed in as an argument | |
'--srcPattern', | |
'.*prefix.' + day.strftime(PATTERN_DATE_FORMAT) + '.([0-9]+.[0-9]+.[0-9]+).*', | |
'--dest', | |
'hdfs:///path/to/hdfs/input' | |
] | |
} | |
} | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment