jsamuel1 · June 9, 2020 04:24
diff --git a/aws-athena-auto-partition-lambda.py b/aws-athena-auto-partition-lambda.py
 # Lambda function to create partition for Cloudtrail log on daily basis.
 # You need to schedule it in AWS Lambda.

 '''
 -------------------------------------------
 AWS Athena Create Partitions Automatically
 -------------------------------------------

 Version 1.0
 Author: SqlAdmin
 Twitter: https://twitter.com/SqlAdmin
 License: Free for educational purpose.                 

 Minor modifications for my personal account:  use environment variables + support organizations.
 NOTE: 
 -----
 1) Before schedule it, you need to create partitions for till current date.
 2) This is will start creating partitions with current day [current date].
 3) This will not return the Athena query is successful or not. But this
   will return the Query Execution ID. 

 HOW THIS WORKS:
 ---------------
 1) It'll check the list of regions that cloudwatch logs captured from the 
   S3. Becuase few peoples will use only particular region. So they won't
   get any logs on other regions. 
 2) Then it'll start executing the create partition queries against all 
   the regions. 

 Example Cloudtrail Path:
 -----------------------
 s3://bucket/AWSLogs/Account_ID/Cloudtrail/regions/year/month/day/log_files

 ENVIRONMENT VARIABLES CAN BE SET:
 ---------------------------
 * organizationid
 * accountid - comma delimited list of accounts
 * s3_bucket - Bucket name where your cloudtrail logs stored.
 * s3_prefix - Path for your cloudtrail logs (give the prefix before the regions. 
   for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files
   So you need to use path: AWSLogs/AccountID/Cloudtrail/ )
 * s3_output - Path for where your Athena query results need to be saved.
 * database - Name of the DB where your Cloudtrail logs table located.
 * table_name - Name of the table where your Cloudtrail logs table located.
 * dateoverride - run for a specific date, instead of current date

 DEBUGGING:
 ----------
 1) comment the line 103 [run_query(query, database, s3_ouput]
 2) remove comment from line 101 and 102 [print(get-regions), print(query)]
 ---------------------------------------------------------------------------------'''

 #Import libraries 
 import boto3
 from datetime import datetime
 import os

 #Connection for S3 and Athena
 s3 = boto3.client('s3')
 athena = boto3.client('athena')

 #Get Year, Month, Day for partition (this will get tomorrow date's value)
 if os.getenv('dateoverride'):
    date = datetime.strptime(os.getenv('dateoverride'), '%Y-%m-%d')
 else:
    date = datetime.now()
 athena_year = str(date.year)
 athena_month = str(date.month).rjust(2, '0')
 athena_day = str(date.day).rjust(2, '0')

 #Parameters for S3 log location and Athena table
 #Fill this carefully (Read the commented section on top to help)
 organizationid = os.getenv('organizationid') or 'o-xxxxxxx'
 accountid = os.getenv('accountid') or s3.accountid
 s3_bucket = os.getenv('s3_bucket') or 'cloudtrail-logs'
 database = os.getenv('database') or 'athena_log_database'
 s3_output = os.getenv('s3_output') or f's3://{s3_bucket}/queryresults'


 #Executing the athena query:
 def run_query(query, database, s3_output):
        query_response = athena.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
            },
        ResultConfiguration={
            'OutputLocation': s3_output,
            }
        )
        print('Execution ID: ' + query_response['QueryExecutionId'])
        return query_response
    
 #Main function for get regions and run the query on the captured regions
 def lambda_handler(event, context):
  for account in accountid.split(','):
    print(f'account: ${account}')
    
    s3_prefix = os.getenv('s3_prefix') or f'{organizationid}/AWSLogs/{account}/CloudTrail/'
    s3_input = f's3://{s3_bucket}/{s3_prefix}'
    table_name = os.getenv('table_name') or 'cloudtrail_logs_' + account


    result =  s3.list_objects(Bucket=s3_bucket,Prefix=s3_prefix, Delimiter='/')
    for regions in result.get('CommonPrefixes'):
      get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/',''))
      query = f'''
        ALTER TABLE {table_name} 
        ADD PARTITION (
          region='{get_region}',
          year='{athena_year}',
          month='{athena_month}',
          day='{athena_day}'
        ) location '{s3_input}{get_region}/{athena_year}/{athena_month}/{athena_day}/';
        '''
      
      print(get_region) #-- for debug
      print(query) #-- for debug
      run_query(query, database, s3_output)
	# Lambda function to create partition for Cloudtrail log on daily basis.
	# You need to schedule it in AWS Lambda.

	'''
	-------------------------------------------
	AWS Athena Create Partitions Automatically
	-------------------------------------------

	Version 1.0
	Author: SqlAdmin
	Twitter: https://twitter.com/SqlAdmin
	License: Free for educational purpose.

	Minor modifications for my personal account: use environment variables + support organizations.
	NOTE:
	-----
	1) Before schedule it, you need to create partitions for till current date.
	2) This is will start creating partitions with current day [current date].
	3) This will not return the Athena query is successful or not. But this
	will return the Query Execution ID.

	HOW THIS WORKS:
	---------------
	1) It'll check the list of regions that cloudwatch logs captured from the
	S3. Becuase few peoples will use only particular region. So they won't
	get any logs on other regions.
	2) Then it'll start executing the create partition queries against all
	the regions.

	Example Cloudtrail Path:
	-----------------------
	s3://bucket/AWSLogs/Account_ID/Cloudtrail/regions/year/month/day/log_files

	ENVIRONMENT VARIABLES CAN BE SET:
	---------------------------
	* organizationid
	* accountid - comma delimited list of accounts
	* s3_bucket - Bucket name where your cloudtrail logs stored.
	* s3_prefix - Path for your cloudtrail logs (give the prefix before the regions.
	for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files
	So you need to use path: AWSLogs/AccountID/Cloudtrail/ )
	* s3_output - Path for where your Athena query results need to be saved.
	* database - Name of the DB where your Cloudtrail logs table located.
	* table_name - Name of the table where your Cloudtrail logs table located.
	* dateoverride - run for a specific date, instead of current date

	DEBUGGING:
	----------
	1) comment the line 103 [run_query(query, database, s3_ouput]
	2) remove comment from line 101 and 102 [print(get-regions), print(query)]
	---------------------------------------------------------------------------------'''

	#Import libraries
	import boto3
	from datetime import datetime
	import os

	#Connection for S3 and Athena
	s3 = boto3.client('s3')
	athena = boto3.client('athena')

	#Get Year, Month, Day for partition (this will get tomorrow date's value)
	if os.getenv('dateoverride'):
	date = datetime.strptime(os.getenv('dateoverride'), '%Y-%m-%d')
	else:
	date = datetime.now()
	athena_year = str(date.year)
	athena_month = str(date.month).rjust(2, '0')
	athena_day = str(date.day).rjust(2, '0')

	#Parameters for S3 log location and Athena table
	#Fill this carefully (Read the commented section on top to help)
	organizationid = os.getenv('organizationid') or 'o-xxxxxxx'
	accountid = os.getenv('accountid') or s3.accountid
	s3_bucket = os.getenv('s3_bucket') or 'cloudtrail-logs'
	database = os.getenv('database') or 'athena_log_database'
	s3_output = os.getenv('s3_output') or f's3://{s3_bucket}/queryresults'


	#Executing the athena query:
	def run_query(query, database, s3_output):
	query_response = athena.start_query_execution(
	QueryString=query,
	QueryExecutionContext={
	'Database': database
	},
	ResultConfiguration={
	'OutputLocation': s3_output,
	}
	)
	print('Execution ID: ' + query_response['QueryExecutionId'])
	return query_response

	#Main function for get regions and run the query on the captured regions
	def lambda_handler(event, context):
	for account in accountid.split(','):
	print(f'account: ${account}')

	s3_prefix = os.getenv('s3_prefix') or f'{organizationid}/AWSLogs/{account}/CloudTrail/'
	s3_input = f's3://{s3_bucket}/{s3_prefix}'
	table_name = os.getenv('table_name') or 'cloudtrail_logs_' + account


	result = s3.list_objects(Bucket=s3_bucket,Prefix=s3_prefix, Delimiter='/')
	for regions in result.get('CommonPrefixes'):
	get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/',''))
	query = f'''
	ALTER TABLE {table_name}
	ADD PARTITION (
	region='{get_region}',
	year='{athena_year}',
	month='{athena_month}',
	day='{athena_day}'
	) location '{s3_input}{get_region}/{athena_year}/{athena_month}/{athena_day}/';
	'''

	print(get_region) #-- for debug
	print(query) #-- for debug
	run_query(query, database, s3_output)