Last active
September 8, 2020 07:20
-
-
Save cobookman/0dd1ef504da96502cb7e2ad1488f6866 to your computer and use it in GitHub Desktop.
Dump AWS Glue metadata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Lab Doc | |
https://docs.google.com/document/d/1Kw4dhhjFLvRXUMZzr9HsIDt8mQfccXopP9jbPdd5NX8/edit?usp=sharing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Dumps Databases, Tables, and Partitions from AWS Glue.""" | |
import logging | |
import sys | |
import boto3 | |
import json | |
# Push logs to STDERR. | |
logging.basicConfig(stream=sys.stderr, level=logging.ERROR) | |
def main(): | |
# AWS Credentials of user with Glue GetDatabase/GetDatabases/GetPartition/GetPartitions/GetTable/GetTables IAM permissions | |
ACCESS_KEY='****' | |
SECRET_KEY='****' | |
REGION='us-west-2' | |
# CatalogId(string): is the AWS Account ID for where GLUE catalog resides. | |
# If CatalogId is not provided, the Authenticated User's AWS Account ID is used by default. | |
CATALOG_ID=AWS_ACCOUNT_ID='***' | |
glue_us_west2 = boto3.client( | |
service_name='glue', | |
region_name='us-west-2', | |
aws_access_key_id=ACCESS_KEY, | |
aws_secret_access_key=SECRET_KEY) | |
databases = get_glue_databases(glue_us_west2, CATALOG_ID) | |
for database in databases: | |
tables = get_glue_tables(glue_us_west2, CATALOG_ID, database['Name']) | |
for table in tables: | |
partitions = get_glue_partitions(glue_us_west2, CATALOG_ID, database['Name'], table['Name']) | |
table['Partitions'] = partitions | |
database['Tables'] = tables | |
print(json.dumps({'Databases': databases}, indent=2, sort_keys=True, default=str)) | |
def get_glue_partitions(glue_client, catalog_id, database_name, table_name): | |
"""Grabs all partitions for a table, and iterates through pagination.""" | |
partitions = [] | |
next_token = '' | |
while True: # Emulating do-while. Python doesn't have do-while loops :( | |
resp = glue_client.get_partitions( | |
CatalogId=catalog_id, | |
DatabaseName=database_name, | |
TableName=table_name, | |
NextToken=next_token) | |
partitions += resp['Partitions'] | |
if 'NextToken' not in resp: | |
break | |
next_token = resp['NextToken'] | |
return partitions | |
def get_glue_tables(glue_client, catalog_id, database_name): | |
"""Grabs all glue tables in database, and iterates through pagination.""" | |
tables = [] | |
next_token = '' | |
while True: # Emulating do-while. Python doesn't have do-while loops :( | |
resp = glue_client.get_tables( | |
CatalogId=catalog_id, | |
DatabaseName=database_name, | |
NextToken=next_token) | |
tables += resp['TableList'] | |
if 'NextToken' not in resp: | |
break | |
next_token = resp['NextToken'] | |
return tables | |
def get_glue_databases(glue_client, catalog_id): | |
"""Grabs all glue databases, and iterates through pagination.""" | |
databases = [] | |
next_token = '' | |
while True: # Emulating do-while. Python doesn't have do-while loops :( | |
resp = glue_client.get_databases(CatalogId=catalog_id, NextToken="") | |
databases += resp['DatabaseList'] | |
if 'NextToken' not in resp: | |
break | |
next_token = databases['NextToken'] | |
return databases | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
**Note, if there's a substantial number of Tables / databases. You might want to leverage a worker thread pool to shard the GetTables / GetPartitions requests across many threads.