Last active
July 10, 2023 01:41
-
-
Save geminixiang/93042b06131e416a6299dee03e060b60 to your computer and use it in GitHub Desktop.
Top N table size of BigQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ref. https://stackoverflow.com/a/59633371 | |
# Before using | |
# pip install oauth2client google-api-python-client google-cloud-bigquery | |
# export GOOGLE_APPLICATION_CREDENTIALS=<CREDENTIALS>.json | |
from google.cloud import bigquery | |
from google.cloud.bigquery import Dataset | |
from googleapiclient import discovery | |
from oauth2client.client import GoogleCredentials | |
# Leverage the Application Default Credentials for authentication | |
credentials = GoogleCredentials.get_application_default() | |
service = discovery.build("cloudresourcemanager", "v1", credentials=credentials) | |
# List projects | |
request = service.projects().list() | |
response = request.execute() | |
# Main loop to list projects | |
for project in response.get("projects", []): | |
try: | |
client = bigquery.Client( | |
project["projectId"] | |
) # Start the client in the right project | |
# Loop to list datasets | |
datasets = list(client.list_datasets()) | |
if datasets: # If there is some BQ dataset | |
print("Datasets in project {}:".format(project["projectId"])) | |
# Loop to list the tables in each dataset | |
max_tables = [] | |
for dataset in datasets: | |
print(" - {}".format(dataset.dataset_id)) | |
get_sizeGiB = client.query( | |
"select table_id, (size_bytes /1073741824) as sizeGiB from " | |
+ dataset.dataset_id | |
+ ".__TABLES__" | |
) # This query retrieves all the tables in the dataset and the size in GiB. It can be modified to pull more fields. | |
tables = get_sizeGiB.result() # Get the result | |
# Loop to list the tables and print the size | |
for table in tables: | |
print("\t{}: {} GiB".format(table.table_id, table.sizeGiB)) | |
max_tables.append( | |
(dataset.dataset_id, table.table_id, table.sizeGiB) | |
) | |
if max_tables: | |
max_tables.sort( | |
key=lambda x: x[2], reverse=True | |
) # Sort the max_tables list based on table size in descending order | |
print("Max table sizes in project {}:".format(project["projectId"])) | |
for i in range( | |
min(10, len(max_tables)) | |
): # Print the top 5 table sizes or less if there are fewer than 5 | |
dataset_id, table_id, table_size = max_tables[i] | |
print("\t{}.{}: {} GiB".format(dataset_id, table_id, table_size)) | |
else: | |
print("{} project does not contain any datasets.".format(projectId)) | |
except Exception: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment