geminixiang · July 10, 2023 01:41
diff --git a/bq_table_sizes.py b/bq_table_sizes.py
 # ref. https://stackoverflow.com/a/59633371
 # Before using
 # pip install oauth2client google-api-python-client google-cloud-bigquery
 # export GOOGLE_APPLICATION_CREDENTIALS=<CREDENTIALS>.json

 from google.cloud import bigquery
 from google.cloud.bigquery import Dataset
 from googleapiclient import discovery
 from oauth2client.client import GoogleCredentials

 # Leverage the Application Default Credentials for authentication
 credentials = GoogleCredentials.get_application_default()
 service = discovery.build("cloudresourcemanager", "v1", credentials=credentials)

 # List projects
 request = service.projects().list()
 response = request.execute()

 # Main loop to list projects
 for project in response.get("projects", []):
    try:
        client = bigquery.Client(
            project["projectId"]
        )  # Start the client in the right project

        # Loop to list datasets
        datasets = list(client.list_datasets())
        if datasets:  # If there is some BQ dataset
            print("Datasets in project {}:".format(project["projectId"]))
            # Loop to list the tables in each dataset
            max_tables = []
            for dataset in datasets:
                print(" - {}".format(dataset.dataset_id))
                get_sizeGiB = client.query(
                    "select table_id, (size_bytes /1073741824) as sizeGiB from "
                    + dataset.dataset_id
                    + ".__TABLES__"
                )  # This query retrieves all the tables in the dataset and the size in GiB. It can be modified to pull more fields.
                tables = get_sizeGiB.result()  # Get the result
                # Loop to list the tables and print the size
                for table in tables:
                    print("\t{}: {} GiB".format(table.table_id, table.sizeGiB))
                    max_tables.append(
                        (dataset.dataset_id, table.table_id, table.sizeGiB)
                    )
            if max_tables:
                max_tables.sort(
                    key=lambda x: x[2], reverse=True
                )  # Sort the max_tables list based on table size in descending order
                print("Max table sizes in project {}:".format(project["projectId"]))
                for i in range(
                    min(10, len(max_tables))
                ):  # Print the top 5 table sizes or less if there are fewer than 5
                    dataset_id, table_id, table_size = max_tables[i]
                    print("\t{}.{}: {} GiB".format(dataset_id, table_id, table_size))
        else:
            print("{} project does not contain any datasets.".format(projectId))
    except Exception:
        pass
	# ref. https://stackoverflow.com/a/59633371
	# Before using
	# pip install oauth2client google-api-python-client google-cloud-bigquery
	# export GOOGLE_APPLICATION_CREDENTIALS=<CREDENTIALS>.json

	from google.cloud import bigquery
	from google.cloud.bigquery import Dataset
	from googleapiclient import discovery
	from oauth2client.client import GoogleCredentials

	# Leverage the Application Default Credentials for authentication
	credentials = GoogleCredentials.get_application_default()
	service = discovery.build("cloudresourcemanager", "v1", credentials=credentials)

	# List projects
	request = service.projects().list()
	response = request.execute()

	# Main loop to list projects
	for project in response.get("projects", []):
	try:
	client = bigquery.Client(
	project["projectId"]
	) # Start the client in the right project

	# Loop to list datasets
	datasets = list(client.list_datasets())
	if datasets: # If there is some BQ dataset
	print("Datasets in project {}:".format(project["projectId"]))
	# Loop to list the tables in each dataset
	max_tables = []
	for dataset in datasets:
	print(" - {}".format(dataset.dataset_id))
	get_sizeGiB = client.query(
	"select table_id, (size_bytes /1073741824) as sizeGiB from "
	+ dataset.dataset_id
	+ ".__TABLES__"
	) # This query retrieves all the tables in the dataset and the size in GiB. It can be modified to pull more fields.
	tables = get_sizeGiB.result() # Get the result
	# Loop to list the tables and print the size
	for table in tables:
	print("\t{}: {} GiB".format(table.table_id, table.sizeGiB))
	max_tables.append(
	(dataset.dataset_id, table.table_id, table.sizeGiB)
	)
	if max_tables:
	max_tables.sort(
	key=lambda x: x[2], reverse=True
	) # Sort the max_tables list based on table size in descending order
	print("Max table sizes in project {}:".format(project["projectId"]))
	for i in range(
	min(10, len(max_tables))
	): # Print the top 5 table sizes or less if there are fewer than 5
	dataset_id, table_id, table_size = max_tables[i]
	print("\t{}.{}: {} GiB".format(dataset_id, table_id, table_size))
	else:
	print("{} project does not contain any datasets.".format(projectId))
	except Exception:
	pass