Last active
June 7, 2022 09:42
-
-
Save sounishnath003/a6dd848cff9c07046d7783435c051605 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
# _* coding: utf8 *_ | |
filename: excel_to_bigquery_connector.py | |
@author: sounishnath | |
createdAt: 2022-06-03 23:47:47 | |
# Modules Required: | |
- pip install google-cloud-bigquery | |
- pip install pandas-gbq | |
""" | |
# Upload Excel file to BigQuery Table from local file system | |
from argparse import ArgumentParser | |
from glob import glob | |
from tqdm import tqdm | |
def upload_and_convert_excel_files_into_bigquery( | |
directory_path, bq_project_name, credential_json_filepath | |
): | |
""" | |
- Grab all the excel files from the directory_path | |
- Create a BigQuery table from the excel files | |
- Upload the excel files to the BigQuery table | |
""" | |
# Import the necessary packages | |
import os | |
import pandas as pd | |
from google.cloud import bigquery | |
from google.oauth2 import service_account | |
# Set the environment variable | |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credential_json_filepath | |
# Initialize the credentials | |
credentials = service_account.Credentials.from_service_account_file( | |
credential_json_filepath | |
) | |
# Initialize the BigQuery client | |
client = bigquery.Client(credentials=credentials, project=bq_project_name) | |
# Grab the excel files from the directory_path | |
excel_files = glob("{}/*.xlsx".format(directory_path)) | |
# Create a BigQuery table from the excel files | |
for excel_file in tqdm(excel_files): | |
# Grab the excel file name | |
excel_file_name = os.path.basename(excel_file) | |
# Grab the excel file name without the extension | |
excel_file_name_without_extension = os.path.splitext(excel_file_name)[0] | |
# Grab the excel file name without the extension | |
excel_file_name_without_extension = os.path.splitext(excel_file_name)[0] | |
# Create the table name | |
table_name = "{}:samples.{}".format( | |
bq_project_name, excel_file_name_without_extension | |
) | |
df = pd.read_excel(excel_file) | |
column_names = df.columns.values.tolist() | |
# Create the table schema | |
table_schema = [ | |
bigquery.SchemaField(col, "STRING", mode="REQUIRED") | |
if df[col].dtype == "object" | |
else bigquery.SchemaField(col, "INT", mode="REQUIRED") | |
for col in column_names | |
] | |
# Create the table | |
table = bigquery.Table(table_name, schema=table_schema) | |
table = client.create_table(table) | |
# Upload the excel files to the BigQuery table | |
df.to_gbq( | |
table_name, | |
project_id=bq_project_name, | |
if_exists="append", | |
credentials=credentials, | |
progress_bar=True, | |
location='asia-south1', # Mumbai | |
) | |
if __name__ == "__main__": | |
parser=ArgumentParser(description="migrate directory excel files into BQ", usage="excel-to-bigquery --directory='' --bq_project_name=<name> --credentials=<credentials.json>") | |
parser.add_argument('--directory', type=str) | |
parser.add_argument('--bq_project_name', type=str) | |
parser.add_argument('--credentials', type=str) | |
config=vars(parser.parse_args()) | |
print(config) | |
# Set the directory path | |
directory_path = config.get('directory') | |
# Set the BigQuery project name | |
bq_project_name = config.get('bq_project_name') | |
# Set the credential json file path | |
credential_json_filepath = config.get('credentials') | |
# Upload the excel files to the BigQuery table | |
upload_and_convert_excel_files_into_bigquery( | |
directory_path, bq_project_name, credential_json_filepath | |
) | |
# References | |
# https://cloud.google.com/bigquery/docs/reference/libraries#client-libraries-install-python |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment