Last active
November 22, 2023 15:57
-
-
Save iklobato/26cfd3b6e20f803d9eb605edca4486f9 to your computer and use it in GitHub Desktop.
bucket-migration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Purpose: This Bash script automates data migration between BigQuery and Google Cloud's storage. | |
# It streamlines the process by using Google Cloud Storage as temporary storage. It's designed to export table | |
# schemas to JSON, load data from CSV files into BigQuery for analysis, and securely move data between different | |
# storage buckets. Additionally, it simplifies bucket management for organized data and enhanced redundancy. | |
# Value: For GCP users, this script is invaluable for simplifying and expediting data migration tasks between | |
# BigQuery, making it a crucial resource for data management. | |
# How to Run: Before executing, set key variables like `SOURCE_PROJECT_ID`, `DESTINATION_PROJECT_ID`, | |
# and `DATASET_NAME`. Then, prepare an input file listing the table names you want to process. Run the script in | |
# your terminal using: `./script.sh input_file.txt`, replacing `script.sh` with your file name and `input_file.txt` with | |
# your table list. | |
# Steps Covered: | |
# 1. Export Table Schemas to JSON: Helps understand data structure. | |
# 2. Load CSV Data into BigQuery: Facilitates comprehensive data analysis. | |
# 3. Efficient Data Transfer: Optimizes data movement between Google Cloud Storage buckets for faster uploads and downloads. | |
# 4. Bucket Management: Manages storage bucket creation, ensuring organized data storage and simplifying the migration process. | |
# This script significantly accelerates and simplifies data migration between BigQuery, offering | |
# an efficient solution for data management and backup. | |
if [[ "$*" == *-v* ]]; then | |
set -x | |
fi | |
source_project_id="${SOURCE_PROJECT_ID}" | |
destination_project_id="${DESTINATION_PROJECT_ID}" | |
dataset_name="${DATASET_NAME}" | |
export_schema_to_json() { | |
local table_name=$1 | |
bq show --format=json "${source_project_id}:${dataset_name}.${table_name}" | jq '.schema.fields' > "${table_name}.json" | |
echo "Schema exported to ${table_name}.json" | |
} | |
load_data_from_csv() { | |
local table_name=$1 | |
local bucket_name="${table_name}-ax" | |
bq load --source_format=CSV --skip_leading_rows=1 "${destination_project_id}:${dataset_name}.${table_name}" "gs://${bucket_name}/*" "./${table_name}.json" | |
echo "Data loaded to ${destination_project_id}:${dataset_name}.${table_name}" | |
} | |
move_between_buckets() { | |
local bucket_name=$1 | |
gsutil -o "GSUtil:parallel_composite_upload_threshold=150M" \ | |
-o "GSUtil:parallel_thread_count=10" \ | |
-o "GSUtil:check_hashes=if_fast_else_skip" \ | |
-h "Content-Encoding:gzip" \ | |
-o "GSUtil:sliced_object_download=true" \ | |
-o "GSUtil:sliced_object_upload=true" \ | |
-m cp -r "gs://${bucket_name}" "gs://${bucket_name}-ax" | |
} | |
check_and_create_bucket() { | |
local project_id=$1 | |
local bucket_name=$2 | |
if gsutil ls "gs://${project_id}/${bucket_name}" &>/dev/null; then | |
echo "Bucket gs://${project_id}/${bucket_name} already exists." | |
else | |
gsutil mb -p "${project_id}" "gs://${bucket_name}" | |
fi | |
} | |
export_table_to_bucket() { | |
local table_name=$1 | |
local bucket_name="${table_name}-bkp" | |
check_and_create_bucket "${destination_project_id}" "${bucket_name}" | |
bq extract --compression GZIP "${source_project_id}:${dataset_name}.${table_name}" "gs://${bucket_name}/x-*" | |
move_between_buckets "${bucket_name}" | |
echo "Data migrated from gs://${source_project_id}:${dataset_name}.${table_name} to gs://${bucket_name}" | |
} | |
file_path=$1 | |
if [ ! -f "$file_path" ]; then | |
echo "Error: File '$file_path' not found." | |
exit 1 | |
fi | |
while IFS= read -r table_name; do | |
( | |
echo "Table: $table_name" | |
echo "Source: gs://${source_project_id}:${dataset_name}.${table_name}" | |
echo "Destination: gs://${destination_project_id}:${dataset_name}.${table_name}" | |
echo "" | |
) | |
export_table_to_bucket "$table_name" | |
export_schema_to_json "$table_name" | |
load_data_from_csv "$table_name" | |
done < "$file_path" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment