iklobato · November 22, 2023 15:57
diff --git a/bucket-migration.sh b/bucket-migration.sh
 #!/bin/bash

 # Purpose: This Bash script automates data migration between BigQuery and Google Cloud's storage.
 # It streamlines the process by using Google Cloud Storage as temporary storage. It's designed to export table 
 # schemas to JSON, load data from CSV files into BigQuery for analysis, and securely move data between different 
 # storage buckets. Additionally, it simplifies bucket management for organized data and enhanced redundancy.
 # Value: For GCP users, this script is invaluable for simplifying and expediting data migration tasks between 
 # BigQuery, making it a crucial resource for data management.
 # How to Run: Before executing, set key variables like `SOURCE_PROJECT_ID`, `DESTINATION_PROJECT_ID`, 
 # and `DATASET_NAME`. Then, prepare an input file listing the table names you want to process. Run the script in 
 # your terminal using: `./script.sh input_file.txt`, replacing `script.sh` with your file name and `input_file.txt` with 
 # your table list.

 # Steps Covered:
 # 1. Export Table Schemas to JSON: Helps understand data structure.
 # 2. Load CSV Data into BigQuery: Facilitates comprehensive data analysis.
 # 3. Efficient Data Transfer: Optimizes data movement between Google Cloud Storage buckets for faster uploads and downloads.
 # 4. Bucket Management: Manages storage bucket creation, ensuring organized data storage and simplifying the migration process.

 # This script significantly accelerates and simplifies data migration between BigQuery, offering 
 # an efficient solution for data management and backup.

 if [[ "$*" == *-v* ]]; then
    set -x
 fi

 source_project_id="${SOURCE_PROJECT_ID}"
 destination_project_id="${DESTINATION_PROJECT_ID}"
 dataset_name="${DATASET_NAME}"

 export_schema_to_json() {
    local table_name=$1
    bq show --format=json "${source_project_id}:${dataset_name}.${table_name}" | jq '.schema.fields' > "${table_name}.json"
    echo "Schema exported to ${table_name}.json"
 }

 load_data_from_csv() {
    local table_name=$1
    local bucket_name="${table_name}-ax"
    bq load --source_format=CSV --skip_leading_rows=1 "${destination_project_id}:${dataset_name}.${table_name}" "gs://${bucket_name}/*" "./${table_name}.json"
    echo "Data loaded to ${destination_project_id}:${dataset_name}.${table_name}"
 }

 move_between_buckets() {
    local bucket_name=$1
    gsutil -o "GSUtil:parallel_composite_upload_threshold=150M" \
        -o "GSUtil:parallel_thread_count=10" \
        -o "GSUtil:check_hashes=if_fast_else_skip" \
        -h "Content-Encoding:gzip" \
        -o "GSUtil:sliced_object_download=true" \
        -o "GSUtil:sliced_object_upload=true" \
        -m cp -r "gs://${bucket_name}" "gs://${bucket_name}-ax"
 }

 check_and_create_bucket() {
    local project_id=$1
    local bucket_name=$2

    if gsutil ls "gs://${project_id}/${bucket_name}" &>/dev/null; then
        echo "Bucket gs://${project_id}/${bucket_name} already exists."
    else
        gsutil mb -p "${project_id}" "gs://${bucket_name}"
    fi
 }

 export_table_to_bucket() {
    local table_name=$1
    local bucket_name="${table_name}-bkp"

    check_and_create_bucket "${destination_project_id}" "${bucket_name}"

    bq extract --compression GZIP "${source_project_id}:${dataset_name}.${table_name}" "gs://${bucket_name}/x-*"

    move_between_buckets "${bucket_name}"

    echo "Data migrated from gs://${source_project_id}:${dataset_name}.${table_name} to gs://${bucket_name}"
 }

 file_path=$1

 if [ ! -f "$file_path" ]; then
    echo "Error: File '$file_path' not found."
    exit 1
 fi

 while IFS= read -r table_name; do
    (
        echo "Table: $table_name"
        echo "Source: gs://${source_project_id}:${dataset_name}.${table_name}"
        echo "Destination: gs://${destination_project_id}:${dataset_name}.${table_name}"
        echo ""
    )
    export_table_to_bucket "$table_name"
    export_schema_to_json "$table_name"
    load_data_from_csv "$table_name"
 done < "$file_path"
	#!/bin/bash

	# Purpose: This Bash script automates data migration between BigQuery and Google Cloud's storage.
	# It streamlines the process by using Google Cloud Storage as temporary storage. It's designed to export table
	# schemas to JSON, load data from CSV files into BigQuery for analysis, and securely move data between different
	# storage buckets. Additionally, it simplifies bucket management for organized data and enhanced redundancy.
	# Value: For GCP users, this script is invaluable for simplifying and expediting data migration tasks between
	# BigQuery, making it a crucial resource for data management.
	# How to Run: Before executing, set key variables like `SOURCE_PROJECT_ID`, `DESTINATION_PROJECT_ID`,
	# and `DATASET_NAME`. Then, prepare an input file listing the table names you want to process. Run the script in
	# your terminal using: `./script.sh input_file.txt`, replacing `script.sh` with your file name and `input_file.txt` with
	# your table list.

	# Steps Covered:
	# 1. Export Table Schemas to JSON: Helps understand data structure.
	# 2. Load CSV Data into BigQuery: Facilitates comprehensive data analysis.
	# 3. Efficient Data Transfer: Optimizes data movement between Google Cloud Storage buckets for faster uploads and downloads.
	# 4. Bucket Management: Manages storage bucket creation, ensuring organized data storage and simplifying the migration process.

	# This script significantly accelerates and simplifies data migration between BigQuery, offering
	# an efficient solution for data management and backup.

	if [[ "$" == -v* ]]; then
	set -x
	fi

	source_project_id="${SOURCE_PROJECT_ID}"
	destination_project_id="${DESTINATION_PROJECT_ID}"
	dataset_name="${DATASET_NAME}"

	export_schema_to_json() {
	local table_name=$1
	bq show --format=json "${source_project_id}:${dataset_name}.${table_name}" \| jq '.schema.fields' > "${table_name}.json"
	echo "Schema exported to ${table_name}.json"
	}

	load_data_from_csv() {
	local table_name=$1
	local bucket_name="${table_name}-ax"
	bq load --source_format=CSV --skip_leading_rows=1 "${destination_project_id}:${dataset_name}.${table_name}" "gs://${bucket_name}/*" "./${table_name}.json"
	echo "Data loaded to ${destination_project_id}:${dataset_name}.${table_name}"
	}

	move_between_buckets() {
	local bucket_name=$1
	gsutil -o "GSUtil:parallel_composite_upload_threshold=150M" \
	-o "GSUtil:parallel_thread_count=10" \
	-o "GSUtil:check_hashes=if_fast_else_skip" \
	-h "Content-Encoding:gzip" \
	-o "GSUtil:sliced_object_download=true" \
	-o "GSUtil:sliced_object_upload=true" \
	-m cp -r "gs://${bucket_name}" "gs://${bucket_name}-ax"
	}

	check_and_create_bucket() {
	local project_id=$1
	local bucket_name=$2

	if gsutil ls "gs://${project_id}/${bucket_name}" &>/dev/null; then
	echo "Bucket gs://${project_id}/${bucket_name} already exists."
	else
	gsutil mb -p "${project_id}" "gs://${bucket_name}"
	fi
	}

	export_table_to_bucket() {
	local table_name=$1
	local bucket_name="${table_name}-bkp"

	check_and_create_bucket "${destination_project_id}" "${bucket_name}"

	bq extract --compression GZIP "${source_project_id}:${dataset_name}.${table_name}" "gs://${bucket_name}/x-*"

	move_between_buckets "${bucket_name}"

	echo "Data migrated from gs://${source_project_id}:${dataset_name}.${table_name} to gs://${bucket_name}"
	}

	file_path=$1

	if [ ! -f "$file_path" ]; then
	echo "Error: File '$file_path' not found."
	exit 1
	fi

	while IFS= read -r table_name; do
	(
	echo "Table: $table_name"
	echo "Source: gs://${source_project_id}:${dataset_name}.${table_name}"
	echo "Destination: gs://${destination_project_id}:${dataset_name}.${table_name}"
	echo ""
	)
	export_table_to_bucket "$table_name"
	export_schema_to_json "$table_name"
	load_data_from_csv "$table_name"
	done < "$file_path"