Justin Naldzin justinnaldzin

Unique ID column

Generate a unique identifier that consistently produces the same result each time based on the values in the row. The ID column will be the first column positioned in the DataFrame.

from pyspark.sql.functions import sha2, concat_ws

columns = df.columns
df = df.withColumn(id_col, sha2(concat_ws("||", *df.columns), 256))
df = df.select([id_col] + columns)

Load data from BigQuery

Using the BigQuery client library

pip install --upgrade google-cloud-bigquery

from google.cloud import bigquery

	import boto3


	def get_matching_s3_objects(bucket, prefix="", suffix=""):
	"""
	Generate objects in an S3 bucket.

	:param bucket: Name of the S3 bucket.
	:param prefix: Only fetch objects whose key starts with this prefix (optional).
	:param suffix: Only fetch objects whose keys end with this suffix (optional).

	LOG_GROUP='my/log/group'
	AWS_PROFILE='my-profile'

	# Delete all log streams within a log group
	aws logs describe-log-streams --profile $AWS_PROFILE --log-group-name $LOG_GROUP --query 'logStreams[*].logStreamName' --output table \| awk '{print $2}' \| grep -v ^$ \| while read x; do aws logs delete-log-stream --profile $AWS_PROFILE --log-group-name $LOG_GROUP --log-stream-name $x; done

	# Delete log streams starting with string
	aws logs describe-log-streams --profile $AWS_PROFILE --log-group-name $LOG_GROUP --query 'logStreams[?starts_with(logStreamName,`2020/04/3`)].logStreamName' --output table \| awk '{print $2}' \| grep -v ^$ \| while read x; do aws logs delete-log-stream --profile $AWS_PROFILE --log-group-name $LOG_GROUP --log-stream-name $x; done

	import os
	import sys
	import logging


	FORMATTER = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(message)s')
	LOG_FOLDER = 'log'


	def get_console_handler():

	# Delete all tables matching a grep pattern in a BigQuery dataset

	DATASET=my_dataset
	TABLE_PATTERN=my_table_

	# Confirm the table names before deleting
	for TABLE in `bq ls --max_results=10000 $DATASET \| grep TABLE \| grep $TABLE_PATTERN \| awk '{print $1}'`; do echo $TABLE; done

	# Delete the tables; USE WITH CAUTION!
	for TABLE in `bq ls --max_results=10000 $DATASET \| grep TABLE \| grep $TABLE_PATTERN \| awk '{print $1}'`; do echo $TABLE; bq rm -f -t $DATASET.$TABLE; done

	from google.cloud import kms_v1


	def encrypt(project_id, location_id, key_ring_id, crypto_key_id, plaintext):
	"""Encrypts input plaintext data using the provided symmetric CryptoKey."""

	# Creates an API client for the KMS API.
	client = kms_v1.KeyManagementServiceClient()

	# The resource name of the CryptoKey.

	# Google Cloud Composer - Manually trigger DAG runs using Airflow v1.10+
	ENVIRONMENT_NAME=my-composer
	LOCATION=us-east1

	# Trigger DAG - individual
	DAG_ID=my_daily_dag
	EXEC_DATE=2019-02-11
	gcloud composer environments run ${ENVIRONMENT_NAME} --location ${LOCATION} trigger_dag -- -r manual__${EXEC_DATE} -e ${EXEC_DATE} ${DAG_ID}

	# Trigger DAG - multiple

	# Copy zip files from S3 to local directory, unzip and upload to S3

	aws s3 cp s3://bucket/folder/ . --recursive
	for f in *.zip; do unzip $f; done
	aws s3 cp . s3://bucket/folder/ --recursive --exclude "*.zip"

	import boto3


	def get_matching_s3_objects(bucket, prefix='', suffix=''):
	"""
	Fetch objects in an S3 bucket.

	:param bucket: Name of the S3 bucket.
	:param prefix: Only fetch objects whose key starts with
	this prefix (optional).