Skip to content

Instantly share code, notes, and snippets.

@rssanders3
Created November 12, 2022 19:55
Show Gist options
  • Save rssanders3/d24e4427e5951b14da2872d61e94a54b to your computer and use it in GitHub Desktop.
Save rssanders3/d24e4427e5951b14da2872d61e94a54b to your computer and use it in GitHub Desktop.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkConf, SparkContext
from pyspark.sql import SparkSession
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
AWS_BUCKET = "tbd" # Update with your Bucket Name
GLUE_DB_NAME = "tbd_db" # Update with your GlueDB
conf_list = [
#General Spark configs
("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"),
("spark.sql.parquet.writeLegacyFormat", "true"),
("spark.sql.parquet.writeLegacyFormat", "true"),
("hive.exec.dynamic.partition.mode", "nonstrict"),
("spark.sql.hive.caseSensitiveInferenceMode", "INFER_ONLY"),
("spark.sql.source.partitinoOverviewMode", "dynamic"),
#Configs needed for Iceberg
("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"),
("spark.sql.catalog.iceberg_catalog", "org.apache.iceberg.spark.SparkCatalog"),
("spark.sql.catalog.iceberg_catalog.warehouse", f"s3://{AWS_BUCKET}/iceberg_catalog/"),
("spark.sql.catalog.iceberg_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog")
]
spark_conf = SparkConf().setAll(conf_list)
spark = SparkSession.builder.config(conf=spark_conf).enableHiveSupport().getOrCreate()
glue_context = GlueContext(spark.sparkContext)
job = Job(glue_context)
job.init(args['JOB_NAME'], args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment