DhyanRathore · July 7, 2021 08:13
diff --git a/adb-schema-drifted-incremental-load.py b/adb-schema-drifted-incremental-load.py
 # Python/PySpark code for cleansing and transforming schema drifted csv files into relational data with incremental loads in Azure Databricks
 # Author: Dhyanendra Singh Rathore

 # Define the variables used for creating connection strings
 adlsAccountName = "dlscsvdataproject"
 adlsContainerName = "csv-data-store"
 adlsFolderName = "covid19-data"
 mountPoint = "/mnt/csvFiles"

 # Application (Client) ID
 applicationId = dbutils.secrets.get(scope="CSVProjectKeyVault",key="ClientId")

 # Application (Client) Secret Key
 authenticationKey = dbutils.secrets.get(scope="CSVProjectKeyVault",key="ClientSecret")

 # Directory (Tenant) ID
 tenandId = dbutils.secrets.get(scope="CSVProjectKeyVault",key="TenantId")

 endpoint = "https://login.microsoftonline.com/" + tenandId + "/oauth2/token"
 source = "abfss://" + adlsContainerName + "@" + adlsAccountName + ".dfs.core.windows.net/" + adlsFolderName

 # Connecting using Service Principal secrets and OAuth
 configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": applicationId,
           "fs.azure.account.oauth2.client.secret": authenticationKey,
           "fs.azure.account.oauth2.client.endpoint": endpoint}

 # Mounting ADLS Storage to DBFS
 # Mount only if it's not already mounted
 if not any(mount.mountPoint == mountPoint for mount in dbutils.fs.mounts()):
  dbutils.fs.mount(
    source = source,
    mount_point = mountPoint,
    extra_configs = configs)

 # Declare variables for creating JDBC URL
 jdbcHostname = "sql-csv-data-server.database.windows.net"
 jdbcPort = 1433
 jdbcDatabase = "syn-csv-data-dw"
 jdbcTable = "csvData.covidcsvdata"

 # Connection secrets from vault
 jdbcUsername = dbutils.secrets.get(scope="CSVProjectKeyVault",key="SQLAdmin")
 jdbcPassword = dbutils.secrets.get(scope="CSVProjectKeyVault",key="SQLAdminPwd")

 # Create JDBC URL
 jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname, jdbcPort, jdbcDatabase)
 connectionProperties = {
  "user" : jdbcUsername,
  "password" : jdbcPassword,
  "driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
 }

 # import lit() to create new columns in our dataframe
 from pyspark.sql.functions import lit

 # Function to flatten the column names by removing (' ', '/', '_') and converting them to lowercase letters
 def rename_columns(rename_df):
  for column in rename_df.columns:
    new_column = column.replace(' ','').replace('/','').replace('_','')
    rename_df = rename_df.withColumnRenamed(column, new_column.lower())
  return rename_df

 # List all the files we have in our store to iterate through them one by one
 file_list = [file.name for file in dbutils.fs.ls("dbfs:{}".format(mountPoint))]

 # Find out the last loaded file to use as cut-off for our incremental load
 lastLoadedFileQuery = "(SELECT MAX(sourcefile) as sourcefile FROM csvData.covidcsvdata) t"
 lastFileDf = spark.read.jdbc(url=jdbcUrl, table=lastLoadedFileQuery, properties=connectionProperties)
 lastFile = lastFileDf.collect()[0][0]

 # Find the index of the file from the list
 loadFrom = file_list.index('{}.csv'.format(lastFile)) + 1 if lastFile else 0

 # Trim the list keeping only the files that should be processed
 file_list = file_list[loadFrom:]

 # Iterate through the files
 for file in file_list:
  loadFile = "{0}/{1}".format(mountPoint, file)
  
  # Read the csv files with first line as header, comma (,) as separator, and detect schema from the file
  csvDf = spark.read.format("csv") \
  .option("inferSchema", "true") \
  .option("header", "true") \
  .option("sep", ",") \
  .load(loadFile)
  
  csvDf = rename_columns(csvDf)
  
  # Check dataframe and add/rename columns to fit our database table structure
  if 'lat' in csvDf.columns:
    csvDf = csvDf.withColumnRenamed('lat', 'latitude')
    
  if 'long' in csvDf.columns:
    csvDf = csvDf.withColumnRenamed('long', 'longitude')
    
  if 'active' not in csvDf.columns:
    csvDf = csvDf.withColumn('active', lit(None).cast("int"))
    
  if 'latitude' not in csvDf.columns:
    csvDf = csvDf.withColumn('latitude', lit(None).cast("decimal"))
  
  if 'longitude' not in csvDf.columns:
    csvDf = csvDf.withColumn('longitude', lit(None).cast("decimal"))

  # Add the source file name (without the extension) as an additional column to help us keep track of data source
  csvDf = csvDf.withColumn("sourcefile", lit(file.split('.')[0]))
  
  csvDf = csvDf.select("provincestate", "countryregion", "lastupdate", "confirmed", "deaths", "recovered", "active", "latitude", "longitude", "sourcefile")
  
  # Write the cleansed data to the database
  csvDf.createOrReplaceTempView("finalCsvData")
  spark.table("finalCsvData").write.mode("append").jdbc(url=jdbcUrl, table=jdbcTable, properties=connectionProperties)

 # Unmount only if directory is mounted
 if any(mount.mountPoint == mountPoint for mount in dbutils.fs.mounts()):
  dbutils.fs.unmount(mountPoint)
	# Python/PySpark code for cleansing and transforming schema drifted csv files into relational data with incremental loads in Azure Databricks
	# Author: Dhyanendra Singh Rathore

	# Define the variables used for creating connection strings
	adlsAccountName = "dlscsvdataproject"
	adlsContainerName = "csv-data-store"
	adlsFolderName = "covid19-data"
	mountPoint = "/mnt/csvFiles"

	# Application (Client) ID
	applicationId = dbutils.secrets.get(scope="CSVProjectKeyVault",key="ClientId")

	# Application (Client) Secret Key
	authenticationKey = dbutils.secrets.get(scope="CSVProjectKeyVault",key="ClientSecret")

	# Directory (Tenant) ID
	tenandId = dbutils.secrets.get(scope="CSVProjectKeyVault",key="TenantId")

	endpoint = "https://login.microsoftonline.com/" + tenandId + "/oauth2/token"
	source = "abfss://" + adlsContainerName + "@" + adlsAccountName + ".dfs.core.windows.net/" + adlsFolderName

	# Connecting using Service Principal secrets and OAuth
	configs = {"fs.azure.account.auth.type": "OAuth",
	"fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
	"fs.azure.account.oauth2.client.id": applicationId,
	"fs.azure.account.oauth2.client.secret": authenticationKey,
	"fs.azure.account.oauth2.client.endpoint": endpoint}

	# Mounting ADLS Storage to DBFS
	# Mount only if it's not already mounted
	if not any(mount.mountPoint == mountPoint for mount in dbutils.fs.mounts()):
	dbutils.fs.mount(
	source = source,
	mount_point = mountPoint,
	extra_configs = configs)

	# Declare variables for creating JDBC URL
	jdbcHostname = "sql-csv-data-server.database.windows.net"
	jdbcPort = 1433
	jdbcDatabase = "syn-csv-data-dw"
	jdbcTable = "csvData.covidcsvdata"

	# Connection secrets from vault
	jdbcUsername = dbutils.secrets.get(scope="CSVProjectKeyVault",key="SQLAdmin")
	jdbcPassword = dbutils.secrets.get(scope="CSVProjectKeyVault",key="SQLAdminPwd")

	# Create JDBC URL
	jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname, jdbcPort, jdbcDatabase)
	connectionProperties = {
	"user" : jdbcUsername,
	"password" : jdbcPassword,
	"driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
	}

	# import lit() to create new columns in our dataframe
	from pyspark.sql.functions import lit

	# Function to flatten the column names by removing (' ', '/', '_') and converting them to lowercase letters
	def rename_columns(rename_df):
	for column in rename_df.columns:
	new_column = column.replace(' ','').replace('/','').replace('_','')
	rename_df = rename_df.withColumnRenamed(column, new_column.lower())
	return rename_df

	# List all the files we have in our store to iterate through them one by one
	file_list = [file.name for file in dbutils.fs.ls("dbfs:{}".format(mountPoint))]

	# Find out the last loaded file to use as cut-off for our incremental load
	lastLoadedFileQuery = "(SELECT MAX(sourcefile) as sourcefile FROM csvData.covidcsvdata) t"
	lastFileDf = spark.read.jdbc(url=jdbcUrl, table=lastLoadedFileQuery, properties=connectionProperties)
	lastFile = lastFileDf.collect()[0][0]

	# Find the index of the file from the list
	loadFrom = file_list.index('{}.csv'.format(lastFile)) + 1 if lastFile else 0

	# Trim the list keeping only the files that should be processed
	file_list = file_list[loadFrom:]

	# Iterate through the files
	for file in file_list:
	loadFile = "{0}/{1}".format(mountPoint, file)

	# Read the csv files with first line as header, comma (,) as separator, and detect schema from the file
	csvDf = spark.read.format("csv") \
	.option("inferSchema", "true") \
	.option("header", "true") \
	.option("sep", ",") \
	.load(loadFile)

	csvDf = rename_columns(csvDf)

	# Check dataframe and add/rename columns to fit our database table structure
	if 'lat' in csvDf.columns:
	csvDf = csvDf.withColumnRenamed('lat', 'latitude')

	if 'long' in csvDf.columns:
	csvDf = csvDf.withColumnRenamed('long', 'longitude')

	if 'active' not in csvDf.columns:
	csvDf = csvDf.withColumn('active', lit(None).cast("int"))

	if 'latitude' not in csvDf.columns:
	csvDf = csvDf.withColumn('latitude', lit(None).cast("decimal"))

	if 'longitude' not in csvDf.columns:
	csvDf = csvDf.withColumn('longitude', lit(None).cast("decimal"))

	# Add the source file name (without the extension) as an additional column to help us keep track of data source
	csvDf = csvDf.withColumn("sourcefile", lit(file.split('.')[0]))

	csvDf = csvDf.select("provincestate", "countryregion", "lastupdate", "confirmed", "deaths", "recovered", "active", "latitude", "longitude", "sourcefile")

	# Write the cleansed data to the database
	csvDf.createOrReplaceTempView("finalCsvData")
	spark.table("finalCsvData").write.mode("append").jdbc(url=jdbcUrl, table=jdbcTable, properties=connectionProperties)

	# Unmount only if directory is mounted
	if any(mount.mountPoint == mountPoint for mount in dbutils.fs.mounts()):
	dbutils.fs.unmount(mountPoint)