Skip to content

Instantly share code, notes, and snippets.

@Arqentum
Last active January 15, 2021 13:46
Show Gist options
  • Save Arqentum/3fa04e97eac507d139d61a5cbf556ab7 to your computer and use it in GitHub Desktop.
Save Arqentum/3fa04e97eac507d139d61a5cbf556ab7 to your computer and use it in GitHub Desktop.
normalize column names #Spark
def normalize_column_names(df: DataFrame) : DataFrame = {
df.toDF(df.columns map(_
.toLowerCase
.replaceAll("\\s", "_")
.replaceAll("\"+", "")
): _*)
}
def load_path_as_df(dict: List[List[String]]) {
dict.foreach{ l =>
val name :: delimiter :: path :: extra = l
normalize_column_names(spark.read.option("header", "true").option("delimiter", delimiter).csv(path.replaceAll("%3D","=")).toDF() ).createOrReplaceTempView(name)
}
}
val lt = List(
List("standard_thesaurus" ,",","s3a://adaptive-data-lake/staging/data_provider%3DAdaptive/standard_tickers.csv")
, List("all_thesaurus_names",",","s3://adaptive-data-lake/prod/data_provider=Adaptive/prod_thesauri_with_names.csv")
, List("all_sections" ,",","s3://adaptive-data-lake/prod/data_provider%3DAdaptive/a_section__201911221610.csv")
, List("mscience" ,",","s3://adaptive-data-lake/prod/data_provider%3Dmscience/namespace%3Dapi.mscience.swipev2.txs_per_shopper/year%3D2019/month%3D12/day%3D15/")
, List("req_daily" ,",","s3://adaptive-data-lake/dev/test_stanis/miscellaneous/fundmanager_inf_manually_changed.csv")
, List("req_full" ,",","s3://adaptive-data-lake/dev/test_stanis/miscellaneous/fundmanager_inf.csv")
)
load_path_as_df(lt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment