bigorn0 · June 6, 2017 12:00
diff --git a/Spark apply function on multiple columns at once b/Spark apply function on multiple columns at once
 You can use select with varargs including *:

 import spark.implicits._

 df.select($"*" +: Seq("A", "B", "C").map(c => 
  sum(c).over(Window.partitionBy("ID").orderBy("time")).alias(s"cum$c")
 ): _*)
 This:

 Maps columns names to window expressions with Seq("A", ...).map(...)
 Prepends all pre-existing columns with $"*" +: ....
 Unpacks combined sequence with ... : _*.
 and can be generalize as:

 import org.apache.spark.sql.{Column, DataFrame}

 /**
 * @param cols a sequence of columns to transform
 * @param df an input DataFrame
 * @param f a function to be applied on each col in cols
 */
 def withColumns(cols: Seq[String], df: DataFrame, f: String => Column) =
  df.select($"*" +: cols.map(c => f(c)): _*)
 If you find withColumn syntax more readable you can use foldLeft:

 Seq("A", "B", "C").foldLeft(df)((df, c) =>
  df.withColumn(s"cum$c",  sum(c).over(Window.partitionBy("ID").orderBy("time")))
 )
 which can be generalized for example to:

 /**
 * @param cols a sequence of columns to transform
 * @param df an input DataFrame
 * @param f a function to be applied on each col in cols
 * @param name a function mapping from input to output name.
 */
 def withColumns(cols: Seq[String], df: DataFrame, 
    f: String =>  Column, name: String => String = identity) =
  cols.foldLeft(df)((df, c) => df.withColumn(name(c), f(c)))
	You can use select with varargs including *:

	import spark.implicits._

	df.select($"*" +: Seq("A", "B", "C").map(c =>
	sum(c).over(Window.partitionBy("ID").orderBy("time")).alias(s"cum$c")
	): _*)
	This:

	Maps columns names to window expressions with Seq("A", ...).map(...)
	Prepends all pre-existing columns with $"*" +: ....
	Unpacks combined sequence with ... : _*.
	and can be generalize as:

	import org.apache.spark.sql.{Column, DataFrame}

	/**
	* @param cols a sequence of columns to transform
	* @param df an input DataFrame
	* @param f a function to be applied on each col in cols
	*/
	def withColumns(cols: Seq[String], df: DataFrame, f: String => Column) =
	df.select($"" +: cols.map(c => f(c)): _)
	If you find withColumn syntax more readable you can use foldLeft:

	Seq("A", "B", "C").foldLeft(df)((df, c) =>
	df.withColumn(s"cum$c", sum(c).over(Window.partitionBy("ID").orderBy("time")))
	)
	which can be generalized for example to:

	/**
	* @param cols a sequence of columns to transform
	* @param df an input DataFrame
	* @param f a function to be applied on each col in cols
	* @param name a function mapping from input to output name.
	*/
	def withColumns(cols: Seq[String], df: DataFrame,
	f: String => Column, name: String => String = identity) =
	cols.foldLeft(df)((df, c) => df.withColumn(name(c), f(c)))