Created
August 28, 2017 09:01
-
-
Save SlavikBaranov/9cbc12f2676431392db7ca07c02de73b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// These two lines are only for code completion in IDEA, | |
// don't paste them into spark-shell | |
val spark: org.apache.spark.sql.SparkSession | |
import spark.implicits._ | |
import scala.util.Random | |
@transient val sc = spark.sparkContext | |
// # of rows 1M-10M. Running time is quadratic, | |
// so doubling # of rows increases running time by a factor of 4 | |
val numRows = 1000000 | |
case class Record( | |
id: Int, | |
bucket: Int, | |
val1: Int, | |
val2: Int | |
) | |
val r0 = sc.parallelize(0 until numRows, 1) | |
val r1 = r0.mapPartitionsWithIndex { case (idx, it) => | |
val rnd = new Random(100500 + idx) | |
for (id <- it) yield { | |
Record(id, rnd.nextInt(200), rnd.nextInt(1000), rnd.nextInt(10)) | |
} | |
} | |
val t1 = r1.toDS() | |
t1.createOrReplaceTempView("t1") | |
t1.show(5) | |
spark.sql( | |
"select a.bucket, sum(a.val2) tot " + | |
"from t1 a, t1 b " + | |
"where a.bucket=b.bucket and a.val1+b.val1<1000 " + | |
"group by a.bucket " + | |
"order by a.bucket").show(10) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment