SlavikBaranov · August 28, 2017 09:01
diff --git a/cartesian-product.sc b/cartesian-product.sc
 // These two lines are only for code completion in IDEA, 
 // don't paste them into spark-shell
 val spark: org.apache.spark.sql.SparkSession
 import spark.implicits._

 import scala.util.Random

 @transient val sc = spark.sparkContext

 // # of rows 1M-10M. Running time is quadratic, 
 // so doubling # of rows increases running time by a factor of 4
 val numRows = 1000000

 case class Record(
    id: Int,
    bucket: Int,
    val1: Int,
    val2: Int
 )

 val r0 = sc.parallelize(0 until numRows, 1)

 val r1 = r0.mapPartitionsWithIndex { case (idx, it) =>
  val rnd = new Random(100500 + idx)
  for (id <- it) yield {
    Record(id, rnd.nextInt(200), rnd.nextInt(1000), rnd.nextInt(10))
  }
 }

 val t1 = r1.toDS()
 t1.createOrReplaceTempView("t1")
 t1.show(5)


 spark.sql(
  "select a.bucket, sum(a.val2) tot " +
    "from t1 a, t1 b " +
    "where a.bucket=b.bucket and a.val1+b.val1<1000 " +
    "group by a.bucket " +
    "order by a.bucket").show(10)
	// These two lines are only for code completion in IDEA,
	// don't paste them into spark-shell
	val spark: org.apache.spark.sql.SparkSession
	import spark.implicits._

	import scala.util.Random

	@transient val sc = spark.sparkContext

	// # of rows 1M-10M. Running time is quadratic,
	// so doubling # of rows increases running time by a factor of 4
	val numRows = 1000000

	case class Record(
	id: Int,
	bucket: Int,
	val1: Int,
	val2: Int
	)

	val r0 = sc.parallelize(0 until numRows, 1)

	val r1 = r0.mapPartitionsWithIndex { case (idx, it) =>
	val rnd = new Random(100500 + idx)
	for (id <- it) yield {
	Record(id, rnd.nextInt(200), rnd.nextInt(1000), rnd.nextInt(10))
	}
	}

	val t1 = r1.toDS()
	t1.createOrReplaceTempView("t1")
	t1.show(5)


	spark.sql(
	"select a.bucket, sum(a.val2) tot " +
	"from t1 a, t1 b " +
	"where a.bucket=b.bucket and a.val1+b.val1<1000 " +
	"group by a.bucket " +
	"order by a.bucket").show(10)