devender-yadav · June 19, 2021 04:41 · devender-yadav · Jun 19, 2021
diff --git a/pyspark-jdbc-intro.md b/pyspark-jdbc-intro.md
diff --git a/spark-jdbc-partition.py b/spark-jdbc-partition.py
 from pyspark.sql import SparkSession

 import time as t


 t1 = t.time()

 spark = SparkSession.builder.master("local[4]").appName("Test-JDBC").getOrCreate()

 ds = spark.read.jdbc("jdbc:mysql://localhost:3306/stackexchange","(select min(id), max(id) from post_history) as ph",properties={"user": "devender", "password": "password@123",  "driver":"com.mysql.jdbc.Driver"})

 r = ds.head()

 minId = r[0]
 maxId = r[1]

 ds = spark.read.jdbc("jdbc:mysql://localhost:3306/stackexchange", "(select * from post_history) as ph",
          properties={"user": "devender", "password": "password@123",  "driver":"com.mysql.jdbc.Driver"}, numPartitions=4, column="id", lowerBound =minId, upperBound=maxId)

 count = ds.count()

 t2 = t.time()

 # It took less time with partitioning
 print ("Total time taken by spark - "+ str(t2-t1) + " seconds")

 print(count)



diff --git a/spark-jdbc.py b/spark-jdbc.py
 from pyspark.sql import SparkSession

 import time as t


 t1 = t.time()

 spark = SparkSession.builder.master("local[1]").appName("Test-JDBC").getOrCreate()
   
   #.option("fetchsize","10000")

 ds = spark.read.jdbc("jdbc:mysql://localhost:3306/stackexchange", "(select * from post_history) as ph",
          properties={"user": "devender", "password": "password@123",  "driver":"com.mysql.jdbc.Driver"})

 count = ds.count()

 t2 = t.time()

 print ("Total time taken by spark - "+ str(t2-t1) + " seconds")

 print(count)
	from pyspark.sql import SparkSession

	import time as t


	t1 = t.time()

	spark = SparkSession.builder.master("local[4]").appName("Test-JDBC").getOrCreate()

	ds = spark.read.jdbc("jdbc:mysql://localhost:3306/stackexchange","(select min(id), max(id) from post_history) as ph",properties={"user": "devender", "password": "password@123", "driver":"com.mysql.jdbc.Driver"})

	r = ds.head()

	minId = r[0]
	maxId = r[1]

	ds = spark.read.jdbc("jdbc:mysql://localhost:3306/stackexchange", "(select * from post_history) as ph",
	properties={"user": "devender", "password": "password@123", "driver":"com.mysql.jdbc.Driver"}, numPartitions=4, column="id", lowerBound =minId, upperBound=maxId)

	count = ds.count()

	t2 = t.time()

	# It took less time with partitioning
	print ("Total time taken by spark - "+ str(t2-t1) + " seconds")

	print(count)