Skip to content

Instantly share code, notes, and snippets.

@Megaprog
Created July 12, 2023 11:18
Show Gist options
  • Save Megaprog/89313a0ee654925d736ce185db2ca12f to your computer and use it in GitHub Desktop.
Save Megaprog/89313a0ee654925d736ce185db2ca12f to your computer and use it in GitHub Desktop.
Spark local read from JDBC and write to parquet
import org.apache.parquet.hadoop.ParquetOutputFormat
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.SparkSession
fun main() {
val session = createSparkSession("test", 1)
val df = session.read().format("jdbc")
.option("url", "jdbc:postgresql://localhost:6432/pg-atlas-v2-stg-other?user=postgres")
.option("query", """select * from usdt_trx_udm_v2_transactions
where consensus_time >= '2023-07-11 00:00:00Z' and consensus_time < '2023-07-12 00:00:00Z'
order by consensus_time, hash""")
.option("fetchsize", 5000)
.load()
df.write()
.mode(SaveMode.Overwrite)
.parquet("./transactions.parquet")
}
fun createSparkSession(name: String, threads: Int): SparkSession =
SparkSession.builder()
.appName(name)
.master("local[$threads]")
.config("spark.ui.enabled", false)
.config("spark.sql.caseSensitive", true)
.config("spark.sql.datetime.java8API.enabled", true)
.config("spark.sql.legacy.utcTimestampFunc.enabled", true)
.config("spark.sql.session.timeZone", "UTC")
.config("spark.sql.parquet.enableVectorizedReader", false)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.kryoserializer.buffer.max", "1g")
.config(ParquetOutputFormat.COMPRESSION, "snappy")
.config(ParquetOutputFormat.BLOCK_SIZE, 512 * 1024 * 1024L)
.config(ParquetOutputFormat.PAGE_SIZE, 16 * 1024 * 1024L)
.config(ParquetOutputFormat.DICTIONARY_PAGE_SIZE, 32 * 1024 * 1024L)
.config(ParquetOutputFormat.MIN_MEMORY_ALLOCATION, 8 * 1024 * 1024L)
.config(ParquetOutputFormat.MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, 500)
.config(ParquetOutputFormat.WRITER_VERSION, "PARQUET_2_0")
.config("spark.driver.maxResultSize", "4g")
.getOrCreate();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment