Last active
June 22, 2019 15:00
-
-
Save bvaradar/e18d96f9b99980dfb67a6601de5aa626 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Configs: Attached (ds_configs.tgz) | |
Upload configs: | |
tar -zxvf ds_configs.tgz | |
hadoop fs -copyFromLocal -r ds_configs <DFS_CONFIG_ROOT>/ | |
Spark Submit Command: | |
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 | |
export HADOOP_CONF_DIR=/home/guoyihua/wireline/hadoop-conf | |
export HUDI_UTILITIES_BUNDLE=<PATH_TO>/hoodie-utilities-0.4.8-SNAPSHOT.jar | |
export HUDI_UTILITIES_TEST_BUNDLE=<PATH_TO>/hoodie-utilities-0.4.8-SNAPSHOT-tests.jar | |
export HUDI_CLIENT_TEST=<PATH_TO>/hoodie-client-0.4.8-SNAPSHOT-tests.jar | |
.... | |
spark-submit --packages com.databricks:spark-avro_2.11:4.0.0 --jars $HUDI_UTILITIES_TEST_BUNDLE,$HUDI_CLIENT_TEST \ | |
--master yarn \ | |
--deploy-mode cluster \ | |
--num-executors 200 \ | |
--executor-memory 5g \ | |
--driver-memory 13g \ | |
--conf spark.driver.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_driver.hprof" \ | |
--conf spark.executor.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_executor.hprof" \ | |
--queue <QUEUE_NAME> \ | |
--conf spark.scheduler.mode=FAIR \ | |
--conf spark.yarn.executor.memoryOverhead=11072 \ | |
--conf spark.yarn.driver.memoryOverhead=3072 \ | |
--conf spark.task.cpus=1 \ | |
--conf spark.executor.cores=1 \ | |
--conf spark.task.maxFailures=100 \ | |
--conf spark.memory.fraction=0.4 \ | |
--conf spark.rdd.compress=true \ | |
--conf spark.kryoserializer.buffer.max=2000m \ | |
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ | |
--conf spark.memory.storageFraction=0.1 \ | |
--conf spark.shuffle.service.enabled=true \ | |
--conf spark.sql.hive.convertMetastoreParquet=false \ | |
--conf spark.ui.port=5555 \ | |
--conf spark.driver.maxResultSize=12g \ | |
--conf spark.executor.heartbeatInterval=120s \ | |
--conf spark.network.timeout=600s \ | |
--conf spark.eventLog.overwrite=true \ | |
--conf spark.eventLog.enabled=true \ | |
--conf spark.yarn.max.executor.failures=10 \ | |
--conf spark.sql.catalogImplementation=hive \ | |
--conf spark.sql.shuffle.partitions=1000 \ | |
--driver-class-path $HADOOP_CONF_DIR \ | |
--class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \ | |
--source-ordering-field timestamp \ | |
--target-base-path /user/varadarb/deltastreamer/run2 \ | |
--target-table vb_hoodie_trips \ | |
--props /user/varadarb/deltastreamer/configs/test-source.properties \ | |
--source-class com.uber.hoodie.utilities.sources.DistributedTestDataSource \ | |
--schemaprovider-class com.uber.hoodie.utilities.schema.FilebasedSchemaProvider \ | |
# Per batch 10M records | |
--source-limit 10000000 \ | |
--continuous \ | |
--storage-type MERGE_ON_READ \ | |
--hoodie-conf "hoodie.deltastreamer.source.test.num_partitions=100" \ | |
--hoodie-conf "hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=true" \ | |
# Total unique records created for the whole run ~= 100M | |
--hoodie-conf "hoodie.deltastreamer.source.test.max_unique_records=100000000" \ | |
--hoodie-conf "hoodie.embed.timeline.server=true" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment