Skip to content

Instantly share code, notes, and snippets.

@freemandealer
Created February 16, 2023 12:50
Show Gist options
  • Save freemandealer/aef136754d258e77d6ea6acd314eeddd to your computer and use it in GitHub Desktop.
Save freemandealer/aef136754d258e77d6ea6acd314eeddd to your computer and use it in GitHub Desktop.
Generate LowCardinality Data as JSON & ORC format
#!/usr/bin/env python
# pip install pyspark before execution
import random
import json
import os
import sys
from pyspark.sql import SparkSession
from pyspark import SparkContext
ROW_NUM = 500
COL_NUM = 50
def delete_file_or_dir(path):
if os.path.exists(path):
if os.path.isfile(path):
os.remove(path)
elif os.path.isdir(path):
os.system('rm -rf ' + path)
def random_fruit():
fruits = ["Apple", "Banana", "Orange", "Grapes", "Strawberry", "Mango", "Kiwi", "Pineapple", "Cherry", "Watermelon", "Lemon", "Peach", "Plum", "Lychee", "Avocado", "Blueberry", "Raspberry"]
return random.choice(fruits)
def gen_sparse_json():
with open('output.json', 'a+') as f:
for i in range(1, ROW_NUM+1):
row = {'col_0':i}
for j in range(1, COL_NUM):
row['col_'+str(j)] = random_fruit()
# print(row)
#data = json.dumps(result, indent=1)
f.write(str(row)+"\n")
def convert_json_to_orc():
# enlarge memory, otherwise jvm OOM
SparkContext.setSystemProperty('spark.executor.memory', '300g')
sc = SparkContext("local", "spark.py")
spark = SparkSession.builder.getOrCreate()
df = spark.read.load("output.json", format="json")
#df.printSchema()
# coalesce to stop spliting large orc file (we want one output)
df.coalesce(1).write.format("orc").save("output.orc")
delete_file_or_dir('output.json')
delete_file_or_dir('output.orc')
gen_sparse_json()
convert_json_to_orc()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment