bioinfornatics · March 19, 2020 15:29
diff --git a/bench_parquet_sqlite_raw.py b/bench_parquet_sqlite_raw.py
 #!/usr/bin/env python3
 import sqlite3
 import random
 import string
 from uuid import uuid4
 from typing import List, Tuple
 from statistics import mean

 import pyarrow as pa
 import pyarrow.parquet as pq
 from pyarrow.lib import Schema
 from pyspark.sql import SparkSession, SQLContext


 def random_str():
    str_length = random.randint(40,50)
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))


 def sql_init(db_file: str):
    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    configure_sqlite(c)
    c.execute('''CREATE TABLE IF NOT EXISTS some_text
                 (txt1 TEXT, txt2 TEXT, txt3 TEXT, txt4 TEXT, text5 TEXT)''')
    return conn
 

 def sql_init_with_constraint(db_file: str):
    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    configure_sqlite(c)
    c.execute('''CREATE TABLE IF NOT EXISTS some_text
                 (txt1 TEXT PRIMARY KEY, txt2 TEXT, txt3 TEXT, txt4 TEXT, text5 TEXT)''')
    return conn


 def configure_sqlite(cursor: sqlite3.Cursor):
    cursor.execute('PRAGMA optimize')
    cursor.execute('PRAGMA journal_mode=WAL')
    cursor.execute('PRAGMA LOCKING_MODE=EXCLUSIVE')
    cursor.execute('PRAGMA synchronous=NORMAL')
    cursor.execute('PRAGMA SQLITE_CONFIG_MULTITHREAD')
    cursor.execute('PRAGMA SQLITE_DEFAULT_CACHE_SIZE=-4000')
    cursor.execute('PRAGMA THREAD=4')


 def parquet_init() -> Schema:
    my_schema = pa.schema([('txt1', pa.string()),
                           ('txt2', pa.string()),
                           ('txt3', pa.string()),
                           ('txt4', pa.string()),
                           ('txt5', pa.string())])
    return my_schema


 def spark_init() -> Tuple[SparkSession, SQLContext]:
    spark = SparkSession.builder.master("local") \
                        .appName("Word selector") \
                        .getOrCreate()
    sc = SQLContext(spark.sparkContext, sparkSession=spark)
    return spark, sc


 def sql_write(conn: sqlite3.Connection, data: List[List[str]]):
    cursor = conn.cursor()
    cursor.executemany('''INSERT INTO some_text VALUES
                     (?,?,?,?,?)''', data)
    conn.commit()
    conn.close()


 def txt_write(txt_file: str, data: List[List[str]]):
    with open(txt_file, 'w') as f:
        for row in data:
            f.write(', '.join(row) + '\n')


 def parquet_write(parquet_file: str, schema, data: List[List[str]]):
    table = None
    columns = [[] for _ in range(0, len(data[0]))]  # each row have same number of columns

    for row in data:
        for i, item in enumerate(row):
            columns[i].append(item)

    arrays = [pa.array(column) for column in columns]
    table = pa.Table.from_arrays(arrays, schema=schema)

    with pq.ParquetWriter(parquet_file, table.schema,
                          use_dictionary=True, version='2.0') as writer:
        writer.write_table(table)  # write one row_group


 def sql_select(conn: sqlite3.Connection, to_select: List[str]):
    cursor = conn.cursor()
    query = f'''SELECT * FROM some_text 
                    WHERE txt1 IN ({','.join(['?']*len(to_select))})'''
    cursor.execute(query, to_select)
    return cursor.fetchall()


 def txt_select(txt_file: str, to_select: List[str]):
    result = []
    with open(txt_file, 'r') as f:
        for line in f:
            fields = line.split()
            if fields[0] in to_select:
                result.append(line)
    return result


 def parquet_sql_select(parquet_file: str, sc: SQLContext, to_select: List[str]):
    data_frame = sc.read.parquet(parquet_file)
    data_frame.createOrReplaceTempView("some_text")
    query = f'''SELECT * FROM some_text 
                    WHERE txt1 IN ({','.join(f'"{txt1}"' for txt1 in to_select)})'''
    results = sc.sql(query)
    return results.collect()


 def parquet_sql_select2(parquet_file: str, sc: SQLContext, spark: SparkSession, to_select: List[str]):
    data_frame = sc.read.parquet(parquet_file)
    filter_df = spark.createDataFrame(to_select, data_frame.schema['txt1'].dataType)
    return data_frame.join(filter_df, data_frame['txt1'] == filter_df["value"])


 def parquet_dataframe_select(parquet_file: str, sc: SQLContext, to_select: List[str]):
    data_frame = sc.read.parquet(parquet_file)
    results = data_frame.where(data_frame.txt1.isin(to_select))
    return results


 nb_rows = 100000
 rows = [[str(uuid4())] + [random_str() for _ in range(4)] for iteration in range(0,nb_rows)]
 txt1_field_selector = [rows[random.randint(0, nb_rows-1)][0] for _ in range(0, 100)]
 db_file = 'perf_pq_db_txt.db'
 db_constraint_file = 'perf_pq_db_txt.constraint.db'
 txt_file = 'perf_pq_db_txt.txt'
 pq_file = 'perf_pq_db_txt.parquet'

 if __name__ == '__main__':
    from timeit import repeat
    txt_write_t1 = repeat(stmt='txt_write(txt_file,rows)', repeat=6, number=1, setup='''
 from __main__ import txt_write, rows, txt_file
 import os
 if os.path.exists(txt_file):
    os.remove(txt_file)
 ''')
    sql_write_t1 = repeat(stmt='sql_write(conn,rows)', repeat=6, number=1, setup='''
 from __main__ import sql_init, sql_write, rows, db_file
 import os
 if os.path.exists(db_file):
    os.remove(db_file)
 conn = sql_init(db_file)
 ''')
    sql_write_t2 = repeat(stmt='sql_write(conn,rows)', repeat=6, number=1, setup='''
 from __main__ import sql_init_with_constraint, sql_write, rows, db_constraint_file
 import os
 if os.path.exists(db_constraint_file):
    os.remove(db_constraint_file)
 conn = sql_init_with_constraint(db_constraint_file)
 ''')
    parquet_write_t1 = repeat(stmt='parquet_write(pq_file,schema,rows)', repeat=6, number=1, setup='''
 from __main__ import parquet_init, parquet_write, rows, pq_file
 import os
 if os.path.exists(pq_file):
    os.remove(pq_file)
 schema = parquet_init()
 ''')
    txt_select_t1 = repeat(stmt='txt_select(txt_file,txt1_field_selector)', repeat=6, number=1, setup='''
 from __main__ import txt_select, txt_file, txt1_field_selector
 ''')
    sql_select_t1 = repeat(stmt='sql_select(conn,txt1_field_selector)', repeat=6, number=1, setup='''
 from __main__ import sql_init, sql_select, txt1_field_selector, db_file
 conn = sql_init(db_file)
 ''')
    sql_select_t2 = repeat(stmt='sql_select(conn,txt1_field_selector)', repeat=6, number=1, setup='''
 from __main__ import sql_init, sql_select, txt1_field_selector, db_constraint_file
 conn = sql_init(db_constraint_file)
 ''')
    parquet_select_t1 = repeat(stmt='parquet_sql_select(pq_file,sc,txt1_field_selector)', repeat=6, number=1, setup='''
 from __main__ import spark_init, parquet_sql_select, txt1_field_selector, pq_file
 import os
 spark, sc = spark_init()
 ''')
    parquet_select_t2 = repeat(stmt='parquet_dataframe_select(pq_file,sc,txt1_field_selector)', repeat=6, number=1, setup='''
 from __main__ import spark_init, parquet_dataframe_select, txt1_field_selector, pq_file
 import os
 spark, sc = spark_init()
 ''')
    print(f'{"text":<40} was writed in {mean(txt_write_t1):.3f} seconds.')
    print(f'{"database":<40} was writed in {mean(sql_write_t1):.3f} seconds.')
    print(f'{"database with constraint":<40} was writed in {mean(sql_write_t2):.3f} seconds.')
    print(f'{"parquet":<40} was writed in {mean(parquet_write_t1):.3f} seconds.')
    print(f'{"text":<40} selected in {mean(txt_select_t1):.3f} seconds.')
    print(f'{"database":<40} selected in {mean(sql_select_t1):.3f} seconds.')
    print(f'{"database with constraint":<40} selected in {mean(sql_select_t2):.3f} seconds.')
    print(f'{"parquet SQL":<40} selected in {mean(parquet_select_t1):.3f} seconds.')
    print(f'{"parquet using dataframe":<40} selected in {mean(parquet_select_t2):.3f} seconds.')
	#!/usr/bin/env python3
	import sqlite3
	import random
	import string
	from uuid import uuid4
	from typing import List, Tuple
	from statistics import mean

	import pyarrow as pa
	import pyarrow.parquet as pq
	from pyarrow.lib import Schema
	from pyspark.sql import SparkSession, SQLContext


	def random_str():
	str_length = random.randint(40,50)
	return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))


	def sql_init(db_file: str):
	conn = sqlite3.connect(db_file)
	c = conn.cursor()
	configure_sqlite(c)
	c.execute('''CREATE TABLE IF NOT EXISTS some_text
	(txt1 TEXT, txt2 TEXT, txt3 TEXT, txt4 TEXT, text5 TEXT)''')
	return conn


	def sql_init_with_constraint(db_file: str):
	conn = sqlite3.connect(db_file)
	c = conn.cursor()
	configure_sqlite(c)
	c.execute('''CREATE TABLE IF NOT EXISTS some_text
	(txt1 TEXT PRIMARY KEY, txt2 TEXT, txt3 TEXT, txt4 TEXT, text5 TEXT)''')
	return conn


	def configure_sqlite(cursor: sqlite3.Cursor):
	cursor.execute('PRAGMA optimize')
	cursor.execute('PRAGMA journal_mode=WAL')
	cursor.execute('PRAGMA LOCKING_MODE=EXCLUSIVE')
	cursor.execute('PRAGMA synchronous=NORMAL')
	cursor.execute('PRAGMA SQLITE_CONFIG_MULTITHREAD')
	cursor.execute('PRAGMA SQLITE_DEFAULT_CACHE_SIZE=-4000')
	cursor.execute('PRAGMA THREAD=4')


	def parquet_init() -> Schema:
	my_schema = pa.schema([('txt1', pa.string()),
	('txt2', pa.string()),
	('txt3', pa.string()),
	('txt4', pa.string()),
	('txt5', pa.string())])
	return my_schema


	def spark_init() -> Tuple[SparkSession, SQLContext]:
	spark = SparkSession.builder.master("local") \
	.appName("Word selector") \
	.getOrCreate()
	sc = SQLContext(spark.sparkContext, sparkSession=spark)
	return spark, sc


	def sql_write(conn: sqlite3.Connection, data: List[List[str]]):
	cursor = conn.cursor()
	cursor.executemany('''INSERT INTO some_text VALUES
	(?,?,?,?,?)''', data)
	conn.commit()
	conn.close()


	def txt_write(txt_file: str, data: List[List[str]]):
	with open(txt_file, 'w') as f:
	for row in data:
	f.write(', '.join(row) + '\n')


	def parquet_write(parquet_file: str, schema, data: List[List[str]]):
	table = None
	columns = [[] for _ in range(0, len(data[0]))] # each row have same number of columns

	for row in data:
	for i, item in enumerate(row):
	columns[i].append(item)

	arrays = [pa.array(column) for column in columns]
	table = pa.Table.from_arrays(arrays, schema=schema)

	with pq.ParquetWriter(parquet_file, table.schema,
	use_dictionary=True, version='2.0') as writer:
	writer.write_table(table) # write one row_group


	def sql_select(conn: sqlite3.Connection, to_select: List[str]):
	cursor = conn.cursor()
	query = f'''SELECT * FROM some_text
	WHERE txt1 IN ({','.join(['?']*len(to_select))})'''
	cursor.execute(query, to_select)
	return cursor.fetchall()


	def txt_select(txt_file: str, to_select: List[str]):
	result = []
	with open(txt_file, 'r') as f:
	for line in f:
	fields = line.split()
	if fields[0] in to_select:
	result.append(line)
	return result


	def parquet_sql_select(parquet_file: str, sc: SQLContext, to_select: List[str]):
	data_frame = sc.read.parquet(parquet_file)
	data_frame.createOrReplaceTempView("some_text")
	query = f'''SELECT * FROM some_text
	WHERE txt1 IN ({','.join(f'"{txt1}"' for txt1 in to_select)})'''
	results = sc.sql(query)
	return results.collect()


	def parquet_sql_select2(parquet_file: str, sc: SQLContext, spark: SparkSession, to_select: List[str]):
	data_frame = sc.read.parquet(parquet_file)
	filter_df = spark.createDataFrame(to_select, data_frame.schema['txt1'].dataType)
	return data_frame.join(filter_df, data_frame['txt1'] == filter_df["value"])


	def parquet_dataframe_select(parquet_file: str, sc: SQLContext, to_select: List[str]):
	data_frame = sc.read.parquet(parquet_file)
	results = data_frame.where(data_frame.txt1.isin(to_select))
	return results


	nb_rows = 100000
	rows = [[str(uuid4())] + [random_str() for _ in range(4)] for iteration in range(0,nb_rows)]
	txt1_field_selector = [rows[random.randint(0, nb_rows-1)][0] for _ in range(0, 100)]
	db_file = 'perf_pq_db_txt.db'
	db_constraint_file = 'perf_pq_db_txt.constraint.db'
	txt_file = 'perf_pq_db_txt.txt'
	pq_file = 'perf_pq_db_txt.parquet'

	if __name__ == '__main__':
	from timeit import repeat
	txt_write_t1 = repeat(stmt='txt_write(txt_file,rows)', repeat=6, number=1, setup='''
	from __main__ import txt_write, rows, txt_file
	import os
	if os.path.exists(txt_file):
	os.remove(txt_file)
	''')
	sql_write_t1 = repeat(stmt='sql_write(conn,rows)', repeat=6, number=1, setup='''
	from __main__ import sql_init, sql_write, rows, db_file
	import os
	if os.path.exists(db_file):
	os.remove(db_file)
	conn = sql_init(db_file)
	''')
	sql_write_t2 = repeat(stmt='sql_write(conn,rows)', repeat=6, number=1, setup='''
	from __main__ import sql_init_with_constraint, sql_write, rows, db_constraint_file
	import os
	if os.path.exists(db_constraint_file):
	os.remove(db_constraint_file)
	conn = sql_init_with_constraint(db_constraint_file)
	''')
	parquet_write_t1 = repeat(stmt='parquet_write(pq_file,schema,rows)', repeat=6, number=1, setup='''
	from __main__ import parquet_init, parquet_write, rows, pq_file
	import os
	if os.path.exists(pq_file):
	os.remove(pq_file)
	schema = parquet_init()
	''')
	txt_select_t1 = repeat(stmt='txt_select(txt_file,txt1_field_selector)', repeat=6, number=1, setup='''
	from __main__ import txt_select, txt_file, txt1_field_selector
	''')
	sql_select_t1 = repeat(stmt='sql_select(conn,txt1_field_selector)', repeat=6, number=1, setup='''
	from __main__ import sql_init, sql_select, txt1_field_selector, db_file
	conn = sql_init(db_file)
	''')
	sql_select_t2 = repeat(stmt='sql_select(conn,txt1_field_selector)', repeat=6, number=1, setup='''
	from __main__ import sql_init, sql_select, txt1_field_selector, db_constraint_file
	conn = sql_init(db_constraint_file)
	''')
	parquet_select_t1 = repeat(stmt='parquet_sql_select(pq_file,sc,txt1_field_selector)', repeat=6, number=1, setup='''
	from __main__ import spark_init, parquet_sql_select, txt1_field_selector, pq_file
	import os
	spark, sc = spark_init()
	''')
	parquet_select_t2 = repeat(stmt='parquet_dataframe_select(pq_file,sc,txt1_field_selector)', repeat=6, number=1, setup='''
	from __main__ import spark_init, parquet_dataframe_select, txt1_field_selector, pq_file
	import os
	spark, sc = spark_init()
	''')
	print(f'{"text":<40} was writed in {mean(txt_write_t1):.3f} seconds.')
	print(f'{"database":<40} was writed in {mean(sql_write_t1):.3f} seconds.')
	print(f'{"database with constraint":<40} was writed in {mean(sql_write_t2):.3f} seconds.')
	print(f'{"parquet":<40} was writed in {mean(parquet_write_t1):.3f} seconds.')
	print(f'{"text":<40} selected in {mean(txt_select_t1):.3f} seconds.')
	print(f'{"database":<40} selected in {mean(sql_select_t1):.3f} seconds.')
	print(f'{"database with constraint":<40} selected in {mean(sql_select_t2):.3f} seconds.')
	print(f'{"parquet SQL":<40} selected in {mean(parquet_select_t1):.3f} seconds.')
	print(f'{"parquet using dataframe":<40} selected in {mean(parquet_select_t2):.3f} seconds.')