bourdeau · November 10, 2022 12:52
diff --git a/test-pandas.py b/test-pandas.py
 import pandas as pd
 import time
 from contextlib import contextmanager
 import dask.dataframe
 import redis
 import struct
 import numpy as np
 import pickle


 @contextmanager
 def timer(message: str) -> None:
    try:
        start = time.perf_counter()
        yield {}
    finally:
        time_taken = round(time.perf_counter() - start, 3)
        print(f"'{message}' --> {time_taken} sec.")


 """
 Loading 2Go file CSV with 100 000 000 rows and 3 columns
 """


 """
 Redis
 Output: Error 104 while writing to socket. Connection reset by peer
 """
 r = redis.StrictRedis(host='panda_redis', port=6379, db=0)

 df = pd.read_csv("test.csv")
 data = df.to_json()

 r.set("redis-key", data) # Fail, too large...

 results = r.get('redis-key')

 print(pd.read_json(results))




 """
 Loading CSV:
 Output: 'Panda loading CSV' --> 5.022 sec.
 """
 with timer("Panda loading CSV"):
    df = pd.read_csv("test.csv")

 """
 Loading CSV with dtypes:
 Output: 'Panda loading CSV with dtypes' --> 4.603 sec.
 """
 with timer("Panda loading CSV with dtypes"):
    df = pd.read_csv("test.csv", dtype={"Titre": "object", "Taille": "float64", "Age": "int64"})


 """
 Loading CSV with Pyarrow:
 Output: Crashes Python......
 """
 with timer("Panda loading CSV with Pyarrow"):
    df = pd.read_csv("test.csv", engine="pyarrow")

 """
 Loading CSV with dtypes and Parquet:
 Output: Erreur de segmentation (core dumped)
 Note: file seems to big to be saved as parquet...
 """
 with timer("Panda loading CSV with parquet"):
    df = pd.read_csv("test.csv", dtype={"Titre": "object", "Taille": "float64", "Age": "int64"})
    df.to_parquet("test.parquet", engine="fastparquet")

 """
 Loading CSV with Dask:
 Output: Crashes
 """  
 with timer("Panda loading CSV with Dask"):
    data = dask.dataframe.read_csv("test.csv", dtype={"Titre": "object", "Taille": "float64", "Age": "int64"})
	import pandas as pd
	import time
	from contextlib import contextmanager
	import dask.dataframe
	import redis
	import struct
	import numpy as np
	import pickle


	@contextmanager
	def timer(message: str) -> None:
	try:
	start = time.perf_counter()
	yield {}
	finally:
	time_taken = round(time.perf_counter() - start, 3)
	print(f"'{message}' --> {time_taken} sec.")


	"""
	Loading 2Go file CSV with 100 000 000 rows and 3 columns
	"""


	"""
	Redis
	Output: Error 104 while writing to socket. Connection reset by peer
	"""
	r = redis.StrictRedis(host='panda_redis', port=6379, db=0)

	df = pd.read_csv("test.csv")
	data = df.to_json()

	r.set("redis-key", data) # Fail, too large...

	results = r.get('redis-key')

	print(pd.read_json(results))




	"""
	Loading CSV:
	Output: 'Panda loading CSV' --> 5.022 sec.
	"""
	with timer("Panda loading CSV"):
	df = pd.read_csv("test.csv")

	"""
	Loading CSV with dtypes:
	Output: 'Panda loading CSV with dtypes' --> 4.603 sec.
	"""
	with timer("Panda loading CSV with dtypes"):
	df = pd.read_csv("test.csv", dtype={"Titre": "object", "Taille": "float64", "Age": "int64"})


	"""
	Loading CSV with Pyarrow:
	Output: Crashes Python......
	"""
	with timer("Panda loading CSV with Pyarrow"):
	df = pd.read_csv("test.csv", engine="pyarrow")

	"""
	Loading CSV with dtypes and Parquet:
	Output: Erreur de segmentation (core dumped)
	Note: file seems to big to be saved as parquet...
	"""
	with timer("Panda loading CSV with parquet"):
	df = pd.read_csv("test.csv", dtype={"Titre": "object", "Taille": "float64", "Age": "int64"})
	df.to_parquet("test.parquet", engine="fastparquet")

	"""
	Loading CSV with Dask:
	Output: Crashes
	"""
	with timer("Panda loading CSV with Dask"):
	data = dask.dataframe.read_csv("test.csv", dtype={"Titre": "object", "Taille": "float64", "Age": "int64"})