Last active
June 28, 2024 07:10
-
-
Save FabienArcellier/cb7fb74fd89981696b5c3d1d8423a8ff to your computer and use it in GitHub Desktop.
pyarrow export for list of records and array
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pyarrow.Table | |
# 0: string | |
# 1: int64 | |
# 2: string | |
# __index_level_0__: int64 | |
# ---- | |
# 0: [["Alice","Bob","Charlie","Charlie"]] | |
# 1: [[30,25,35,28]] | |
# 2: [["New York","San Francisco","Chicago","Chicago"]] | |
# __index_level_0__: [[0,1,2,3]] | |
import pyarrow as pa | |
# Étape 1: Liste de données (chaque sous-liste est une ligne) | |
data = [ | |
["Alice", 30, "New York"], | |
["Bob", 25, "San Francisco"], | |
["Charlie", 35, "Chicago"], | |
["Charlie", 28, "Chicago"] | |
] | |
# Étape 2: Générer des noms de colonnes basés sur les index | |
num_columns = len(data[0]) # Supposons que toutes les sous-listes ont la même longueur | |
column_names = [f"{i}" for i in range(num_columns)] | |
column_names += ['__index_level_0__'] | |
# Étape 3: Transposer les données pour obtenir les colonnes | |
transposed_data = list(zip(*data)) | |
transposed_data.append(list(range(len(data)))) | |
# Étape 4: Créer des objets PyArrow Array pour chaque colonne | |
pyarrow_columns = [pa.array(column) for column in transposed_data] | |
# Étape 5: Définir le schéma de la table (optionnel) | |
schema = pa.schema([(column_names[i], pyarrow_columns[i].type) for i in range(len(column_names))]) | |
# Étape 6: Créer la table PyArrow en utilisant les index générés comme noms de colonnes | |
table = pa.Table.from_arrays(pyarrow_columns, schema=schema) | |
# Afficher la table pour vérifier | |
print(table) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Result | |
# pyarrow.Table | |
# name: string | |
# age: int64 | |
# city: string | |
# __index_level_0__: int64 | |
# ---- | |
# name: [["Alice","Bob","Charlie","Charlie"]] | |
# age: [[30,25,35,28]] | |
# city: [["New York","San Francisco","Chicago","Chicago"]] | |
# __index_level_0__: [[0,1,2,3]] | |
import pyarrow as pa | |
# Étape 1: Créer une liste d'enregistrements (dictionnaires) | |
data = [ | |
{"name": "Alice", "age": 30, "city": "New York"}, | |
{"name": "Bob", "age": 25, "city": "San Francisco"}, | |
{"name": "Charlie", "age": 35, "city": "Chicago"}, | |
{"name": "Charlie", "age": 28, "city": "Chicago"} | |
] | |
# Étape 2: Extraire les colonnes de la liste d'enregistrements | |
# Utiliser les clés du premier enregistrement pour extraire les colonnes | |
column_names = list(data[0].keys()) | |
# Créer un dictionnaire où chaque clé correspond à une liste de valeurs pour cette colonne | |
columns = {key: [record[key] for record in data] for key in column_names} | |
column_names += ['__index_level_0__'] | |
columns['__index_level_0__'] = list(range(len(data))) | |
# Étape 3: Créer des objets PyArrow Array pour chaque colonne | |
pyarrow_columns = {key: pa.array(values) for key, values in columns.items()} | |
# Étape 4: Définir le schéma de la table (optionnel mais recommandé pour plus de contrôle) | |
# On utilise les mêmes clés (noms de colonnes) et on déduit les types de données | |
schema = pa.schema([(key, pyarrow_columns[key].type) for key in pyarrow_columns]) | |
# Étape 5: Créer la table PyArrow | |
table = pa.Table.from_arrays( | |
[pyarrow_columns[key] for key in column_names], | |
schema=schema | |
) | |
# Afficher la table pour vérifier | |
print(table) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Result | |
# pyarrow.Table | |
# name: string | |
# age: int64 | |
# city: string | |
# __index_level_0__: int64 | |
# ---- | |
# name: [["Alice","Bob","Charlie"]] | |
# age: [[30,25,35]] | |
# city: [["New York","San Francisco","Chicago"]] | |
# __index_level_0__: [[0,1,2]] | |
import pandas as pd | |
import pyarrow as pa | |
# Exemple de DataFrame Pandas | |
df = pd.DataFrame({ | |
"name": ["Alice", "Bob", "Charlie"], | |
"age": [30, 25, 35], | |
"city": ["New York", "San Francisco", "Chicago"] | |
}, index=[0, 1, 2]) # Ajout d'un index explicite | |
# Convertir le DataFrame en table PyArrow | |
# En incluant l'index du DataFrame Pandas en tant que colonne | |
table = pa.Table.from_pandas(df, preserve_index=True) | |
print(table) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Result | |
# pyarrow.Table | |
# __index_level_0__: uint32 | |
# name: large_string | |
# age: int64 | |
# city: large_string | |
# ---- | |
# __index_level_0__: [[0,1,2]] | |
# name: [["Alice","Bob","Charlie"]] | |
# age: [[30,25,35]] | |
# city: [["New York","San Francisco","Chicago"]] | |
import polars as po | |
from pyarrow.interchange import from_dataframe | |
# Exemple de DataFrame Pandas | |
df = po.DataFrame({ | |
"name": ["Alice", "Bob", "Charlie"], | |
"age": [30, 25, 35], | |
"city": ["New York", "San Francisco", "Chicago"] | |
}) # Ajout d'un index explicite | |
df = df.with_row_count("__index_level_0__") | |
# Convertir le DataFrame en table PyArrow | |
# En incluant l'index du DataFrame Pandas en tant que colonne | |
table = from_dataframe(df) | |
print(table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment