Created
June 27, 2021 12:45
-
-
Save grantmwilliams/143fd60b3891959a733d0ce5e195f71d to your computer and use it in GitHub Desktop.
Pyarrow iter_batches as python native iterable
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import s3fs | |
import pyarrow as pa | |
import pyarrow.parquet as pq | |
from itertools import chain | |
from typing import Tuple, Any | |
def iter_parquet(s3_uri: str, columns = None, batch_size=1_000) -> Tuple[Any]: | |
# create file system for file interface objects from S3 | |
fs = s3fs.S3FileSystem() | |
# open a file interface object | |
with fs.open(s3_uri) as fp: | |
# convert the python file object into a ParquetFile object for iterating | |
parquet_file = pq.ParquetFile(fp) | |
# an iterator of pyarrow.RecordBatch | |
record_batches = parquet_file.iter_batches(batch_size=batch_size, columns=columns) | |
# convert from columnar format of pyarrow arrays to a row format of python objects (yields tuples) | |
yield from chain.from_iterable(zip(*map(lambda col: col.to_pylist(), batch.columns)) for batch in record_batches) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment