Skip to content

Instantly share code, notes, and snippets.

@ianmcook
Last active August 5, 2024 14:48
Show Gist options
  • Save ianmcook/79f5ed7c73b0663a35ae325b71f52630 to your computer and use it in GitHub Desktop.
Save ianmcook/79f5ed7c73b0663a35ae325b71f52630 to your computer and use it in GitHub Desktop.
Zero null-masked bytes of a fixed-width array in PyArrow
import pyarrow as pa
import numpy as np
import pandas as pd
# Create an array of some fixed-width type containing nulls
a = pa.array(obj=pd.Series([1, 2, 3]), type=pa.int64(), mask=np.array([1, 0, 1], dtype=bool))
# Get the values buffer as a bytearray
b = a.buffers()
v = bytearray(b[1].to_pybytes())
# For each null-masked value...
for i in range(0, len(a)):
if a.is_null()[i] == pa.scalar(True):
# ...locate the associated bytes in the bytearray
bytes_start = i * a.type.byte_width
bytes_end = bytes_start + a.type.byte_width
# Examine the bytes and notice they are not zeroed
print('Original bytes ' + v[bytes_start:bytes_end].hex(), end=' ')
# Replace them with zero bytes
v[bytes_start:bytes_end] = bytearray(bytes_length)
print('zeroed')
# Replace the values buffer and reassemble the array
b[1] = pa.py_buffer(v)
a_new = pa.IntegerArray.from_buffers(a.type, len(a), b)
# Notice PyArrow says the new array is the same as the original
a_new == a
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment