# What are the size overheads for serializing tensors?
#
import io
import sys
import numpy as np
# https://huggingface.co/docs/safetensors/index
#from safetensors.torch import save_file
# https://huggingface.co/docs/safetensors/api/numpy
from safetensors.numpy import save
nx, ny, nz = map(int, sys.argv[1:4])
print(nx, ny, nz)
x2 = (np.arange(nx)*4/nx)**2
y2 = (np.arange(ny)*5/ny)**2
z2 = (np.arange(nz)*5/nz)**2
A = 10000
x = (A*np.exp(-0.5*(x2[:,None] + y2[None,:]))).astype('float32')
y = (A*np.exp(-0.5*(x2[:,None] + z2[None,:]))).astype('int16')
print(x.nbytes + y.nbytes)
print(x.nbytes + (y != 0).sum()*16//8) # bytes in nonzeros
data = {"x": x, "y": y}
with io.BytesIO() as f:
np.savez(f, data)
sz = f.getbuffer().nbytes
print(f"np.savez: {sz}")
sz = len(save(data))
print(f"safetensors: {sz}")
import torch
tdata = {"x": torch.tensor(x), "y": torch.tensor(y)}
print(tdata["x"].nbytes+tdata["y"].nbytes)
with io.BytesIO() as f:
torch.save(tdata, f)
sz = f.getbuffer().nbytes
print(f"torch.save: {sz}")
import h5py
# https://docs.h5py.org/en/stable/high/dataset.html#shuffle-filter
#compression = "gzip"
compression = "lzf"
with io.BytesIO() as f:
with h5py.File(f, "w") as h5:
for k, v in data.items():
h5.create_dataset(k, data=v,
compression=compression,
shuffle=True)
sz = f.getbuffer().nbytes
print(f"hdf5: {sz}")
import zfpy # github.com/llnl/zfp
# Doesn't name tensors or accept int16, but that's OK.
# We add a header size to be fair.
sz = len( zfpy.compress_numpy(x, write_header=True) ) \
+ len( zfpy.compress_numpy(y.astype('int32'), write_header=True) ) \
+ len(b'{"x":____,"y": ____}')
print(f"zfpy: {sz}")
% python3 sizes.py 100 1000 1
100 1000 1
400200
400200
np.savez: 400802
safetensors: 400344
400200
torch.save: 401560
hdf5: 280551
zfpy: 195340
% python3 sizes.py 50 50 30
50 50 30
13000
12192
np.savez: 13604
safetensors: 13136
13000
torch.save: 14360
hdf5: 16784
zfpy: 8356
% python3 sizes.py 500 500 30
500 500 30
1030000
1021746
np.savez: 1030607
safetensors: 1030144
1030000
torch.save: 1031384
hdf5: 704563
zfpy: 418020