Created
April 6, 2021 13:18
-
-
Save ales-erjavec/1753ba18f90756158e4c030f6a57fdd4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Read the IDX file format as described by http://yann.lecun.com/exdb/mnist/ | |
""" | |
import os | |
import io | |
import struct | |
import mmap | |
from functools import reduce | |
from typing import IO, Tuple, Union | |
import numpy as np | |
def parse_magic(mg: bytes) -> Tuple[np.dtype, int]: | |
"""Parse the magic header""" | |
b1, b2, typecode, ndim = struct.unpack("BBBB", mg) | |
assert b1 == b2 == 0 | |
mapping = { | |
0x08: np.ubyte, | |
0x09: np.byte, | |
0x0B: np.int16, | |
0x0C: np.int32, | |
0x0D: np.float32, | |
0x0E: np.float64 | |
} | |
return np.dtype(mapping[typecode]), ndim | |
def read_idx(f: IO[bytes]) -> np.ndarray: | |
"""Read `f` and load the array into memory. `f` can be compressed. | |
""" | |
def parse_int(mb: bytes): | |
return struct.unpack(">i", mb)[0] | |
magic = f.read(4) | |
dtype, ndim = parse_magic(magic) | |
shape = [] | |
for m in range(ndim): | |
shape.append(parse_int(f.read(4))) | |
data = f.read() | |
return np.frombuffer(data, dtype).reshape(tuple(shape)) | |
def mmap_idx(f: Union[int, io.FileIO]) -> np.ndarray: | |
"""Memory map the IDX file contents into memory. | |
`f` must be a uncompressed and reside on the local filesystem. | |
""" | |
def parse_int(mb: bytes): | |
return struct.unpack(">i", mb)[0] | |
if isinstance(f, int): | |
f = os.fdopen(f, 'rb', buffering=0, closefd=False) | |
magic = f.read(4) | |
dtype, ndim = parse_magic(magic) | |
shape = [] | |
for m in range(ndim): | |
shape.append(parse_int(f.read(4))) | |
size = reduce(int.__mul__, shape + [dtype.itemsize]) | |
buffer = mmap.mmap( | |
f.fileno(), length=size, access=mmap.ACCESS_READ, | |
) | |
return np.frombuffer(buffer, dtype).reshape(tuple(shape)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment