Skip to content

Instantly share code, notes, and snippets.

@deepanshu-yadav
Last active June 7, 2022 10:46
Show Gist options
  • Save deepanshu-yadav/154c4acfc79d22acaf6b8d2865dfd2e9 to your computer and use it in GitHub Desktop.
Save deepanshu-yadav/154c4acfc79d22acaf6b8d2865dfd2e9 to your computer and use it in GitHub Desktop.
import glob
from sklearn.preprocessing import MinMaxScaler
def npy_header_offset(npy_path):
"""Gives the no of header bytes inside a numpy file."""
with open(str(npy_path), 'rb') as f:
if f.read(6) != b'\x93NUMPY':
raise ValueError('Invalid NPY file.')
version_major, version_minor = f.read(2)
if version_major == 1:
header_len_size = 2
elif version_major == 2:
header_len_size = 4
else:
raise ValueError('Unknown NPY file version {}.{}'.
format(version_major, version_minor))
header_len = sum(b << (8 * i) for i, b in enumerate(
f.read(header_len_size)))
header = f.read(header_len)
if not header.endswith(b'\n'):
raise ValueError('Invalid NPY file.')
return f.tell()
# Following code fits the min max scaler,
# as well as gets the total number of examples
# for training and validation.
training_files = glob.glob(os.path.join(train_dir, '*'))
validation_files = glob.glob(os.path.join(validation_dir, '*'))
min_max_scaler_train = MinMaxScaler()
BATCH_SIZE = 32
NO_OF_EPOCHS = 3
total_examples_train = 0
for train_file_name in training_files:
file_np = np.load(train_file_name)
rows = file_np.shape[0]
total_examples_train += rows
# we will fit the min max scaler to training data here.
# Notice we will use partial fit for large data.
min_max_scaler_train.partial_fit(file_np)
total_examples_validation = 0
for validation_file_name in validation_files:
file_np = np.load(validation_file_name)
rows = file_np.shape[0]
total_examples_validation += rows
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment