Skip to content

Instantly share code, notes, and snippets.

@sharnoff
Created September 11, 2020 07:06
Show Gist options
  • Save sharnoff/5dc5000fca80a2ab0f78b2786b75c2eb to your computer and use it in GitHub Desktop.
Save sharnoff/5dc5000fca80a2ab0f78b2786b75c2eb to your computer and use it in GitHub Desktop.
Triggering code for tensorflow/tensorflow issue #43119
import tensorflow as tf
# These are the "givens" from the issue - essentially they're set to mimic
# the actual values that I ran into this issue with
ABSURDLY_LARGE_NUMBER = 2 * 10**17
YOUR_CSV_PATH = "temp-data.csv"
LABEL_COLUMN = "label"
INPUT_SIZE = 300
INPUT_COLUMNS = [ str(x) for x in range(INPUT_SIZE) ]
import tensorflow as tf
# This function is the culprit -- setting `shuffle_buffer_size` to a number
# large enough to cause the requested allocation size to overflow turns out to
# break things down the line. More information in the issue.
def get_dataset(file_path, **kwargs):
return tf.data.experimental.make_csv_dataset(
file_path,
batch_size=1,
num_epochs=1,
label_name=LABEL_COLUMN,
select_columns=[LABEL_COLUMN] + INPUT_COLUMNS,
header=True,
shuffle=True,
shuffle_buffer_size=ABSURDLY_LARGE_NUMBER,
**kwargs)
# In order to be able to run `get_dataset`, we'll first write a little csv to a file
#
# Aside from a header, all of the values in the dataset will be zero, because it
# doesn't actually matter what's there
header_cols = [LABEL_COLUMN] + INPUT_COLUMNS
header = ','.join(header_cols)
line = ','.join(['0' for _ in header_cols])
with open(YOUR_CSV_PATH, "w+") as f:
s = '\n'.join([header, line])
f.write(s)
# And there we go! That should be enough to get us going :)
################################################################################
# The next few pieces are mostly adapted from this tutorial:
# https://www.tensorflow.org/tutorials/load_data/csv
class PackNumericFeatures(object):
def __init__(self, names):
self.names = names
self.batch_size = 1
def __call__(self, features, labels):
numeric_features = [features.pop(name) for name in self.names]
numeric_features = [tf.cast(feat, tf.uint8) for feat in numeric_features]
numeric_features = tf.reshape(numeric_features, (self.batch_size, INPUT_SIZE))
features['numeric'] = numeric_features
return features, labels
def packed_dataset(file_path):
raw_data = get_dataset(file_path)
numeric_features = INPUT_COLUMNS
return raw_data.map(PackNumericFeatures(numeric_features))
numeric_column = tf.feature_column.numeric_column(
'numeric', shape=[INPUT_SIZE])
################################################################################
# And then we get to constructing the model itself. Not too much to see here
model = tf.keras.Sequential([
tf.keras.layers.DenseFeatures([numeric_column]),
tf.keras.layers.Dense(1, activation='relu'),
])
model.compile(
loss='mse',
optimizer=tf.keras.optimizers.SGD(),
metrics=['accuracy'],
)
# Finally, when we try to evaluate this model (using a dataset prepared badly),
# our good friend Mr. Segfault shows up :)
model.evaluate(packed_dataset(YOUR_CSV_PATH))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment