sharnoff · September 11, 2020 07:06
diff --git a/segfault.py b/segfault.py
 import tensorflow as tf

 # These are the "givens" from the issue - essentially they're set to mimic
 # the actual values that I ran into this issue with

 ABSURDLY_LARGE_NUMBER = 2 * 10**17
 YOUR_CSV_PATH = "temp-data.csv"
 LABEL_COLUMN = "label"
 INPUT_SIZE = 300
 INPUT_COLUMNS = [ str(x) for x in range(INPUT_SIZE) ]

 import tensorflow as tf

 # This function is the culprit -- setting `shuffle_buffer_size` to a number
 # large enough to cause the requested allocation size to overflow turns out to
 # break things down the line. More information in the issue.
 def get_dataset(file_path, **kwargs):
    return tf.data.experimental.make_csv_dataset(
            file_path,
            batch_size=1,
            num_epochs=1,
            label_name=LABEL_COLUMN,
            select_columns=[LABEL_COLUMN] + INPUT_COLUMNS,
            header=True,
            shuffle=True,
            shuffle_buffer_size=ABSURDLY_LARGE_NUMBER,
            **kwargs)

 # In order to be able to run `get_dataset`, we'll first write a little csv to a file
 #
 # Aside from a header, all of the values in the dataset will be zero, because it
 # doesn't actually matter what's there
 header_cols = [LABEL_COLUMN] + INPUT_COLUMNS
 header = ','.join(header_cols)
 line = ','.join(['0' for _ in header_cols])

 with open(YOUR_CSV_PATH, "w+") as f:
    s = '\n'.join([header, line])
    f.write(s)

 # And there we go! That should be enough to get us going :)

 ################################################################################
 # The next few pieces are mostly adapted from this tutorial:
 #   https://www.tensorflow.org/tutorials/load_data/csv

 class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names
        self.batch_size = 1

    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.uint8) for feat in numeric_features]
        numeric_features = tf.reshape(numeric_features, (self.batch_size, INPUT_SIZE))
        features['numeric'] = numeric_features
        return features, labels

 def packed_dataset(file_path):
    raw_data = get_dataset(file_path)
    numeric_features = INPUT_COLUMNS
    return raw_data.map(PackNumericFeatures(numeric_features))

 numeric_column = tf.feature_column.numeric_column(
    'numeric', shape=[INPUT_SIZE])

 ################################################################################
 # And then we get to constructing the model itself. Not too much to see here

 model = tf.keras.Sequential([
    tf.keras.layers.DenseFeatures([numeric_column]),
    tf.keras.layers.Dense(1, activation='relu'),
 ])

 model.compile(
    loss='mse',
    optimizer=tf.keras.optimizers.SGD(),
    metrics=['accuracy'],
 )

 # Finally, when we try to evaluate this model (using a dataset prepared badly),
 # our good friend Mr. Segfault shows up :)
 model.evaluate(packed_dataset(YOUR_CSV_PATH))
	import tensorflow as tf

	# These are the "givens" from the issue - essentially they're set to mimic
	# the actual values that I ran into this issue with

	ABSURDLY_LARGE_NUMBER = 2 * 10**17
	YOUR_CSV_PATH = "temp-data.csv"
	LABEL_COLUMN = "label"
	INPUT_SIZE = 300
	INPUT_COLUMNS = [ str(x) for x in range(INPUT_SIZE) ]

	import tensorflow as tf

	# This function is the culprit -- setting `shuffle_buffer_size` to a number
	# large enough to cause the requested allocation size to overflow turns out to
	# break things down the line. More information in the issue.
	def get_dataset(file_path, **kwargs):
	return tf.data.experimental.make_csv_dataset(
	file_path,
	batch_size=1,
	num_epochs=1,
	label_name=LABEL_COLUMN,
	select_columns=[LABEL_COLUMN] + INPUT_COLUMNS,
	header=True,
	shuffle=True,
	shuffle_buffer_size=ABSURDLY_LARGE_NUMBER,
	**kwargs)

	# In order to be able to run `get_dataset`, we'll first write a little csv to a file
	#
	# Aside from a header, all of the values in the dataset will be zero, because it
	# doesn't actually matter what's there
	header_cols = [LABEL_COLUMN] + INPUT_COLUMNS
	header = ','.join(header_cols)
	line = ','.join(['0' for _ in header_cols])

	with open(YOUR_CSV_PATH, "w+") as f:
	s = '\n'.join([header, line])
	f.write(s)

	# And there we go! That should be enough to get us going :)

	################################################################################
	# The next few pieces are mostly adapted from this tutorial:
	# https://www.tensorflow.org/tutorials/load_data/csv

	class PackNumericFeatures(object):
	def __init__(self, names):
	self.names = names
	self.batch_size = 1

	def __call__(self, features, labels):
	numeric_features = [features.pop(name) for name in self.names]
	numeric_features = [tf.cast(feat, tf.uint8) for feat in numeric_features]
	numeric_features = tf.reshape(numeric_features, (self.batch_size, INPUT_SIZE))
	features['numeric'] = numeric_features
	return features, labels

	def packed_dataset(file_path):
	raw_data = get_dataset(file_path)
	numeric_features = INPUT_COLUMNS
	return raw_data.map(PackNumericFeatures(numeric_features))

	numeric_column = tf.feature_column.numeric_column(
	'numeric', shape=[INPUT_SIZE])

	################################################################################
	# And then we get to constructing the model itself. Not too much to see here

	model = tf.keras.Sequential([
	tf.keras.layers.DenseFeatures([numeric_column]),
	tf.keras.layers.Dense(1, activation='relu'),
	])

	model.compile(
	loss='mse',
	optimizer=tf.keras.optimizers.SGD(),
	metrics=['accuracy'],
	)

	# Finally, when we try to evaluate this model (using a dataset prepared badly),
	# our good friend Mr. Segfault shows up :)
	model.evaluate(packed_dataset(YOUR_CSV_PATH))