Created April 2, 2018 11:53
Test mixed precision training on Volta / CuDNN autotune
import tensorflow as tf
import numpy as np
import math
import os
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
*args, **kwargs):
"""Custom variable getter that forces trainable variables to be stored in
float32 precision and then casts them to the training precision.
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
*args, **kwargs)
if trainable and dtype != tf.float32:
variable = tf.cast(variable, dtype)
return variable
def gradients_with_loss_scaling(loss, variables, loss_scale):
"""Gradient calculation with loss scaling to improve numerical stability
when training with float16.
return [grad / loss_scale
for grad in tf.gradients(loss * loss_scale, variables)]
def create_simple_model(nbatch, dtype):
image_size = 224
images = tf.Variable(tf.random_normal([nbatch,
image_size, 3],
# conv1
with tf.name_scope('conv1') as scope:
kernel = tf.Variable(tf.truncated_normal([11, 11, 3, 64], dtype=dtype,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=dtype),
trainable=True, name='biases')
bias = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(bias, name=scope)
# pool1
pool1 = tf.nn.max_pool(conv1,
ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1],
# conv2
with tf.name_scope('conv2') as scope:
kernel = tf.Variable(tf.truncated_normal([5, 5, 64, 192], dtype=dtype,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=dtype),
trainable=True, name='biases')
bias = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(bias, name=scope)
# pool2
pool2 = tf.nn.max_pool(conv2,
ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1],
# conv3
with tf.name_scope('conv3') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 192, 384],
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(pool2, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=dtype),
trainable=True, name='biases')
bias = tf.nn.bias_add(conv, biases)
conv3 = tf.nn.relu(bias, name=scope)
# conv4
with tf.name_scope('conv4') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256],
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=dtype),
trainable=True, name='biases')
bias = tf.nn.bias_add(conv, biases)
conv4 = tf.nn.relu(bias, name=scope)
# conv5
with tf.name_scope('conv5') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256],
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=dtype),
trainable=True, name='biases')
bias = tf.nn.bias_add(conv, biases)
conv5 = tf.nn.relu(bias, name=scope)
# pool5
pool5 = tf.nn.max_pool(conv5,
ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1],
loss = tf.nn.l2_loss(pool5)
return loss
if __name__ == '__main__':
os.environ['TF_CUDNN_USE_AUTOTUNE'] = "0"
nbatch = 32
nin = 100
nout = 10
learning_rate = 0.1
momentum = 0.9
loss_scale = 128
dtype = tf.float16 #tf.float16
# Create training graph
with tf.device('/gpu:0'), tf.variable_scope('fp32_storage', custom_getter=float32_variable_storage_getter):
loss = create_simple_model(nbatch,dtype)
variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
grad = gradients_with_loss_scaling(loss, variables, loss_scale)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
training_step_op = optimizer.apply_gradients(zip(grad, variables))
# Run training
import time
# Build an initialization operation.
init = tf.global_variables_initializer()
# Start running operations on the Graph.
config = tf.ConfigProto()
config.gpu_options.allocator_type = 'BFC'
sess = tf.Session(config=config)
total_duration = 0.0
total_duration_squared = 0.0
nminibatches = 5000
for step in range(nminibatches):
start_time = time.time()
_ =
duration = time.time() - start_time
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / float(nminibatches)
vr = total_duration_squared / float(nminibatches) - mn * mn
sd = math.sqrt(vr)
print ('Mean batch processing time {} +- {}'.format(mn, sd))
I always get the error below when running with training_step_op = optimizer.apply_gradients(zip(grad, variables))
Tensor objects are only iterable when eager execution is enabled. To iterate over this tensor use tf.map_fn. Does anybody have a clue? Thanks!

