Last active
September 17, 2021 09:32
-
-
Save ay27/caba007cb2e27ca824c587c80303aafa to your computer and use it in GitHub Desktop.
[TF CUDA Optimization Options] All the tensorflow with cuda optimization you need! Will speedup at least 1.3 times in Volta and Turing architecture! Only works with the tensorflow gpu version build from source with cuda/cudnn support, or use the docker image from nvidia gpu cloud <ngc.nvidia.com>. #tensorflow #cuda
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# mainly from: | |
# 1. https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars | |
# 2. https://github.com/NVIDIA/DeepLearningExamples/issues/57 | |
# 3. https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#variablesaddtf | |
def is_using_hvd(): | |
env_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"] | |
if all([var in os.environ for var in env_vars]): | |
return True | |
else: | |
return False | |
def cuda_opt(use_mixed_precision=True): | |
# ============================================ | |
# Optimsation Flags - Do not remove | |
# ============================================ | |
os.environ['CUDA_CACHE_DISABLE'] = '0' | |
os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' | |
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' | |
os.environ['TF_GPU_THREAD_COUNT'] = '1' if not is_using_hvd() else str(hvd.size()) | |
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' | |
os.environ['TF_ADJUST_HUE_FUSED'] = '1' | |
os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' | |
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' | |
os.environ['TF_SYNC_ON_FINISH'] = '0' | |
os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' | |
os.environ['TF_DISABLE_NVTX_RANGES'] = '1' # NVIDIA Tools Extension, only for debugging and profiling | |
# fast math (essentially the tensor core) | |
os.environ["TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32"] = "1" | |
os.environ["TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32"] = "1" | |
os.environ["TF_ENABLE_CUDNN_RNN_TENSOR_OP_MATH_FP32"] = "1" | |
if use_mixed_precision: | |
# tf auto mixed precision, will do these operations automatically: | |
# 1. Insert the appropriate cast operations into your TensorFlow graph to use float16 execution and storage | |
# where appropriate -- this enables the use of tensor cores along with memory storage and bandwidth savings. | |
# also can set by "TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE" | |
# 2. Turn on automatic loss scaling inside the training Optimizer object. | |
# also can set by "TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING" | |
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" | |
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" | |
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" | |
def get_session_config(is_training, use_xla=True): | |
config = tf.ConfigProto() | |
config.allow_soft_placement = True | |
config.log_device_placement = False | |
config.gpu_options.allow_growth = True | |
# Horovod: pin GPU to be used to process local rank (one GPU per process) | |
if is_using_hvd(): | |
config.gpu_options.visible_device_list = str(hvd.local_rank()) | |
if use_xla: | |
os.environ["TF_ENABLE_XLA"] = "1" | |
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 | |
config.gpu_options.force_gpu_compatible = True # Force pinned memory | |
if is_training: | |
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads | |
if is_using_hvd(): | |
config.inter_op_parallelism_threads = max(2, (multiprocessing.cpu_count() // hvd.size()) - 2) | |
else: | |
config.inter_op_parallelism_threads = 4 | |
return config |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment