pkit · March 23, 2018 12:02
diff --git a/tensorflow-1.6.0-gpu-cuda-load.patch b/tensorflow-1.6.0-gpu-cuda-load.patch
 diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
 index 08961fc..5963c8a 100644
 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
 +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
 @@ -16,6 +16,9 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 #include "cuda/include/cuda.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 +#include "tensorflow/stream_executor/platform/port.h"
 +#include "tensorflow/stream_executor/dso_loader.h"
 +#include "tensorflow/core/platform/default/logging.h"
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 @@ -25,6 +28,41 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
 +namespace dynload {
 +
 +namespace gpu = ::perftools::gputools;
 +
 +#define CUDAMALLOC_LIBCUDA_WRAP(__name)                                           \
 +  struct DynLoadShim__##__name {                                                  \
 +    static const char *kName;                                                     \
 +    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;              \
 +    static void *GetDsoHandle() {                                                 \
 +      static auto status = gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
 +      return status.ValueOrDie();                                                 \
 +    }                                                                             \
 +    static FuncPointerT LoadOrDie() {                                             \
 +      void *f;                                                                    \
 +      gpu::port::Status s = gpu::port::Env::Default()->GetSymbolFromLibrary(      \
 +          GetDsoHandle(), kName, &f);                                             \
 +      CHECK(s.ok()) << "could not find " << kName                                 \
 +                    << " in libcuda DSO; dlerror: " << s.error_message();         \
 +      return reinterpret_cast<FuncPointerT>(f);                                   \
 +    }                                                                             \
 +    static FuncPointerT DynLoad() {                                               \
 +      static FuncPointerT f = LoadOrDie();                                        \
 +      return f;                                                                   \
 +    }                                                                             \
 +    template <typename... Args>                                                   \
 +    CUresult operator()(Args... args) {                                           \
 +      return DynLoad()(args...);                                                  \
 +    }                                                                             \
 +  } __name;                                                                       \
 +  const char *DynLoadShim__##__name::kName = #__name;
 +
 +	CUDAMALLOC_LIBCUDA_WRAP(cuMemAlloc_v2);
 +	CUDAMALLOC_LIBCUDA_WRAP(cuMemFree_v2);
 +}  // namespace dynload
 +
 namespace tensorflow {
 
 GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator,
 @@ -40,7 +78,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   // allocate with cudaMalloc
   gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
   CUdeviceptr rv = 0;
 -  CUresult res = cuMemAlloc(&rv, num_bytes);
 +  CUresult res = dynload::cuMemAlloc_v2(&rv, num_bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemAlloc failed to allocate " << num_bytes;
     return nullptr;
 @@ -53,7 +91,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
 void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
 #ifdef GOOGLE_CUDA
   // free with cudaFree
 -  CUresult res = cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
 +  CUresult res = dynload::cuMemFree_v2(reinterpret_cast<CUdeviceptr>(ptr));
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemFree failed to free " << ptr;
   }
 diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
 index 1865240..50b2930 100644
 --- a/tensorflow/stream_executor/BUILD
 +++ b/tensorflow/stream_executor/BUILD
 @@ -76,7 +76,6 @@ cc_library(
     ] + if_cuda_is_configured([
         "//tensorflow/core:cuda",
         "@local_config_cuda//cuda:cublas",
 -        "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cudnn",
         "@local_config_cuda//cuda:cufft",
         "@local_config_cuda//cuda:curand",
 diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
 index a017ff6..7c5147a 100644
 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc
 +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
 @@ -21,7 +21,9 @@ limitations under the License.
 #include <set>
 #include <utility>
 
 +#include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 +#include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 @@ -57,6 +59,107 @@ namespace perftools {
 namespace gputools {
 namespace cuda {
 
 +namespace dynload {
 +
 +#define PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(__name)                              \
 +  struct DynLoadShim__##__name {                                             \
 +    static const char *kName;                                                \
 +    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;         \
 +    static void *GetDsoHandle() {                                            \
 +      static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
 +      return status.ValueOrDie();                                            \
 +    }                                                                        \
 +    static FuncPointerT LoadOrDie() {                                        \
 +      void *f;                                                               \
 +      port::Status s = port::Env::Default()->GetSymbolFromLibrary(           \
 +          GetDsoHandle(), kName, &f);                                        \
 +      CHECK(s.ok()) << "could not find " << kName                            \
 +                    << " in libcuda DSO; dlerror: " << s.error_message();    \
 +      return reinterpret_cast<FuncPointerT>(f);                              \
 +    }                                                                        \
 +    static FuncPointerT DynLoad() {                                          \
 +      static FuncPointerT f = LoadOrDie();                                   \
 +      return f;                                                              \
 +    }                                                                        \
 +    template <typename... Args>                                              \
 +    CUresult operator()(Args... args) {                                      \
 +      return DynLoad()(args...);                                             \
 +    }                                                                        \
 +  } __name;                                                                  \
 +  const char *DynLoadShim__##__name::kName = #__name;
 +
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxCreate_v2);
 +#if CUDA_VERSION >= 7000
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxRetain);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxRelease);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxSetFlags);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxGetState);
 +#endif
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxDestroy);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxEnablePeerAccess);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetCurrent);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetDevice);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetSharedMemConfig);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxPopCurrent_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetCurrent);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetSharedMemConfig);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSynchronize);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceComputeCapability);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceCanAccessPeer);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGet);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetAttribute);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetCount);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetName);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetPCIBusId);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetProperties);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceTotalMem);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDriverGetVersion);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventCreate);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventSynchronize);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorString);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuInit);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuLaunchKernel);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemAlloc_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoD_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoH_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoD_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoDAsync_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoHAsync_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoDAsync_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetAddressRange_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFree_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFreeHost);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetInfo_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostAlloc);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostRegister_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostUnregister);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32Async);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8Async);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetFunction);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetGlobal_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadDataEx);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadFatBinary);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleUnload);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuPointerGetAttribute);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamAddCallback);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamCreate);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamDestroy_v2);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamQuery);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamSynchronize);
 +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamWaitEvent);
 +
 +}  // namespace dynload
 +
 namespace {
 
 // Manages the singleton map of contexts that we've created, mapping
 @@ -273,7 +376,7 @@ namespace {
 
 // Call cuCtxtSynchronize and crash if it doesn't succeed.
 void SynchronizeOrDie() {
 -  auto res = cuCtxSynchronize();
 +  auto res = dynload::cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(FATAL) << "Synchronize found "
                << ToString(res) << " :: " << port::CurrentStackTrace();
 @@ -309,7 +412,7 @@ ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
   to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
 
   // Set the context and update thread local.
 -  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
 +  CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(cuda_context->context()));
   tls->id = cuda_context->id();
   tls->context = cuda_context;
 }
 @@ -334,7 +437,7 @@ ScopedActivateContext::~ScopedActivateContext() {
   }
 
   // Set context and update thread local.
 -  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
 +  CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(to_restore_->context()));
   tls->id = to_restore_->id();
   tls->context = to_restore_;
 }
 @@ -395,8 +498,10 @@ static port::Status InternalInit() {
   CUresult res = CUDA_ERROR_NO_DEVICE;
   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
 -  } else {
 -    res = cuInit(0 /* = flags */);
 +  } else if (internal::CachedDsoLoader::GetLibcudaDsoHandle().ok()) {
 +    // We only call cuInit if we can dynload libcuda.
 +
 +    res = dynload::cuInit(0 /* = flags */);
   }
 
   if (res == CUDA_SUCCESS) {
 @@ -429,7 +534,7 @@ static port::Status InternalInit() {
 
 /* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
                                                 CUdevice *device) {
 -  CUresult res = cuDeviceGet(device, device_ordinal);
 +  CUresult res = dynload::cuDeviceGet(device, device_ordinal);
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
   }
 @@ -443,7 +548,8 @@ static port::Status InternalInit() {
                                             string *device_name) {
   static const size_t kCharLimit = 64;
   port::InlinedVector<char, 4> chars(kCharLimit);
 -  CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
 +  CUresult res =
 +      dynload::cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get device name for " << device << ": "
                << ToString(res);
 @@ -499,8 +605,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
       unsigned int former_primary_context_flags;
       int former_primary_context_is_active;
       CHECK_EQ(CUDA_SUCCESS,
 -               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
 -                                          &former_primary_context_is_active));
 +               dynload::cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
 +                                                   &former_primary_context_is_active));
       if (former_primary_context_flags != flags) {
         if (former_primary_context_is_active) {
           LOG(ERROR)
 @@ -508,16 +614,16 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
               << former_primary_context_flags << ") than the desired flag set ("
               << flags << ").";
         } else {
 -          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
 +          CHECK_EQ(CUDA_SUCCESS, dynload::cuDevicePrimaryCtxSetFlags(device, flags));
         }
       }
     }
 
     former_context = CUDADriver::CurrentContextOrDie();
 -    res = cuDevicePrimaryCtxRetain(&new_context, device);
 +    res = dynload::cuDevicePrimaryCtxRetain(&new_context, device);
     if (former_context != nullptr) {
       CUdevice former_device;
 -      if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
 +      if (dynload::cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
         if (former_device == device) {
           if (former_context == new_context) {
             VLOG(2) << "The primary context " << former_context
 @@ -544,10 +650,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
           << "creating context when one is currently active; existing: "
           << former_context;
     }
 -    res = cuCtxCreate(&new_context, flags, device);
 +    res = dynload::cuCtxCreate(&new_context, flags, device);
 #endif
   }
 -  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
 +  CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(former_context));
 
   if (res == CUDA_SUCCESS) {
     *context = CreatedContexts::Add(new_context);
 @@ -580,14 +686,14 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   }
 #if CUDA_VERSION >= 7000
   CUcontext former_context = CurrentContext();
 -  CUresult res = cuCtxSetCurrent(context->context());
 +  CUresult res = dynload::cuCtxSetCurrent(context->context());
   CUdevice device;
 -  cuCtxGetDevice(&device);
 -  cuCtxSetCurrent(former_context);
 +  dynload::cuCtxGetDevice(&device);
 +  dynload::cuCtxSetCurrent(former_context);
 
 -  res = cuDevicePrimaryCtxRelease(device);
 +  res = dynload::cuDevicePrimaryCtxRelease(device);
 #else
 -  CUresult res = cuCtxDestroy(context->context());
 +  CUresult res = dynload::cuCtxDestroy_v2(context->context());
 #endif
 
   if (res != CUDA_SUCCESS) {
 @@ -600,7 +706,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 /* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
                                                CUfunction func,
                                                int *attribute_value) {
 -  CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
 +  CUresult res = dynload::cuFuncGetAttribute(attribute_value, attribute, func);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
                << ", attribute: " << attribute;
 @@ -611,7 +717,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 
 /* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
                                                  CUfunc_cache cache_config) {
 -  CUresult res = cuFuncSetCacheConfig(function, cache_config);
 +  CUresult res = dynload::cuFuncSetCacheConfig(function, cache_config);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
                << ", config: " << cache_config << ", result: " << ToString(res);
 @@ -625,10 +731,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   CUsharedconfig shared_mem_config;
   ScopedActivateContext activation{context};
 -  CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
 +  CUresult result = dynload::cuCtxGetSharedMemConfig(&shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
 -    cuCtxGetDevice(&device);
 +    dynload::cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to get CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", result: " << ToString(result);
 @@ -642,10 +748,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
     CudaContext* context, CUsharedconfig shared_mem_config) {
   ScopedActivateContext activation{context};
 -  CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
 +  CUresult result = dynload::cuCtxSetSharedMemConfig(shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
 -    cuCtxGetDevice(&device);
 +    dynload::cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to set CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", config: " << shared_mem_config
 @@ -668,9 +774,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z;
 -  CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
 -                                block_dim_x, block_dim_y, block_dim_z,
 -                                shared_mem_bytes, stream, kernel_params, extra);
 +  CUresult res = dynload::cuLaunchKernel(
 +      function, grid_dim_x, grid_dim_y, grid_dim_z,
 +      block_dim_x, block_dim_y, block_dim_z,
 +      shared_mem_bytes, stream, kernel_params, extra);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to launch CUDA kernel: " << function
                << "; result: " << ToString(res);
 @@ -684,7 +791,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                 const char *cubin_bytes,
                                                 CUmodule *module) {
   ScopedActivateContext activation{context};
 -  CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
 +  CUresult result = dynload::cuModuleLoadFatBinary(module, cubin_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status{port::error::INTERNAL,
                         "failed to load in-memory CUBIN: " + ToString(result)};
 @@ -727,8 +834,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943
 
 -      res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options,
 -                               option_values);
 +      res = dynload::cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options),
 +                                        options, option_values);
     }
 
     // The PTX JIT mutates the values in the option values array to reflect the
 @@ -767,7 +874,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                      CUdeviceptr location,
                                                      uint8 value, size_t size) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemsetD8(location, value, size);
 +  CUresult res = dynload::cuMemsetD8_v2(location, value, size);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
 @@ -780,7 +887,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                       uint32 value,
                                                       size_t uint32_count) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemsetD32(location, value, uint32_count);
 +  CUresult res = dynload::cuMemsetD32_v2(location, value, uint32_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
 @@ -794,7 +901,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                       size_t uint32_count,
                                                       CUstream stream) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
 +  CUresult res =
 +      dynload::cuMemsetD8Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
 @@ -809,7 +917,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                        size_t uint32_count,
                                                        CUstream stream) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
 +  CUresult res =
 +      dynload::cuMemsetD32Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
 @@ -823,7 +932,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                 StreamCallback callback,
                                                 void *data) {
   // Note: flags param is required to be zero according to CUDA 6.0.
 -  CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
 +  CUresult res =
 +      dynload::cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
     return false;
 @@ -837,7 +947,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                 CUfunction *function) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && kernel_name != nullptr);
 -  CUresult res = cuModuleGetFunction(function, module, kernel_name);
 +  CUresult res = dynload::cuModuleGetFunction(function, module, kernel_name);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
                << "\" from module: " << ToString(res);
 @@ -855,7 +965,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
 -  CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
 +  CUresult res =
 +      dynload::cuModuleGetGlobal_v2(dptr, bytes, module, symbol_name);
   if (res != CUDA_SUCCESS) {
     // symbol may not be found in the current module, but it may reside in
     // another module.
 @@ -870,7 +981,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ void CUDADriver::UnloadModule(CudaContext *context,
                                            CUmodule module) {
   ScopedActivateContext activated{context};
 -  CUresult res = cuModuleUnload(module);
 +  CUresult res = dynload::cuModuleUnload(module);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to unload module " << module
                << "; leaking: " << ToString(res);
 @@ -881,7 +992,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     CudaContext* context) {
   ScopedActivateContext activated{context};
   CUdevice device = -1;
 -  CUresult result = cuCtxGetDevice(&device);
 +  CUresult result = dynload::cuCtxGetDevice(&device);
   if (result == CUDA_SUCCESS) {
     return device;
   }
 @@ -897,7 +1008,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   // up synchronization with respect to memsets and any other things that have
   // to occur on the default stream?
   ScopedActivateContext activated{context};
 -  CUresult res = cuStreamCreate(out, 0);
 +  CUresult res = dynload::cuStreamCreate(out, 0);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not allocate CUDA stream for context " << context
                << ": " << ToString(res);
 @@ -916,7 +1027,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 
   ScopedActivateContext activated{context};
 -  CUresult res = cuStreamDestroy(*stream);
 +  CUresult res = dynload::cuStreamDestroy_v2(*stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to destroy CUDA stream for context " << context
                << ": " << ToString(res);
 @@ -931,7 +1042,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                               uint64 bytes) {
   ScopedActivateContext activated{context};
   CUdeviceptr result = 0;
 -  CUresult res = cuMemAlloc(&result, bytes);
 +  CUresult res = dynload::cuMemAlloc_v2(&result, bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to allocate "
                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
 @@ -948,7 +1059,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                void *location) {
   ScopedActivateContext activation{context};
   CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
 -  CUresult res = cuMemFree(pointer);
 +  CUresult res = dynload::cuMemFree_v2(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free device memory at " << location
                << "; result: " << ToString(res);
 @@ -962,7 +1073,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activation{context};
   void *host_mem = nullptr;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
 -  CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
 +  CUresult res =
 +      dynload::cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes on host: " << ToString(res);
 @@ -973,7 +1085,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ void CUDADriver::HostDeallocate(CudaContext* context,
                                              void *location) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemFreeHost(location);
 +  CUresult res = dynload::cuMemFreeHost(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
                << ToString(res);
 @@ -985,7 +1097,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activation{context};
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
   CUresult res =
 -      cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
 +      dynload::cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error registering host memory at " << location << ": "
                << ToString(res);
 @@ -997,7 +1109,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ bool CUDADriver::HostUnregister(CudaContext* context,
                                              void *location) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemHostUnregister(location);
 +  CUresult res = dynload::cuMemHostUnregister(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
                << ToString(res);
 @@ -1014,7 +1126,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 
   ScopedActivateContext activated{context};
 -  CUresult res = cuEventDestroy(*event);
 +  CUresult res = dynload::cuEventDestroy_v2(*event);
   *event = nullptr;
 
   switch (res) {
 @@ -1038,7 +1150,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                   CUevent event,
                                                   CUstream stream) {
   ScopedActivateContext activated{context};
 -  CUresult res = cuEventRecord(event, stream);
 +  CUresult res = dynload::cuEventRecord(event, stream);
   switch (res) {
     case CUDA_SUCCESS:
       return port::Status::OK();
 @@ -1059,7 +1171,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
     CudaContext *context, CUevent event) {
   ScopedActivateContext activated{context};
 -  CUresult res = cuEventQuery(event);
 +  CUresult res = dynload::cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
     return port::Status{
         port::error::INTERNAL,
 @@ -1075,12 +1187,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activated{context};
   // The stop event must have completed in order for cuEventElapsedTime to
   // work.
 -  CUresult res = cuEventSynchronize(stop);
 +  CUresult res = dynload::cuEventSynchronize(stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
     return false;
   }
 -  res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
 +  res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get elapsed time between events: "
                << ToString(res);
 @@ -1094,7 +1206,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                 CUstream stream,
                                                 CUevent event) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
 +  CUresult res = dynload::cuStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
     return false;
 @@ -1105,7 +1217,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuCtxSynchronize();
 +  CUresult res = dynload::cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
                << " :: " << port::CurrentStackTrace();
 @@ -1119,7 +1231,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                         CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
 -  CUresult res = cuStreamSynchronize(stream);
 +  CUresult res = dynload::cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
     port::Status status = port::InternalError(
         port::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
 @@ -1135,7 +1247,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                            CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
 -  CUresult res = cuStreamQuery(stream);
 +  CUresult res = dynload::cuStreamQuery(stream);
   if (res == CUDA_SUCCESS) {
     return true;
   }
 @@ -1151,7 +1263,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdeviceptr gpu_src,
                                                            uint64 size) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
 +  CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(
         port::Printf("failed to synchronous memcpy from device to host: %s; "
 @@ -1169,7 +1281,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            const void *host_src,
                                                            uint64 size) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
 +  CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
 @@ -1186,7 +1298,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdeviceptr gpu_src,
                                                            uint64 size) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
 +  CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
 @@ -1204,7 +1316,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     uint64 size,
                                                     CUstream stream) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
 +  CUresult res = dynload::cuMemcpyDtoHAsync_v2(host_dst, gpu_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
 @@ -1224,7 +1336,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     uint64 size,
                                                     CUstream stream) {
   ScopedActivateContext activation{context};
 -  CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
 +  CUresult res = dynload::cuMemcpyHtoDAsync_v2(gpu_dst, host_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
 @@ -1243,7 +1355,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     uint64 size,
                                                     CUstream stream) {
   ScopedActivateContext activation{context};
 -  CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
 +  CUresult result =
 +      dynload::cuMemcpyDtoDAsync_v2(gpu_dst, gpu_src, size, stream);
   if (result != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to device: %s"
 @@ -1279,7 +1392,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 
   ScopedActivateContext activated{context};
 -  CUresult res = cuEventCreate(result, cuflags);
 +  CUresult res = dynload::cuEventCreate(result, cuflags);
 
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
 @@ -1295,7 +1408,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ int CUDADriver::GetDeviceCount() {
   int device_count = 0;
 -  CUresult res = cuDeviceGetCount(&device_count);
 +  CUresult res = dynload::cuDeviceGetCount(&device_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
     return 0;
 @@ -1310,8 +1423,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
     CUdeviceptr pointer) {
   CudaContext* context = nullptr;
 -  CUresult result =
 -      cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
 +  CUresult result = dynload::cuPointerGetAttribute(
 +      &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
   if (result == CUDA_SUCCESS) {
     CHECK(context != nullptr) << "success should entail non-null context";
     return context;
 @@ -1326,8 +1439,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
     CUdeviceptr pointer) {
   unsigned int value;
 -  CUresult result =
 -      cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
 +  CUresult result = dynload::cuPointerGetAttribute(
 +      &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
   if (result == CUDA_SUCCESS) {
     switch (value) {
       case CU_MEMORYTYPE_DEVICE:
 @@ -1350,7 +1463,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
                                                              CUdeviceptr *base,
                                                              size_t *size) {
 -  CUresult result = cuMemGetAddressRange(base, size, dptr);
 +  CUresult result = dynload::cuMemGetAddressRange(base, size, dptr);
   if (result == CUDA_SUCCESS) {
     return port::Status::OK();
   } else if (result == CUDA_ERROR_NOT_FOUND) {
 @@ -1384,7 +1497,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdevice device) {
   *cc_major = 0;
   *cc_minor = 0;
 -  CUresult result = cuDeviceComputeCapability(cc_major, cc_minor, device);
 +  CUresult result =
 +      dynload::cuDeviceComputeCapability(cc_major, cc_minor, device);
   if (result == CUDA_SUCCESS) {
     return port::Status::OK();
   }
 @@ -1401,7 +1515,7 @@ template <typename T>
 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                                             CUdevice_attribute attribute) {
   int value = -1;
 -  CUresult result = cuDeviceGetAttribute(&value, attribute, device);
 +  CUresult result = dynload::cuDeviceGetAttribute(&value, attribute, device);
   if (result != CUDA_SUCCESS) {
     return port::Status{
         port::error::NOT_FOUND,
 @@ -1456,24 +1570,24 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
                                             CUdevice device) {
   int value;
 -  CUresult res =
 -      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
 +  CUresult res = dynload::cuDeviceGetAttribute(
 +      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
     return false;
   }
   *x = value;
 
 -  res =
 -      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
 +  res = dynload::cuDeviceGetAttribute(
 +      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
     return false;
   }
   *y = value;
 
 -  res =
 -      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
 +  res = dynload::cuDeviceGetAttribute(
 +      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
     return false;
 @@ -1483,7 +1597,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 }
 
 /* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
 -  CUresult res = cuDriverGetVersion(driver_version);
 +  CUresult res = dynload::cuDriverGetVersion(driver_version);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query driver version: " << ToString(res);
     return false;
 @@ -1494,7 +1608,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 
 /* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties,
                                                   int device_ordinal) {
 -  CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
 +  CUresult res =
 +      dynload::cuDeviceGetProperties(device_properties, device_ordinal);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device properties: " << ToString(res);
     return false;
 @@ -1505,8 +1620,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 
 /* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
   int value = -1;
 -  CUresult res =
 -      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
 +  CUresult res = dynload::cuDeviceGetAttribute(
 +      &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
     return false;
 @@ -1522,7 +1637,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   ScopedActivateContext activation{context};
   size_t free = 0;
   size_t total = 0;
 -  CUresult res = cuMemGetInfo(&free, &total);
 +  CUresult res = dynload::cuMemGetInfo_v2(&free, &total);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
     return false;
 @@ -1536,7 +1651,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
                                                    uint64 *result) {
   size_t value = -1;
 -  CUresult res = cuDeviceTotalMem(&value, device);
 +  CUresult res = dynload::cuDeviceTotalMem_v2(&value, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
     return false;
 @@ -1551,7 +1666,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   static const int kBufferSize = 64;
   port::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
 -  CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
 +  CUresult res =
 +      dynload::cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
     return pci_bus_id;
 @@ -1579,7 +1695,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                << to_device.status();
     return false;
   }
 -  CUresult res = cuDeviceCanAccessPeer(
 +  CUresult res = dynload::cuDeviceCanAccessPeer(
       &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
 @@ -1596,7 +1712,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   }
 
   ScopedActivateContext activated{from};
 -  CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
 +  CUresult result =
 +      dynload::cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
     return port::Status{
 @@ -1614,7 +1731,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   ScopedActivateContext activation{context};
 
   int max_blocks;
 -  CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
 +  CUresult result = dynload::cuOccupancyMaxActiveBlocksPerMultiprocessor(
       &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status{
 @@ -1628,7 +1745,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 
 /* static */ CUcontext CUDADriver::CurrentContextOrDie() {
   CUcontext current = nullptr;
 -  CUresult result = cuCtxGetCurrent(&current);
 +  CUresult result = dynload::cuCtxGetCurrent(&current);
   if (result != CUDA_SUCCESS) {
     LOG(FATAL) << "failed to query current context: " << ToString(result);
   }
 diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
 index 4bbd531..e840e6c 100644
 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
 +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
 @@ -24,7 +24,9 @@ limitations under the License.
 #else
 #include <unistd.h>
 #endif
 +#include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 +#include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 @@ -1159,6 +1161,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 namespace gpu = ::perftools::gputools;
 
 void initialize_cuda_gpu_executor() {
 +  port::StatusOr<void *> status =
 +      gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle();
 +  if (!status.ok()) {
 +    gpu::cuda::Diagnostician::LogDriverVersionInformation();
 +    LOG(INFO) << "LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH");
 +    LOG(INFO) << "failed to find libcuda.so on this system: "
 +              << status.status();
 +  }
 +
 +  // TODO(b/22689637): Temporary until users are migrated off of PlatformKind.
 +  gpu::PluginRegistry::Instance()->MapPlatformKindToId(
 +      gpu::PlatformKind::kCuda, gpu::cuda::kCudaPlatformId);
 +
   *gpu::internal::MakeCUDAExecutorImplementation() = [](
       const gpu::PluginConfig &config) {
     return new gpu::cuda::CUDAExecutor{config};