Created
March 23, 2018 12:02
-
-
Save pkit/e06c2d23046a265a4f0fea302a5ce539 to your computer and use it in GitHub Desktop.
Patch tensorflow 1.6.0 to dynamically load libcuda.so.1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc | |
index 08961fc..5963c8a 100644 | |
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc | |
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc | |
@@ -16,6 +16,9 @@ limitations under the License. | |
#ifdef GOOGLE_CUDA | |
#include "cuda/include/cuda.h" | |
#include "tensorflow/stream_executor/cuda/cuda_activation.h" | |
+#include "tensorflow/stream_executor/platform/port.h" | |
+#include "tensorflow/stream_executor/dso_loader.h" | |
+#include "tensorflow/core/platform/default/logging.h" | |
#endif // GOOGLE_CUDA | |
#include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h" | |
@@ -25,6 +28,41 @@ limitations under the License. | |
#include "tensorflow/core/common_runtime/gpu/gpu_init.h" | |
#include "tensorflow/core/platform/stream_executor.h" | |
+namespace dynload { | |
+ | |
+namespace gpu = ::perftools::gputools; | |
+ | |
+#define CUDAMALLOC_LIBCUDA_WRAP(__name) \ | |
+ struct DynLoadShim__##__name { \ | |
+ static const char *kName; \ | |
+ using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ | |
+ static void *GetDsoHandle() { \ | |
+ static auto status = gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle(); \ | |
+ return status.ValueOrDie(); \ | |
+ } \ | |
+ static FuncPointerT LoadOrDie() { \ | |
+ void *f; \ | |
+ gpu::port::Status s = gpu::port::Env::Default()->GetSymbolFromLibrary( \ | |
+ GetDsoHandle(), kName, &f); \ | |
+ CHECK(s.ok()) << "could not find " << kName \ | |
+ << " in libcuda DSO; dlerror: " << s.error_message(); \ | |
+ return reinterpret_cast<FuncPointerT>(f); \ | |
+ } \ | |
+ static FuncPointerT DynLoad() { \ | |
+ static FuncPointerT f = LoadOrDie(); \ | |
+ return f; \ | |
+ } \ | |
+ template <typename... Args> \ | |
+ CUresult operator()(Args... args) { \ | |
+ return DynLoad()(args...); \ | |
+ } \ | |
+ } __name; \ | |
+ const char *DynLoadShim__##__name::kName = #__name; | |
+ | |
+ CUDAMALLOC_LIBCUDA_WRAP(cuMemAlloc_v2); | |
+ CUDAMALLOC_LIBCUDA_WRAP(cuMemFree_v2); | |
+} // namespace dynload | |
+ | |
namespace tensorflow { | |
GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator, | |
@@ -40,7 +78,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { | |
// allocate with cudaMalloc | |
gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_}; | |
CUdeviceptr rv = 0; | |
- CUresult res = cuMemAlloc(&rv, num_bytes); | |
+ CUresult res = dynload::cuMemAlloc_v2(&rv, num_bytes); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "cuMemAlloc failed to allocate " << num_bytes; | |
return nullptr; | |
@@ -53,7 +91,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { | |
void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) { | |
#ifdef GOOGLE_CUDA | |
// free with cudaFree | |
- CUresult res = cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)); | |
+ CUresult res = dynload::cuMemFree_v2(reinterpret_cast<CUdeviceptr>(ptr)); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "cuMemFree failed to free " << ptr; | |
} | |
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD | |
index 1865240..50b2930 100644 | |
--- a/tensorflow/stream_executor/BUILD | |
+++ b/tensorflow/stream_executor/BUILD | |
@@ -76,7 +76,6 @@ cc_library( | |
] + if_cuda_is_configured([ | |
"//tensorflow/core:cuda", | |
"@local_config_cuda//cuda:cublas", | |
- "@local_config_cuda//cuda:cuda_driver", | |
"@local_config_cuda//cuda:cudnn", | |
"@local_config_cuda//cuda:cufft", | |
"@local_config_cuda//cuda:curand", | |
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc | |
index a017ff6..7c5147a 100644 | |
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc | |
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc | |
@@ -21,7 +21,9 @@ limitations under the License. | |
#include <set> | |
#include <utility> | |
+#include "tensorflow/stream_executor/platform/port.h" | |
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" | |
+#include "tensorflow/stream_executor/dso_loader.h" | |
#include "tensorflow/stream_executor/lib/casts.h" | |
#include "tensorflow/stream_executor/lib/env.h" | |
#include "tensorflow/stream_executor/lib/error.h" | |
@@ -57,6 +59,107 @@ namespace perftools { | |
namespace gputools { | |
namespace cuda { | |
+namespace dynload { | |
+ | |
+#define PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(__name) \ | |
+ struct DynLoadShim__##__name { \ | |
+ static const char *kName; \ | |
+ using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \ | |
+ static void *GetDsoHandle() { \ | |
+ static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \ | |
+ return status.ValueOrDie(); \ | |
+ } \ | |
+ static FuncPointerT LoadOrDie() { \ | |
+ void *f; \ | |
+ port::Status s = port::Env::Default()->GetSymbolFromLibrary( \ | |
+ GetDsoHandle(), kName, &f); \ | |
+ CHECK(s.ok()) << "could not find " << kName \ | |
+ << " in libcuda DSO; dlerror: " << s.error_message(); \ | |
+ return reinterpret_cast<FuncPointerT>(f); \ | |
+ } \ | |
+ static FuncPointerT DynLoad() { \ | |
+ static FuncPointerT f = LoadOrDie(); \ | |
+ return f; \ | |
+ } \ | |
+ template <typename... Args> \ | |
+ CUresult operator()(Args... args) { \ | |
+ return DynLoad()(args...); \ | |
+ } \ | |
+ } __name; \ | |
+ const char *DynLoadShim__##__name::kName = #__name; | |
+ | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxCreate_v2); | |
+#if CUDA_VERSION >= 7000 | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxRetain); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxRelease); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxSetFlags); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxGetState); | |
+#endif | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxDestroy); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxEnablePeerAccess); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetCurrent); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetDevice); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetSharedMemConfig); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxPopCurrent_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetCurrent); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetSharedMemConfig); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSynchronize); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceComputeCapability); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceCanAccessPeer); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGet); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetAttribute); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetCount); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetName); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetPCIBusId); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetProperties); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceTotalMem); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDriverGetVersion); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventCreate); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventSynchronize); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorString); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuInit); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuLaunchKernel); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemAlloc_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoD_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoH_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoD_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoDAsync_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoHAsync_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoDAsync_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetAddressRange_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFree_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFreeHost); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetInfo_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostAlloc); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostRegister_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostUnregister); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32Async); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8Async); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetFunction); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetGlobal_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadDataEx); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadFatBinary); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleUnload); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuPointerGetAttribute); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamAddCallback); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamCreate); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamDestroy_v2); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamQuery); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamSynchronize); | |
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamWaitEvent); | |
+ | |
+} // namespace dynload | |
+ | |
namespace { | |
// Manages the singleton map of contexts that we've created, mapping | |
@@ -273,7 +376,7 @@ namespace { | |
// Call cuCtxtSynchronize and crash if it doesn't succeed. | |
void SynchronizeOrDie() { | |
- auto res = cuCtxSynchronize(); | |
+ auto res = dynload::cuCtxSynchronize(); | |
if (res != CUDA_SUCCESS) { | |
LOG(FATAL) << "Synchronize found " | |
<< ToString(res) << " :: " << port::CurrentStackTrace(); | |
@@ -309,7 +412,7 @@ ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) { | |
to_restore_ = (tls->depth == 1 ? nullptr : tls->context); | |
// Set the context and update thread local. | |
- CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context())); | |
+ CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(cuda_context->context())); | |
tls->id = cuda_context->id(); | |
tls->context = cuda_context; | |
} | |
@@ -334,7 +437,7 @@ ScopedActivateContext::~ScopedActivateContext() { | |
} | |
// Set context and update thread local. | |
- CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context())); | |
+ CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(to_restore_->context())); | |
tls->id = to_restore_->id(); | |
tls->context = to_restore_; | |
} | |
@@ -395,8 +498,10 @@ static port::Status InternalInit() { | |
CUresult res = CUDA_ERROR_NO_DEVICE; | |
if (FLAGS_gpuexec_cuda_driver_inject_init_error) { | |
LOG(ERROR) << "injecting CUDA init error; initialization will fail"; | |
- } else { | |
- res = cuInit(0 /* = flags */); | |
+ } else if (internal::CachedDsoLoader::GetLibcudaDsoHandle().ok()) { | |
+ // We only call cuInit if we can dynload libcuda. | |
+ | |
+ res = dynload::cuInit(0 /* = flags */); | |
} | |
if (res == CUDA_SUCCESS) { | |
@@ -429,7 +534,7 @@ static port::Status InternalInit() { | |
/* static */ port::Status CUDADriver::GetDevice(int device_ordinal, | |
CUdevice *device) { | |
- CUresult res = cuDeviceGet(device, device_ordinal); | |
+ CUresult res = dynload::cuDeviceGet(device, device_ordinal); | |
if (res == CUDA_SUCCESS) { | |
return port::Status::OK(); | |
} | |
@@ -443,7 +548,8 @@ static port::Status InternalInit() { | |
string *device_name) { | |
static const size_t kCharLimit = 64; | |
port::InlinedVector<char, 4> chars(kCharLimit); | |
- CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device); | |
+ CUresult res = | |
+ dynload::cuDeviceGetName(chars.begin(), kCharLimit - 1, device); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to get device name for " << device << ": " | |
<< ToString(res); | |
@@ -499,8 +605,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options, | |
unsigned int former_primary_context_flags; | |
int former_primary_context_is_active; | |
CHECK_EQ(CUDA_SUCCESS, | |
- cuDevicePrimaryCtxGetState(device, &former_primary_context_flags, | |
- &former_primary_context_is_active)); | |
+ dynload::cuDevicePrimaryCtxGetState(device, &former_primary_context_flags, | |
+ &former_primary_context_is_active)); | |
if (former_primary_context_flags != flags) { | |
if (former_primary_context_is_active) { | |
LOG(ERROR) | |
@@ -508,16 +614,16 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options, | |
<< former_primary_context_flags << ") than the desired flag set (" | |
<< flags << ")."; | |
} else { | |
- CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags)); | |
+ CHECK_EQ(CUDA_SUCCESS, dynload::cuDevicePrimaryCtxSetFlags(device, flags)); | |
} | |
} | |
} | |
former_context = CUDADriver::CurrentContextOrDie(); | |
- res = cuDevicePrimaryCtxRetain(&new_context, device); | |
+ res = dynload::cuDevicePrimaryCtxRetain(&new_context, device); | |
if (former_context != nullptr) { | |
CUdevice former_device; | |
- if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) { | |
+ if (dynload::cuCtxGetDevice(&former_device) == CUDA_SUCCESS) { | |
if (former_device == device) { | |
if (former_context == new_context) { | |
VLOG(2) << "The primary context " << former_context | |
@@ -544,10 +650,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options, | |
<< "creating context when one is currently active; existing: " | |
<< former_context; | |
} | |
- res = cuCtxCreate(&new_context, flags, device); | |
+ res = dynload::cuCtxCreate(&new_context, flags, device); | |
#endif | |
} | |
- CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context)); | |
+ CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(former_context)); | |
if (res == CUDA_SUCCESS) { | |
*context = CreatedContexts::Add(new_context); | |
@@ -580,14 +686,14 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options, | |
} | |
#if CUDA_VERSION >= 7000 | |
CUcontext former_context = CurrentContext(); | |
- CUresult res = cuCtxSetCurrent(context->context()); | |
+ CUresult res = dynload::cuCtxSetCurrent(context->context()); | |
CUdevice device; | |
- cuCtxGetDevice(&device); | |
- cuCtxSetCurrent(former_context); | |
+ dynload::cuCtxGetDevice(&device); | |
+ dynload::cuCtxSetCurrent(former_context); | |
- res = cuDevicePrimaryCtxRelease(device); | |
+ res = dynload::cuDevicePrimaryCtxRelease(device); | |
#else | |
- CUresult res = cuCtxDestroy(context->context()); | |
+ CUresult res = dynload::cuCtxDestroy_v2(context->context()); | |
#endif | |
if (res != CUDA_SUCCESS) { | |
@@ -600,7 +706,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options, | |
/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute, | |
CUfunction func, | |
int *attribute_value) { | |
- CUresult res = cuFuncGetAttribute(attribute_value, attribute, func); | |
+ CUresult res = dynload::cuFuncGetAttribute(attribute_value, attribute, func); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query kernel attribute. kernel: " << func | |
<< ", attribute: " << attribute; | |
@@ -611,7 +717,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options, | |
/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function, | |
CUfunc_cache cache_config) { | |
- CUresult res = cuFuncSetCacheConfig(function, cache_config); | |
+ CUresult res = dynload::cuFuncSetCacheConfig(function, cache_config); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function | |
<< ", config: " << cache_config << ", result: " << ToString(res); | |
@@ -625,10 +731,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options, | |
CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUsharedconfig shared_mem_config; | |
ScopedActivateContext activation{context}; | |
- CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config); | |
+ CUresult result = dynload::cuCtxGetSharedMemConfig(&shared_mem_config); | |
if (result != CUDA_SUCCESS) { | |
CUdevice device; | |
- cuCtxGetDevice(&device); | |
+ dynload::cuCtxGetDevice(&device); | |
LOG(ERROR) << "failed to get CUDA device shared memory config. " | |
<< "Context device ID: " << device | |
<< ", result: " << ToString(result); | |
@@ -642,10 +748,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ port::Status CUDADriver::ContextSetSharedMemConfig( | |
CudaContext* context, CUsharedconfig shared_mem_config) { | |
ScopedActivateContext activation{context}; | |
- CUresult result = cuCtxSetSharedMemConfig(shared_mem_config); | |
+ CUresult result = dynload::cuCtxSetSharedMemConfig(shared_mem_config); | |
if (result != CUDA_SUCCESS) { | |
CUdevice device; | |
- cuCtxGetDevice(&device); | |
+ dynload::cuCtxGetDevice(&device); | |
LOG(ERROR) << "failed to set CUDA device shared memory config. " | |
<< "Context device ID: " << device | |
<< ", config: " << shared_mem_config | |
@@ -668,9 +774,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
<< " gdy: " << grid_dim_y << " gdz: " << grid_dim_z | |
<< " bdx: " << block_dim_x << " bdy: " << block_dim_y | |
<< " bdz: " << block_dim_z; | |
- CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, | |
- block_dim_x, block_dim_y, block_dim_z, | |
- shared_mem_bytes, stream, kernel_params, extra); | |
+ CUresult res = dynload::cuLaunchKernel( | |
+ function, grid_dim_x, grid_dim_y, grid_dim_z, | |
+ block_dim_x, block_dim_y, block_dim_z, | |
+ shared_mem_bytes, stream, kernel_params, extra); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to launch CUDA kernel: " << function | |
<< "; result: " << ToString(res); | |
@@ -684,7 +791,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
const char *cubin_bytes, | |
CUmodule *module) { | |
ScopedActivateContext activation{context}; | |
- CUresult result = cuModuleLoadFatBinary(module, cubin_bytes); | |
+ CUresult result = dynload::cuModuleLoadFatBinary(module, cubin_bytes); | |
if (result != CUDA_SUCCESS) { | |
return port::Status{port::error::INTERNAL, | |
"failed to load in-memory CUBIN: " + ToString(result)}; | |
@@ -727,8 +834,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
// TODO(leary) Need to see if NVIDIA can expunge the leakiness in their | |
// module loading: see http://b/13248943 | |
- res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options, | |
- option_values); | |
+ res = dynload::cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), | |
+ options, option_values); | |
} | |
// The PTX JIT mutates the values in the option values array to reflect the | |
@@ -767,7 +874,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUdeviceptr location, | |
uint8 value, size_t size) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemsetD8(location, value, size); | |
+ CUresult res = dynload::cuMemsetD8_v2(location, value, size); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to memset memory: " << ToString(res); | |
return false; | |
@@ -780,7 +887,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
uint32 value, | |
size_t uint32_count) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemsetD32(location, value, uint32_count); | |
+ CUresult res = dynload::cuMemsetD32_v2(location, value, uint32_count); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to memset memory: " << ToString(res); | |
return false; | |
@@ -794,7 +901,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
size_t uint32_count, | |
CUstream stream) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemsetD8Async(location, value, uint32_count, stream); | |
+ CUresult res = | |
+ dynload::cuMemsetD8Async(location, value, uint32_count, stream); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); | |
return false; | |
@@ -809,7 +917,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
size_t uint32_count, | |
CUstream stream) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemsetD32Async(location, value, uint32_count, stream); | |
+ CUresult res = | |
+ dynload::cuMemsetD32Async(location, value, uint32_count, stream); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); | |
return false; | |
@@ -823,7 +932,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
StreamCallback callback, | |
void *data) { | |
// Note: flags param is required to be zero according to CUDA 6.0. | |
- CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */); | |
+ CUresult res = | |
+ dynload::cuStreamAddCallback(stream, callback, data, 0 /* = flags */); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "unable to add host callback: " << ToString(res); | |
return false; | |
@@ -837,7 +947,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUfunction *function) { | |
ScopedActivateContext activated{context}; | |
CHECK(module != nullptr && kernel_name != nullptr); | |
- CUresult res = cuModuleGetFunction(function, module, kernel_name); | |
+ CUresult res = dynload::cuModuleGetFunction(function, module, kernel_name); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name | |
<< "\" from module: " << ToString(res); | |
@@ -855,7 +965,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
ScopedActivateContext activated{context}; | |
CHECK(module != nullptr && symbol_name != nullptr && | |
(dptr != nullptr || bytes != nullptr)); | |
- CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name); | |
+ CUresult res = | |
+ dynload::cuModuleGetGlobal_v2(dptr, bytes, module, symbol_name); | |
if (res != CUDA_SUCCESS) { | |
// symbol may not be found in the current module, but it may reside in | |
// another module. | |
@@ -870,7 +981,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ void CUDADriver::UnloadModule(CudaContext *context, | |
CUmodule module) { | |
ScopedActivateContext activated{context}; | |
- CUresult res = cuModuleUnload(module); | |
+ CUresult res = dynload::cuModuleUnload(module); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to unload module " << module | |
<< "; leaking: " << ToString(res); | |
@@ -881,7 +992,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CudaContext* context) { | |
ScopedActivateContext activated{context}; | |
CUdevice device = -1; | |
- CUresult result = cuCtxGetDevice(&device); | |
+ CUresult result = dynload::cuCtxGetDevice(&device); | |
if (result == CUDA_SUCCESS) { | |
return device; | |
} | |
@@ -897,7 +1008,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
// up synchronization with respect to memsets and any other things that have | |
// to occur on the default stream? | |
ScopedActivateContext activated{context}; | |
- CUresult res = cuStreamCreate(out, 0); | |
+ CUresult res = dynload::cuStreamCreate(out, 0); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "could not allocate CUDA stream for context " << context | |
<< ": " << ToString(res); | |
@@ -916,7 +1027,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
} | |
ScopedActivateContext activated{context}; | |
- CUresult res = cuStreamDestroy(*stream); | |
+ CUresult res = dynload::cuStreamDestroy_v2(*stream); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to destroy CUDA stream for context " << context | |
<< ": " << ToString(res); | |
@@ -931,7 +1042,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
uint64 bytes) { | |
ScopedActivateContext activated{context}; | |
CUdeviceptr result = 0; | |
- CUresult res = cuMemAlloc(&result, bytes); | |
+ CUresult res = dynload::cuMemAlloc_v2(&result, bytes); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to allocate " | |
<< port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes | |
@@ -948,7 +1059,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
void *location) { | |
ScopedActivateContext activation{context}; | |
CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location); | |
- CUresult res = cuMemFree(pointer); | |
+ CUresult res = dynload::cuMemFree_v2(pointer); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to free device memory at " << location | |
<< "; result: " << ToString(res); | |
@@ -962,7 +1073,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
ScopedActivateContext activation{context}; | |
void *host_mem = nullptr; | |
// "Portable" memory is visible to all CUDA contexts. Safe for our use model. | |
- CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE); | |
+ CUresult res = | |
+ dynload::cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to alloc " << bytes | |
<< " bytes on host: " << ToString(res); | |
@@ -973,7 +1085,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ void CUDADriver::HostDeallocate(CudaContext* context, | |
void *location) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemFreeHost(location); | |
+ CUresult res = dynload::cuMemFreeHost(location); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "error deallocating host memory at " << location << ": " | |
<< ToString(res); | |
@@ -985,7 +1097,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
ScopedActivateContext activation{context}; | |
// "Portable" memory is visible to all CUDA contexts. Safe for our use model. | |
CUresult res = | |
- cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE); | |
+ dynload::cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "error registering host memory at " << location << ": " | |
<< ToString(res); | |
@@ -997,7 +1109,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ bool CUDADriver::HostUnregister(CudaContext* context, | |
void *location) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemHostUnregister(location); | |
+ CUresult res = dynload::cuMemHostUnregister(location); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "error unregistering host memory at " << location << ": " | |
<< ToString(res); | |
@@ -1014,7 +1126,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
} | |
ScopedActivateContext activated{context}; | |
- CUresult res = cuEventDestroy(*event); | |
+ CUresult res = dynload::cuEventDestroy_v2(*event); | |
*event = nullptr; | |
switch (res) { | |
@@ -1038,7 +1150,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUevent event, | |
CUstream stream) { | |
ScopedActivateContext activated{context}; | |
- CUresult res = cuEventRecord(event, stream); | |
+ CUresult res = dynload::cuEventRecord(event, stream); | |
switch (res) { | |
case CUDA_SUCCESS: | |
return port::Status::OK(); | |
@@ -1059,7 +1171,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent( | |
CudaContext *context, CUevent event) { | |
ScopedActivateContext activated{context}; | |
- CUresult res = cuEventQuery(event); | |
+ CUresult res = dynload::cuEventQuery(event); | |
if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) { | |
return port::Status{ | |
port::error::INTERNAL, | |
@@ -1075,12 +1187,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
ScopedActivateContext activated{context}; | |
// The stop event must have completed in order for cuEventElapsedTime to | |
// work. | |
- CUresult res = cuEventSynchronize(stop); | |
+ CUresult res = dynload::cuEventSynchronize(stop); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res); | |
return false; | |
} | |
- res = cuEventElapsedTime(elapsed_milliseconds, start, stop); | |
+ res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to get elapsed time between events: " | |
<< ToString(res); | |
@@ -1094,7 +1206,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUstream stream, | |
CUevent event) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */); | |
+ CUresult res = dynload::cuStreamWaitEvent(stream, event, 0 /* = flags */); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "could not wait stream on event: " << ToString(res); | |
return false; | |
@@ -1105,7 +1217,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuCtxSynchronize(); | |
+ CUresult res = dynload::cuCtxSynchronize(); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res) | |
<< " :: " << port::CurrentStackTrace(); | |
@@ -1119,7 +1231,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUstream stream) { | |
ScopedActivateContext activated{context}; | |
CHECK(stream != nullptr); | |
- CUresult res = cuStreamSynchronize(stream); | |
+ CUresult res = dynload::cuStreamSynchronize(stream); | |
if (res != CUDA_SUCCESS) { | |
port::Status status = port::InternalError( | |
port::StrCat("could not synchronize on CUDA stream: ", ToString(res))); | |
@@ -1135,7 +1247,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUstream stream) { | |
ScopedActivateContext activated{context}; | |
CHECK(stream != nullptr); | |
- CUresult res = cuStreamQuery(stream); | |
+ CUresult res = dynload::cuStreamQuery(stream); | |
if (res == CUDA_SUCCESS) { | |
return true; | |
} | |
@@ -1151,7 +1263,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUdeviceptr gpu_src, | |
uint64 size) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size); | |
+ CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size); | |
if (res != CUDA_SUCCESS) { | |
return port::InternalError( | |
port::Printf("failed to synchronous memcpy from device to host: %s; " | |
@@ -1169,7 +1281,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
const void *host_src, | |
uint64 size) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size); | |
+ CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size); | |
if (res != CUDA_SUCCESS) { | |
return port::InternalError(port::Printf( | |
"failed to synchronous memcpy from host to device: %s; GPU dst: %p;" | |
@@ -1186,7 +1298,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUdeviceptr gpu_src, | |
uint64 size) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size); | |
+ CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size); | |
if (res != CUDA_SUCCESS) { | |
return port::InternalError(port::Printf( | |
"failed to synchronous memcpy from host to device: %s; GPU dst: %p; " | |
@@ -1204,7 +1316,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
uint64 size, | |
CUstream stream) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream); | |
+ CUresult res = dynload::cuMemcpyDtoHAsync_v2(host_dst, gpu_src, size, stream); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << port::Printf( | |
"failed to enqueue async memcpy from device to host: %s; host dst: %p; " | |
@@ -1224,7 +1336,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
uint64 size, | |
CUstream stream) { | |
ScopedActivateContext activation{context}; | |
- CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream); | |
+ CUresult res = dynload::cuMemcpyHtoDAsync_v2(gpu_dst, host_src, size, stream); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << port::Printf( | |
"failed to enqueue async memcpy from host to device: %s; GPU dst: %p; " | |
@@ -1243,7 +1355,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
uint64 size, | |
CUstream stream) { | |
ScopedActivateContext activation{context}; | |
- CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream); | |
+ CUresult result = | |
+ dynload::cuMemcpyDtoDAsync_v2(gpu_dst, gpu_src, size, stream); | |
if (result != CUDA_SUCCESS) { | |
LOG(ERROR) << port::Printf( | |
"failed to enqueue async memcpy from device to device: %s" | |
@@ -1279,7 +1392,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
} | |
ScopedActivateContext activated{context}; | |
- CUresult res = cuEventCreate(result, cuflags); | |
+ CUresult res = dynload::cuEventCreate(result, cuflags); | |
if (res == CUDA_SUCCESS) { | |
return port::Status::OK(); | |
@@ -1295,7 +1408,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ int CUDADriver::GetDeviceCount() { | |
int device_count = 0; | |
- CUresult res = cuDeviceGetCount(&device_count); | |
+ CUresult res = dynload::cuDeviceGetCount(&device_count); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res); | |
return 0; | |
@@ -1310,8 +1423,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext( | |
CUdeviceptr pointer) { | |
CudaContext* context = nullptr; | |
- CUresult result = | |
- cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer); | |
+ CUresult result = dynload::cuPointerGetAttribute( | |
+ &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer); | |
if (result == CUDA_SUCCESS) { | |
CHECK(context != nullptr) << "success should entail non-null context"; | |
return context; | |
@@ -1326,8 +1439,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace( | |
CUdeviceptr pointer) { | |
unsigned int value; | |
- CUresult result = | |
- cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer); | |
+ CUresult result = dynload::cuPointerGetAttribute( | |
+ &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer); | |
if (result == CUDA_SUCCESS) { | |
switch (value) { | |
case CU_MEMORYTYPE_DEVICE: | |
@@ -1350,7 +1463,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr, | |
CUdeviceptr *base, | |
size_t *size) { | |
- CUresult result = cuMemGetAddressRange(base, size, dptr); | |
+ CUresult result = dynload::cuMemGetAddressRange(base, size, dptr); | |
if (result == CUDA_SUCCESS) { | |
return port::Status::OK(); | |
} else if (result == CUDA_ERROR_NOT_FOUND) { | |
@@ -1384,7 +1497,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { | |
CUdevice device) { | |
*cc_major = 0; | |
*cc_minor = 0; | |
- CUresult result = cuDeviceComputeCapability(cc_major, cc_minor, device); | |
+ CUresult result = | |
+ dynload::cuDeviceComputeCapability(cc_major, cc_minor, device); | |
if (result == CUDA_SUCCESS) { | |
return port::Status::OK(); | |
} | |
@@ -1401,7 +1515,7 @@ template <typename T> | |
static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
CUdevice_attribute attribute) { | |
int value = -1; | |
- CUresult result = cuDeviceGetAttribute(&value, attribute, device); | |
+ CUresult result = dynload::cuDeviceGetAttribute(&value, attribute, device); | |
if (result != CUDA_SUCCESS) { | |
return port::Status{ | |
port::error::NOT_FOUND, | |
@@ -1456,24 +1570,24 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z, | |
CUdevice device) { | |
int value; | |
- CUresult res = | |
- cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device); | |
+ CUresult res = dynload::cuDeviceGetAttribute( | |
+ &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query max grid dim x: " << ToString(res); | |
return false; | |
} | |
*x = value; | |
- res = | |
- cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device); | |
+ res = dynload::cuDeviceGetAttribute( | |
+ &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query max grid dim y: " << ToString(res); | |
return false; | |
} | |
*y = value; | |
- res = | |
- cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device); | |
+ res = dynload::cuDeviceGetAttribute( | |
+ &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query max grid dim z: " << ToString(res); | |
return false; | |
@@ -1483,7 +1597,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
} | |
/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) { | |
- CUresult res = cuDriverGetVersion(driver_version); | |
+ CUresult res = dynload::cuDriverGetVersion(driver_version); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query driver version: " << ToString(res); | |
return false; | |
@@ -1494,7 +1608,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
/* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties, | |
int device_ordinal) { | |
- CUresult res = cuDeviceGetProperties(device_properties, device_ordinal); | |
+ CUresult res = | |
+ dynload::cuDeviceGetProperties(device_properties, device_ordinal); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query device properties: " << ToString(res); | |
return false; | |
@@ -1505,8 +1620,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) { | |
int value = -1; | |
- CUresult res = | |
- cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device); | |
+ CUresult res = dynload::cuDeviceGetAttribute( | |
+ &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query ECC status: " << ToString(res); | |
return false; | |
@@ -1522,7 +1637,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
ScopedActivateContext activation{context}; | |
size_t free = 0; | |
size_t total = 0; | |
- CUresult res = cuMemGetInfo(&free, &total); | |
+ CUresult res = dynload::cuMemGetInfo_v2(&free, &total); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query device memory info: " << ToString(res); | |
return false; | |
@@ -1536,7 +1651,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device, | |
uint64 *result) { | |
size_t value = -1; | |
- CUresult res = cuDeviceTotalMem(&value, device); | |
+ CUresult res = dynload::cuDeviceTotalMem_v2(&value, device); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query total available memory: " << ToString(res); | |
return false; | |
@@ -1551,7 +1666,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
static const int kBufferSize = 64; | |
port::InlinedVector<char, 4> chars(kBufferSize); | |
chars[kBufferSize - 1] = '\0'; | |
- CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device); | |
+ CUresult res = | |
+ dynload::cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res); | |
return pci_bus_id; | |
@@ -1579,7 +1695,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
<< to_device.status(); | |
return false; | |
} | |
- CUresult res = cuDeviceCanAccessPeer( | |
+ CUresult res = dynload::cuDeviceCanAccessPeer( | |
&can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie()); | |
if (res != CUDA_SUCCESS) { | |
LOG(ERROR) << "failed to detect peer access capability: " << ToString(res); | |
@@ -1596,7 +1712,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
} | |
ScopedActivateContext activated{from}; | |
- CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */); | |
+ CUresult result = | |
+ dynload::cuCtxEnablePeerAccess(to->context(), 0 /* = flags */); | |
if (result != CUDA_SUCCESS && | |
result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) { | |
return port::Status{ | |
@@ -1614,7 +1731,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
ScopedActivateContext activation{context}; | |
int max_blocks; | |
- CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor( | |
+ CUresult result = dynload::cuOccupancyMaxActiveBlocksPerMultiprocessor( | |
&max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes); | |
if (result != CUDA_SUCCESS) { | |
return port::Status{ | |
@@ -1628,7 +1745,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device, | |
/* static */ CUcontext CUDADriver::CurrentContextOrDie() { | |
CUcontext current = nullptr; | |
- CUresult result = cuCtxGetCurrent(¤t); | |
+ CUresult result = dynload::cuCtxGetCurrent(¤t); | |
if (result != CUDA_SUCCESS) { | |
LOG(FATAL) << "failed to query current context: " << ToString(result); | |
} | |
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | |
index 4bbd531..e840e6c 100644 | |
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | |
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | |
@@ -24,7 +24,9 @@ limitations under the License. | |
#else | |
#include <unistd.h> | |
#endif | |
+#include "tensorflow/stream_executor/platform/port.h" | |
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" | |
+#include "tensorflow/stream_executor/dso_loader.h" | |
#include "tensorflow/stream_executor/cuda/cuda_driver.h" | |
#include "tensorflow/stream_executor/cuda/cuda_event.h" | |
#include "tensorflow/stream_executor/cuda/cuda_platform_id.h" | |
@@ -1159,6 +1161,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const { | |
namespace gpu = ::perftools::gputools; | |
void initialize_cuda_gpu_executor() { | |
+ port::StatusOr<void *> status = | |
+ gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle(); | |
+ if (!status.ok()) { | |
+ gpu::cuda::Diagnostician::LogDriverVersionInformation(); | |
+ LOG(INFO) << "LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH"); | |
+ LOG(INFO) << "failed to find libcuda.so on this system: " | |
+ << status.status(); | |
+ } | |
+ | |
+ // TODO(b/22689637): Temporary until users are migrated off of PlatformKind. | |
+ gpu::PluginRegistry::Instance()->MapPlatformKindToId( | |
+ gpu::PlatformKind::kCuda, gpu::cuda::kCudaPlatformId); | |
+ | |
*gpu::internal::MakeCUDAExecutorImplementation() = []( | |
const gpu::PluginConfig &config) { | |
return new gpu::cuda::CUDAExecutor{config}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment