Last active
April 15, 2024 18:31
-
-
Save dusty-nv/ce51796085178e1f38e3c6a1663a93a1 to your computer and use it in GitHub Desktop.
PyTorch patch for building on JetPack >= 4.4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h | |
index 2aac442d21..f2321dad7a 100644 | |
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h | |
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h | |
@@ -26,6 +26,9 @@ namespace { | |
// Most likely we will do aarch32 support with inline asm. | |
#if defined(__aarch64__) | |
+// See https://github.com/pytorch/pytorch/issues/47098 | |
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) | |
+ | |
#ifdef __BIG_ENDIAN__ | |
#error "Big endian is not supported." | |
#endif | |
@@ -713,6 +716,7 @@ Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<floa | |
return Vectorized<float>(r0, r1); | |
} | |
+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */ | |
#endif /* defined(aarch64) */ | |
}}} | |
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index 1751128f1a..03e74f5ac2 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index 91a61b04b8..5e9c128eed 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -19,7 +19,9 @@ namespace at { namespace cuda { namespace detail { | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py | |
index 00e6d5d45e..2ee832051a 100644 | |
--- a/torch/utils/cpp_extension.py | |
+++ b/torch/utils/cpp_extension.py | |
@@ -1595,7 +1595,7 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: | |
]) | |
supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2', | |
- '7.0', '7.2', '7.5', '8.0', '8.6'] | |
+ '7.0', '7.2', '7.5', '8.0', '8.6', '8.7'] | |
valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches] | |
# The default is sm_30 for CUDA 9.x and 10.x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index e48c020b03..0ecc111c4b 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index af788ff8f8..fb27ab808c 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail { | |
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) | |
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h | |
index 61cd90cdd6..1d3f5383d4 100644 | |
--- a/aten/src/THCUNN/common.h | |
+++ b/aten/src/THCUNN/common.h | |
@@ -5,7 +5,10 @@ | |
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-const int CUDA_NUM_THREADS = 1024; | |
+//const int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) | |
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc | |
index 8f69944dcf..3679c9d2a7 100644 | |
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc | |
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc | |
@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize( | |
// RNN setup | |
{ | |
#if CUDNN_VERSION_MIN(7, 0, 0) | |
- CUDNN_ENFORCE(cudnnSetRNNDescriptor( | |
+ CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6( | |
cudnn_wrapper_.inline_cudnn_handle(), | |
rnnDesc_, | |
hiddenSize, | |
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake | |
index a5c50b90df..22d738e0c1 100644 | |
--- a/cmake/public/cuda.cmake | |
+++ b/cmake/public/cuda.cmake | |
@@ -147,7 +147,7 @@ endif() | |
# ---[ Extract versions | |
if(CAFFE2_USE_CUDNN) | |
# Get cuDNN version | |
- file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS) | |
+ file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS) | |
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" | |
CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}") | |
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index fd51cc45e7..e3be2fd3bc 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index af788ff8f8..fb27ab808c 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail { | |
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) | |
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h | |
index 61cd90cdd6..1d3f5383d4 100644 | |
--- a/aten/src/THCUNN/common.h | |
+++ b/aten/src/THCUNN/common.h | |
@@ -5,7 +5,10 @@ | |
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-const int CUDA_NUM_THREADS = 1024; | |
+//const int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) | |
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc | |
index 8f69944dcf..3679c9d2a7 100644 | |
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc | |
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc | |
@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize( | |
// RNN setup | |
{ | |
#if CUDNN_VERSION_MIN(7, 0, 0) | |
- CUDNN_ENFORCE(cudnnSetRNNDescriptor( | |
+ CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6( | |
cudnn_wrapper_.inline_cudnn_handle(), | |
rnnDesc_, | |
hiddenSize, | |
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake | |
index 545d6cd924..235beaeda8 100644 | |
--- a/cmake/public/cuda.cmake | |
+++ b/cmake/public/cuda.cmake | |
@@ -145,7 +145,7 @@ endif() | |
# ---[ Extract versions | |
if(CAFFE2_USE_CUDNN) | |
# Get cuDNN version | |
- file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS) | |
+ file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS) | |
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" | |
CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}") | |
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index fd51cc45e7..e3be2fd3bc 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index af788ff8f8..fb27ab808c 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail { | |
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) | |
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h | |
index 61cd90cdd6..c50c5506ca 100644 | |
--- a/aten/src/THCUNN/common.h | |
+++ b/aten/src/THCUNN/common.h | |
@@ -5,7 +5,10 @@ | |
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-const int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index fd51cc45e7..e3be2fd3bc 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index af788ff8f8..fb27ab808c 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail { | |
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) | |
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h | |
index 61cd90cdd6..c50c5506ca 100644 | |
--- a/aten/src/THCUNN/common.h | |
+++ b/aten/src/THCUNN/common.h | |
@@ -5,7 +5,10 @@ | |
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-const int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h | |
index cfe6b0ea0f..d1e75ab9af 100644 | |
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h | |
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h | |
@@ -25,6 +25,8 @@ namespace { | |
// https://bugs.llvm.org/show_bug.cgi?id=45824 | |
// Most likely we will do aarch32 support with inline asm. | |
#if defined(__aarch64__) | |
+// See https://github.com/pytorch/pytorch/issues/47098 | |
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) | |
#ifdef __BIG_ENDIAN__ | |
#error "Big endian is not supported." | |
@@ -665,6 +667,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const | |
return Vec256<float>(r0, r1); | |
} | |
-#endif | |
+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */ | |
+#endif /* defined(aarch64) */ | |
}}} | |
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index fd51cc45e7..e3be2fd3bc 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index 45056ab996..81a0246ceb 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail { | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int64_t N) { | |
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h | |
index 69b7f3a4d3..85b0b1305f 100644 | |
--- a/aten/src/THCUNN/common.h | |
+++ b/aten/src/THCUNN/common.h | |
@@ -5,7 +5,10 @@ | |
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-const int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int64_t N) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index 1751128f1a..03e74f5ac2 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index 45056ab996..81a0246ceb 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail { | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int64_t N) { | |
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | |
index 4e9c799986..12c1453073 100644 | |
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | |
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | |
@@ -24,7 +24,13 @@ using namespace vec256; | |
// copysign faster for the half-precision types | |
template<typename T> | |
T copysign(T a, T b) { | |
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8) | |
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64 | |
+ // (e.g. Jetson), see PyTorch PR #51834 | |
return std::copysign(a, b); | |
+#else | |
+ return std::signbit(b) ? -std::abs(a) : std::abs(a); | |
+#endif | |
} | |
// Implement copysign for half precision floats using bit ops | |
@@ -149,6 +155,18 @@ void div_trunc_kernel(TensorIterator& iter) { | |
} | |
} | |
+// this is a function because MSVC does not like us to use #if inside AT_DISPATC | |
+template <typename scalar_t> | |
+static inline scalar_t signed_zero(scalar_t sign) { | |
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8) | |
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64 | |
+ // (e.g. Jetson), see PyTorch PR #51834 | |
+ return std::copysign(scalar_t(0), sign); | |
+#else | |
+ return std::signbit(sign) ? -scalar_t(0) : scalar_t(0); | |
+#endif | |
+} | |
+ | |
// NOTE: [Floor Division in Python] | |
// Python's __floordiv__ operator is more complicated than just floor(a / b). | |
// It aims to maintain the property: a == (a // b) * b + remainder(a, b) | |
@@ -201,7 +219,7 @@ void div_floor_kernel(TensorIterator& iter) { | |
floordiv += scalar_t(1.0); | |
} | |
} else { | |
- floordiv = copysign(scalar_t(0), a / b); | |
+ floordiv = signed_zero(a / b); | |
} | |
return floordiv; | |
}); | |
diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu | |
index e3ac2665a4..77e866b7f3 100644 | |
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu | |
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu | |
@@ -1,10 +1,11 @@ | |
#include <ATen/AccumulateType.h> | |
#include <ATen/Dispatch.h> | |
+#include <ATen/native/BinaryOps.h> | |
#include <ATen/native/DispatchStub.h> | |
-#include <ATen/native/cuda/Loops.cuh> | |
#include <ATen/native/TensorIterator.h> | |
-#include <ATen/native/BinaryOps.h> | |
#include <c10/cuda/CUDAGuard.h> | |
+#include <c10/cuda/CUDAMathCompat.h> | |
+#include <ATen/native/cuda/Loops.cuh> | |
// NOTE: CUDA on Windows requires that the enclosing function | |
// of a __device__ lambda not have internal linkage. | |
@@ -139,7 +140,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) { | |
floordiv += scalar_t(1.0); | |
} | |
} else { | |
- floordiv = std::copysign(scalar_t(0), a * inv_b); | |
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64 | |
+ // (e.g. Jetson), see PyTorch PR #51834 | |
+ floordiv = c10::cuda::compat::copysign(scalar_t(0), a * inv_b); | |
} | |
return floordiv; | |
}); | |
@@ -160,7 +163,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) { | |
floordiv += scalar_t(1.0); | |
} | |
} else { | |
- floordiv = std::copysign(scalar_t(0), a / b); | |
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64 | |
+ // (e.g. Jetson), see PyTorch PR #51834 | |
+ floordiv = c10::cuda::compat::copysign(scalar_t(0), a / b); | |
} | |
return floordiv; | |
}); | |
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h | |
index 69b7f3a4d3..85b0b1305f 100644 | |
--- a/aten/src/THCUNN/common.h | |
+++ b/aten/src/THCUNN/common.h | |
@@ -5,7 +5,10 @@ | |
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-const int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int64_t N) | |
diff --git a/c10/cuda/CUDAMathCompat.h b/c10/cuda/CUDAMathCompat.h | |
index 1fb0c3ec29..a4c6655859 100644 | |
--- a/c10/cuda/CUDAMathCompat.h | |
+++ b/c10/cuda/CUDAMathCompat.h | |
@@ -42,11 +42,80 @@ __MATH_FUNCTIONS_DECL__ double ceil(double x) { | |
return ::ceil(x); | |
} | |
+__MATH_FUNCTIONS_DECL__ float fp32_from_bits(uint32_t w) { | |
+#if defined(__OPENCL_VERSION__) | |
+ return as_float(w); | |
+#elif defined(__CUDA_ARCH__) | |
+ return __uint_as_float((unsigned int)w); | |
+#elif defined(__INTEL_COMPILER) | |
+ return _castu32_f32(w); | |
+#else | |
+ union { | |
+ uint32_t as_bits; | |
+ float as_value; | |
+ } fp32 = {w}; | |
+ return fp32.as_value; | |
+#endif | |
+} | |
+ | |
+__MATH_FUNCTIONS_DECL__ uint32_t fp32_to_bits(float f) { | |
+#if defined(__OPENCL_VERSION__) | |
+ return as_uint(f); | |
+#elif defined(__CUDA_ARCH__) | |
+ return (uint32_t)__float_as_uint(f); | |
+#elif defined(__INTEL_COMPILER) | |
+ return _castf32_u32(f); | |
+#else | |
+ union { | |
+ float as_value; | |
+ uint32_t as_bits; | |
+ } fp32 = {f}; | |
+ return fp32.as_bits; | |
+#endif | |
+} | |
+ | |
+__MATH_FUNCTIONS_DECL__ double fp64_from_bits(uint64_t w) { | |
+#if defined(__CUDA_ARCH__) | |
+ return __longlong_as_double(w); | |
+#else | |
+ union { | |
+ uint64_t as_bits; | |
+ double as_value; | |
+ } fp64 = {w}; | |
+ return fp64.as_value; | |
+#endif | |
+} | |
+ | |
+__MATH_FUNCTIONS_DECL__ uint64_t fp64_to_bits(double f) { | |
+#if defined(__CUDA_ARCH__) | |
+ return __double_as_longlong(f); | |
+#else | |
+ union { | |
+ double as_value; | |
+ int64_t as_bits; | |
+ } fp64 = {f}; | |
+ return fp64.as_bits; | |
+#endif | |
+} | |
+ | |
__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) { | |
- return ::copysignf(x, y); | |
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8) | |
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64 | |
+ // (e.g. Jetson), see PyTorch PR #51834 | |
+ return ::copysignf(x, y); | |
+#else | |
+ return fp32_from_bits( | |
+ (fp32_to_bits(x) & 0x7fffffffu) | (fp32_to_bits(y) & 0x80000000u)); | |
+#endif | |
} | |
__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) { | |
- return ::copysign(x, y); | |
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8) | |
+ return ::copysign(x, y); | |
+#else | |
+ return fp64_from_bits( | |
+ (fp64_to_bits(x) & 0x7fffffffffffffffull) | | |
+ (fp64_to_bits(y) & 0x8000000000000000ull)); | |
+#endif | |
} | |
__MATH_FUNCTIONS_DECL__ float floor(float x) { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h | |
index 5f6134112c..15b311d4d6 100644 | |
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h | |
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h | |
@@ -26,6 +26,9 @@ namespace { | |
// Most likely we will do aarch32 support with inline asm. | |
#if defined(__aarch64__) | |
+// See https://github.com/pytorch/pytorch/issues/47098 | |
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) | |
+ | |
#ifdef __BIG_ENDIAN__ | |
#error "Big endian is not supported." | |
#endif | |
@@ -710,6 +713,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const | |
return Vec256<float>(r0, r1); | |
} | |
+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */ | |
#endif /* defined(aarch64) */ | |
}}} | |
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index 1751128f1a..03e74f5ac2 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index e707e94cc1..4c6fcf3e5e 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail { | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int64_t N) { | |
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h | |
index 69b7f3a4d3..54455ab4b0 100644 | |
--- a/aten/src/THCUNN/common.h | |
+++ b/aten/src/THCUNN/common.h | |
@@ -5,7 +5,10 @@ | |
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-const int CUDA_NUM_THREADS = 1024; | |
+//const int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int64_t N) |
nevermind, the patch for 1.8 applies cleanly to 1.8.1, and it seems the jetpack version doesn't matter; at least this still works on the new 4.6
hi @dusty-nv, does pytorch-1.9-jetpack-4.5.1.patch
work for jetpack 4.6?
@maaaxac it's been a while since I built pytorch for JetPack 4, but these patches were not specific to individual versions of JetPack 4 really, but rather the version of PyTorch (1.9 in this case)
@dusty-nv thank you
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
is this idea to cut the number of threads in half still needed with torch 1.8.1 / 1.8.2, and jetpack 4.6?