dusty-nv · April 15, 2024 18:31 · rajb245 · Aug 27, 2021 · rajb245 · Aug 31, 2021
diff --git a/pytorch-1.10-jetpack-4.5.1.patch b/pytorch-1.10-jetpack-4.5.1.patch
 diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
 index 2aac442d21..f2321dad7a 100644
 --- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
 +++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
 @@ -26,6 +26,9 @@ namespace {
 // Most likely we will do aarch32 support with inline asm.
 #if defined(__aarch64__)
 
 +// See https://github.com/pytorch/pytorch/issues/47098
 +#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
 +
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
 #endif
 @@ -713,6 +716,7 @@ Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<floa
   return Vectorized<float>(r0, r1);
 }
 
 +#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
 #endif /* defined(aarch64) */
 
 }}}
 diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
 index 1751128f1a..03e74f5ac2 100644
 --- a/aten/src/ATen/cuda/CUDAContext.cpp
 +++ b/aten/src/ATen/cuda/CUDAContext.cpp
 @@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
 +  // patch for "too many resources requested for launch"
 +  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
 
 diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
 index 91a61b04b8..5e9c128eed 100644
 --- a/aten/src/ATen/cuda/detail/KernelUtils.h
 +++ b/aten/src/ATen/cuda/detail/KernelUtils.h
 @@ -19,7 +19,9 @@ namespace at { namespace cuda { namespace detail {
 
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -constexpr int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
diff --git a/pytorch-1.11-jetpack-5.0.patch b/pytorch-1.11-jetpack-5.0.patch
 diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
 index 00e6d5d45e..2ee832051a 100644
 --- a/torch/utils/cpp_extension.py
 +++ b/torch/utils/cpp_extension.py
 @@ -1595,7 +1595,7 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
 -                        '7.0', '7.2', '7.5', '8.0', '8.6']
 +                        '7.0', '7.2', '7.5', '8.0', '8.6', '8.7']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
     # The default is sm_30 for CUDA 9.x and 10.x
diff --git a/pytorch-1.4-diff-jetpack-4.4.patch b/pytorch-1.4-diff-jetpack-4.4.patch
 diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
 index e48c020b03..0ecc111c4b 100644
 --- a/aten/src/ATen/cuda/CUDAContext.cpp
 +++ b/aten/src/ATen/cuda/CUDAContext.cpp
 @@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
 +  // patch for "too many resources requested for launch"
 +  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
 
 diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
 index af788ff8f8..fb27ab808c 100644
 --- a/aten/src/ATen/cuda/detail/KernelUtils.h
 +++ b/aten/src/ATen/cuda/detail/KernelUtils.h
 @@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -constexpr int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
 diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
 index 61cd90cdd6..1d3f5383d4 100644
 --- a/aten/src/THCUNN/common.h
 +++ b/aten/src/THCUNN/common.h
 @@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -const int CUDA_NUM_THREADS = 1024;
 +//const int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
 diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
 index 8f69944dcf..3679c9d2a7 100644
 --- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
 +++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
 @@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
   // RNN setup
   {
 #if CUDNN_VERSION_MIN(7, 0, 0)
 -    CUDNN_ENFORCE(cudnnSetRNNDescriptor(
 +    CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
         cudnn_wrapper_.inline_cudnn_handle(),
         rnnDesc_,
         hiddenSize,
 diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
 index a5c50b90df..22d738e0c1 100644
 --- a/cmake/public/cuda.cmake
 +++ b/cmake/public/cuda.cmake
 @@ -147,7 +147,7 @@ endif()
 # ---[ Extract versions
 if(CAFFE2_USE_CUDNN)
   # Get cuDNN version
 -  file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
 +  file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
   string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
                CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
   string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
diff --git a/pytorch-1.5-diff-jetpack-4.4.patch b/pytorch-1.5-diff-jetpack-4.4.patch
 diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
 index fd51cc45e7..e3be2fd3bc 100644
 --- a/aten/src/ATen/cuda/CUDAContext.cpp
 +++ b/aten/src/ATen/cuda/CUDAContext.cpp
 @@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
 +  // patch for "too many resources requested for launch"
 +  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
 
 diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
 index af788ff8f8..fb27ab808c 100644
 --- a/aten/src/ATen/cuda/detail/KernelUtils.h
 +++ b/aten/src/ATen/cuda/detail/KernelUtils.h
 @@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -constexpr int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
 diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
 index 61cd90cdd6..1d3f5383d4 100644
 --- a/aten/src/THCUNN/common.h
 +++ b/aten/src/THCUNN/common.h
 @@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -const int CUDA_NUM_THREADS = 1024;
 +//const int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
 diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
 index 8f69944dcf..3679c9d2a7 100644
 --- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
 +++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
 @@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
   // RNN setup
   {
 #if CUDNN_VERSION_MIN(7, 0, 0)
 -    CUDNN_ENFORCE(cudnnSetRNNDescriptor(
 +    CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
         cudnn_wrapper_.inline_cudnn_handle(),
         rnnDesc_,
         hiddenSize,
 diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
 index 545d6cd924..235beaeda8 100644
 --- a/cmake/public/cuda.cmake
 +++ b/cmake/public/cuda.cmake
 @@ -145,7 +145,7 @@ endif()
 # ---[ Extract versions
 if(CAFFE2_USE_CUDNN)
   # Get cuDNN version
 -  file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
 +  file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
   string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
                CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
   string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
diff --git a/pytorch-1.6-jetpack-4.4.patch b/pytorch-1.6-jetpack-4.4.patch
 diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
 index fd51cc45e7..e3be2fd3bc 100644
 --- a/aten/src/ATen/cuda/CUDAContext.cpp
 +++ b/aten/src/ATen/cuda/CUDAContext.cpp
 @@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
 +  // patch for "too many resources requested for launch"
 +  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
 
 diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
 index af788ff8f8..fb27ab808c 100644
 --- a/aten/src/ATen/cuda/detail/KernelUtils.h
 +++ b/aten/src/ATen/cuda/detail/KernelUtils.h
 @@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -constexpr int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
 diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
 index 61cd90cdd6..c50c5506ca 100644
 --- a/aten/src/THCUNN/common.h
 +++ b/aten/src/THCUNN/common.h
 @@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -const int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/pytorch-1.6-rc2-jetpack-4.4-GA.patch b/pytorch-1.6-rc2-jetpack-4.4-GA.patch
 diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
 index fd51cc45e7..e3be2fd3bc 100644
 --- a/aten/src/ATen/cuda/CUDAContext.cpp
 +++ b/aten/src/ATen/cuda/CUDAContext.cpp
 @@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
 +  // patch for "too many resources requested for launch"
 +  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
 
 diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
 index af788ff8f8..fb27ab808c 100644
 --- a/aten/src/ATen/cuda/detail/KernelUtils.h
 +++ b/aten/src/ATen/cuda/detail/KernelUtils.h
 @@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -constexpr int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
 diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
 index 61cd90cdd6..c50c5506ca 100644
 --- a/aten/src/THCUNN/common.h
 +++ b/aten/src/THCUNN/common.h
 @@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -const int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/pytorch-1.7-jetpack-4.4.1.patch b/pytorch-1.7-jetpack-4.4.1.patch
 diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
 index cfe6b0ea0f..d1e75ab9af 100644
 --- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
 +++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
 @@ -25,6 +25,8 @@ namespace {
 //    https://bugs.llvm.org/show_bug.cgi?id=45824
 // Most likely we will do aarch32 support with inline asm.
 #if defined(__aarch64__)
 +// See https://github.com/pytorch/pytorch/issues/47098
 +#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
 
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
 @@ -665,6 +667,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
   return Vec256<float>(r0, r1);
 }
 
 -#endif
 +#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
 +#endif /* defined(aarch64) */
 
 }}}
 diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
 index fd51cc45e7..e3be2fd3bc 100644
 --- a/aten/src/ATen/cuda/CUDAContext.cpp
 +++ b/aten/src/ATen/cuda/CUDAContext.cpp
 @@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
 +  // patch for "too many resources requested for launch"
 +  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
 
 diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
 index 45056ab996..81a0246ceb 100644
 --- a/aten/src/ATen/cuda/detail/KernelUtils.h
 +++ b/aten/src/ATen/cuda/detail/KernelUtils.h
 @@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {
 
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -constexpr int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N) {
 diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
 index 69b7f3a4d3..85b0b1305f 100644
 --- a/aten/src/THCUNN/common.h
 +++ b/aten/src/THCUNN/common.h
 @@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -const int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N)
diff --git a/pytorch-1.8-jetpack-4.4.1.patch b/pytorch-1.8-jetpack-4.4.1.patch
 diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
 index 1751128f1a..03e74f5ac2 100644
 --- a/aten/src/ATen/cuda/CUDAContext.cpp
 +++ b/aten/src/ATen/cuda/CUDAContext.cpp
 @@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
 +  // patch for "too many resources requested for launch"
 +  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
 
 diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
 index 45056ab996..81a0246ceb 100644
 --- a/aten/src/ATen/cuda/detail/KernelUtils.h
 +++ b/aten/src/ATen/cuda/detail/KernelUtils.h
 @@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {
 
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -constexpr int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N) {
 diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
 index 4e9c799986..12c1453073 100644
 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
 +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
 @@ -24,7 +24,13 @@ using namespace vec256;
 // copysign faster for the half-precision types
 template<typename T>
 T copysign(T a, T b) {
 +#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
 +  // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
 +  // (e.g. Jetson), see PyTorch PR #51834
   return std::copysign(a, b);
 +#else
 +  return std::signbit(b) ? -std::abs(a) : std::abs(a);
 +#endif
 }
 
 // Implement copysign for half precision floats using bit ops
 @@ -149,6 +155,18 @@ void div_trunc_kernel(TensorIterator& iter) {
   }
 }
 
 +// this is a function because MSVC does not like us to use #if inside AT_DISPATC
 +template <typename scalar_t>
 +static inline scalar_t signed_zero(scalar_t sign) {
 +#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
 +  // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
 +  // (e.g. Jetson), see PyTorch PR #51834
 +  return std::copysign(scalar_t(0), sign);
 +#else
 +  return std::signbit(sign) ? -scalar_t(0) : scalar_t(0);
 +#endif
 +}
 +
 // NOTE: [Floor Division in Python]
 // Python's __floordiv__ operator is more complicated than just floor(a / b).
 // It aims to maintain the property: a == (a // b) * b + remainder(a, b)
 @@ -201,7 +219,7 @@ void div_floor_kernel(TensorIterator& iter) {
                 floordiv += scalar_t(1.0);
               }
             } else {
 -              floordiv = copysign(scalar_t(0), a / b);
 +              floordiv = signed_zero(a / b);
             }
             return floordiv;
           });
 diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
 index e3ac2665a4..77e866b7f3 100644
 --- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
 +++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
 @@ -1,10 +1,11 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 +#include <ATen/native/BinaryOps.h>
 #include <ATen/native/DispatchStub.h>
 -#include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
 -#include <ATen/native/BinaryOps.h>
 #include <c10/cuda/CUDAGuard.h>
 +#include <c10/cuda/CUDAMathCompat.h>
 +#include <ATen/native/cuda/Loops.cuh>
 
 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.
 @@ -139,7 +140,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) {
             floordiv += scalar_t(1.0);
           }
         } else {
 -          floordiv = std::copysign(scalar_t(0), a * inv_b);
 +          // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
 +          // (e.g. Jetson), see PyTorch PR #51834
 +          floordiv = c10::cuda::compat::copysign(scalar_t(0), a * inv_b);
         }
         return floordiv;
       });
 @@ -160,7 +163,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) {
             floordiv += scalar_t(1.0);
           }
         } else {
 -          floordiv = std::copysign(scalar_t(0), a / b);
 +          // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
 +          // (e.g. Jetson), see PyTorch PR #51834
 +          floordiv = c10::cuda::compat::copysign(scalar_t(0), a / b);
         }
         return floordiv;
       });
 diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
 index 69b7f3a4d3..85b0b1305f 100644
 --- a/aten/src/THCUNN/common.h
 +++ b/aten/src/THCUNN/common.h
 @@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -const int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N)
 diff --git a/c10/cuda/CUDAMathCompat.h b/c10/cuda/CUDAMathCompat.h
 index 1fb0c3ec29..a4c6655859 100644
 --- a/c10/cuda/CUDAMathCompat.h
 +++ b/c10/cuda/CUDAMathCompat.h
 @@ -42,11 +42,80 @@ __MATH_FUNCTIONS_DECL__ double ceil(double x) {
   return ::ceil(x);
 }
 
 +__MATH_FUNCTIONS_DECL__ float fp32_from_bits(uint32_t w) {
 +#if defined(__OPENCL_VERSION__)
 +  return as_float(w);
 +#elif defined(__CUDA_ARCH__)
 +  return __uint_as_float((unsigned int)w);
 +#elif defined(__INTEL_COMPILER)
 +  return _castu32_f32(w);
 +#else
 +  union {
 +    uint32_t as_bits;
 +    float as_value;
 +  } fp32 = {w};
 +  return fp32.as_value;
 +#endif
 +}
 +
 +__MATH_FUNCTIONS_DECL__ uint32_t fp32_to_bits(float f) {
 +#if defined(__OPENCL_VERSION__)
 +  return as_uint(f);
 +#elif defined(__CUDA_ARCH__)
 +  return (uint32_t)__float_as_uint(f);
 +#elif defined(__INTEL_COMPILER)
 +  return _castf32_u32(f);
 +#else
 +  union {
 +    float as_value;
 +    uint32_t as_bits;
 +  } fp32 = {f};
 +  return fp32.as_bits;
 +#endif
 +}
 +
 +__MATH_FUNCTIONS_DECL__ double fp64_from_bits(uint64_t w) {
 +#if defined(__CUDA_ARCH__)
 +  return __longlong_as_double(w);
 +#else
 +  union {
 +    uint64_t as_bits;
 +    double as_value;
 +  } fp64 = {w};
 +  return fp64.as_value;
 +#endif
 +}
 +
 +__MATH_FUNCTIONS_DECL__ uint64_t fp64_to_bits(double f) {
 +#if defined(__CUDA_ARCH__)
 +  return __double_as_longlong(f);
 +#else
 +  union {
 +    double as_value;
 +    int64_t as_bits;
 +  } fp64 = {f};
 +  return fp64.as_bits;
 +#endif
 +}
 +
 __MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
 -  return ::copysignf(x, y);
 +#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
 +   // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
 +   // (e.g. Jetson), see PyTorch PR #51834
 +   return ::copysignf(x, y);
 +#else
 +   return fp32_from_bits(
 +      (fp32_to_bits(x) & 0x7fffffffu) | (fp32_to_bits(y) & 0x80000000u));
 +#endif
 }
 __MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
 -  return ::copysign(x, y);
 +#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
 +   return ::copysign(x, y);
 +#else
 +   return fp64_from_bits(
 +       (fp64_to_bits(x) & 0x7fffffffffffffffull) |
 +       (fp64_to_bits(y) & 0x8000000000000000ull));
 +#endif
 }
 
 __MATH_FUNCTIONS_DECL__ float floor(float x) {
diff --git a/pytorch-1.9-jetpack-4.5.1.patch b/pytorch-1.9-jetpack-4.5.1.patch
 diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
 index 5f6134112c..15b311d4d6 100644
 --- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
 +++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
 @@ -26,6 +26,9 @@ namespace {
 // Most likely we will do aarch32 support with inline asm.
 #if defined(__aarch64__)
 
 +// See https://github.com/pytorch/pytorch/issues/47098
 +#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
 +
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
 #endif
 @@ -710,6 +713,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
   return Vec256<float>(r0, r1);
 }
 
 +#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
 #endif /* defined(aarch64) */
 
 }}}
 diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
 index 1751128f1a..03e74f5ac2 100644
 --- a/aten/src/ATen/cuda/CUDAContext.cpp
 +++ b/aten/src/ATen/cuda/CUDAContext.cpp
 @@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
 +  // patch for "too many resources requested for launch"
 +  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
 
 diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
 index e707e94cc1..4c6fcf3e5e 100644
 --- a/aten/src/ATen/cuda/detail/KernelUtils.h
 +++ b/aten/src/ATen/cuda/detail/KernelUtils.h
 @@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {
 
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -constexpr int CUDA_NUM_THREADS = 1024;
 +//constexpr int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N) {
 diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
 index 69b7f3a4d3..54455ab4b0 100644
 --- a/aten/src/THCUNN/common.h
 +++ b/aten/src/THCUNN/common.h
 @@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
 
 // Use 1024 threads per block, which requires cuda sm_2x or above
 -const int CUDA_NUM_THREADS = 1024;
 +//const int CUDA_NUM_THREADS = 1024;
 +
 +// patch for "too many resources requested for launch"
 +constexpr int CUDA_NUM_THREADS = 512;
 
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N)
	diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
	index 2aac442d21..f2321dad7a 100644
	--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
	+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
	@@ -26,6 +26,9 @@ namespace {
	// Most likely we will do aarch32 support with inline asm.
	#if defined(__aarch64__)

	+// See https://github.com/pytorch/pytorch/issues/47098
	+#if defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
	+
	#ifdef __BIG_ENDIAN__
	#error "Big endian is not supported."
	#endif
	@@ -713,6 +716,7 @@ Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<floa
	return Vectorized<float>(r0, r1);
	}

	+#endif /* defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
	#endif /* defined(aarch64) */

	}}}
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index 1751128f1a..03e74f5ac2 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index 91a61b04b8..5e9c128eed 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -19,7 +19,9 @@ namespace at { namespace cuda { namespace detail {


	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
	diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
	index 00e6d5d45e..2ee832051a 100644
	--- a/torch/utils/cpp_extension.py
	+++ b/torch/utils/cpp_extension.py
	@@ -1595,7 +1595,7 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
	])

	supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
	- '7.0', '7.2', '7.5', '8.0', '8.6']
	+ '7.0', '7.2', '7.5', '8.0', '8.6', '8.7']
	valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]

	# The default is sm_30 for CUDA 9.x and 10.x
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index e48c020b03..0ecc111c4b 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index af788ff8f8..fb27ab808c 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
	for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
	index 61cd90cdd6..1d3f5383d4 100644
	--- a/aten/src/THCUNN/common.h
	+++ b/aten/src/THCUNN/common.h
	@@ -5,7 +5,10 @@
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-const int CUDA_NUM_THREADS = 1024;
	+//const int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
	index 8f69944dcf..3679c9d2a7 100644
	--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
	+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
	@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
	// RNN setup
	{
	#if CUDNN_VERSION_MIN(7, 0, 0)
	- CUDNN_ENFORCE(cudnnSetRNNDescriptor(
	+ CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
	cudnn_wrapper_.inline_cudnn_handle(),
	rnnDesc_,
	hiddenSize,
	diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
	index a5c50b90df..22d738e0c1 100644
	--- a/cmake/public/cuda.cmake
	+++ b/cmake/public/cuda.cmake
	@@ -147,7 +147,7 @@ endif()
	# ---[ Extract versions
	if(CAFFE2_USE_CUDNN)
	# Get cuDNN version
	- file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
	+ file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
	string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
	CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
	string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index fd51cc45e7..e3be2fd3bc 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index af788ff8f8..fb27ab808c 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
	for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
	index 61cd90cdd6..1d3f5383d4 100644
	--- a/aten/src/THCUNN/common.h
	+++ b/aten/src/THCUNN/common.h
	@@ -5,7 +5,10 @@
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-const int CUDA_NUM_THREADS = 1024;
	+//const int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
	index 8f69944dcf..3679c9d2a7 100644
	--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
	+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
	@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
	// RNN setup
	{
	#if CUDNN_VERSION_MIN(7, 0, 0)
	- CUDNN_ENFORCE(cudnnSetRNNDescriptor(
	+ CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
	cudnn_wrapper_.inline_cudnn_handle(),
	rnnDesc_,
	hiddenSize,
	diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
	index 545d6cd924..235beaeda8 100644
	--- a/cmake/public/cuda.cmake
	+++ b/cmake/public/cuda.cmake
	@@ -145,7 +145,7 @@ endif()
	# ---[ Extract versions
	if(CAFFE2_USE_CUDNN)
	# Get cuDNN version
	- file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
	+ file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
	string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
	CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
	string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
	diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	index cfe6b0ea0f..d1e75ab9af 100644
	--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	@@ -25,6 +25,8 @@ namespace {
	// https://bugs.llvm.org/show_bug.cgi?id=45824
	// Most likely we will do aarch32 support with inline asm.
	#if defined(__aarch64__)
	+// See https://github.com/pytorch/pytorch/issues/47098
	+#if defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3))

	#ifdef __BIG_ENDIAN__
	#error "Big endian is not supported."
	@@ -665,6 +667,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
	return Vec256<float>(r0, r1);
	}

	-#endif
	+#endif /* defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
	+#endif /* defined(aarch64) */

	}}}
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index fd51cc45e7..e3be2fd3bc 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index 45056ab996..81a0246ceb 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {


	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N) {
	diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
	index 69b7f3a4d3..85b0b1305f 100644
	--- a/aten/src/THCUNN/common.h
	+++ b/aten/src/THCUNN/common.h
	@@ -5,7 +5,10 @@
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-const int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N)
	diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	index 5f6134112c..15b311d4d6 100644
	--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	@@ -26,6 +26,9 @@ namespace {
	// Most likely we will do aarch32 support with inline asm.
	#if defined(__aarch64__)

	+// See https://github.com/pytorch/pytorch/issues/47098
	+#if defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
	+
	#ifdef __BIG_ENDIAN__
	#error "Big endian is not supported."
	#endif
	@@ -710,6 +713,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
	return Vec256<float>(r0, r1);
	}

	+#endif /* defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
	#endif /* defined(aarch64) */

	}}}
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index 1751128f1a..03e74f5ac2 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index e707e94cc1..4c6fcf3e5e 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {


	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N) {
	diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
	index 69b7f3a4d3..54455ab4b0 100644
	--- a/aten/src/THCUNN/common.h
	+++ b/aten/src/THCUNN/common.h
	@@ -5,7 +5,10 @@
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-const int CUDA_NUM_THREADS = 1024;
	+//const int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N)