Skip to content

Instantly share code, notes, and snippets.

@alcides
Last active November 12, 2021 19:12
Show Gist options
  • Save alcides/08ffe6c9dc09a708f476886d04300678 to your computer and use it in GitHub Desktop.
Save alcides/08ffe6c9dc09a708f476886d04300678 to your computer and use it in GitHub Desktop.
MWE for grid-wide synchronization in CUDA 9+. Source: https://stackoverflow.com/questions/6404992/cuda-block-synchronization
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <cooperative_groups.h>
#include "workshop.h"
#define N 10
__global__ void block_wide_kernel() {
int index = blockIdx.x * blockDim.x + threadIdx.x;
cooperative_groups::grid_group g = cooperative_groups::this_grid();
for (int i=0; i<N; i++) {
if (threadIdx.x == 0)
printf("%d | %d\n", i, index);
g.sync();
}
}
int main(int argc, char **argv) {
// This will not work!
//block_wide_kernel<<<64, 1024>>>();
// You have to use the low-level CooperativeKernel launch api.
// Note the required imports for this.
void *kernelArgs[] = { /* add kernel args */ };
dim3 dimGrid(64, 1, 1); // deviceProp.multiProcessorCount*numBlocksPerSm
dim3 dimBlock(1024, 1, 1);
cudaLaunchCooperativeKernel((void*)block_wide_kernel, dimGrid, dimBlock, kernelArgs);
cudaDeviceSynchronize();
return 0;
}
#include <cuda_runtime_api.h>
#include <cuda.h>
#include "workshop.h"
#define N 10
__global__ void block_wide_kernel() {
int index = blockIdx.x * blockDim.x + threadIdx.x;
for (int i=0; i<N; i++) {
if (threadIdx.x == 0)
printf("%d | %d\n", i, index);
}
}
int main(int argc, char **argv) {
block_wide_kernel<<<64, 1024>>>();
cudaDeviceSynchronize();
return 0;
}
/*
prints something like:
...
7 | 41984
7 | 46080
7 | 47104
9 | 26624
7 | 15360
7 | 17408
7 | 8192
7 | 14336
...
*/
#include <cuda_runtime_api.h>
#include <cuda.h>
#include "workshop.h"
#define N 10
__device__ uint64_t phaser_between_32_groups; // Can synchronize among 64 groups.
__global__ void block_wide_kernel() {
int index = blockIdx.x * blockDim.x + threadIdx.x;
for (int i=0; i<N; i++) {
// You need to reset the mask in the beginning, only in the first thread
if (blockIdx.x == 0 && threadIdx.x == 0) {
atomicAnd((int*) &phaser_between_32_groups, 0);
}
if (threadIdx.x == 0)
printf("%d | %d\n", i, index);
__syncthreads();
// Then you use atomic or to do a busy wait until all threads have arrived.
const auto phaser_complete = (1 << gridDim.x) - 1;
if (threadIdx.x == 0) {
while ((atomicOr((int*) &phaser_between_32_groups, 1ULL << blockIdx.x)) != phaser_complete) { /*do nothing*/ }
}
}
}
int main(int argc, char **argv) {
block_wide_kernel<<<32, 1024>>>();
cudaDeviceSynchronize();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment