alcides · November 12, 2021 19:12
diff --git a/blockwidesync.cu b/blockwidesync.cu
 #include <cuda_runtime_api.h> 
 #include <cuda.h> 
 #include <cooperative_groups.h>

 #include "workshop.h"

 #define N 10

 __global__ void block_wide_kernel() {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  cooperative_groups::grid_group g = cooperative_groups::this_grid(); 

  for (int i=0; i<N;  i++) {
    if (threadIdx.x == 0)
        printf("%d | %d\n", i, index);
    g.sync();
  }
 }

 int main(int argc, char **argv) {
  // This will not work!
  //block_wide_kernel<<<64, 1024>>>();
  // You have to use the low-level CooperativeKernel launch api.
  // Note the required imports for this.

  void *kernelArgs[] = { /* add kernel args */ };
  dim3 dimGrid(64, 1, 1); // deviceProp.multiProcessorCount*numBlocksPerSm
  dim3 dimBlock(1024, 1, 1);
  cudaLaunchCooperativeKernel((void*)block_wide_kernel, dimGrid, dimBlock, kernelArgs);
  cudaDeviceSynchronize();
  return 0;
 }
diff --git a/blockwidesync_bad.cu b/blockwidesync_bad.cu
 #include <cuda_runtime_api.h> 
 #include <cuda.h> 

 #include "workshop.h"

 #define N 10

 __global__ void block_wide_kernel() {
  int index = blockIdx.x * blockDim.x + threadIdx.x;

  for (int i=0; i<N;  i++) {
    if (threadIdx.x == 0)
        printf("%d | %d\n", i, index);
  }
 }

 int main(int argc, char **argv) {
  block_wide_kernel<<<64, 1024>>>();
  cudaDeviceSynchronize();
  return 0;
 }


 /* 

 prints something like:

 ...
 7 | 41984
 7 | 46080
 7 | 47104
 9 | 26624
 7 | 15360
 7 | 17408
 7 | 8192
 7 | 14336
 ...


 */
diff --git a/blockwidesync_old.cu b/blockwidesync_old.cu
 #include <cuda_runtime_api.h> 
 #include <cuda.h> 

 #include "workshop.h"

 #define N 10

 __device__ uint64_t phaser_between_32_groups;  // Can synchronize among 64 groups.

 __global__ void block_wide_kernel() {
  int index = blockIdx.x * blockDim.x + threadIdx.x;

  for (int i=0; i<N;  i++) {
    // You need to reset the mask in the beginning, only in the first thread
    if (blockIdx.x == 0 && threadIdx.x == 0) {
        atomicAnd((int*) &phaser_between_32_groups, 0);
    }


    if (threadIdx.x == 0)
        printf("%d | %d\n", i, index);
    
    
    __syncthreads();
    // Then you use atomic or to do a busy wait until all threads have arrived.
    const auto phaser_complete = (1 << gridDim.x) - 1;
    if (threadIdx.x == 0) {
        while ((atomicOr((int*) &phaser_between_32_groups, 1ULL << blockIdx.x)) != phaser_complete) { /*do nothing*/ }
    }


  }
 }

 int main(int argc, char **argv) {
  block_wide_kernel<<<32, 1024>>>();
  cudaDeviceSynchronize();
  return 0;
 }
	#include <cuda_runtime_api.h>
	#include <cuda.h>
	#include <cooperative_groups.h>

	#include "workshop.h"

	#define N 10

	__global__ void block_wide_kernel() {
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	cooperative_groups::grid_group g = cooperative_groups::this_grid();

	for (int i=0; i<N; i++) {
	if (threadIdx.x == 0)
	printf("%d \| %d\n", i, index);
	g.sync();
	}
	}

	int main(int argc, char **argv) {
	// This will not work!
	//block_wide_kernel<<<64, 1024>>>();
	// You have to use the low-level CooperativeKernel launch api.
	// Note the required imports for this.

	void kernelArgs[] = { / add kernel args */ };
	dim3 dimGrid(64, 1, 1); // deviceProp.multiProcessorCount*numBlocksPerSm
	dim3 dimBlock(1024, 1, 1);
	cudaLaunchCooperativeKernel((void*)block_wide_kernel, dimGrid, dimBlock, kernelArgs);
	cudaDeviceSynchronize();
	return 0;
	}