Last active
January 29, 2021 23:29
-
-
Save sandeepkumar-skb/3ebfb6f918ed7c37aae550f775305ed3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cooperative_groups.h> | |
#include <algorithm> | |
#include <cuda.h> | |
#include<stdio.h> | |
using namespace cooperative_groups; | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ | |
if (err != cudaSuccess){ | |
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | |
exit(EXIT_FAILURE); | |
} | |
} | |
#define gpuErrchk(ans) \ | |
{ \ | |
gpuAssert((ans), __FILE__, __LINE__); \ | |
} | |
__device__ int thread_sum(int *input, int n) | |
{ | |
int sum = 0; | |
for(int i = blockIdx.x * blockDim.x + threadIdx.x; | |
i < n / 4; | |
i += blockDim.x * gridDim.x) | |
{ | |
int4 in = ((int4*)input)[i]; | |
sum += in.x + in.y + in.z + in.w; | |
} | |
return sum; | |
} | |
__device__ int reduce_sum(thread_group g, int *temp, int val) | |
{ | |
int lane = g.thread_rank(); | |
// Each iteration halves the number of active threads | |
// Each thread adds its partial sum[i] to sum[lane+i] | |
for (int i = g.size() / 2; i > 0; i /= 2) | |
{ | |
temp[lane] = val; | |
g.sync(); // wait for all threads to store | |
if(lane<i) val += temp[lane + i]; | |
g.sync(); // wait for all threads to load | |
} | |
return val; // note: only thread 0 will return full sum | |
} | |
__global__ void sum_kernel_block(int *sum, int *input, int n) | |
{ | |
//int tx = blockIdx.x*blockDim.x + threadIdx.x; | |
//int my_sum = input[tx]; //thread_sum(input, n); | |
int my_sum = thread_sum(input, n); | |
extern __shared__ int temp[]; | |
auto g = this_thread_block(); | |
int block_sum = reduce_sum(g, temp, my_sum); | |
if (g.thread_rank() == 0) atomicAdd(sum, block_sum); | |
} | |
int main(){ | |
int n = 1<<24; | |
int blockSize = 256; | |
int nBlocks = (n + blockSize - 1) / blockSize; | |
int sharedBytes = blockSize * sizeof(int); | |
int *sum, *data; | |
cudaMallocManaged(&sum, sizeof(int)); | |
cudaMallocManaged(&data, n * sizeof(int)); | |
std::fill_n(data, n, 1); // initialize data | |
cudaMemset(sum, 0, sizeof(int)); | |
cudaEvent_t start, stop; | |
gpuErrchk(cudaEventCreate(&start)); | |
gpuErrchk(cudaEventCreate(&stop)); | |
gpuErrchk(cudaEventRecord(start)); | |
sum_kernel_block<<<nBlocks, blockSize, sharedBytes>>>(sum, data, n); | |
gpuErrchk(cudaEventRecord(stop)); | |
gpuErrchk(cudaEventSynchronize(stop)); | |
float milliseconds = 0; | |
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop)); | |
printf("Effective time: %.3f ms\n", milliseconds); | |
printf("Final Result: %d Expected Result: %d\n", *sum, n); | |
cudaFree(sum); | |
cudaFree(data); | |
return 0; | |
} |
Author
sandeepkumar-skb
commented
Jan 29, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment