Last active
January 30, 2021 04:35
-
-
Save sandeepkumar-skb/abed94574cfefe83fcedbb530c14a2a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <iostream> | |
#include <chrono> | |
#define BLOCK_SIZE 256 | |
#define GRID_SIZE 72 //Turing Titan RTX | |
#define OUT_SIZE 256 | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ | |
if (err != cudaSuccess){ | |
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | |
exit(EXIT_FAILURE); | |
} | |
} | |
#define gpuErrchk(ans) \ | |
{ \ | |
gpuAssert((ans), __FILE__, __LINE__); \ | |
} | |
__global__ | |
void histo_d(float* img, int height, int width, unsigned int *out, int out_size){ | |
int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
int stride = gridDim.x*blockDim.x; | |
__shared__ unsigned int shmem[OUT_SIZE]; | |
for (int i=threadIdx.x; i < OUT_SIZE; i+=BLOCK_SIZE){ | |
shmem[i] = 0; | |
} | |
__syncthreads(); | |
for (int i=idx; i < height*width; i+=stride){ | |
int bucket = static_cast<int>(img[i]) % OUT_SIZE; | |
atomicAdd(&shmem[bucket], 1); | |
} | |
__syncthreads(); | |
for (int i=threadIdx.x; i < OUT_SIZE; i+=BLOCK_SIZE){ | |
atomicAdd(&out[i], shmem[i]); | |
} | |
} | |
void histo_h(float* img, int height, int width, unsigned int *out, int out_size){ | |
for (int i=0; i < height*width; ++i){ | |
int bucket = static_cast<int>(img[i]) % OUT_SIZE; | |
out[bucket]++; | |
} | |
} | |
int main(){ | |
float *img; | |
unsigned int *out; | |
unsigned int *out_h; | |
int out_size = OUT_SIZE; | |
int height = 1024; | |
int width = 1024; | |
gpuErrchk(cudaMallocManaged(&img, height*width*sizeof(float))); | |
out_h = (unsigned int*) malloc(out_size*sizeof(unsigned int)); | |
gpuErrchk(cudaMallocManaged(&out, out_size*sizeof(unsigned int))); | |
for (int i=0; i < height*width; ++i){ | |
img[i] = i; | |
} | |
for (int i=0; i < out_size; ++i){ | |
out[i] = 0; | |
out_h[i] = 0; | |
} | |
cudaEvent_t start, stop; | |
gpuErrchk(cudaEventCreate(&start)); | |
gpuErrchk(cudaEventCreate(&stop)); | |
gpuErrchk(cudaEventRecord(start)); | |
histo_d<<<GRID_SIZE, BLOCK_SIZE>>>(img, height, width, out, out_size); | |
gpuErrchk(cudaEventRecord(stop)); | |
gpuErrchk(cudaEventSynchronize(stop)); | |
float milliseconds = 0; | |
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop)); | |
std::chrono::high_resolution_clock::time_point ch_start; | |
std::chrono::high_resolution_clock::time_point ch_end ; | |
std::chrono::duration<double> span; | |
ch_start = std::chrono::high_resolution_clock::now(); | |
histo_h(img, height, width, out_h, out_size); | |
ch_end = std::chrono::high_resolution_clock::now(); | |
span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start); | |
for(int i=0; i < out_size; ++i){ | |
if (out[i] != out_h[i]){ | |
std::cout << "there is a mismatch at: " << i << " out: " << out[i] << " out_h: " << out_h[i] << "\n"; | |
} | |
} | |
printf("GPU Effective time: %f ms\n", milliseconds); | |
std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl; | |
cudaFree(img); | |
cudaFree(out); | |
free(out_h); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compile & Run: