Created
December 21, 2020 01:36
-
-
Save sandeepkumar-skb/2e2254fe1ec05d2dd18180d362bd3eab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <chrono> | |
#include <iostream> | |
#define BLOCK_SIZE 128 | |
inline void gpuAssert(cudaError_t err, const char *file, int line) | |
{ | |
if (err != cudaSuccess){ | |
printf("%s in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); | |
exit(EXIT_FAILURE); | |
} | |
} | |
#define gpuErrchk(ans) \ | |
{ \ | |
gpuAssert((ans), __FILE__, __LINE__); \ | |
} | |
__global__ | |
void copy_d(int* inp, | |
int* idx, | |
int* out, | |
int size) | |
{ | |
int tx = blockIdx.x*blockDim.x + threadIdx.x; | |
if (tx < size){ | |
out[tx] = inp[idx[tx]]; | |
} | |
} | |
__global__ | |
void copy_restrict_d(const int* __restrict__ inp, | |
const int* __restrict__ idx, | |
int* __restrict__ out, | |
int size) | |
{ | |
int tx = blockIdx.x*blockDim.x + threadIdx.x; | |
if (tx < size){ | |
out[tx] = inp[idx[tx]]; | |
} | |
} | |
void copy_cpu(int* inp, | |
int* idx, | |
int* out, | |
int size) | |
{ | |
for (int i=0; i < size; ++i){ | |
out[i] = inp[idx[i]]; | |
} | |
} | |
int main(){ | |
int size = 1024*1024; | |
int *inp, *out, *idx, *out_cpu; | |
gpuErrchk(cudaMallocManaged(&inp, size*sizeof(int))); | |
gpuErrchk(cudaMallocManaged(&out, size*sizeof(int))); | |
gpuErrchk(cudaMallocManaged(&idx, size*sizeof(int))); | |
out_cpu = (int*) malloc (size*sizeof(int)); | |
for (int i=0; i<size; ++i){ | |
inp[i] = i; | |
out[i] = 0; | |
idx[i] = size - i -1; | |
out_cpu[i] = 0; | |
} | |
cudaEvent_t start, stop; | |
gpuErrchk(cudaEventCreate(&start)); | |
gpuErrchk(cudaEventCreate(&stop)); | |
dim3 num_threads(BLOCK_SIZE); | |
dim3 num_blocks((size-1)/BLOCK_SIZE + 1); | |
int warmup = 20; | |
int num_iter = 100; | |
float total_time = 0.0; | |
for (int i=0; i < num_iter + warmup; ++i){ | |
gpuErrchk(cudaEventRecord(start)); | |
copy_d<<<num_blocks, num_threads>>>(inp, idx, out, size); | |
gpuErrchk(cudaEventRecord(stop)); | |
gpuErrchk(cudaEventSynchronize(stop)); | |
float milliseconds = 0; | |
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop)); | |
if (i > warmup) | |
total_time += milliseconds; | |
} | |
printf("Effective time orig: %f us\n", (total_time/num_iter)*1000); | |
total_time = 0.0; | |
for (int i=0; i < num_iter + warmup; ++i){ | |
gpuErrchk(cudaEventRecord(start)); | |
copy_restrict_d<<<num_blocks, num_threads>>>(inp, idx, out, size); | |
gpuErrchk(cudaEventRecord(stop)); | |
gpuErrchk(cudaEventSynchronize(stop)); | |
float milliseconds = 0; | |
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop)); | |
if (i > warmup) | |
total_time += milliseconds; | |
} | |
printf("Effective time with restrict: %f us\n", (total_time/num_iter)*1000); | |
std::chrono::high_resolution_clock::time_point ch_start; | |
std::chrono::high_resolution_clock::time_point ch_end ; | |
std::chrono::duration<double> span; | |
ch_start = std::chrono::high_resolution_clock::now(); | |
copy_cpu(inp, idx, out_cpu, size); | |
ch_end = std::chrono::high_resolution_clock::now(); | |
span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start); | |
std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl; | |
for(int i=0; i<size; ++i){ | |
if (out_cpu[i] != out[i]){ | |
printf("Error!!\n"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compile & Run:
nvcc pointer_aliasing_demo.cu -o out && ./out