Skip to content

Instantly share code, notes, and snippets.

@jatesy
Created April 1, 2014 18:17
Show Gist options
  • Save jatesy/9919846 to your computer and use it in GitHub Desktop.
Save jatesy/9919846 to your computer and use it in GitHub Desktop.
Graphic Processing Units(GPU) programming: Matrix multiplication implemented parallel in Cuda
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda.h>
// Thread block size
#define BLOCK_SIZE 8 //We can change this to 16, 32, 64, 128, 256, 512 and 1024
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA 8 // Matrix A width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
#define HA 8 // Matrix A height, we can change this to 16, 32, 64, 128, 256, 512 and 1024
#define WB 8 // Matrix B width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
#define HB WA // Matrix B height, we can change this to 16, 32, 64, 128, 256, 512 and 1024
#define WC WB // Matrix C width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
#define HC HA // Matrix C height, we can change this to 16, 32, 64, 128, 256, 512 and 1024
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width)
{
// Calculate the row index of the Pd element and M
int Row = blockIdx.y*BLOCK_SIZE + threadIdx.y;
// Calculate the column idenx of Pd and N
int Col = blockIdx.x*BLOCK_SIZE + threadIdx.x;
float Pvalue = 0;
// each thread computes one element of the block sub-matrix
for (int k = 0; k < Width; ++k)
Pvalue += Md[Row*Width+k] * Nd[k*Width+Col];
Pd[Row*Width+Col] = Pvalue;
}
void MatrixMulOnHost(float* M, float* N, float* P, int Width)
{
for (int i = 0; i < Width; ++i)
for (int j = 0; j < Width; ++j) {
double sum = 0;
for (int k = 0; k < Width; ++k) {
double a = M[i * Width + k];
double b = N[k * Width + j];
sum += a * b;
}
P[i * Width + j] = sum;
}
}
int main()
{
//cudaSetDevice( cutGetMaxGflopsDeviceId() );
srand(2006);
double ctime,cudatime,htime;
clock_t stime,stime2,etime, etime2;
// allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
stime = clock();
// allocate device memory
float* d_A;
cudaMalloc((void**) &d_A, mem_size_A);
float* d_B;
cudaMalloc((void**) &d_B, mem_size_B);
// copy host memory to device
cudaMemcpy(d_A, h_A, mem_size_A,cudaMemcpyHostToDevice) ;
cudaMemcpy(d_B, h_B, mem_size_B,cudaMemcpyHostToDevice);
// allocate device memory for result
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* d_C;
cudaMalloc((void**) &d_C, mem_size_C);
stime2 = clock();
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(1,1);
dim3 block(1,1);
// execute the kernel
MatrixMulKernel<<< grid, threads >>>(d_A, d_B, d_C, WB);
cudaThreadSynchronize();
etime2 = clock();
cudatime = (double)(etime2 - stime2) / CLOCKS_PER_SEC;
// allocate host memory for the result
float* h_C = (float*) malloc(mem_size_C);
// copy result from device to host
cudaMemcpy(h_C, d_C, mem_size_C,cudaMemcpyDeviceToHost);
etime = clock();
ctime = (double)(etime - stime) / CLOCKS_PER_SEC;
// compute reference solution
float* reference = (float*) malloc(mem_size_C);
stime = clock();
MatrixMulOnHost(h_A, h_B, reference, WB);
etime = clock();
htime = (double)(etime - stime) / CLOCKS_PER_SEC;
printf("host: %.10f\ncuda: %.10f\ncuda, w/copy: %.10f\n",htime,cudatime,ctime);
// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaThreadExit();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment