Skip to content

Instantly share code, notes, and snippets.

@y-tag
Created February 3, 2014 23:25
Show Gist options
  • Save y-tag/8794544 to your computer and use it in GitHub Desktop.
Save y-tag/8794544 to your computer and use it in GitHub Desktop.
#include <cstdio>
#include <cstdlib>
#include <sys/time.h>
#include "blas.h"
#include "utility.h"
// $ g++ -O3 -o bench_sgemm bench_sgemm.cc -lblas -lpthread -Wall -Werror
int main(int argc, char **argv) {
int loop_num = 10;
int m = 1 << 11;
int n = 1 << 11;
int k = 1 << 11;
float *A = new float[m * k];
float *B = new float[k * n];
float *C = new float[m * n];
if (A == NULL || B == NULL || C == NULL) {
fprintf(stderr, "malloc error!\n");
return 1;
}
float alpha = 1.0f;
float beta = 1.0f;
double elapsed_time = 0.0;
double start = 0.0;
for (int l = 0; l < loop_num; ++l) {
randn(A, m * k);
randn(B, k * n);
randn(C, m * n);
start = gettimeofday_sec();
sgemm_("N", "N", &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
elapsed_time += gettimeofday_sec() - start;
}
long ops_num = 2L * m * n * k * loop_num;
double gflops = ops_num / (1000 * 1000 * 1000 * elapsed_time);
fprintf(stdout, "elapsed_time: %f\n", elapsed_time);
fprintf(stdout, "GFLOPS: %f\n", gflops);
delete[] A; A = NULL;
delete[] B; B = NULL;
delete[] C; C = NULL;
return 0;
}
#include <cstdio>
#include <cstdlib>
#include <sys/time.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "utility.h"
// $ nvcc -o bench_sgemm_cublas bench_sgemm_cublas.cc -I/usr/local/cuda-5.5/include -lcublas
int main(int argc, char **argv) {
int loop_num = 10;
int m = 1 << 11;
int n = 1 << 11;
int k = 1 << 11;
float *A = new float[m * k];
float *B = new float[k * n];
float *C = new float[m * n];
if (A == NULL || B == NULL || C == NULL) {
fprintf(stderr, "malloc error!\n");
return 1;
}
float alpha = 1.0f;
float beta = 1.0f;
double elapsed_time = 0.0;
double start = 0.0;
cudaError_t cuda_stat;
cublasStatus_t stat;
cublasHandle_t handle;
float *devA;
float *devB;
float *devC;
cuda_stat = cudaMalloc((void**)&devA, m * k * sizeof(*A));
if (cuda_stat != cudaSuccess) { fprintf(stderr, "cudaMalloc error! (devA)\n"); return 1; }
cuda_stat = cudaMalloc((void**)&devB, k * n * sizeof(*B));
if (cuda_stat != cudaSuccess) { fprintf(stderr, "cudaMalloc error! (devB)\n"); return 1; }
cuda_stat = cudaMalloc((void**)&devC, m * n * sizeof(*C));
if (cuda_stat != cudaSuccess) { fprintf(stderr, "cudaMalloc error! (devC)\n"); return 1; }
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "cublasCreate error!\n");
cudaFree(devA); cudaFree(devB); cudaFree(devC);
cublasDestroy(handle);
return 1;
}
int l = 0;
for (l = 0; l < loop_num; ++l) {
randn(A, m * k);
randn(B, k * n);
randn(C, m * n);
start = gettimeofday_sec();
stat = cublasSetMatrix(m, k, sizeof(*A), A, m, devA, m);
if (stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "cublasSetMatrix error!(devA)\n"); break; }
stat = cublasSetMatrix(k, n, sizeof(*B), B, k, devB, k);
if (stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "cublasSetMatrix error!(devB)\n"); break; }
stat = cublasSetMatrix(m, n, sizeof(*C), C, m, devC, m);
if (stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "cublasSetMatrix error!(devC)\n"); break; }
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, devA, m, devB, k, &beta, devC, m);
if (stat != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "cublasSgemm error!\n");
break;
}
stat = cublasGetMatrix(m, k, sizeof(*A), devA, m, A, m);
if (stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "cublasGetMatrix error!(devA)\n"); break; }
stat = cublasGetMatrix(k, n, sizeof(*B), devB, k, B, k);
if (stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "cublasGetMatrix error!(devB)\n"); break; }
stat = cublasGetMatrix(m, n, sizeof(*C), devC, m, C, m);
if (stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "cublasGetMatrix error!(devC)\n"); break; }
elapsed_time += gettimeofday_sec() - start;
}
if (l == loop_num) {
long ops_num = 2L * m * n * k * loop_num;
double gflops = ops_num / (1000 * 1000 * 1000 * elapsed_time);
fprintf(stdout, "elapsed_time: %f\n", elapsed_time);
fprintf(stdout, "GFLOPS: %f\n", gflops);
}
cudaFree(devA); devA = NULL;
cudaFree(devB); devB = NULL;
cudaFree(devC); devC = NULL;
cublasDestroy(handle);
delete[] A; A = NULL;
delete[] B; B = NULL;
delete[] C; C = NULL;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment