Skip to content

Instantly share code, notes, and snippets.

@yashi
Created September 20, 2024 07:52
Show Gist options
  • Save yashi/31c0fbc9d56b44712a0ce5503715ad92 to your computer and use it in GitHub Desktop.
Save yashi/31c0fbc9d56b44712a0ce5503715ad92 to your computer and use it in GitHub Desktop.
OpenCL Hello World with ROCm specific options
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#define VECTOR_SIZE 1024
// OpenCL kernel which is run for every work item created.
const char *saxpy_kernel =
"__kernel \n"
"void saxpy_kernel(float alpha, \n"
" __global float *A, \n"
" __global float *B, \n"
" __global float *C) \n"
"{ \n"
" // Get the index of the work-item \n"
" int index = get_global_id(0); \n"
" C[index] = alpha * A[index] + B[index]; \n"
"} \n";
int main(void)
{
int i;
cl_int clStatus;
// Allocate and initialize pointers to NULL
float alpha = 2.0f;
float *A = NULL;
float *B = NULL;
float *C = NULL;
cl_platform_id *platforms = NULL;
cl_uint num_platforms = 0;
cl_device_id *device_list = NULL;
cl_uint num_devices = 0;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem A_clmem = NULL;
cl_mem B_clmem = NULL;
cl_mem C_clmem = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
// Allocate space for vectors A, B, and C
A = (float*)malloc(sizeof(float) * VECTOR_SIZE);
B = (float*)malloc(sizeof(float) * VECTOR_SIZE);
C = (float*)malloc(sizeof(float) * VECTOR_SIZE);
// Check for successful memory allocation
if (!A || !B || !C) {
fprintf(stderr, "Failed to allocate host memory.\n");
goto cleanup;
}
// Initialize vectors
for(i = 0; i < VECTOR_SIZE; i++) {
A[i] = (float)i;
B[i] = (float)(VECTOR_SIZE - i);
C[i] = 0.0f;
}
// Get platform and device information
// Retrieve the number of platforms
clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
if (clStatus != CL_SUCCESS || num_platforms == 0) {
fprintf(stderr, "Failed to find any OpenCL platforms.\n");
goto cleanup;
}
// Allocate memory for platform IDs
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
if (!platforms) {
fprintf(stderr, "Failed to allocate memory for platform IDs.\n");
goto cleanup;
}
// Retrieve the platform IDs
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to get the platform IDs.\n");
goto cleanup;
}
// Get the devices list and choose the device you want to run on
// Retrieve the number of GPU devices
clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
if (clStatus != CL_SUCCESS || num_devices == 0) {
fprintf(stderr, "Failed to find any GPU devices.\n");
goto cleanup;
}
// Allocate memory for device IDs
device_list = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devices);
if (!device_list) {
fprintf(stderr, "Failed to allocate memory for device IDs.\n");
goto cleanup;
}
// Retrieve the device IDs
clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to get the device IDs.\n");
goto cleanup;
}
// Create an OpenCL context for the selected device
context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to create an OpenCL context.\n");
goto cleanup;
}
// Create a command queue using clCreateCommandQueueWithProperties
cl_queue_properties properties[] = {0}; // No specific properties
command_queue = clCreateCommandQueueWithProperties(context, device_list[0], properties, &clStatus);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to create a command queue.\n");
goto cleanup;
}
// Create memory buffers on the device for each vector
A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to create buffer for vector A.\n");
goto cleanup;
}
B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to create buffer for vector B.\n");
goto cleanup;
}
C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to create buffer for vector C.\n");
goto cleanup;
}
// Copy the Buffer A and B to the device
clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to write data to buffer A.\n");
goto cleanup;
}
clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to write data to buffer B.\n");
goto cleanup;
}
// Create a program from the kernel source
program = clCreateProgramWithSource(context, 1, &saxpy_kernel, NULL, &clStatus);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to create program from source.\n");
goto cleanup;
}
// Build the program
const char *build_options = "-time -buildlog=stderr";
clStatus = clBuildProgram(program, 1, device_list, build_options, NULL, NULL);
if (clStatus != CL_SUCCESS) {
// Determine the reason for the error
size_t log_size;
clGetProgramBuildInfo(program, device_list[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char*)malloc(log_size);
if (log) {
clGetProgramBuildInfo(program, device_list[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
fprintf(stderr, "Error in kernel:\n%s\n", log);
free(log);
} else {
fprintf(stderr, "Failed to allocate memory for build log.\n");
}
goto cleanup;
}
// Create the OpenCL kernel
kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to create kernel.\n");
goto cleanup;
}
// Set the arguments of the kernel
clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
clStatus |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem);
clStatus |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem);
clStatus |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to set kernel arguments.\n");
goto cleanup;
}
// Execute the OpenCL kernel on the list
size_t global_size = VECTOR_SIZE; // Process the entire lists
size_t local_size = 64; // Process in work-groups of size 64
// Ensure that VECTOR_SIZE is divisible by local_size
if (VECTOR_SIZE % local_size != 0) {
fprintf(stderr, "VECTOR_SIZE (%d) is not divisible by local_size (%zu).\n", VECTOR_SIZE, local_size);
goto cleanup;
}
clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to enqueue NDRange kernel.\n");
goto cleanup;
}
// Read the cl memory C_clmem on device to the host variable C
clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to read buffer C from device.\n");
goto cleanup;
}
// Clean up and wait for all the commands to complete.
clStatus = clFlush(command_queue);
clStatus |= clFinish(command_queue);
if (clStatus != CL_SUCCESS) {
fprintf(stderr, "Failed to flush or finish the command queue.\n");
// Proceeding to clean up resources even if this fails
}
// Display the result to the screen
for(i = 0; i < VECTOR_SIZE; i++) {
printf("%f * %f + %f = %f\n", alpha, A[i], B[i], C[i]);
}
cleanup:
// Release OpenCL resources
if (kernel) clReleaseKernel(kernel);
if (program) clReleaseProgram(program);
if (C_clmem) clReleaseMemObject(C_clmem);
if (B_clmem) clReleaseMemObject(B_clmem);
if (A_clmem) clReleaseMemObject(A_clmem);
if (command_queue) clReleaseCommandQueue(command_queue);
if (context) clReleaseContext(context);
if (device_list) free(device_list);
if (platforms) free(platforms);
// Free host memory
if (A) free(A);
if (B) free(B);
if (C) free(C);
return (clStatus == CL_SUCCESS) ? EXIT_SUCCESS : EXIT_FAILURE;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment