yashi · September 20, 2024 07:52
diff --git a/gistfile1.txt b/gistfile1.txt
 #include <stdio.h>
 #include <stdlib.h>
 #include <CL/cl.h>

 #define VECTOR_SIZE 1024

 // OpenCL kernel which is run for every work item created.
 const char *saxpy_kernel =
 	"__kernel                                   \n"
 	"void saxpy_kernel(float alpha,             \n"
 	"                  __global float *A,       \n"
 	"                  __global float *B,       \n"
 	"                  __global float *C)       \n"
 	"{                                          \n"
 	"    // Get the index of the work-item      \n"
 	"    int index = get_global_id(0);          \n"
 	"    C[index] = alpha * A[index] + B[index]; \n"
 	"}                                          \n";

 int main(void)
 {
 	int i;
 	cl_int clStatus;

 	// Allocate and initialize pointers to NULL
 	float alpha = 2.0f;
 	float *A = NULL;
 	float *B = NULL;
 	float *C = NULL;

 	cl_platform_id *platforms = NULL;
 	cl_uint num_platforms = 0;

 	cl_device_id *device_list = NULL;
 	cl_uint num_devices = 0;

 	cl_context context = NULL;
 	cl_command_queue command_queue = NULL;

 	cl_mem A_clmem = NULL;
 	cl_mem B_clmem = NULL;
 	cl_mem C_clmem = NULL;

 	cl_program program = NULL;
 	cl_kernel kernel = NULL;

 	// Allocate space for vectors A, B, and C
 	A = (float*)malloc(sizeof(float) * VECTOR_SIZE);
 	B = (float*)malloc(sizeof(float) * VECTOR_SIZE);
 	C = (float*)malloc(sizeof(float) * VECTOR_SIZE);

 	// Check for successful memory allocation
 	if (!A || !B || !C) {
 		fprintf(stderr, "Failed to allocate host memory.\n");
 		goto cleanup;
 	}

 	// Initialize vectors
 	for(i = 0; i < VECTOR_SIZE; i++) {
 		A[i] = (float)i;
 		B[i] = (float)(VECTOR_SIZE - i);
 		C[i] = 0.0f;
 	}

 	// Get platform and device information

 	// Retrieve the number of platforms
 	clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
 	if (clStatus != CL_SUCCESS || num_platforms == 0) {
 		fprintf(stderr, "Failed to find any OpenCL platforms.\n");
 		goto cleanup;
 	}

 	// Allocate memory for platform IDs
 	platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
 	if (!platforms) {
 		fprintf(stderr, "Failed to allocate memory for platform IDs.\n");
 		goto cleanup;
 	}

 	// Retrieve the platform IDs
 	clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to get the platform IDs.\n");
 		goto cleanup;
 	}

 	// Get the devices list and choose the device you want to run on

 	// Retrieve the number of GPU devices
 	clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
 	if (clStatus != CL_SUCCESS || num_devices == 0) {
 		fprintf(stderr, "Failed to find any GPU devices.\n");
 		goto cleanup;
 	}

 	// Allocate memory for device IDs
 	device_list = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devices);
 	if (!device_list) {
 		fprintf(stderr, "Failed to allocate memory for device IDs.\n");
 		goto cleanup;
 	}

 	// Retrieve the device IDs
 	clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to get the device IDs.\n");
 		goto cleanup;
 	}

 	// Create an OpenCL context for the selected device
 	context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to create an OpenCL context.\n");
 		goto cleanup;
 	}

 	// Create a command queue using clCreateCommandQueueWithProperties
 	cl_queue_properties properties[] = {0}; // No specific properties
 	command_queue = clCreateCommandQueueWithProperties(context, device_list[0], properties, &clStatus);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to create a command queue.\n");
 		goto cleanup;
 	}

 	// Create memory buffers on the device for each vector
 	A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to create buffer for vector A.\n");
 		goto cleanup;
 	}

 	B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to create buffer for vector B.\n");
 		goto cleanup;
 	}

 	C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to create buffer for vector C.\n");
 		goto cleanup;
 	}

 	// Copy the Buffer A and B to the device
 	clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to write data to buffer A.\n");
 		goto cleanup;
 	}

 	clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to write data to buffer B.\n");
 		goto cleanup;
 	}

 	// Create a program from the kernel source
 	program = clCreateProgramWithSource(context, 1, &saxpy_kernel, NULL, &clStatus);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to create program from source.\n");
 		goto cleanup;
 	}

 	// Build the program
 	const char *build_options = "-time -buildlog=stderr";
 	clStatus = clBuildProgram(program, 1, device_list, build_options, NULL, NULL);
 	if (clStatus != CL_SUCCESS) {
 		// Determine the reason for the error
 		size_t log_size;
 		clGetProgramBuildInfo(program, device_list[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
 		char *log = (char*)malloc(log_size);
 		if (log) {
 			clGetProgramBuildInfo(program, device_list[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
 			fprintf(stderr, "Error in kernel:\n%s\n", log);
 			free(log);
 		} else {
 			fprintf(stderr, "Failed to allocate memory for build log.\n");
 		}
 		goto cleanup;
 	}

 	// Create the OpenCL kernel
 	kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to create kernel.\n");
 		goto cleanup;
 	}

 	// Set the arguments of the kernel
 	clStatus  = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
 	clStatus |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem);
 	clStatus |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem);
 	clStatus |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to set kernel arguments.\n");
 		goto cleanup;
 	}

 	// Execute the OpenCL kernel on the list
 	size_t global_size = VECTOR_SIZE; // Process the entire lists
 	size_t local_size  = 64;          // Process in work-groups of size 64

 	// Ensure that VECTOR_SIZE is divisible by local_size
 	if (VECTOR_SIZE % local_size != 0) {
 		fprintf(stderr, "VECTOR_SIZE (%d) is not divisible by local_size (%zu).\n", VECTOR_SIZE, local_size);
 		goto cleanup;
 	}

 	clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to enqueue NDRange kernel.\n");
 		goto cleanup;
 	}

 	// Read the cl memory C_clmem on device to the host variable C
 	clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to read buffer C from device.\n");
 		goto cleanup;
 	}

 	// Clean up and wait for all the commands to complete.
 	clStatus = clFlush(command_queue);
 	clStatus |= clFinish(command_queue);
 	if (clStatus != CL_SUCCESS) {
 		fprintf(stderr, "Failed to flush or finish the command queue.\n");
 		// Proceeding to clean up resources even if this fails
 	}

 	// Display the result to the screen
 	for(i = 0; i < VECTOR_SIZE; i++) {
 		printf("%f * %f + %f = %f\n", alpha, A[i], B[i], C[i]);
 	}

 cleanup:
 	// Release OpenCL resources
 	if (kernel) clReleaseKernel(kernel);
 	if (program) clReleaseProgram(program);
 	if (C_clmem) clReleaseMemObject(C_clmem);
 	if (B_clmem) clReleaseMemObject(B_clmem);
 	if (A_clmem) clReleaseMemObject(A_clmem);
 	if (command_queue) clReleaseCommandQueue(command_queue);
 	if (context) clReleaseContext(context);
 	if (device_list) free(device_list);
 	if (platforms) free(platforms);

 	// Free host memory
 	if (A) free(A);
 	if (B) free(B);
 	if (C) free(C);

 	return (clStatus == CL_SUCCESS) ? EXIT_SUCCESS : EXIT_FAILURE;
 }
	#include <stdio.h>
	#include <stdlib.h>
	#include <CL/cl.h>

	#define VECTOR_SIZE 1024

	// OpenCL kernel which is run for every work item created.
	const char *saxpy_kernel =
	"__kernel \n"
	"void saxpy_kernel(float alpha, \n"
	" __global float *A, \n"
	" __global float *B, \n"
	" __global float *C) \n"
	"{ \n"
	" // Get the index of the work-item \n"
	" int index = get_global_id(0); \n"
	" C[index] = alpha * A[index] + B[index]; \n"
	"} \n";

	int main(void)
	{
	int i;
	cl_int clStatus;

	// Allocate and initialize pointers to NULL
	float alpha = 2.0f;
	float *A = NULL;
	float *B = NULL;
	float *C = NULL;

	cl_platform_id *platforms = NULL;
	cl_uint num_platforms = 0;

	cl_device_id *device_list = NULL;
	cl_uint num_devices = 0;

	cl_context context = NULL;
	cl_command_queue command_queue = NULL;

	cl_mem A_clmem = NULL;
	cl_mem B_clmem = NULL;
	cl_mem C_clmem = NULL;

	cl_program program = NULL;
	cl_kernel kernel = NULL;

	// Allocate space for vectors A, B, and C
	A = (float)malloc(sizeof(float) VECTOR_SIZE);
	B = (float)malloc(sizeof(float) VECTOR_SIZE);
	C = (float)malloc(sizeof(float) VECTOR_SIZE);

	// Check for successful memory allocation
	if (!A \|\| !B \|\| !C) {
	fprintf(stderr, "Failed to allocate host memory.\n");
	goto cleanup;
	}

	// Initialize vectors
	for(i = 0; i < VECTOR_SIZE; i++) {
	A[i] = (float)i;
	B[i] = (float)(VECTOR_SIZE - i);
	C[i] = 0.0f;
	}

	// Get platform and device information

	// Retrieve the number of platforms
	clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
	if (clStatus != CL_SUCCESS \|\| num_platforms == 0) {
	fprintf(stderr, "Failed to find any OpenCL platforms.\n");
	goto cleanup;
	}

	// Allocate memory for platform IDs
	platforms = (cl_platform_id)malloc(sizeof(cl_platform_id) num_platforms);
	if (!platforms) {
	fprintf(stderr, "Failed to allocate memory for platform IDs.\n");
	goto cleanup;
	}

	// Retrieve the platform IDs
	clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to get the platform IDs.\n");
	goto cleanup;
	}

	// Get the devices list and choose the device you want to run on

	// Retrieve the number of GPU devices
	clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
	if (clStatus != CL_SUCCESS \|\| num_devices == 0) {
	fprintf(stderr, "Failed to find any GPU devices.\n");
	goto cleanup;
	}

	// Allocate memory for device IDs
	device_list = (cl_device_id)malloc(sizeof(cl_device_id) num_devices);
	if (!device_list) {
	fprintf(stderr, "Failed to allocate memory for device IDs.\n");
	goto cleanup;
	}

	// Retrieve the device IDs
	clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to get the device IDs.\n");
	goto cleanup;
	}

	// Create an OpenCL context for the selected device
	context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to create an OpenCL context.\n");
	goto cleanup;
	}

	// Create a command queue using clCreateCommandQueueWithProperties
	cl_queue_properties properties[] = {0}; // No specific properties
	command_queue = clCreateCommandQueueWithProperties(context, device_list[0], properties, &clStatus);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to create a command queue.\n");
	goto cleanup;
	}

	// Create memory buffers on the device for each vector
	A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to create buffer for vector A.\n");
	goto cleanup;
	}

	B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to create buffer for vector B.\n");
	goto cleanup;
	}

	C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to create buffer for vector C.\n");
	goto cleanup;
	}

	// Copy the Buffer A and B to the device
	clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to write data to buffer A.\n");
	goto cleanup;
	}

	clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to write data to buffer B.\n");
	goto cleanup;
	}

	// Create a program from the kernel source
	program = clCreateProgramWithSource(context, 1, &saxpy_kernel, NULL, &clStatus);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to create program from source.\n");
	goto cleanup;
	}

	// Build the program
	const char *build_options = "-time -buildlog=stderr";
	clStatus = clBuildProgram(program, 1, device_list, build_options, NULL, NULL);
	if (clStatus != CL_SUCCESS) {
	// Determine the reason for the error
	size_t log_size;
	clGetProgramBuildInfo(program, device_list[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
	char log = (char)malloc(log_size);
	if (log) {
	clGetProgramBuildInfo(program, device_list[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
	fprintf(stderr, "Error in kernel:\n%s\n", log);
	free(log);
	} else {
	fprintf(stderr, "Failed to allocate memory for build log.\n");
	}
	goto cleanup;
	}

	// Create the OpenCL kernel
	kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to create kernel.\n");
	goto cleanup;
	}

	// Set the arguments of the kernel
	clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
	clStatus \|= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem);
	clStatus \|= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem);
	clStatus \|= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to set kernel arguments.\n");
	goto cleanup;
	}

	// Execute the OpenCL kernel on the list
	size_t global_size = VECTOR_SIZE; // Process the entire lists
	size_t local_size = 64; // Process in work-groups of size 64

	// Ensure that VECTOR_SIZE is divisible by local_size
	if (VECTOR_SIZE % local_size != 0) {
	fprintf(stderr, "VECTOR_SIZE (%d) is not divisible by local_size (%zu).\n", VECTOR_SIZE, local_size);
	goto cleanup;
	}

	clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to enqueue NDRange kernel.\n");
	goto cleanup;
	}

	// Read the cl memory C_clmem on device to the host variable C
	clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to read buffer C from device.\n");
	goto cleanup;
	}

	// Clean up and wait for all the commands to complete.
	clStatus = clFlush(command_queue);
	clStatus \|= clFinish(command_queue);
	if (clStatus != CL_SUCCESS) {
	fprintf(stderr, "Failed to flush or finish the command queue.\n");
	// Proceeding to clean up resources even if this fails
	}

	// Display the result to the screen
	for(i = 0; i < VECTOR_SIZE; i++) {
	printf("%f * %f + %f = %f\n", alpha, A[i], B[i], C[i]);
	}

	cleanup:
	// Release OpenCL resources
	if (kernel) clReleaseKernel(kernel);
	if (program) clReleaseProgram(program);
	if (C_clmem) clReleaseMemObject(C_clmem);
	if (B_clmem) clReleaseMemObject(B_clmem);
	if (A_clmem) clReleaseMemObject(A_clmem);
	if (command_queue) clReleaseCommandQueue(command_queue);
	if (context) clReleaseContext(context);
	if (device_list) free(device_list);
	if (platforms) free(platforms);

	// Free host memory
	if (A) free(A);
	if (B) free(B);
	if (C) free(C);

	return (clStatus == CL_SUCCESS) ? EXIT_SUCCESS : EXIT_FAILURE;
	}