Related
I'm new to openCL program and this is the problem I'm facing while executing a simple vector addition.
I have the following kernel code
#include <CL/cl.hpp>
#include<iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
I have integrated gpu and amd gpus on my system. I'm trying to perform vector addition on my intel gpu and for which I have installed the intel opencl drivers (i7 3rd gen processor with hd graphics).
I have the below openCL code
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::cout << "Total platforms including cpu: " << platforms.size() << std::endl;
if (platforms.size() == 0) {
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
int i;
const int LIST_SIZE = 50;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
//std::cout<<source_str<<std::endl;
// Get platform and device information
cl_platform_id* platforms1 = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, platforms1, &ret_num_platforms);
platforms1= (cl_platform_id*) malloc(sizeof(cl_platform_id) * ret_num_platforms);
clGetPlatformIDs(ret_num_platforms, platforms1, NULL);
/*
* Platform 0: Intel Graphics
* Platform 1 : AMD Graphics
*/
//CHANGE THE PLATFORM ACCORDING TO YOUR SYSTEM!!!!
ret = clGetDeviceIDs( platforms1[0], CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 16; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
//FREE
return 0;
}
If the LISTSIZE is 50, it prints only till 48 that is 16*3. It prints only the multiple of LISTSIZE and I'm not able to figure out why?.
OpenCL kernels execute only for a multiple of the local thread block size (local Range, in your code local_item_size), which should not be smaller than 32 and must be a multiple of 2, (so it can be (32, 64, 128, 256, ...). If you set it to 16, half of the GPU will be idle at any time. global_item_size must be a multiple of local_item_size. You need at least 32 data items for the kernel to function and a lot more for it to yield good performance.
Also the part
#include <CL/cl.hpp>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
is not OpenCL C code and does not belong in the .cl source file. If it is not too lengthy, you can write the OpenCL C code directly in the .cpp file as a raw string:
const string kernel_code = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
)";
char* source_str = kernel_code.c_str();
Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.
im new with OpenCL, I have a problem in clCreateKernel, it throws CL_INVALID_PROGRAM_EXECUTABLE, could anybody help, the code is based on http://www.cs.bris.ac.uk/home/simonm/workshops/OpenCL_lecture3.pdf , the last optimization
Here is the code:
#define ORDER 10 // Order of the square matrices A, B, and C
#define AVAL 3.0 // A elements are constant and equal to AVAL
#define BVAL 5.0 // B elements are constant and equal to BVAL
#define TOL (0.001) // tolerance used in floating point comparisons
#define DIM 2 // Max dim for NDRange
#define COUNT 1 // number of times to do each multiplication
#define SUCCESS 1
#define FAILURE 0
// Funciones Auxiliares
void initmat(int Mdim, int Ndim, int Pdim, float *A, float *B, float *C)
{
int i, j;
/* Initialize matrices */
for (i = 0; i < Ndim; i++)
for (j = 0; j < Pdim; j++)
A[i*Ndim+j] = AVAL;
for (i = 0; i < Pdim; i++)
for (j = 0; j < Mdim; j++)
B[i*Pdim+j] = BVAL;
for (i = 0; i < Ndim; i++)
for (j = 0; j < Mdim; j++)
C[i*Ndim+j] = 0.0f;
}
// Definicion de la funcion:
char * readKernel(void)
{
size_t *source_length;
FILE *fp = fopen("kernel.cl", "r");
if (fp == NULL)
{
printf("Cannot Open Kernel.cl\n");
}
else
{
printf("Kernel.cl Opened\n");
}
fseek(fp, 0, SEEK_END);
source_length[0] = ftell(fp);
if (source_length[0] == 0)
{
printf("Kernel.cl is empty\n");
}
else
{
printf("Kernel.cl length: %zu bytes\n", source_length[0]);
}
char *source = (char*) calloc(source_length[0] + 1, 1);
if (source == 0)
{
printf("Memory allocation failed");
}
fseek(fp, 0, SEEK_SET);
fread(source, 1, source_length[0], fp);
printf("Kernel.cl Read\n");
return source;
}
int main(int argc, char **argv)
{
// Declare and iniciate data
float *A, *B, *C;
int Mdim, Ndim, Pdim;
int err, szA, szB, szC;
size_t global[DIM];
size_t local[DIM];
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_uint nd;
cl_mem a_in, b_in, c_out;
Ndim = ORDER;
Pdim = ORDER;
Mdim = ORDER;
szA = Ndim*Pdim;
szB = Pdim*Mdim;
szC = Ndim*Mdim;
A = (float *)malloc(szA*sizeof(float));
B = (float *)malloc(szB*sizeof(float));
C = (float *)malloc(szC*sizeof(float));
const char* C_elem_KernelSource =
"__kernel \n"
"void mmul( \n"
" const int Mdim, \n"
" const int Ndim, \n"
" const int Pdim, \n"
" __global float* A, \n"
" __global float* B, \n"
" __global float* C, \n"
" __local float* Bwrk) \n"
"{ \n"
" int k,j; \n"
" int i = get_global_id(0); \n"
" int iloc = get_local_id(0); \n"
" int nloc = get_local_size(0); \n"
" float Awrk[10]; \n"
" float tmp; \n"
" for (k=0; k<Pdim; k++) \n"
" Awrk[k] = A[i*Ndim+k]; \n"
" for (j=0; j<Mdim; j++){ \n"
" for (k=iloc; k<Pdim; k=k+nloc) \n"
" Bwrk[k] = B[k*Pdim+j]; \n"
" barrier(CLK_LOCAL_MEM_FENCE); \n"
" tmp = 0.0f; \n"
" for (k=0; k<Pdim; k++) \n"
" tmp += Awrk[k] * Bwrk[k]; \n"
" C[i*Ndim+j] += tmp; \n"
"} \n"
;
initmat(Mdim, Ndim, Pdim, A, B, C);
// Setup the plataform
cl_uint num_platforms;
if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS)
{
printf("Unable to get platform!\n");
}else{
printf("Plataformas Disponibles: %u \n", num_platforms);
}
//Identificador
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, &num_platforms);
printf("Plataformas creada\n");
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err==CL_SUCCESS){
printf("Device creado \n");
}else {
printf("Error %d \n", err);
}
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
if (err==CL_SUCCESS){
printf("Contexto creado \n");
}else {
printf("Error creando contexto \n");
}
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (err==CL_SUCCESS){
printf("cola de comandos creadas \n");
}else {
printf("Error creando cola de comandos \n");
}
// Setup buffers and write A and B matrices to the device memory
a_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szA, NULL, NULL);
b_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szB, NULL, NULL);
c_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, NULL);
err = clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float) * szA, A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, b_in, CL_TRUE, 0, sizeof(float) * szB, B, 0, NULL, NULL);
// Build the program, define the kernel and setup arguments
program = clCreateProgramWithSource(context, 1, (const char **) &C_elem_KernelSource, NULL, &err);
if (err==CL_SUCCESS){
printf("programa creado \n");
}else {
printf("Error generado %d creando programa\n", err);
}
//Compila el programa en el dispositivo elegido
clBuildProgram(program, 1, &device_id, NULL, NULL, NULL );
if (err==CL_SUCCESS){
printf("programa compilado 1\n");
}else {
printf("Error generado %d compilando programa 1\n", err);
}
kernel = clCreateKernel(program, "mmul", &err);
if (err==CL_SUCCESS){
printf("Kernel creado \n");
}else {
printf("Error generado %d creando kernel\n", err);
}
err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim);
err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim);
err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &a_in);
err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_in);
err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &c_out);
err |= clSetKernelArg(kernel, 6, sizeof(float)*Pdim, NULL);
if (err==CL_SUCCESS){
printf("Argumentos del Kernel configurados \n");
}else {
printf("Error configurando argumentos del kernel \n");
}
//Run the kernel and collect results
// 1D ND Range set to dimensions of C matrix
//Local Dim set to 250 so number of work-groups match number of
//compute units (4 in this case) for our order 1000 matrices
//Pass local memory to kernels. This requires a change to the kernel
//argument list … a new call to clSetKernelArg is needed
printf("Encolando Kernel:\n");
global[0] = (size_t) Ndim; global[1] = (size_t) Mdim; local[0] = (size_t) 2;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global, local, 0, NULL, NULL);
if (err==CL_SUCCESS){
printf("Kernel enviado a device \n");
}else {
printf("Error enviando kernel a device \n");
}
clFinish(commands);
err = clEnqueueReadBuffer(commands, c_out, CL_TRUE, 0, sizeof(float) * szC, C, 0, NULL, NULL );
//test_results(A, B, c_out);
}
Thanks
The main problem is that the open brace on line 112 has no matching closing brace:
" for (j=0; j<Mdim; j++){ \n"
Also note that the pointer declared on line 34 is used without initialization:
size_t *source_length;
On line 170, an err= should be added to the clBuildProgram() call so that the error checking works as intended. Then you can add logic to use clGetProgramBuildInfo() to get details in the case of a build fail.
Here is my attempt to write a opencl code to add 2 vectors
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
//24/12
//data structure platform, device, context,program, kernel, command queue
void main()
{
/////////////////////////////////////////////////////////////////////
//PLATFORM QUERY:
/////////////////////////////////////////////////////////////////////
//clGetPlatformIDs(num_entries, platforms, &num_platforms);
// two part: platform = NULL
// malloc and get platforms*
cl_uint num_platforms; //must be uint
cl_platform_id *platforms;
clGetPlatformIDs(5, NULL, &num_platforms);
printf("There are %d platforms \n", num_platforms);
platforms = (cl_platform_id*) malloc (num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(5, platforms, &num_platforms);
for(int i = 0; i < num_platforms; i++)
{
char name[40],vendor[40],version[40], profile[40],extensions[4096];
clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME, sizeof(name), &name, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_VERSION, sizeof(vendor), &version, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_PROFILE, sizeof(vendor), &profile, NULL);
//clGetPlatformInfo(platforms[i],CL_PLATFORM_EXTENSIONS, sizeof(vendor), &extensions, NULL);
printf("Platform %d \n", i);
printf("Name %s \n", name);
printf("Vendor %s \n", vendor);
printf("Version %s \n", version);
printf("Profile %s \n", profile);
//printf("Extension %s \n", extensions);
printf("----------------------------------\n");
}
////////////////////////////////////////////////////////////////
//DEVICES QUERYING
////////////////////////////////////////////////////////////////
cl_device_id* devices;
cl_uint num_devices;
cl_device_fp_config flag ;
for(int i= 0; i< num_platforms; i++)
{
printf("Platform %d has:\n",i);
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 3, NULL, &num_devices);
devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
char name[40];
for(int j=0; j < num_devices; j++)
{
int err= clGetDeviceInfo(devices[j],CL_DEVICE_NAME,sizeof(name),name,NULL);
if (err<0)
{
//printf("Error querying devices name\n");
}
else
{
printf("Device name %s \n", name);
}
err= clGetDeviceInfo(devices[j],CL_DEVICE_NAME,sizeof(flag),&flag,NULL);
if (flag & CL_FP_DENORM)
{
printf("This device support denormalized number \n");
}
}
printf("-----------------------------------\n");
}
///////////////////////////////////////////////////////
//CONTEXT QUERYING AND CREATING
////////////////////////////////////////////////////////
//NOTE clCreateContext returns cl_context instead of errors
//REF_COUNT if very important in the future
//create context for GPU
cl_context context;
cl_uint ref_count;
cl_int err;
char name[40];
context= clCreateContext(NULL,1,&devices[0], NULL,NULL,&err);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Original reference count is %d \n",ref_count);
/*clRetainContext(context);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Incremented reference count is %d \n",ref_count);
clReleaseContext(context);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Decremented reference count is %d \n",ref_count);*/
////////////////////////////////////////////////////////
//Create programme
///////////////////////////////////////////////////////
size_t program_size;
err=0;
cl_program program;
char* program_buffer;
FILE* program_handle = fopen("kernel.cl","r");
//More recommendable than source code???
program_buffer = (char*)malloc(MAX_SOURCE_SIZE);
program_size = fread( program_buffer, 1, MAX_SOURCE_SIZE, program_handle);
fclose( program_handle );
program = clCreateProgramWithSource(context,1,(const char**) &program_buffer,
(size_t*)&program_size, &err);
////////////////////////////////////////////////////////
//Build Program
///////////////////////////////////////////////////////
//const char options[] = "-cl-finite-math-only -cl-no-signed-zeros";
char* program_log;
size_t log_size;
err= clBuildProgram(program, 1 , devices, NULL, NULL, NULL);
if(err < 0) //debug , printing log
{
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*) malloc(log_size+1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,
program_log,NULL);
printf("%s\n",program_log);
free(program_log);
//exit(1);
}
///////////////////////////////////////////////////////////////////////////////////
//create kernel
///////////////////////////////////////////////////////////////////////////////////
cl_uint num_kernels;
cl_kernel kernel;
char kernel_name[40];
kernel = clCreateKernel(program,"add",&err);
if (err<0)
{
perror("could not found any kernels\n");
}
//kernels = (cl_kernel*)malloc(num_kernels*sizeof(cl_kernel));
//clCreateKernelsInProgram(program, num_kernels, kernels, NULL);
///FOR REFERNECE
//for(int i=0; i<num_kernels; i++)
//{
clGetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,sizeof(kernel_name),kernel_name,NULL);
printf("Kernel function: %s \n",kernel_name);
//}
/////////////////////////////////////////////////////
//Create command queue
/////////////////////////////////////////////////////
cl_command_queue queue = clCreateCommandQueue(context, devices[0],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
if (err < 0)
{
printf("Couldn't create command queue \n");
exit(1);
}
clEnqueueTask(queue, kernel, 0, NULL, NULL);//only enqueue
//////////////////////////////////////////
unsigned int n= 1000;
int* h_a;
int* h_b;
int* h_c;
cl_mem d_a;
cl_mem d_b;
cl_mem d_c;
h_a = (int*) malloc(n*sizeof(int));
h_b = (int*) malloc(n*sizeof(int));
h_c = (int*) malloc(n*sizeof(int));
for(int i=0; i< n; i++)
{
h_a[i]= 1;//sinf(i)*sinf(i);
h_b[i]= 1;//cosf(i)*cosf(i);
}
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
err = clEnqueueWriteBuffer(queue,d_a,CL_TRUE,0,sizeof(h_a),h_a,0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue,d_b,CL_TRUE,0,sizeof(h_b),h_a,0, NULL, NULL);
//////set argument
err= clSetKernelArg(kernel,0,sizeof(cl_mem),&d_a);
err= clSetKernelArg(kernel,1,sizeof(cl_mem),&d_b);
err= clSetKernelArg(kernel,2,sizeof(cl_mem),&d_c);
err= clSetKernelArg(kernel,3,sizeof(unsigned int),&n);
///////////////
size_t globalsize, localsize;
localsize=64;
globalsize=ceil(n/(float)localsize)*localsize;
err= clEnqueueNDRangeKernel(queue,kernel,1, NULL,&globalsize,&localsize,0,NULL,NULL);
////////////////////////
clFinish(queue);
err=clEnqueueReadBuffer(queue, d_c,CL_TRUE, 0, sizeof(h_c), h_c, 0 , NULL, NULL);
for(int i = 0; i< n; i++)
{
printf(" h_c[%d] = %d \n", i, h_c[i]);
}
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseKernel(kernel);
free(h_a);
free(h_b);
free(h_c);
getchar();
}
and here is my kernel.cl
__kernel void add(__global int * a, __global int *b, __global int* c, const unsigned n)
{
int id= get_global_id(0);
if (id<n)
c[id]= a[id] + b[id];
}
With this, I only received garbage values , for example h_c[i]= -842150451 for all i.
Please help me to fix it. Thanks!
This statement is not correct :
sizeof(h_a)
Should be something like :
n * sizeof(int)
Indeed h_a is just a pointer so sizeof(h_a) = sizeof(int) => you have the space for only one item.
I've written the following code. I have a loop which iterates between two red and black kernels. In each iteration I call clEnqueueReadBuffer which I think is not efficient. Is there any other way to repeat calling kernels efficiently?
Thanks
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <cmath>
#include <ctime>
#include <ocl
Utils.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define DATA_SIZE (1048576)
#define NANO_TO_MILI 1e6
#define MAX_ITER 1
#define LIMIT 100
#define BIG_RANGE LIMIT*4*100
#define EPS 1e-2
#define SQ 1024
#define A(i,j) A[i*SQ+j]
using namespace std;
cl_platform_id platforms;
cl_device_id device;
cl_context context;
cl_program program1, program2;
cl_command_queue command;
cl_int err;
cl_kernel kernel_red, kernel_black;
cl_int i;
cl_mem input_A,input_b,in_out_X;
cl_event timing_event;
cl_ulong time_start, time_end,total_time = 0;
const char options[] = "-cl-mad-enable -cl-finite-math-only -Werror -DWIDTH=1024 -DHEIGHT=1024";
char *kernel_names[] = {"Red","Black"};
float norm (float*,float*,int);
void swap(float **in, float **out);
void CreateQueue(void);
void CreateKernel(void);
void CreateBuffer(unsigned int);
void Enqueue_Write_Buffer(unsigned int);
void Kernel_Arg_Set(cl_kernel, unsigned int);
void Enqueue_Read_Buffer(unsigned int);
void Create_Work_Group(cl_kernel, unsigned int);
void Shutdown();
float *A,*oldX,*newX,*b;
int main(int argc, char** argv) {
unsigned int count = DATA_SIZE;
int i,j;
clock_t start,end;
float *XX,*XXnew;
A = (float*)malloc(sizeof(float)*count);
newX = (float*)malloc(sizeof(float)*SQ);
oldX = (float*)malloc(sizeof(float)*SQ);
b = (float*)malloc(sizeof(float)*SQ);
XX = (float*)malloc(sizeof(float)*SQ);
float h=1.0f/SQ;
float xx[SQ];
for (i=0;i<SQ;i++){
XX[i] = 0.0f;
oldX[i]=0.0f;
xx[i] = 0.0f + (i+1)*h;
if (i != 0) b[i] = -2.0f*xx[i]; else b[i] = -2.0f*xx[i]-1.0f/(h*h)+1.0f/(2.0f*h);
for(j=0;j<SQ;j++) A(i,j) =0.0f;
A(i,i) = -2.0f/(h*h);
if (i!=SQ-1) A(i,i+1) = 1.0f/(h*h) + 1.0f/(2.0f*h); else A(i,i+1) = 0.0f;
if (i != 0) A(i,i-1) = 1.0f/(h*h) - 1.0f/(2.0f*h); else A(i,i-1) = 0.0f;
}
newX[0] = BIG_RANGE;
int cnt = 0;
CreateQueue();
CreateKernel();
CreateBuffer(count);
Kernel_Arg_Set(kernel_red ,count);
Kernel_Arg_Set(kernel_black,count);
end=0.0f;start =clock();cnt =0;
Enqueue_Write_Buffer(count);
while(norm(oldX,newX,SQ) > EPS && cnt<LIMIT){
Create_Work_Group(kernel_red, count);
Enqueue_Read_Buffer(count);
Create_Work_Group(kernel_black, count);
cnt++;
Enqueue_Read_Buffer(count);
}
clFinish(command);
Shutdown();
free(oldX);
free(newX);
free(XX);
free(XXnew);
return 0;
}
void CreateQueue(){
err = clGetPlatformIDs(1, &platforms, NULL);
if(err<0){
perror("no platform");getchar();exit(1);}
err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &device,NULL);
if(err<0){
perror("no device");getchar();exit(1);}
context = clCreateContext(NULL, 1, &device,NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");exit(1);}
command = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
if (!command)
{
printf("Error: Failed to create a command commands!\n");
exit(1);
}
clEnqueueBarrier(command);
}
void CreateBuffer(unsigned int count){
input_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * count, A, NULL);
in_out_X = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, oldX, NULL);
input_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, b, NULL);
if (!input_A || !input_b || !in_out_X)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
}
void CreateKernel(){
FILE *fp;
size_t program_size;
string kernel_src;
fp = fopen("Red.cl", "r");
fseek(fp, 0, SEEK_END);
program_size = ftell(fp);
kernel_src.resize(program_size + 1);
fseek(fp, 0, SEEK_SET);
fread(&kernel_src[0], program_size, 1, fp);
fclose(fp);
kernel_src[program_size] = '\0';
const char *src = &kernel_src[0];
program1 = clCreateProgramWithSource(context, 1,&src, NULL, &err);
if (!program1)
{
printf("clCreateProgramWithSource failed\n");
exit(1);
}
err =clBuildProgram(program1, 1, &device, options, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2*2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program1, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
kernel_red = clCreateKernel(program1, kernel_names[0], &err);
if (!kernel_red || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
kernel_black = clCreateKernel(program1, kernel_names[1], &err);
if (!kernel_black || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
}
void Create_Work_Group(cl_kernel kernel, unsigned int count){
size_t global[] = {SQ,SQ,0};
size_t local[] = {32,32,0};
err = clEnqueueNDRangeKernel(command, kernel, 2, NULL, global, local, 0, NULL,NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
exit(1);
}
}
void Kernel_Arg_Set(cl_kernel kernel,unsigned int count){
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_A);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_out_X);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_b);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
}
void Enqueue_Read_Buffer(unsigned int count){
err = clEnqueueReadBuffer( command, in_out_X, CL_TRUE, 0, sizeof(float) * SQ, oldX, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
}
void Enqueue_Write_Buffer(unsigned int count){
err = clEnqueueWriteBuffer(command, input_A , CL_FALSE, 0, sizeof(float) * count, A, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, input_b , CL_FALSE, 0, sizeof(float) * SQ , b, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, in_out_X, CL_FALSE, 0, sizeof(float) * SQ ,oldX, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
}
What you do is quite inefficient. You can write the buffer only once, then enqueue as many kernels as you want, with the same buffer as their argument. Of course if you need to compute the norm, you need to read data back. I would suggest something like this:
Create an additional buffer for the norm; check at the beginning of every kernel what the norm is (just by reading its value); if it is smaller than threshold value, return immediately.
Create a new kernel which will compute the norm for you.
Enque tasks like:
write buffers,
kernels: { {red,black}*10, updateNorm}*10
read buffers.
The computation will run 10x, then norm will be updated. In case it is already ok, already enqueued computation kernels will be will retrun immediately. After the queue is finished, read buffers back and check norm on the CPU. If the norm is still not OK, enqueue the same batch of kernels again.
In the worst case, you will waste 9 real and 90 immediately returning kernel runs.