How to remove CL_INVALID_PLATFORM error in opencl code? - opencl

Doing simple matrix multiplication using OpenCL:
// Multiply two matrices A * B = C
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <oclUtils.h>
#define WA 3
#define HA 3
#define WB 3
#define HB 3
#define WC 3
#define HC 3
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
// Program main
main(int argc, char** argv)
// set seed for rand()
// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
// 2. initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// 3. print out A and B
printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++)
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++)
printf("%f ", h_B[i]);
if(((i + 1) % WB) == 0)
// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
// 5. Initialize OpenCL
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
cl_kernel clKernel;
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
/* Initialize OpenCL */
clGPUContext = clCreateContextFromType(0,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
cl_device_id *clDevices = (cl_device_id *)
errcode |= clGetContextInfo(clGPUContext,
clDevices, NULL);
//shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
//shrCheckError(errcode, CL_SUCCESS);
// Setup device memory
d_C = clCreateBuffer(clGPUContext,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
mem_size_B, h_B, &errcode);
// 6. Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("",
"// My comment\n",
//shrCheckError(clMatrixMul != NULL, shrTRUE);
clProgram = clCreateProgramWithSource(clGPUContext,
1, (const char **)&clMatrixMul,
&kernelLength, &errcode);
//shrCheckError(errcode, CL_SUCCESS);
errcode = clBuildProgram(clProgram, 0,
//shrCheckError(errcode, CL_SUCCESS);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
//shrCheckError(errcode, CL_SUCCESS);
// 7. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];
int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
//shrCheckError(errcode, CL_SUCCESS);
localWorkSize[0] = 3;
localWorkSize[1] = 3;
globalWorkSize[0] = 3;
globalWorkSize[1] = 3;
errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
//shrCheckError(errcode, CL_SUCCESS);
// 8. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
//shrCheckError(errcode, CL_SUCCESS);
// 9. print out the results
printf("\n\nMatrix C (Results)\n");
for(int i = 0; i < size_C; i++)
printf("%f ", h_C[i]);
if(((i + 1) % WC) == 0)
// 10. clean up memory
In the above code I keep getting error at the place :
/**********************/ / Initialize OpenCL
/ /**********************/
clGPUContext = clCreateContextFromType(0,
NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS);
The error code being returned is -32 that means: CL_INVALID_PLATFORM"
How do I remove this error?
OS: Windows 7, 32 bit, NVIDIA GPU GeForce 610

The Nvidia drivers expect you to provide a non-NULL properties pointer as first argument to the clCreateContextFromType call.
The Khronos specification for clCreateContextFromType states that if NULL is passed for the properties parameter, the platform that is selected is implementation dependent. In case of Nvidia the choice seems to be that no platform at all is selected if a NULL pointer is passed. See clCreateContextFromType for more information.
On the other hand, this behavior is consistent with Issue #3 in the cl_khr_icd extension, which would apply if you are using OpenCL through the ICD, and which states:
3: How will the ICD handle a NULL cl_platform_id?
RESOLVED: The NULL platform is not supported by the ICD.
To pass the properties to clCreateContextFromType, first query the platforms with clGetPlatformIDs. Then construct a properties array with the desired platform ID and pass it to clCreateContextFromType. Something along the following lines should work with a C99 compliant compiler:
// query the number of platforms
cl_uint numPlatforms;
errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
shrCheckError(errcode, CL_SUCCESS);
// now get all the platform IDs
cl_platform_id platforms[numPlatforms];
errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
shrCheckError(errcode, CL_SUCCESS);
// set platform property - we just pick the first one
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0};
clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);


Problems additionVector with OpenCL

I want to learn OpenCL so i read a tutorial with a simple vector addition
Im working with ubuntu
Distributor ID: Ubuntu
Description: Ubuntu 22.04.1 LTS
Release: 22.04
Codename: jammy
And i have a RTX 3080Ti known by my computer
| NVIDIA-SMI 525.85.05 Driver Version: 525.85.05 CUDA Version: 12.0 |
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
| 0 NVIDIA GeForce ... Off | 00000000:02:00.0 On | N/A |
| 0% 54C P8 38W / 350W | 634MiB / 12288MiB | 2% Default |
| | | N/A |
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
| 0 N/A N/A 1766 G /usr/lib/xorg/Xorg 312MiB |
| 0 N/A N/A 2087 G /usr/bin/gnome-shell 105MiB |
| 0 N/A N/A 3343 G ...5/usr/lib/firefox/firefox 183MiB |
give by an nvidia-smi
I installed OpenCL with apt-get install opencl-headers and cuda for OpenCL drivers.
Here is the code :
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#include <CL/cl.h>
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 10;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = i;
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
char str_buffer[1024];
fp = fopen("", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
return 0;
And the code of the kernel :
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
I compile with : gcc main.c -o vectorAddition -l OpenCL
And the execution of vectorAddition give me this :
platform name : NVIDIA CUDA
platform vendor : NVIDIA Corporation
Device name : NVIDIA Corporation
0 + 0 = 0
1 + 1 = 0
2 + 2 = 0
3 + 3 = 0
4 + 4 = 0
5 + 5 = 0
6 + 6 = 0
7 + 7 = 0
8 + 8 = 0
9 + 9 = 0
I already read a post which is pretty the same than mine :
But i think my clCreateBuffer are good
I put these lines in my code to be sure my gpu is know :
//Get the name of the platform and device
ret = clGetPlatformInfo(0, CL_PLATFORM_NAME, sizeof(str_buffer), &str_buffer, NULL);
printf("platform name : %s\n",str_buffer);
ret = clGetPlatformInfo(0, CL_PLATFORM_VENDOR, sizeof(str_buffer), &str_buffer, NULL);
printf("platform vendor : %s\n",str_buffer);
ret = clGetDeviceInfo(0, CL_DEVICE_NAME, sizeof(str_buffer), &str_buffer, NULL);
printf("Device name : %s\n",str_buffer);
If anyone have the same issue i found out the solution. The problem is that the man who wrote the tutorial made work-groups with these lines :
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,&global_item_size, &local_item_size, 0, NULL, NULL);
You can have an unknown behavior if the local_item_size is not a multiple of list_size
So you can send a NULL argument instead of &local_item_size or chose 64,128,... for LIST_SIZE.

OpenCL program works only for the multiple of itemsize

I'm new to openCL program and this is the problem I'm facing while executing a simple vector addition.
I have the following kernel code
#include <CL/cl.hpp>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
I have integrated gpu and amd gpus on my system. I'm trying to perform vector addition on my intel gpu and for which I have installed the intel opencl drivers (i7 3rd gen processor with hd graphics).
I have the below openCL code
std::vector<cl::Platform> platforms;
std::cout << "Total platforms including cpu: " << platforms.size() << std::endl;
if (platforms.size() == 0) {
std::cout << " No platforms found. Check OpenCL installation!\n";
int i;
const int LIST_SIZE = 50;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id* platforms1 = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, platforms1, &ret_num_platforms);
platforms1= (cl_platform_id*) malloc(sizeof(cl_platform_id) * ret_num_platforms);
clGetPlatformIDs(ret_num_platforms, platforms1, NULL);
* Platform 0: Intel Graphics
* Platform 1 : AMD Graphics
ret = clGetDeviceIDs( platforms1[0], CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 16; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
return 0;
If the LISTSIZE is 50, it prints only till 48 that is 16*3. It prints only the multiple of LISTSIZE and I'm not able to figure out why?.
OpenCL kernels execute only for a multiple of the local thread block size (local Range, in your code local_item_size), which should not be smaller than 32 and must be a multiple of 2, (so it can be (32, 64, 128, 256, ...). If you set it to 16, half of the GPU will be idle at any time. global_item_size must be a multiple of local_item_size. You need at least 32 data items for the kernel to function and a lot more for it to yield good performance.
Also the part
#include <CL/cl.hpp>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
is not OpenCL C code and does not belong in the .cl source file. If it is not too lengthy, you can write the OpenCL C code directly in the .cpp file as a raw string:
const string kernel_code = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
char* source_str = kernel_code.c_str();

What is the best practice to do reduce in OpenCL?

Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
output[gl] = accum;
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
int main()
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
size = size/64;
tmp = output;
output = input;
input = tmp;
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
free (array);
free (array2);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
for(; event_index > 0; event_index--)
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.

openCL Long Overflowing

Before I start I am a C beginner and I am trying to do some openCL work which might have been a mistake. Below is my kernel code:
__kernel void collatz(__global int* in, __global int* out)
uint id = get_global_id(0);
unsigned long n = (unsigned long)id;
uint count = 0;
while (n > 1) {
if (n % 2 == 0) {
n = n / 2;
} else {
if(n == 1572066143) {
unsigned long test = n;
printf("BEFORE - %lu\n", n);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
n = (3 * n) + 1;
} else {
n = (3 * n) + 1;
count = count + 1;
out[id] = count;
and the output:
BEFORE - 1572066143
AFTER - 421231134
To me it looks like n is overflowing but I can't figure out why it is happening.
The interesting thing is if I create a new variable to store the same value as n then it seems to work correctly.
unsigned long test = 1572066143;
printf("BEFORE - %lu\n", test);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
BEFORE - 1572066143
AFTER - 4716198430
As I said I am a C beginner so I could be doing something very stupid! Any help would be appreciated as I have been pulling my hair out for hours now!
Here is my host code in case I am doing something stupid on that end:
int _tmain(int argc, _TCHAR* argv[])
/*Step1: Getting platforms and choose an available one.*/
cl_uint numPlatforms; //the NO. of platforms
cl_platform_id platform = NULL; //the chosen platform
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
/*Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *devices;
devices = (cl_device_id*)malloc(1 * sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
/*Step 3: Create context.*/
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
/*Step 4: Creating command queue associate with the context.*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
const char *filename = "";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
cl_ulong max = 2000000;
cl_ulong *numbers = NULL;
numbers = new cl_ulong[max];
for (int i = 1; i <= max; i++) {
numbers[i] = i;
int *output = (int*)malloc(sizeof(cl_ulong) * max);
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, max * sizeof(cl_ulong), (void *)numbers, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, max * sizeof(cl_ulong), NULL, NULL);
/*Step 8: Create kernel object */
cl_kernel kernel = clCreateKernel(program, "collatz", NULL);
/*Step 9: Sets Kernel arguments.*/
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
/*Step 10: Running the kernel.*/
size_t global_work_size[] = { max };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
/*Step 11: Read the data put back to host memory.*/
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, max * sizeof(cl_ulong), output, 0, NULL, NULL);
return SUCCESS;
I finally got to the bottom of the issue.
I was running the code on my Intel HD Graphics 4600 chip and it was producing the strange behaviour shown in the original question. I switched to using my AMD card and then it started working as expected!
Very strange. Thanks to everyone for their help!
Host side and device size values have different sizes. In host, long can vary from 32 to 64bits, depending on the platform. In device, long refers to 64bits only.
printf() function, as defined in C says that %ld is to print long (host side long) numbers. You are using printf in a kernel, so.... It could be that the C-like parser is used, therefore printing the variable as a 32bits long.
Can you try printing it as %lld or as a floating point?

How to use clCreateProgramWithBinary in OpenCL?

I'm trying to just get a basic program to work using clCreateProgramWithBinary. This is so I know how to use it rather than a "true" application.
I see that one of the parameters is a list of binaries. How exactly would I go about creating a binary to test with? I have some test code which creates a program from source, builds and enqueues it. Is there a binary created at some point during this process which I can feed into clCreateProgramWithBinary?
Here is some of my code, just to give an idea of my overall flow. I've omitted comments and error checks for simplicity.
program = clCreateProgramWithSource(clctx, 1, &dumbkernelsource, NULL, &errcode);
errcode = clBuildProgram(program, env->num_devices, env->device, NULL, NULL, NULL);
mykernel = clCreateKernel(program, "flops", &errcode);
errcode = clGetKernelWorkGroupInfo(mykernel, *(env->device), CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
global = num_workgroups * local;
errcode = clEnqueueNDRangeKernel(commands, mykernel, 1, NULL, &global, &local, 0, NULL, NULL);
After you compile your program, you can get its binary code with clGetProgramInfo, and then save it to a file.
Example code (not tried to compile, but should be something along these lines):
program = clCreateProgramWithSource(clctx, 1, &dumbkernelsource, NULL, &errcode);
errcode = clBuildProgram(program, env->num_devices, env->device, NULL, NULL, NULL);
int number_of_binaries;
char **binary;
int *binary_sizes;
errcode = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, NULL, 0, &number_of_binaries);
binary_sizes = new int[number_of_binaries];
binary = new char*[number_of_binaries];
errcode = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, binary_sizes, number_of_binaries*sizeof(int), &number_of_binaries);
for (int i = 0; i < number_of_binaries; ++i) binary[i] = new char[binary_sizes[i]];
errcode = clGetProgramInfo(program, CL_PROGRAM_BINARIES, binary, number_of_binaries*sizeof(char*), &number_of_binaries);
Minimal runnable example
Compile the embedded vector increment shader from CL C source, save the binary to a.bin, load the binary shader, and run it:
Assertions are done at the end of the program.
Ignore the CL C shader, load binary from a.bin, and run it:
./a.out 0
Compile and run with:
gcc -ggdb3 -std=c99 -Wall -Wextra a.c -lOpenCL && ./a.out
Tested in Ubuntu 16.10, NVIDIA NVS5400, driver 375.39.
GitHub upstream:
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>
const char *source =
"__kernel void kmain(__global int *out) {\n"
" out[get_global_id(0)]++;\n"
#define BIN_PATH "a.bin"
char* common_read_file(const char *path, long *length_out) {
char *buffer;
FILE *f;
long length;
f = fopen(path, "r");
assert(NULL != f);
fseek(f, 0, SEEK_END);
length = ftell(f);
fseek(f, 0, SEEK_SET);
buffer = malloc(length);
if (fread(buffer, 1, length, f) < (size_t)length) {
return NULL;
if (NULL != length_out) {
*length_out = length;
return buffer;
int main(int argc, char **argv) {
FILE *f;
char *binary;
cl_command_queue command_queue;
cl_context context;
cl_device_id device;
cl_int input[] = {1, 2}, errcode_ret, binary_status;
cl_kernel kernel, binary_kernel;
cl_mem buffer;
cl_platform_id platform;
cl_program program, binary_program;
const size_t global_work_size = sizeof(input) / sizeof(input[0]);
int use_cache;
long lenght;
size_t binary_size;
if (argc > 1) {
use_cache = !strcmp(argv[1], "0");
} else {
use_cache = 0;
/* Get the binary, and create a kernel with it. */
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
command_queue = clCreateCommandQueue(context, device, 0, NULL);
if (use_cache) {
binary = common_read_file(BIN_PATH, &lenght);
binary_size = lenght;
} else {
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
clBuildProgram(program, 1, &device, "", NULL, NULL);
kernel = clCreateKernel(program, "kmain", NULL);
clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
binary = malloc(binary_size);
clGetProgramInfo(program, CL_PROGRAM_BINARIES, binary_size, &binary, NULL);
f = fopen(BIN_PATH, "w");
fwrite(binary, binary_size, 1, f);
binary_program = clCreateProgramWithBinary(
context, 1, &device, &binary_size,
(const unsigned char **)&binary, &binary_status, &errcode_ret
clBuildProgram(binary_program, 1, &device, NULL, NULL, NULL);
binary_kernel = clCreateKernel(binary_program, "kmain", &errcode_ret);
/* Run the kernel created from the binary. */
buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), input, NULL);
clSetKernelArg(binary_kernel, 0, sizeof(buffer), &buffer);
clEnqueueNDRangeKernel(command_queue, binary_kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), input, 0, NULL, NULL);
/* Assertions. */
assert(input[0] == 2);
assert(input[1] == 3);
/* Cleanup. */
I highly recommend cat a.bin, which contains human readable (and editable) PTX assembly for this implementation.
The official OpenCL Programming Guide book has a nice example of this. There's also a Google code project, opencl-book-samples, which includes the code from the book. The example you're looking for is here.
