OpenCL clCreateKernel throws CL_INVALID_PROGRAM_EXECUTABLE - opencl

im new with OpenCL, I have a problem in clCreateKernel, it throws CL_INVALID_PROGRAM_EXECUTABLE, could anybody help, the code is based on http://www.cs.bris.ac.uk/home/simonm/workshops/OpenCL_lecture3.pdf , the last optimization
Here is the code:
#define ORDER 10 // Order of the square matrices A, B, and C
#define AVAL 3.0 // A elements are constant and equal to AVAL
#define BVAL 5.0 // B elements are constant and equal to BVAL
#define TOL (0.001) // tolerance used in floating point comparisons
#define DIM 2 // Max dim for NDRange
#define COUNT 1 // number of times to do each multiplication
#define SUCCESS 1
#define FAILURE 0
// Funciones Auxiliares
void initmat(int Mdim, int Ndim, int Pdim, float *A, float *B, float *C)
{
int i, j;
/* Initialize matrices */
for (i = 0; i < Ndim; i++)
for (j = 0; j < Pdim; j++)
A[i*Ndim+j] = AVAL;
for (i = 0; i < Pdim; i++)
for (j = 0; j < Mdim; j++)
B[i*Pdim+j] = BVAL;
for (i = 0; i < Ndim; i++)
for (j = 0; j < Mdim; j++)
C[i*Ndim+j] = 0.0f;
}
// Definicion de la funcion:
char * readKernel(void)
{
size_t *source_length;
FILE *fp = fopen("kernel.cl", "r");
if (fp == NULL)
{
printf("Cannot Open Kernel.cl\n");
}
else
{
printf("Kernel.cl Opened\n");
}
fseek(fp, 0, SEEK_END);
source_length[0] = ftell(fp);
if (source_length[0] == 0)
{
printf("Kernel.cl is empty\n");
}
else
{
printf("Kernel.cl length: %zu bytes\n", source_length[0]);
}
char *source = (char*) calloc(source_length[0] + 1, 1);
if (source == 0)
{
printf("Memory allocation failed");
}
fseek(fp, 0, SEEK_SET);
fread(source, 1, source_length[0], fp);
printf("Kernel.cl Read\n");
return source;
}
int main(int argc, char **argv)
{
// Declare and iniciate data
float *A, *B, *C;
int Mdim, Ndim, Pdim;
int err, szA, szB, szC;
size_t global[DIM];
size_t local[DIM];
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_uint nd;
cl_mem a_in, b_in, c_out;
Ndim = ORDER;
Pdim = ORDER;
Mdim = ORDER;
szA = Ndim*Pdim;
szB = Pdim*Mdim;
szC = Ndim*Mdim;
A = (float *)malloc(szA*sizeof(float));
B = (float *)malloc(szB*sizeof(float));
C = (float *)malloc(szC*sizeof(float));
const char* C_elem_KernelSource =
"__kernel \n"
"void mmul( \n"
" const int Mdim, \n"
" const int Ndim, \n"
" const int Pdim, \n"
" __global float* A, \n"
" __global float* B, \n"
" __global float* C, \n"
" __local float* Bwrk) \n"
"{ \n"
" int k,j; \n"
" int i = get_global_id(0); \n"
" int iloc = get_local_id(0); \n"
" int nloc = get_local_size(0); \n"
" float Awrk[10]; \n"
" float tmp; \n"
" for (k=0; k<Pdim; k++) \n"
" Awrk[k] = A[i*Ndim+k]; \n"
" for (j=0; j<Mdim; j++){ \n"
" for (k=iloc; k<Pdim; k=k+nloc) \n"
" Bwrk[k] = B[k*Pdim+j]; \n"
" barrier(CLK_LOCAL_MEM_FENCE); \n"
" tmp = 0.0f; \n"
" for (k=0; k<Pdim; k++) \n"
" tmp += Awrk[k] * Bwrk[k]; \n"
" C[i*Ndim+j] += tmp; \n"
"} \n"
;
initmat(Mdim, Ndim, Pdim, A, B, C);
// Setup the plataform
cl_uint num_platforms;
if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS)
{
printf("Unable to get platform!\n");
}else{
printf("Plataformas Disponibles: %u \n", num_platforms);
}
//Identificador
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, &num_platforms);
printf("Plataformas creada\n");
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err==CL_SUCCESS){
printf("Device creado \n");
}else {
printf("Error %d \n", err);
}
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
if (err==CL_SUCCESS){
printf("Contexto creado \n");
}else {
printf("Error creando contexto \n");
}
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (err==CL_SUCCESS){
printf("cola de comandos creadas \n");
}else {
printf("Error creando cola de comandos \n");
}
// Setup buffers and write A and B matrices to the device memory
a_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szA, NULL, NULL);
b_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szB, NULL, NULL);
c_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, NULL);
err = clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float) * szA, A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, b_in, CL_TRUE, 0, sizeof(float) * szB, B, 0, NULL, NULL);
// Build the program, define the kernel and setup arguments
program = clCreateProgramWithSource(context, 1, (const char **) &C_elem_KernelSource, NULL, &err);
if (err==CL_SUCCESS){
printf("programa creado \n");
}else {
printf("Error generado %d creando programa\n", err);
}
//Compila el programa en el dispositivo elegido
clBuildProgram(program, 1, &device_id, NULL, NULL, NULL );
if (err==CL_SUCCESS){
printf("programa compilado 1\n");
}else {
printf("Error generado %d compilando programa 1\n", err);
}
kernel = clCreateKernel(program, "mmul", &err);
if (err==CL_SUCCESS){
printf("Kernel creado \n");
}else {
printf("Error generado %d creando kernel\n", err);
}
err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim);
err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim);
err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &a_in);
err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_in);
err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &c_out);
err |= clSetKernelArg(kernel, 6, sizeof(float)*Pdim, NULL);
if (err==CL_SUCCESS){
printf("Argumentos del Kernel configurados \n");
}else {
printf("Error configurando argumentos del kernel \n");
}
//Run the kernel and collect results
// 1D ND Range set to dimensions of C matrix
//Local Dim set to 250 so number of work-groups match number of
//compute units (4 in this case) for our order 1000 matrices
//Pass local memory to kernels. This requires a change to the kernel
//argument list … a new call to clSetKernelArg is needed
printf("Encolando Kernel:\n");
global[0] = (size_t) Ndim; global[1] = (size_t) Mdim; local[0] = (size_t) 2;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global, local, 0, NULL, NULL);
if (err==CL_SUCCESS){
printf("Kernel enviado a device \n");
}else {
printf("Error enviando kernel a device \n");
}
clFinish(commands);
err = clEnqueueReadBuffer(commands, c_out, CL_TRUE, 0, sizeof(float) * szC, C, 0, NULL, NULL );
//test_results(A, B, c_out);
}
Thanks

The main problem is that the open brace on line 112 has no matching closing brace:
" for (j=0; j<Mdim; j++){ \n"
Also note that the pointer declared on line 34 is used without initialization:
size_t *source_length;
On line 170, an err= should be added to the clBuildProgram() call so that the error checking works as intended. Then you can add logic to use clGetProgramBuildInfo() to get details in the case of a build fail.

Related

openCL wrong result

Could somebody explain me why is it happen?
I tought it'd increment the value of my array.
#include <iostream>
#pragma comment(lib, "OpenCL.lib")
#include <CL/cl.h>
const std::string source_str = R"(
__kernel void add(__global int* c) {
int i = get_global_id(0);
c[i]=c[i]+1;
})";
size_t source_size = source_str.length();
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_context context;
cl_command_queue command_queue;
cl_mem a_mem_obj;
cl_program program;
cl_kernel kernel;
int* a;
#define SIZE 100
// ## You may add your own initialization routines here ##
void init() {
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
if (ret != CL_SUCCESS)
std::cout << ret << 1;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
if (ret != CL_SUCCESS)
std::cout << ret << 2;
// Create an OpenCL context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 3;
// Create a command queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 4;
a_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE,
SIZE * sizeof(int), NULL, &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 6;
// Create a program from the kernel source
program = clCreateProgramWithSource(context, 1,
(const char**)&source_str, (const size_t*)&source_size, &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 9;
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 10;
// Create the OpenCL kernel
kernel = clCreateKernel(program, "add", &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 11;
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&a_mem_obj);
if (ret != CL_SUCCESS)
std::cout << ret << 13;
}
void KernelStart() {
// Copy to the memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
SIZE * sizeof(int), a, 0, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 7;
// Execute the OpenCL kernel on the list
size_t static global_item_size = SIZE; // Process the entire lists
size_t static local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 14;
ret = clEnqueueReadBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
SIZE * sizeof(int), a, 0, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 15;
}
int main() {
a = new int[SIZE];
for (size_t i = 0; i < SIZE; i++)
{
a[i] = 1;
}
for (size_t i = 0; i < SIZE; i++)
{
std::cout << a[i];
}
std::cout << std::endl;
init();
KernelStart();
for (size_t i = 0; i < SIZE; i++)
{
std::cout << a[i];
}
}
result:
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
-4913-54141111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
Your logging already shows you what's going on, albeit not particularly readably.
First Problem
Your program's output:
-4913
The corresponding code:
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&a_mem_obj);
if (ret != CL_SUCCESS)
std::cout << ret << 13;
-49 is CL_INVALID_ARG_INDEX. Kernel arguments are numbered from 0, your kernel has 1 argument, so the only valid index is 0.
Second Problem
Your program's output:
-5414
The code:
#define SIZE 100
…
// Execute the OpenCL kernel on the list
size_t static global_item_size = SIZE; // Process the entire lists
size_t static local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 14;
-54 corresponds to CL_INVALID_WORK_GROUP_SIZE.
There are 3 possible reasons specified:
CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size or does not match the work-group size specified for kernel using the __attribute__((reqd_work_group_size(X, Y, Z))) qualifier in program source.
CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the total number of work-items in the work-group computed as local_work_size[0] *... local_work_size[work_dim - 1] is greater than the value specified by CL_DEVICE_MAX_WORK_GROUP_SIZE in the table of OpenCL Device Queries for clGetDeviceInfo.
CL_INVALID_WORK_GROUP_SIZE if local_work_size is NULL and the __attribute__((reqd_work_group_size(X, Y, Z))) qualifier is used to declare the work-group size for kernel in the program source.
Your local size is 64, your global size is 100. This means you're running into the first condition: you'll need to make sure your global size is an integer multiple of the local size.

What is the best practice to do reduce in OpenCL?

Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.

OpenCL: Can one kernel call the other kernel

Hi ,
I am trying to run the available convolution code in OpenCL.
I am having heterogeneous system with -
1) CPU
2) GPU
PFB my code base which is running in my system :
convolution.cl
// TODO: Add OpenCL kernel code here.
__kernel
void convolve(
const __global uint * const input,
__constant uint * const mask,
__global uint * const output,
const int inputWidth,
const int maskWidth){
const int x = get_global_id(0);
const int y = get_global_id(1);
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
const int idxIntmp = (y + r) * inputWidth + x;
for (int c = 0; c < maskWidth; c++)
{
sum += mask[(r * maskWidth) + c] * input[idxIntmp + c];
}
}
output[y * get_global_size(0) + x] = sum;
}
and convolution.cpp -
//Convolution-Process of applying a 3×3 mask to an 8×8 input signal,resulting in a 6×6 output signal
#include "CL/cl.h"
#include "vector"
#include "iostream"
#include "time.h"
#include <fstream>
#include <sstream>
#include <string>
using namespace std;
// Constants
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;
cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
{3, 1, 1, 4, 8, 2, 1, 3},
{4, 2, 1, 1, 2, 1, 2, 3},
{4, 4, 4, 4, 3, 2, 2, 2},
{9, 8, 3, 8, 9, 0, 0, 0},
{9, 3, 3, 9, 0, 0, 0, 0},
{0, 9, 0, 8, 0, 0, 0, 0},
{3, 0, 8, 8, 9, 4, 4, 4},
{5, 9, 8, 1, 8, 1, 1, 1}
};
const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;
cl_uint outputSignal[outputSignalWidth][outputSignalHeight];
const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;
cl_uint mask[maskWidth][maskHeight] =
{
{1, 1, 1},
{1, 0, 1},
{1, 1, 1},
};
inline void checkErr(cl_int err, const char * name)
{
if (err != CL_SUCCESS)
{
std::cerr << "ERROR: " << name
<< " (" << err << ")" << std::endl;
exit(EXIT_FAILURE);
}
}
void CL_CALLBACK contextCallback(
const char * errInfo,
const void * private_info,
size_t cb,
void * user_data)
{
std::cout << "Error occurred during context use: "<< errInfo << std::endl;
exit(EXIT_FAILURE);
}
int main(int argc,char argv[]){
cl_int errNum;
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id * platformIDs;
cl_device_id * deviceIDs;
cl_context context = NULL;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem inputSignalBuffer;
cl_mem outputSignalBuffer;
cl_mem maskBuffer;
double start,end,Totaltime;//Timer variables
errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr(
(errNum != CL_SUCCESS) ? errNum :
(numPlatforms <= 0 ? -1 : CL_SUCCESS),
"clGetPlatformIDs");
platformIDs = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);
errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
checkErr(
(errNum != CL_SUCCESS) ? errNum :
(numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
deviceIDs = NULL;
cl_uint i;
for (i = 0; i < numPlatforms; i++)
{
errNum = clGetDeviceIDs(
platformIDs[i],
CL_DEVICE_TYPE_GPU,
0,
NULL,
&numDevices);
if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
{
checkErr(errNum, "clGetDeviceIDs");
}
else if (numDevices > 0)
{
deviceIDs = (cl_device_id *)malloc(
sizeof(cl_device_id) * numDevices);
errNum = clGetDeviceIDs(
platformIDs[i],
CL_DEVICE_TYPE_GPU,
numDevices,
&deviceIDs[0],
NULL);
checkErr(errNum, "clGetDeviceIDs");
break;
}
}
if (deviceIDs == NULL) {
std::cout << "No CPU device found" << std::endl;
exit(-1);
}
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,(cl_context_properties)platformIDs[i], 0
};
context = clCreateContext(
contextProperties, numDevices, deviceIDs,
&contextCallback, NULL, &errNum);
checkErr(errNum, "clCreateContext");
std::ifstream srcFile("convolution.cl");
checkErr(srcFile.is_open() ? CL_SUCCESS : -1,
"reading convolution.cl");
std::string srcProg(
std::istreambuf_iterator<char>(srcFile),
(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
checkErr(errNum, "clCreateProgramWithSource");
errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
checkErr(errNum, "clBuildProgram");
kernel = clCreateKernel(program, "convolve", &errNum);
checkErr(errNum, "clCreateKernel");
inputSignalBuffer = clCreateBuffer(
context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
static_cast<void *>(inputSignal), &errNum);
checkErr(errNum, "clCreateBuffer(inputSignal)");
maskBuffer = clCreateBuffer(
context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_uint) * maskHeight * maskWidth,
static_cast<void *>(mask), &errNum);
checkErr(errNum, "clCreateBuffer(mask)");
outputSignalBuffer = clCreateBuffer(
context, CL_MEM_WRITE_ONLY,
sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
NULL, &errNum);
checkErr(errNum, "clCreateBuffer(outputSignal)");
queue = clCreateCommandQueue(
context, deviceIDs[0], 0, &errNum);
checkErr(errNum, "clCreateCommandQueue");
errNum = clSetKernelArg(
kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
errNum |= clSetKernelArg(
kernel, 1, sizeof(cl_mem), &maskBuffer);
errNum |= clSetKernelArg(
kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
errNum |= clSetKernelArg(
kernel, 3, sizeof(cl_uint), &inputSignalWidth);
errNum |= clSetKernelArg(
kernel, 4, sizeof(cl_uint), &maskWidth);
checkErr(errNum, "clSetKernelArg");
const size_t globalWorkSize[1] ={ outputSignalWidth * outputSignalHeight };
const size_t localWorkSize[1] = { 1 };
start = clock();
errNum = clEnqueueNDRangeKernel(
queue,
kernel,
1,
NULL,
globalWorkSize,
localWorkSize,
0,
NULL,
NULL
);
checkErr(errNum, "clEnqueueNDRangeKernel");
errNum = clEnqueueReadBuffer(
queue, outputSignalBuffer, CL_TRUE, 0,
sizeof(cl_uint) * outputSignalHeight * outputSignalHeight,
outputSignal, 0, NULL, NULL);
checkErr(errNum, "clEnqueueReadBuffer");
end= clock(); - start;
cout<<"Time in ms = "<<((end/CLOCKS_PER_SEC) * 1000) << endl;
for (int y = 0; y < outputSignalHeight; y++)
{
for (int x = 0; x < outputSignalWidth; x++)
{
std::cout << outputSignal[x][y] << " ";
}
std::cout << std::endl;
}
return 0;
}
Questions :
I am having below doubts-
1) When I am using device type as CL_DEVICE_TYPE_GPU,
am getting 267 ms performance .When I am using CL_DEVICE_TYPE_CPU,execution time changed to 467 ms.
I want to know that what is the difference between running a convolution code on a CPU without GPU and CPU with GPU (by selecting device type as CL_DEVICE_TYPE_CPU) .
2) As I can see the convolution.cl file where there is a for loop which is executing 3 times.Can I call other Kernel for doing this operation from available kernel file ??
I am asking this question as I am new to the OpenCL coding and want to know that thing.
Both CPU & GPU are OpenCL Devices. So, by choosing CL_DEVICE_TYPE_CPU, you are telling OpenCL runtime to compile kernel code to CPU assembler & run it on CPU. When you are choosing CL_DEVICE_TYPE_GPU, kernel code is compiled to GPU assembler & executed on your video card. Ability to change device type without re-writing source code is of the main OpenCL features. It doesn't matter, does your CPU have integrated GPU, and / or discrete GPU is installed, you just pick available Device & run kernel on it.
For OpenCL 1.2 & older you can't call kernel from kernel. Dynamic parallelism is implemented in OpenCL 2.0.
For the first question: you should vectorize the kernel so opencl can easily use SIMD feature of your CPU hence unlock 4x(or 8x) more compute units per core.
__kernel
void convolve(
const __global uint8 * const input, // uint8 fits AVX(AVX2?) and uint4 fits SSE(SSE3?)
__constant uint8 * const mask,
__global uint8 * const output,
const int inputWidth,
const int maskWidth){
const int x = get_global_id(0); // this is 1/8 size now
const int y = get_global_id(1); // this is 1/8 size now
uint8 sum = 0; // a vector of 8 unsigneds
for (int r = 0; r < maskWidth; r++)
{
const int idxIntmp = (y + r) * inputWidth + x;
for (int c = 0; c < maskWidth; c++)
{
sum += mask[(r * maskWidth) + c] * input[idxIntmp + c]; //8 issued per clock
// scalars get promoted when used in direct multiplication of addition.
}
}
output[y * get_global_size(0) + x] = sum;
}
dont forget to decrease total work threads by 7/8 ratio (example: from 8k threads to 1k threads).
Please increase work per thread such as 50 convolutions per thread to increase occupation ratio of work units, then work on some local memory optimizations(for GPU) to get even better results such as 5ms per kernel..
On my AVX capable CPU, a simple matrix multiplication got speed up ratio of 2.4X going for 8-element vectorizations like this.
Running a kernel 3 times is not an issue if you offload enough work on it. If not, you should concatenate multiple kernels into a single one using some tricky algorithm.
If a profiler is not available at the moment, you can check GPU/CPU temperatures to get some idea of how close you are to the limits of hardware.
Play with number of local threads per work group. This can change performance as it lets more or less registers to be used per thread.

openCL simple add vector returns gargabe values

Here is my attempt to write a opencl code to add 2 vectors
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
//24/12
//data structure platform, device, context,program, kernel, command queue
void main()
{
/////////////////////////////////////////////////////////////////////
//PLATFORM QUERY:
/////////////////////////////////////////////////////////////////////
//clGetPlatformIDs(num_entries, platforms, &num_platforms);
// two part: platform = NULL
// malloc and get platforms*
cl_uint num_platforms; //must be uint
cl_platform_id *platforms;
clGetPlatformIDs(5, NULL, &num_platforms);
printf("There are %d platforms \n", num_platforms);
platforms = (cl_platform_id*) malloc (num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(5, platforms, &num_platforms);
for(int i = 0; i < num_platforms; i++)
{
char name[40],vendor[40],version[40], profile[40],extensions[4096];
clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME, sizeof(name), &name, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_VERSION, sizeof(vendor), &version, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_PROFILE, sizeof(vendor), &profile, NULL);
//clGetPlatformInfo(platforms[i],CL_PLATFORM_EXTENSIONS, sizeof(vendor), &extensions, NULL);
printf("Platform %d \n", i);
printf("Name %s \n", name);
printf("Vendor %s \n", vendor);
printf("Version %s \n", version);
printf("Profile %s \n", profile);
//printf("Extension %s \n", extensions);
printf("----------------------------------\n");
}
////////////////////////////////////////////////////////////////
//DEVICES QUERYING
////////////////////////////////////////////////////////////////
cl_device_id* devices;
cl_uint num_devices;
cl_device_fp_config flag ;
for(int i= 0; i< num_platforms; i++)
{
printf("Platform %d has:\n",i);
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 3, NULL, &num_devices);
devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
char name[40];
for(int j=0; j < num_devices; j++)
{
int err= clGetDeviceInfo(devices[j],CL_DEVICE_NAME,sizeof(name),name,NULL);
if (err<0)
{
//printf("Error querying devices name\n");
}
else
{
printf("Device name %s \n", name);
}
err= clGetDeviceInfo(devices[j],CL_DEVICE_NAME,sizeof(flag),&flag,NULL);
if (flag & CL_FP_DENORM)
{
printf("This device support denormalized number \n");
}
}
printf("-----------------------------------\n");
}
///////////////////////////////////////////////////////
//CONTEXT QUERYING AND CREATING
////////////////////////////////////////////////////////
//NOTE clCreateContext returns cl_context instead of errors
//REF_COUNT if very important in the future
//create context for GPU
cl_context context;
cl_uint ref_count;
cl_int err;
char name[40];
context= clCreateContext(NULL,1,&devices[0], NULL,NULL,&err);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Original reference count is %d \n",ref_count);
/*clRetainContext(context);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Incremented reference count is %d \n",ref_count);
clReleaseContext(context);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Decremented reference count is %d \n",ref_count);*/
////////////////////////////////////////////////////////
//Create programme
///////////////////////////////////////////////////////
size_t program_size;
err=0;
cl_program program;
char* program_buffer;
FILE* program_handle = fopen("kernel.cl","r");
//More recommendable than source code???
program_buffer = (char*)malloc(MAX_SOURCE_SIZE);
program_size = fread( program_buffer, 1, MAX_SOURCE_SIZE, program_handle);
fclose( program_handle );
program = clCreateProgramWithSource(context,1,(const char**) &program_buffer,
(size_t*)&program_size, &err);
////////////////////////////////////////////////////////
//Build Program
///////////////////////////////////////////////////////
//const char options[] = "-cl-finite-math-only -cl-no-signed-zeros";
char* program_log;
size_t log_size;
err= clBuildProgram(program, 1 , devices, NULL, NULL, NULL);
if(err < 0) //debug , printing log
{
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*) malloc(log_size+1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,
program_log,NULL);
printf("%s\n",program_log);
free(program_log);
//exit(1);
}
///////////////////////////////////////////////////////////////////////////////////
//create kernel
///////////////////////////////////////////////////////////////////////////////////
cl_uint num_kernels;
cl_kernel kernel;
char kernel_name[40];
kernel = clCreateKernel(program,"add",&err);
if (err<0)
{
perror("could not found any kernels\n");
}
//kernels = (cl_kernel*)malloc(num_kernels*sizeof(cl_kernel));
//clCreateKernelsInProgram(program, num_kernels, kernels, NULL);
///FOR REFERNECE
//for(int i=0; i<num_kernels; i++)
//{
clGetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,sizeof(kernel_name),kernel_name,NULL);
printf("Kernel function: %s \n",kernel_name);
//}
/////////////////////////////////////////////////////
//Create command queue
/////////////////////////////////////////////////////
cl_command_queue queue = clCreateCommandQueue(context, devices[0],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
if (err < 0)
{
printf("Couldn't create command queue \n");
exit(1);
}
clEnqueueTask(queue, kernel, 0, NULL, NULL);//only enqueue
//////////////////////////////////////////
unsigned int n= 1000;
int* h_a;
int* h_b;
int* h_c;
cl_mem d_a;
cl_mem d_b;
cl_mem d_c;
h_a = (int*) malloc(n*sizeof(int));
h_b = (int*) malloc(n*sizeof(int));
h_c = (int*) malloc(n*sizeof(int));
for(int i=0; i< n; i++)
{
h_a[i]= 1;//sinf(i)*sinf(i);
h_b[i]= 1;//cosf(i)*cosf(i);
}
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
err = clEnqueueWriteBuffer(queue,d_a,CL_TRUE,0,sizeof(h_a),h_a,0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue,d_b,CL_TRUE,0,sizeof(h_b),h_a,0, NULL, NULL);
//////set argument
err= clSetKernelArg(kernel,0,sizeof(cl_mem),&d_a);
err= clSetKernelArg(kernel,1,sizeof(cl_mem),&d_b);
err= clSetKernelArg(kernel,2,sizeof(cl_mem),&d_c);
err= clSetKernelArg(kernel,3,sizeof(unsigned int),&n);
///////////////
size_t globalsize, localsize;
localsize=64;
globalsize=ceil(n/(float)localsize)*localsize;
err= clEnqueueNDRangeKernel(queue,kernel,1, NULL,&globalsize,&localsize,0,NULL,NULL);
////////////////////////
clFinish(queue);
err=clEnqueueReadBuffer(queue, d_c,CL_TRUE, 0, sizeof(h_c), h_c, 0 , NULL, NULL);
for(int i = 0; i< n; i++)
{
printf(" h_c[%d] = %d \n", i, h_c[i]);
}
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseKernel(kernel);
free(h_a);
free(h_b);
free(h_c);
getchar();
}
and here is my kernel.cl
__kernel void add(__global int * a, __global int *b, __global int* c, const unsigned n)
{
int id= get_global_id(0);
if (id<n)
c[id]= a[id] + b[id];
}
With this, I only received garbage values , for example h_c[i]= -842150451 for all i.
Please help me to fix it. Thanks!
This statement is not correct :
sizeof(h_a)
Should be something like :
n * sizeof(int)
Indeed h_a is just a pointer so sizeof(h_a) = sizeof(int) => you have the space for only one item.

Efficient Repeated calling of NDRangeKernel in OpenCL

I've written the following code. I have a loop which iterates between two red and black kernels. In each iteration I call clEnqueueReadBuffer which I think is not efficient. Is there any other way to repeat calling kernels efficiently?
Thanks
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <cmath>
#include <ctime>
#include <ocl
Utils.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define DATA_SIZE (1048576)
#define NANO_TO_MILI 1e6
#define MAX_ITER 1
#define LIMIT 100
#define BIG_RANGE LIMIT*4*100
#define EPS 1e-2
#define SQ 1024
#define A(i,j) A[i*SQ+j]
using namespace std;
cl_platform_id platforms;
cl_device_id device;
cl_context context;
cl_program program1, program2;
cl_command_queue command;
cl_int err;
cl_kernel kernel_red, kernel_black;
cl_int i;
cl_mem input_A,input_b,in_out_X;
cl_event timing_event;
cl_ulong time_start, time_end,total_time = 0;
const char options[] = "-cl-mad-enable -cl-finite-math-only -Werror -DWIDTH=1024 -DHEIGHT=1024";
char *kernel_names[] = {"Red","Black"};
float norm (float*,float*,int);
void swap(float **in, float **out);
void CreateQueue(void);
void CreateKernel(void);
void CreateBuffer(unsigned int);
void Enqueue_Write_Buffer(unsigned int);
void Kernel_Arg_Set(cl_kernel, unsigned int);
void Enqueue_Read_Buffer(unsigned int);
void Create_Work_Group(cl_kernel, unsigned int);
void Shutdown();
float *A,*oldX,*newX,*b;
int main(int argc, char** argv) {
unsigned int count = DATA_SIZE;
int i,j;
clock_t start,end;
float *XX,*XXnew;
A = (float*)malloc(sizeof(float)*count);
newX = (float*)malloc(sizeof(float)*SQ);
oldX = (float*)malloc(sizeof(float)*SQ);
b = (float*)malloc(sizeof(float)*SQ);
XX = (float*)malloc(sizeof(float)*SQ);
float h=1.0f/SQ;
float xx[SQ];
for (i=0;i<SQ;i++){
XX[i] = 0.0f;
oldX[i]=0.0f;
xx[i] = 0.0f + (i+1)*h;
if (i != 0) b[i] = -2.0f*xx[i]; else b[i] = -2.0f*xx[i]-1.0f/(h*h)+1.0f/(2.0f*h);
for(j=0;j<SQ;j++) A(i,j) =0.0f;
A(i,i) = -2.0f/(h*h);
if (i!=SQ-1) A(i,i+1) = 1.0f/(h*h) + 1.0f/(2.0f*h); else A(i,i+1) = 0.0f;
if (i != 0) A(i,i-1) = 1.0f/(h*h) - 1.0f/(2.0f*h); else A(i,i-1) = 0.0f;
}
newX[0] = BIG_RANGE;
int cnt = 0;
CreateQueue();
CreateKernel();
CreateBuffer(count);
Kernel_Arg_Set(kernel_red ,count);
Kernel_Arg_Set(kernel_black,count);
end=0.0f;start =clock();cnt =0;
Enqueue_Write_Buffer(count);
while(norm(oldX,newX,SQ) > EPS && cnt<LIMIT){
Create_Work_Group(kernel_red, count);
Enqueue_Read_Buffer(count);
Create_Work_Group(kernel_black, count);
cnt++;
Enqueue_Read_Buffer(count);
}
clFinish(command);
Shutdown();
free(oldX);
free(newX);
free(XX);
free(XXnew);
return 0;
}
void CreateQueue(){
err = clGetPlatformIDs(1, &platforms, NULL);
if(err<0){
perror("no platform");getchar();exit(1);}
err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &device,NULL);
if(err<0){
perror("no device");getchar();exit(1);}
context = clCreateContext(NULL, 1, &device,NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");exit(1);}
command = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
if (!command)
{
printf("Error: Failed to create a command commands!\n");
exit(1);
}
clEnqueueBarrier(command);
}
void CreateBuffer(unsigned int count){
input_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * count, A, NULL);
in_out_X = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, oldX, NULL);
input_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, b, NULL);
if (!input_A || !input_b || !in_out_X)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
}
void CreateKernel(){
FILE *fp;
size_t program_size;
string kernel_src;
fp = fopen("Red.cl", "r");
fseek(fp, 0, SEEK_END);
program_size = ftell(fp);
kernel_src.resize(program_size + 1);
fseek(fp, 0, SEEK_SET);
fread(&kernel_src[0], program_size, 1, fp);
fclose(fp);
kernel_src[program_size] = '\0';
const char *src = &kernel_src[0];
program1 = clCreateProgramWithSource(context, 1,&src, NULL, &err);
if (!program1)
{
printf("clCreateProgramWithSource failed\n");
exit(1);
}
err =clBuildProgram(program1, 1, &device, options, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2*2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program1, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
kernel_red = clCreateKernel(program1, kernel_names[0], &err);
if (!kernel_red || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
kernel_black = clCreateKernel(program1, kernel_names[1], &err);
if (!kernel_black || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
}
void Create_Work_Group(cl_kernel kernel, unsigned int count){
size_t global[] = {SQ,SQ,0};
size_t local[] = {32,32,0};
err = clEnqueueNDRangeKernel(command, kernel, 2, NULL, global, local, 0, NULL,NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
exit(1);
}
}
void Kernel_Arg_Set(cl_kernel kernel,unsigned int count){
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_A);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_out_X);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_b);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
}
void Enqueue_Read_Buffer(unsigned int count){
err = clEnqueueReadBuffer( command, in_out_X, CL_TRUE, 0, sizeof(float) * SQ, oldX, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
}
void Enqueue_Write_Buffer(unsigned int count){
err = clEnqueueWriteBuffer(command, input_A , CL_FALSE, 0, sizeof(float) * count, A, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, input_b , CL_FALSE, 0, sizeof(float) * SQ , b, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, in_out_X, CL_FALSE, 0, sizeof(float) * SQ ,oldX, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
}
What you do is quite inefficient. You can write the buffer only once, then enqueue as many kernels as you want, with the same buffer as their argument. Of course if you need to compute the norm, you need to read data back. I would suggest something like this:
Create an additional buffer for the norm; check at the beginning of every kernel what the norm is (just by reading its value); if it is smaller than threshold value, return immediately.
Create a new kernel which will compute the norm for you.
Enque tasks like:
write buffers,
kernels: { {red,black}*10, updateNorm}*10
read buffers.
The computation will run 10x, then norm will be updated. In case it is already ok, already enqueued computation kernels will be will retrun immediately. After the queue is finished, read buffers back and check norm on the CPU. If the norm is still not OK, enqueue the same batch of kernels again.
In the worst case, you will waste 9 real and 90 immediately returning kernel runs.

Resources