Transmission parameters in to the function in openCL - opencl

Have following kernel function:
private static String programSource =
"__kernel void sampleKernel(__global float *Y, __global float *param) "
+ "{ int index = get_global_id(0); "
+ " Y[index]=param[0]-Y[index]/param[1]-param[2]; "
+ "} ";
First argument "Y" works perfect, but second parameter "param" works non correct, I receive null values . Second parametr must be a array and consists from 3 cells.
Fragment of code with the transmission parameters:
float[] arr_params = new float[3];
arr_params[0] = (float) h_c;
arr_params[1] = (float) sy;
arr_params[2] = (float) dy;
//pointers
Pointer Pvy = Pointer.to(vy);
Pointer Parr_params = Pointer.to(arr_params);
cl_mem memObjects[] = new cl_mem[2];
memObjects[0] = clCreateBuffer(context,
CL_MEM_READ_WRITE,
Sizeof.cl_float * vy.length, Pvy, null);
memObjects[1] = clCreateBuffer(context,
CL_MEM_READ_ONLY,
Sizeof.cl_float * arr_params.length, Parr_params, null);
// Set the arguments for the kernel
clSetKernelArg(kernel, 0,
Sizeof.cl_mem, Pointer.to(memObjects[0]));
clSetKernelArg(kernel, 1,
Sizeof.cl_mem, Pointer.to(memObjects[1]));
// Set the work-item dimensions
long global_work_size[] = new long[]{vy.length};
long local_work_size[] = new long[]{1};
// Execute the kernel
clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
global_work_size, local_work_size, 0, null, null);
// Read the output data
clEnqueueReadBuffer(commandQueue, memObjects[0], CL_TRUE, 0, vy.length * Sizeof.cl_float, Pvy, 0, null, null);
// Release kernel, program, and memory objects
clReleaseMemObject(memObjects[0]);
clReleaseMemObject(memObjects[1]);

The second buffer is all zeros because, in the clCreateBuffer call, you haven't told OpenCL where to get the data. Use CL_MEM_USE_HOST_PTR or CL_MEM_COPY_HOST_PTR.

Related

OpenCL GPU calculation wrong

I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.
Here is my code on the Host
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
double *arange(double start, double end, double step)
{
// 'arange' routine.
int i;
int arr_size = ((end - start) / step) + 1;
double *output = malloc(arr_size * sizeof(double));
for(i=0;i<arr_size;i++)
{
output[i] = start + (step * i);
}
return output;
}
int main()
{
// This code executes on the OpenCL Host
// Host data
double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
double *delnu = arange(nu_ini, nu_end, nu_step);
double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
double *gamma, *f;
double prs = 950.0;
int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
FILE *fp = fopen("h2o_HITRAN.par","r");
char string[320];
while(!feof(fp))
{
dum = fgetc(fp);
if(dum == '\n')
{
lines++;
}
}
rewind(fp);
nu = malloc(lines * sizeof(double));
inten = malloc(lines * sizeof(double));
gam_air = malloc(lines * sizeof(double));
n_air = malloc(lines * sizeof(double));
del_air = malloc(lines * sizeof(double));
gamma = malloc(lines * sizeof(double));
f = malloc(delnu_size * sizeof(double));
i=0;
while(fgets(string, 320, fp))
{
sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
i++;
}
size_t line_siz = sizeof(double) * lines;
size_t delnu_siz = sizeof(double) * delnu_size;
// gamma calculation
for(i=0;i<lines;i++)
{
gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
}
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = NULL;
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
// Create a context and associate it with the devices
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = NULL;
cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
// Create a buffer objects that will contain the data from the host array 'buf_xxxx'
cl_mem buf_inten = NULL;
cl_mem buf_gamma = NULL;
cl_mem buf_delnu = NULL;
cl_mem buf_nu = NULL;
cl_mem buf_del_air = NULL;
cl_mem buf_f = NULL;
buf_inten = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_gamma = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_delnu = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
buf_nu = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_f = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
// Write input array A to the Device buffer 'buf_xxx'
status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_Source;
FILE *program_handle = fopen("abs_calc.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_Source = (char*)malloc(program_size+1);
program_Source[program_size] = '\0';
fread(program_Source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// Create the vector addition kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "abs_cross", &status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);
// Define index space (global work size) of work items for execution.
// A workgroup size (local work size) is not required, but can be used.
size_t globalWorkSize[2] = {lines, delnu_size};
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
// Read the Device output buffer to the host output array
clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);
// Verify the output
FILE *file = fopen("opencl_output","w");
for(i=0;i<delnu_size;i++)
{
fprintf(file, "%le %le\n", delnu[i], f[i]);
}
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(buf_nu);
clReleaseMemObject(buf_inten);
clReleaseMemObject(buf_del_air);
clReleaseMemObject(buf_gamma);
clReleaseMemObject(buf_f);
clReleaseMemObject(buf_delnu);
clReleaseContext(context);
// Free host resources
free(nu);
free(inten);
free(gam_air);
free(n_air);
free(del_air);
free(delnu);
free(gamma);
free(f);
free(platforms);
free(devices);
fclose(fp);
fclose(file);
return 0;
}
and this is my kernel code
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
global double *gamma,
global double *delnu,
global double *nu,
global double *del_air,
global double *f)
{
double pie = 4.0*atan(1.0);
int i = get_global_id(0);
int j = get_global_id(1);
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
Am I doing something wrong?
Thank you.
You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.
I am using AMD W2100, and yes, I have printed out all the supported extension and it included cl_khr_fp64 extension.
Sorry, I forgot to include the original calculation. The actual calculation goes like the following..
for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
I would write OpenCL kernel as below,
Without using atomics and only single work dimension.
global_work_size = delnu_size
There could be a better way but its the simplest one.
__kernel void test(__global double *gamma,
__global double *inten,
__global double *delnu,
__global double *delair,
__global double *f,
const int lines)
{
double pie = 4.0*atan(1.0);
int j = get_global_id(0);
f[j] = 0;
for(i=0,i<lines;i++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
You need to understand how OpenCL kernel is executed.
You can think of it as large number of threads executing concurrently
and each thread could be identified with get_global_id

openCL Long Overflowing

Before I start I am a C beginner and I am trying to do some openCL work which might have been a mistake. Below is my kernel code:
__kernel void collatz(__global int* in, __global int* out)
{
uint id = get_global_id(0);
unsigned long n = (unsigned long)id;
uint count = 0;
while (n > 1) {
if (n % 2 == 0) {
n = n / 2;
} else {
if(n == 1572066143) {
unsigned long test = n;
printf("BEFORE - %lu\n", n);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
n = (3 * n) + 1;
} else {
n = (3 * n) + 1;
}
}
count = count + 1;
}
out[id] = count;
}
and the output:
BEFORE - 1572066143
AFTER - 421231134
To me it looks like n is overflowing but I can't figure out why it is happening.
The interesting thing is if I create a new variable to store the same value as n then it seems to work correctly.
unsigned long test = 1572066143;
printf("BEFORE - %lu\n", test);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
Output:
BEFORE - 1572066143
AFTER - 4716198430
As I said I am a C beginner so I could be doing something very stupid! Any help would be appreciated as I have been pulling my hair out for hours now!
Thanks,
Stephen
Update:
Here is my host code in case I am doing something stupid on that end:
int _tmain(int argc, _TCHAR* argv[])
{
/*Step1: Getting platforms and choose an available one.*/
cl_uint numPlatforms; //the NO. of platforms
cl_platform_id platform = NULL; //the chosen platform
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
/*Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *devices;
devices = (cl_device_id*)malloc(1 * sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
/*Step 3: Create context.*/
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
/*Step 4: Creating command queue associate with the context.*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
const char *filename = "HelloWorld_Kernel.cl";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
cl_ulong max = 2000000;
cl_ulong *numbers = NULL;
numbers = new cl_ulong[max];
for (int i = 1; i <= max; i++) {
numbers[i] = i;
}
int *output = (int*)malloc(sizeof(cl_ulong) * max);
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, max * sizeof(cl_ulong), (void *)numbers, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, max * sizeof(cl_ulong), NULL, NULL);
/*Step 8: Create kernel object */
cl_kernel kernel = clCreateKernel(program, "collatz", NULL);
/*Step 9: Sets Kernel arguments.*/
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
/*Step 10: Running the kernel.*/
size_t global_work_size[] = { max };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
/*Step 11: Read the data put back to host memory.*/
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, max * sizeof(cl_ulong), output, 0, NULL, NULL);
return SUCCESS;
}
I finally got to the bottom of the issue.
I was running the code on my Intel HD Graphics 4600 chip and it was producing the strange behaviour shown in the original question. I switched to using my AMD card and then it started working as expected!
Very strange. Thanks to everyone for their help!
Host side and device size values have different sizes. In host, long can vary from 32 to 64bits, depending on the platform. In device, long refers to 64bits only.
printf() function, as defined in C says that %ld is to print long (host side long) numbers. You are using printf in a kernel, so.... It could be that the C-like parser is used, therefore printing the variable as a 32bits long.
Can you try printing it as %lld or as a floating point?

Open global_work_size misunderstanding

I'm trying to understand a simple OpenCL example, which is vector addition. The kernel is the following:
__kernel void addVec(__global double* a, __global double* b, __global double* c)
{
size_t id = get_global_id(0);
c[id] = a[id] + b[id];
}
For example, my input arrays have a size of 1 million elements each.
In my host program, I set global_work_size to be exactly the size of the vectors input arrays (1 million).
But when i set it to a smaller value, for example 1000, it also works with this kernel!
I don't understand why the global_work_size can be lesser than the problem dimension, and still, the OpenCL program compute every elements of the input arrays.
Could someone clarify on this?
EDIT: here is the code where I copy the data:
size_t arraySize = 1000000;
const size_t global_work_size[1] = {512};
double *host_a = malloc(arraySize*sizeof(double));
double *host_b = malloc(arraySize*sizeof(double));
double *host_c = calloc(arraySize, sizeof(double));
...
// Create the input and output arrays in device memory for our calculation
device_a = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize*sizeof(double), NULL, NULL);
device_b = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize*sizeof(double), NULL, NULL);
device_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, arraySize*sizeof(double), NULL, NULL);
...
// Copy data set into the input array in device memory. [host --> device]
status = clEnqueueWriteBuffer(command_queue, device_a, CL_TRUE, 0, arraySize*sizeof(double), host_a, 0, NULL, NULL);
status |= clEnqueueWriteBuffer(command_queue, device_b, CL_TRUE, 0, arraySize*sizeof(double), host_b, 0, NULL, NULL);
...
// Copy-back the results from the device [host <-- device]
clEnqueueReadBuffer(command_queue, device_c, CL_TRUE, 0, arraySize*sizeof(double), host_c, 0, NULL, NULL );
...
printf("checking result validity ...\n");
for (size_t i=0; i<arraySize; ++i)
if(host_c[i] - 1 > 1e-6) // the array is supposed to be 1 everywhere
{
printf("*** ERROR! Invalid results ! host_c[%zi]=%.9lf\n", i, host_c[i]);
break;
}
Thanks
Your test function doesn't look good, it will be met for any value < 1, it should be like this:
for (size_t i=0; i<arraySize; ++i){
cl_double val = host_c[i] - 1; // the array is supposed to be 1 everywhere
if((val > 1e-6) || (val < -1e-6))
{
printf("*** ERROR! Invalid results ! host_c[%zi]=%.9lf\n", i, host_c[i]);
break;
}
}
Non initialized values in the GPU are likely to be 0, therefore meeting your condition.
Additionally, remember that if you run the program once with the full size, consecutive reads will still hold the proper processed data (even if you close and open the app again). Since the GPU memory is not cleaned after the buffer is created/destroyed.

Optimising Host to GPU transfer

I am offloading work to GPU using OpenCL (a variant of matrix multiplication). The matrix code itself works fantastically well, but the cost of moving data to GPU is prohibitive.
I've moved from using clEnqueueRead/clEnqueueWrite to memory mapped buffers as follows:
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
sizeof(char) * queryVector_size,
NULL, NULL);
checkErr(err,"Buf A");
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
sizeof(char) * segment_size,
NULL, NULL);
checkErr(err,"Buf B");
err = clSetKernelArg(ko_smat, 0, sizeof(cl_mem), &d_c);
checkErr(err,"Compute Kernel");
err = clSetKernelArg(ko_smat, 1, sizeof(cl_mem), &d_a);
checkErr(err,"Compute Kernel");
err = clSetKernelArg(ko_smat, 2, sizeof(cl_mem), &d_b);
checkErr(err,"Compute Kernel");
query_vector = (char*) clEnqueueMapBuffer(commands, d_a, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * queryVector_size, 0, NULL, NULL, &err);
checkErr(err,"Write A");
segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_size, 0, NULL, NULL, &err);
checkErr(err,"Write B");
// code which initialises buffers using ptrs (segment_data and queryV)
err = clEnqueueUnmapMemObject(commands,
d_a,
query_vector, 0, NULL, NULL);
checkErr(err,"Unmap Buffer");
err = clEnqueueUnmapMemObject(commands,
d_b,
segment_data, 0, NULL, NULL);
checkErr(err,"Unmap Buff");
err = clEnqueueNDRangeKernel(commands, ko_smat, 2, NULL, globalWorkItems, localWorkItems, 0, NULL, NULL);
err = clFinish(commands);
checkErr(err, "Execute Kernel");
result = (char*) clEnqueueMapBuffer(commands, d_c, CL_TRUE,CL_MAP_WRITE, 0, sizeof(char) * result_size, 0, NULL, NULL, &err);
checkErr(err,"Write C");
printMatrix(result, result_row, result_col);
This code works fine when I use the ReadEnqueue/WriteEnqueue methods and intialise d_a, d_b, d_c through that, but when I use the MappedBuffers, result is 0 due to d_a and d_b being null
when running the kernel.
What is the appropriate way to map/unmap buffers?
EDIT:
the core problem seems to be from here
segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_width * segment_length, 0, NULL, NULL, &err);
// INITIALISE
printMatrix(segment_data, segment_length, segment_width);
// ALL GOOD
err = clEnqueueUnmapMemObject(commands,
d_b,
segment_data, 0, NULL, NULL);
checkErr(err,"Unmap Buff");
segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_width * segment_length, 0\
, NULL, NULL, &err);
printMatrix(segment_data, segment_length, segment_width);
// ALL ZEROs again
The first printMatrix() returns the correct output, once I unmap it and remap it, segment_data becomes all 0s (it's initial value). I suspect I'm using an incorrect flag somewhere? I cant' figure out where though.
query_vector = (char*) clEnqueueMapBuffer(commands, d_a, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * queryVector_size, 0, NULL, NULL, &err);
checkErr(err,"Write A");
segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_size, 0, NULL, NULL, &err);
checkErr(err,"Write B");
The buffers are mapped as CL_MAP_READ but writing to them. Unlike buffer creation, these flags do not take a device view of the memory, but a host view, so they should be mapped using the CL_MAP_WRITE flag otherwise any changes will just be discarded when its unmapped
From the OpenCL 1.2 spec:
5.4.3 Accessing mapped regions of a memory object
...
If a memory object is currently mapped for reading, the application must ensure that the memory object is unmapped before any enqueued kernels or commands that write to this memory object or any of its associated memory objects (sub-buffer or 1D image buffer objects) or its parent object (if the memory object is a sub-buffer or 1D image buffer object) begin execution; otherwise the behavior is undefined.
So, you need to map the results buffer after you've enqueued the kernel. Similarly, you need to unmap the input buffers before you enqueue the kernel. The timeline for mapping/unmapping buffers should be roughly as follows:
Create input buffers
Create output buffers
Map input buffers
Write input data
Unmap input buffers
Enqueue kernel
Map output buffers
Read output data
Unmap output buffers
Clearly the best way for speeding up your code is using mapped buffers. You can create the buffers using CL_MEM_ALLOC_HOST_PTR and this basically takes some transfer burden off the CPU by initiating DMA transfers.
Here is an example of using the mapped buffers:
// pointer to hold the result
int * host_ptr = malloc(size * sizeof(int));
d_mem = clCreateBuffer(context,CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR,
size*sizeof(cl_int), NULL, &ret);
int * map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_WRITE,
0,size*sizeof(int),0,NULL,NULL,&ret);
// initialize data
for (i=0; i<size;i++) {
map_ptr[i] = i;
}
ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);
//Set OpenCL Kernel Parameters
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_mem);
size_t global_work[1] = { size };
//Execute OpenCL Kernel
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
global_work, NULL, 0, 0, NULL);
map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_READ,
0,size*sizeof(int),0,NULL,NULL,&ret);
// copy the data to result array
for (i=0; i<size;i++){
host_ptr[i] = map_ptr[i];
}
ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);
// cl finish etc
It is taken from this post.

clEnqueueReadBuffer return a void buffer

I am trying to implement an OpenCL project that should take in input two "grey" channels images and return a "grey" channel buffer
My OpenCL code:
__kernel void motion_detection(__global uchar* bgImage, __global uchar* motionImage,__global uchar* outputImage)
{
int i = get_global_id(0);
int tsh = 7;
uchar bgEl = bgImage[i];
uchar motionEl = motionImage[i];
float bg = convert_float(bgEl);
float mo = convert_float(motionEl);
float temp = fabs(bg-mo);
if(temp>tsh){temp=255.0;
}else
{temp=0.0;
}
outputImage[i] = convert_uchar(bg);
}
My Host:
// get width and height of input image
height = inputImage->rows;
width = inputImage->cols;
// get the pointer to pixel data
cl_uchar* inputImageData = inputImage->data;
cl_uchar* motionImageData = motionImage->data;
cl_uchar* outputImageData = outputImage->data;
After set up buffers:
// Create memory object for input Image
inputImageBuffer = clCreateBuffer(
context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
width * height * greyPixelSize,
inputImageData,
&status);
CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputImageBuffer)");
motionImageBuffer = clCreateBuffer(
context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
width * height * greyPixelSize,
motionImageData,
&status);
CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputImageBuffer)");
// Create memory objects for output Image
outputImageBuffer = clCreateBuffer(
context,
CL_MEM_READ_WRITE,
width * height * greyPixelSize,
0,
&status);
Settting up arguments and executing OpenCL kernel:
// input buffer image
status = clSetKernelArg(
kernl,
0,
sizeof(cl_mem),
&inputImageBuffer);
CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)");
status = clSetKernelArg(
kernl,
1,
sizeof(cl_mem),
&motionImageBuffer);
CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)");
// outBuffer imager
status = clSetKernelArg(
kernl,
2,
sizeof(cl_mem),
&outputImageBuffer);
CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputImageBuffer)");
status = clEnqueueNDRangeKernel(
commandQueue,
kernl,
1,
NULL,
globalThreads,
localThreads,
0,
NULL,
&ndrEvt);
clFinish(commandQueue);
Here is my problem, when this intruction is executed it returns outputImageData as "\0" and nothing else:
status = clEnqueueReadBuffer(
commandQueue,
outputImageBuffer,
CL_TRUE,
0,
width * height * greyPixelSize,
outputImageData,
0,
NULL,
NULL);
CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

Resources