Open global_work_size misunderstanding - opencl

I'm trying to understand a simple OpenCL example, which is vector addition. The kernel is the following:
__kernel void addVec(__global double* a, __global double* b, __global double* c)
{
size_t id = get_global_id(0);
c[id] = a[id] + b[id];
}
For example, my input arrays have a size of 1 million elements each.
In my host program, I set global_work_size to be exactly the size of the vectors input arrays (1 million).
But when i set it to a smaller value, for example 1000, it also works with this kernel!
I don't understand why the global_work_size can be lesser than the problem dimension, and still, the OpenCL program compute every elements of the input arrays.
Could someone clarify on this?
EDIT: here is the code where I copy the data:
size_t arraySize = 1000000;
const size_t global_work_size[1] = {512};
double *host_a = malloc(arraySize*sizeof(double));
double *host_b = malloc(arraySize*sizeof(double));
double *host_c = calloc(arraySize, sizeof(double));
...
// Create the input and output arrays in device memory for our calculation
device_a = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize*sizeof(double), NULL, NULL);
device_b = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize*sizeof(double), NULL, NULL);
device_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, arraySize*sizeof(double), NULL, NULL);
...
// Copy data set into the input array in device memory. [host --> device]
status = clEnqueueWriteBuffer(command_queue, device_a, CL_TRUE, 0, arraySize*sizeof(double), host_a, 0, NULL, NULL);
status |= clEnqueueWriteBuffer(command_queue, device_b, CL_TRUE, 0, arraySize*sizeof(double), host_b, 0, NULL, NULL);
...
// Copy-back the results from the device [host <-- device]
clEnqueueReadBuffer(command_queue, device_c, CL_TRUE, 0, arraySize*sizeof(double), host_c, 0, NULL, NULL );
...
printf("checking result validity ...\n");
for (size_t i=0; i<arraySize; ++i)
if(host_c[i] - 1 > 1e-6) // the array is supposed to be 1 everywhere
{
printf("*** ERROR! Invalid results ! host_c[%zi]=%.9lf\n", i, host_c[i]);
break;
}
Thanks

Your test function doesn't look good, it will be met for any value < 1, it should be like this:
for (size_t i=0; i<arraySize; ++i){
cl_double val = host_c[i] - 1; // the array is supposed to be 1 everywhere
if((val > 1e-6) || (val < -1e-6))
{
printf("*** ERROR! Invalid results ! host_c[%zi]=%.9lf\n", i, host_c[i]);
break;
}
}
Non initialized values in the GPU are likely to be 0, therefore meeting your condition.
Additionally, remember that if you run the program once with the full size, consecutive reads will still hold the proper processed data (even if you close and open the app again). Since the GPU memory is not cleaned after the buffer is created/destroyed.

Related

OpenCL kernel math outputs incorrect results

I am currently trying to implement an OpenCL kernel. The kernel is supposed to output a number of previously calculated elements divided by the total number of elements remapped to a value from 0 to 255.
The kernel runs in a single work group with 256 work items where LX is the local ID:
#define LX get_local_id(0)
kernel void reduceStatistic(global int *inout, int nr_workgroups, int nr_pixels)
{
int i = 1;
for (; i < nr_workgroups; i++)
{
inout[LX] += inout[LX + i * 256];
}
inout[LX] = (int)floor(((float)inout[LX] / (float)nr_pixels) * 256.0f);
}
The calculation before the remapping operation is for clean up after a previous calculation on the same buffer.
The first item of inout[LX] after the cleanup is 17176, the nr_pixels is 160000 so this should result in a value of 27 using the calculation above. The code, however, returns 6.
The relevant host-side code is as follows:
// nr_workgroups is of type int
cl_mem outputBuffer = clCreateBuffer(mgr->context, CL_MEM_READ_WRITE, nr_workgroups * 256 * sizeof(cl_int), NULL, NULL);
// another kernel writes into outputBuffer
// set kernel arguments
clSetKernelArg(mgr->reduceStatisticKernel, 0, sizeof(outputBuffer), &outputBuffer);
clSetKernelArg(mgr->reduceStatisticKernel, 1, sizeof(cl_int), &nr_workgroups);
clSetKernelArg(mgr->reduceStatisticKernel, 2, sizeof(cl_int), &imgSeqSize);
size_t global_work_size_statistics[1] = { 256 };
size_t local_work_size_statistics[1] = { 256 };
// run the kernel
clEnqueueNDRangeKernel(mgr->commandQueue, mgr->reduceStatisticKernel, 1, NULL, global_work_size_statistics, local_work_size_statistics, 0, NULL, NULL);
// read result
cl_int *reducedResult = new cl_int[256];
clEnqueueReadBuffer(mgr->commandQueue, outputBuffer, CL_TRUE, 0, 256 * sizeof(cl_int), reducedResult, 0, NULL, NULL);
Help much appreciated! (:
We established in the comments that the global buffer index calculation is wrong:
inout[LX] += inout[LX + i * 265];
----------^^^
Should be 256
Going out of range on a buffer leads to undefined behaviour, so this is always one of the prime culprits to look for.

OpenCL: Call parameter type does not match function signature

I'm using #pragma OPENCL EXTENSION cl_khr_fp16 : enable supported GPU with OpenCL 1.2. I wanted to check the performance improvement by changin float precision from 32 to 16. In my device kernel, I converted all float to half like shown below:
__kernel void copy_kernel(int N, __global half *X, __global half *Y)
{
int i = get_global_id(0);
if(i < N) Y[i] = X[i];
}
In my host side, I made cl_mem point to array of cl_half. Host program looks as shown below:
void copy(int N, cl_mem X, cl_mem Y)
{
cl_kernel kernel = get_copy_kernel();
cl_command_queue queue = cl.queue;
cl_uint i = 0;
cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N);
cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X);
cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y);
check_error_cl(cl);
size_t gsize = N;
cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, NULL);
check_error_cl(cl);
}
But while compiling the kernel, I get the below error:
Call parameter type does not match function signature!
%32 = load half addrspace(1)* %31, align 2
float %33 = call float #llvm.nvvm.mul.rn.f(half %32, half %19)
Broken module found, compilation terminated!
You are passing a half variable to the kernel but the kernel expects a pointer to an array of halfs.
If you want to pass an array of halfs to the GPU you still have to use cl_mem objects which then contains the array of halfs.

OpenCL - using atomic reduction for double

I know atomic functions with OpenCL-1.x are not recommended but I just want to understand an atomic example.
The following kernel code is not working well, it produces random final values for the computation of sum of all array values (sum reduction) :
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
void atom_add_double(volatile __local double *val, double delta)
{
union {
double f;
ulong i;
} old, new;
do
{
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg((volatile __local ulong *)val, old.i, new.i) != old.i);
}
__kernel void sumGPU ( __global const double *input,
__local double *localInput,
__global double *finalSum
)
{
uint lid = get_local_id(0);
uint gid = get_global_id(0);
uint localSize = get_local_size(0);
uint groupid = get_group_id(0);
local double partialSum;
local double finalSumTemp;
// Initialize sums
if (lid==0)
{
partialSum = 0.0;
finalSumTemp = 0.0;
}
barrier(CLK_LOCAL_MEM_FENCE);
// Set in local memory
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
// Compute atom_add into each workGroup
barrier(CLK_LOCAL_MEM_FENCE);
atom_add_double(&partialSum, localInput[lid]);
// See and Check if barrier below is necessary
barrier(CLK_LOCAL_MEM_FENCE);
// Final sum of partialSums
if (lid==0)
{
atom_add_double(&finalSumTemp, partialSum);
*finalSum = finalSumTemp;
}
}
The version with global id strategy works good but the version above, which passes by the using of local memory (shared memory), doesn't give the expected results (the value of *finalSum is random for each execution).
Here the Buffers and kernel args that I have put in my host code :
// Write to buffers
ret = clEnqueueWriteBuffer(command_queue, inputBuffer, CL_TRUE, 0,
nWorkItems * sizeof(double), xInput, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, finalSumBuffer, CL_TRUE, 0,
sizeof(double), finalSumGPU, 0, NULL, NULL);
// Set the arguments of the kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(kernel, 1, local_item_size*sizeof(double), NULL);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&finalSumBuffer);
and Finally, I read finalSumBuffer to get the sum value.
I think my issue comes rather from the kernel code but I can't find where is the error.
If anyone could see what's wrong, this would be nice to tell me.
Thanks
UPDATE 1 :
I nearly manage to perform this reduction. Following the propositions suggested by huseyin tugrul buyukisik, I have modified the kernel code like this :
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
void atom_add_double(volatile __local double *val, double delta)
{
union {
double d;
ulong i;
} old, new;
do
{
old.d = *val;
new.d = old.d + delta;
}
while (atom_cmpxchg((volatile __local ulong *)val, old.i, new.i) != old.i);
}
__kernel void sumGPU ( __global const double *input,
__local double *localInput,
__local double *partialSum,
__global double *finalSum
)
{
uint lid = get_local_id(0);
uint gid = get_global_id(0);
uint localSize = get_local_size(0);
uint groupid = get_group_id(0);
// Initialize partial sums
if (lid==0)
partialSum[groupid] = 0.0;
barrier(CLK_LOCAL_MEM_FENCE);
// Set in local memory
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
// Compute atom_add into each workGroup
barrier(CLK_LOCAL_MEM_FENCE);
atom_add_double(&partialSum[groupid], localInput[lid]);
// See and Check if barrier below is necessary
barrier(CLK_LOCAL_MEM_FENCE);
// Compute final sum
if (lid==0)
*finalSum += partialSum[groupid];
}
As said huseyin , I don't need to use atomic functions for the final sum of all partial sums.
So I did at the end :
// Compute final sum
if (lid==0)
*finalSum += partialSum[groupid];
But unfortunately, the final sum doesn't give the value expected and the value is random (for example, with nwork-items = 1024 and size-WorkGroup = 16, I get random values in the order of [1e+3 - 1e+4] instead of 5.248e+05 expected.
Here are the setting of arguments into the host code :
// Set the arguments of the kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(kernel, 1, local_item_size*sizeof(double), NULL);
clSetKernelArg(kernel, 2, nWorkGroups*sizeof(double), NULL);
clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&finalSumBuffer);
Could you see where is my error in the kernel code ?
Thanks
Not an error but logic issue:
atom_add_double(&finalSumTemp, partialSum);
is working only once per group (by zero-local-indexed thread).
So you are just doing
finalSumTemp = partialSum
so atomics here is not needed.
There is race condition for
*finalSum = finalSumTemp;
between workgroups where each zero-index local thread writes to same address. So this should be the atomic addition (for learning purposes) or could be written on different cells to be added on host side such as sum_group1+sum_group2+... = total sum.
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
here using groupid is suspicious for multi-device summation. Because each device has its own global range and workgroup id indexings so two device could have same group id values for two different groups. Some device related offset should be used when multiple devices are used. Such as:
idx= get_global_id(0) + deviceOffset[deviceId];
Also if atomic operation is inavoidable, and if exactly N times operated, it could be moved to a single thread(such as 0-indexed thread) and looped for N times(probably being faster) in a second kernel unless that atomic operation latency can't be hidden by other means.

OpenCL - Insert values in every n elements of an array

I have an array of 100 elements, and what I want to do is copy these 100 elements into every nth element of another array.
Let's say n was 3
The new array would have [val1 0 0 val2 0 0 val3 0 0 ...] after the values were copied to every nth element. Now in opencl, I tried creating a pointer which would point to the current index and simply I would just add n to this value every time. However, the current index always just keeps the same value in it. Below is the code I have.
__kernel void ddc(__global float *inputArray, __global float *outputArray, __const int interpolateFactor, __global int *currentIndex){
int i = get_global_id(0);
outputArray[currentIndex[0]] = inputArray[i];
currentIndex[0] = currentIndex[0] + (interpolateFactor - 1);
printf("index %i \n", currentIndex[0]);
}
Host code for the currentIndex part:
int *index;
index = (int*)malloc(2*sizeof(int));
index[0] = 0;
cl_mem currentIndex;
currentIndex = clCreateBuffer(
context,
CL_MEM_WRITE_ONLY,
2 * sizeof(int),
NULL,
&status);
status = clEnqueueWriteBuffer(
cmdQueue,
currentIndex,
CL_FALSE,
0,
2 * sizeof(int),
index,
0,
NULL,
NULL);
printf("Index enqueueWriteBuffer status: %i \n", status);
status |= clSetKernelArg(
kernel,
4,
sizeof(cl_mem),
&currentIndex);
printf("Kernel Arg currentIndex Factor status: %i \n", status);
If you are wondering why I am using an array with two elements, it's because I wasn't sure how to just reference a single variable. I just implemented it the same way I had the input and output array working. When I run the kernel with an interpolateFactor of 3, currentIndex is always printing 2.
So if I understood right what you want to do is save the next index that should be used to currentIndex. This will not work. The value will not instantly update for other workitems. If you wanted to do it this way you would have to execute all the kernels sequentially.
What you could do is
__kernel void ddc(__global float *inputArray, __global float *outputArray, __const int interpolateFactor, int start){
int i = get_global_id(0);
outputArray[start+i*(interpolateFactor-1)] = inputArray[i];
}
assuming you can start from any other spot than 0. Otherwise you could just ditch it completely.
To get it working like that you do
int start = 0;
status |= clSetKernelArg(
kernel,
3, // This should be 3 right? Might have been the problem to begin with.
sizeof(int),
&start);
Hopefully this helps.

openCL Long Overflowing

Before I start I am a C beginner and I am trying to do some openCL work which might have been a mistake. Below is my kernel code:
__kernel void collatz(__global int* in, __global int* out)
{
uint id = get_global_id(0);
unsigned long n = (unsigned long)id;
uint count = 0;
while (n > 1) {
if (n % 2 == 0) {
n = n / 2;
} else {
if(n == 1572066143) {
unsigned long test = n;
printf("BEFORE - %lu\n", n);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
n = (3 * n) + 1;
} else {
n = (3 * n) + 1;
}
}
count = count + 1;
}
out[id] = count;
}
and the output:
BEFORE - 1572066143
AFTER - 421231134
To me it looks like n is overflowing but I can't figure out why it is happening.
The interesting thing is if I create a new variable to store the same value as n then it seems to work correctly.
unsigned long test = 1572066143;
printf("BEFORE - %lu\n", test);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
Output:
BEFORE - 1572066143
AFTER - 4716198430
As I said I am a C beginner so I could be doing something very stupid! Any help would be appreciated as I have been pulling my hair out for hours now!
Thanks,
Stephen
Update:
Here is my host code in case I am doing something stupid on that end:
int _tmain(int argc, _TCHAR* argv[])
{
/*Step1: Getting platforms and choose an available one.*/
cl_uint numPlatforms; //the NO. of platforms
cl_platform_id platform = NULL; //the chosen platform
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
/*Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *devices;
devices = (cl_device_id*)malloc(1 * sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
/*Step 3: Create context.*/
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
/*Step 4: Creating command queue associate with the context.*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
const char *filename = "HelloWorld_Kernel.cl";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
cl_ulong max = 2000000;
cl_ulong *numbers = NULL;
numbers = new cl_ulong[max];
for (int i = 1; i <= max; i++) {
numbers[i] = i;
}
int *output = (int*)malloc(sizeof(cl_ulong) * max);
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, max * sizeof(cl_ulong), (void *)numbers, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, max * sizeof(cl_ulong), NULL, NULL);
/*Step 8: Create kernel object */
cl_kernel kernel = clCreateKernel(program, "collatz", NULL);
/*Step 9: Sets Kernel arguments.*/
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
/*Step 10: Running the kernel.*/
size_t global_work_size[] = { max };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
/*Step 11: Read the data put back to host memory.*/
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, max * sizeof(cl_ulong), output, 0, NULL, NULL);
return SUCCESS;
}
I finally got to the bottom of the issue.
I was running the code on my Intel HD Graphics 4600 chip and it was producing the strange behaviour shown in the original question. I switched to using my AMD card and then it started working as expected!
Very strange. Thanks to everyone for their help!
Host side and device size values have different sizes. In host, long can vary from 32 to 64bits, depending on the platform. In device, long refers to 64bits only.
printf() function, as defined in C says that %ld is to print long (host side long) numbers. You are using printf in a kernel, so.... It could be that the C-like parser is used, therefore printing the variable as a 32bits long.
Can you try printing it as %lld or as a floating point?

Resources