OpenCL - using atomic reduction for double - opencl

I know atomic functions with OpenCL-1.x are not recommended but I just want to understand an atomic example.
The following kernel code is not working well, it produces random final values for the computation of sum of all array values (sum reduction) :
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
void atom_add_double(volatile __local double *val, double delta)
{
union {
double f;
ulong i;
} old, new;
do
{
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg((volatile __local ulong *)val, old.i, new.i) != old.i);
}
__kernel void sumGPU ( __global const double *input,
__local double *localInput,
__global double *finalSum
)
{
uint lid = get_local_id(0);
uint gid = get_global_id(0);
uint localSize = get_local_size(0);
uint groupid = get_group_id(0);
local double partialSum;
local double finalSumTemp;
// Initialize sums
if (lid==0)
{
partialSum = 0.0;
finalSumTemp = 0.0;
}
barrier(CLK_LOCAL_MEM_FENCE);
// Set in local memory
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
// Compute atom_add into each workGroup
barrier(CLK_LOCAL_MEM_FENCE);
atom_add_double(&partialSum, localInput[lid]);
// See and Check if barrier below is necessary
barrier(CLK_LOCAL_MEM_FENCE);
// Final sum of partialSums
if (lid==0)
{
atom_add_double(&finalSumTemp, partialSum);
*finalSum = finalSumTemp;
}
}
The version with global id strategy works good but the version above, which passes by the using of local memory (shared memory), doesn't give the expected results (the value of *finalSum is random for each execution).
Here the Buffers and kernel args that I have put in my host code :
// Write to buffers
ret = clEnqueueWriteBuffer(command_queue, inputBuffer, CL_TRUE, 0,
nWorkItems * sizeof(double), xInput, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, finalSumBuffer, CL_TRUE, 0,
sizeof(double), finalSumGPU, 0, NULL, NULL);
// Set the arguments of the kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(kernel, 1, local_item_size*sizeof(double), NULL);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&finalSumBuffer);
and Finally, I read finalSumBuffer to get the sum value.
I think my issue comes rather from the kernel code but I can't find where is the error.
If anyone could see what's wrong, this would be nice to tell me.
Thanks
UPDATE 1 :
I nearly manage to perform this reduction. Following the propositions suggested by huseyin tugrul buyukisik, I have modified the kernel code like this :
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
void atom_add_double(volatile __local double *val, double delta)
{
union {
double d;
ulong i;
} old, new;
do
{
old.d = *val;
new.d = old.d + delta;
}
while (atom_cmpxchg((volatile __local ulong *)val, old.i, new.i) != old.i);
}
__kernel void sumGPU ( __global const double *input,
__local double *localInput,
__local double *partialSum,
__global double *finalSum
)
{
uint lid = get_local_id(0);
uint gid = get_global_id(0);
uint localSize = get_local_size(0);
uint groupid = get_group_id(0);
// Initialize partial sums
if (lid==0)
partialSum[groupid] = 0.0;
barrier(CLK_LOCAL_MEM_FENCE);
// Set in local memory
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
// Compute atom_add into each workGroup
barrier(CLK_LOCAL_MEM_FENCE);
atom_add_double(&partialSum[groupid], localInput[lid]);
// See and Check if barrier below is necessary
barrier(CLK_LOCAL_MEM_FENCE);
// Compute final sum
if (lid==0)
*finalSum += partialSum[groupid];
}
As said huseyin , I don't need to use atomic functions for the final sum of all partial sums.
So I did at the end :
// Compute final sum
if (lid==0)
*finalSum += partialSum[groupid];
But unfortunately, the final sum doesn't give the value expected and the value is random (for example, with nwork-items = 1024 and size-WorkGroup = 16, I get random values in the order of [1e+3 - 1e+4] instead of 5.248e+05 expected.
Here are the setting of arguments into the host code :
// Set the arguments of the kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(kernel, 1, local_item_size*sizeof(double), NULL);
clSetKernelArg(kernel, 2, nWorkGroups*sizeof(double), NULL);
clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&finalSumBuffer);
Could you see where is my error in the kernel code ?
Thanks

Not an error but logic issue:
atom_add_double(&finalSumTemp, partialSum);
is working only once per group (by zero-local-indexed thread).
So you are just doing
finalSumTemp = partialSum
so atomics here is not needed.
There is race condition for
*finalSum = finalSumTemp;
between workgroups where each zero-index local thread writes to same address. So this should be the atomic addition (for learning purposes) or could be written on different cells to be added on host side such as sum_group1+sum_group2+... = total sum.
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
here using groupid is suspicious for multi-device summation. Because each device has its own global range and workgroup id indexings so two device could have same group id values for two different groups. Some device related offset should be used when multiple devices are used. Such as:
idx= get_global_id(0) + deviceOffset[deviceId];
Also if atomic operation is inavoidable, and if exactly N times operated, it could be moved to a single thread(such as 0-indexed thread) and looped for N times(probably being faster) in a second kernel unless that atomic operation latency can't be hidden by other means.

Related

OpenCL: Call parameter type does not match function signature

I'm using #pragma OPENCL EXTENSION cl_khr_fp16 : enable supported GPU with OpenCL 1.2. I wanted to check the performance improvement by changin float precision from 32 to 16. In my device kernel, I converted all float to half like shown below:
__kernel void copy_kernel(int N, __global half *X, __global half *Y)
{
int i = get_global_id(0);
if(i < N) Y[i] = X[i];
}
In my host side, I made cl_mem point to array of cl_half. Host program looks as shown below:
void copy(int N, cl_mem X, cl_mem Y)
{
cl_kernel kernel = get_copy_kernel();
cl_command_queue queue = cl.queue;
cl_uint i = 0;
cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N);
cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X);
cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y);
check_error_cl(cl);
size_t gsize = N;
cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, NULL);
check_error_cl(cl);
}
But while compiling the kernel, I get the below error:
Call parameter type does not match function signature!
%32 = load half addrspace(1)* %31, align 2
float %33 = call float #llvm.nvvm.mul.rn.f(half %32, half %19)
Broken module found, compilation terminated!
You are passing a half variable to the kernel but the kernel expects a pointer to an array of halfs.
If you want to pass an array of halfs to the GPU you still have to use cl_mem objects which then contains the array of halfs.

Passing variables into kernels

I'm new to OpenCL and am reading the book OpenCL in Action. There is a simple problem that I don't understand it: how to pass values to and return them from kernels.
First of all, are we supposed to always pass arguments by address into kernels?
Then, I have two simple sample of kernels below. In the first one, while output is pointer as function parameter, in body of kernel we never used *output. While in the other kernel, *s1 and *s2 are used as function parameters and we actually assign value to *s1 and *s2 instead of s1 and s2. Can anyone tell me why in the first kernel the value is assigned to output (and not *output) while in the second kernel we have the value assigned to *s1 and *s2 (and not s1 and s2).
I looked at many resources to find a general way to pass and return values to and from kernels by I couldn't find any general rule.
Here is the kernels:
1:
__kernel void id_check(__global float *output) {
/* Access work-item/work-group information */
size_t global_id_0 = get_global_id(0);
size_t global_id_1 = get_global_id(1);
size_t global_size_0 = get_global_size(0);
size_t offset_0 = get_global_offset(0);
size_t offset_1 = get_global_offset(1);
size_t local_id_0 = get_local_id(0);
size_t local_id_1 = get_local_id(1);
/* Determine array index */
int index_0 = global_id_0 - offset_0;
int index_1 = global_id_1 - offset_1;
int index = index_1 * global_size_0 + index_0;
/* Set float data */
float f = global_id_0 * 10.0f + global_id_1 * 1.0f;
f += local_id_0 * 0.1f + local_id_1 * 0.01f;
output[index] = f;
}
2:
__kernel void select_test(__global float4 *s1,
__global uchar2 *s2) {
/* Execute select */
int4 mask1 = (int4)(-1, 0, -1, 0);
float4 input1 = (float4)(0.25f, 0.5f, 0.75f, 1.0f);
float4 input2 = (float4)(1.25f, 1.5f, 1.75f, 2.0f);
*s1 = select(input1, input2, mask1);
/* Execute bitselect */
uchar2 mask2 = (uchar2)(0xAA, 0x55);
uchar2 input3 = (uchar2)(0x0F, 0x0F);
uchar2 input4 = (uchar2)(0x33, 0x33);
*s2 = bitselect(input3, input4, mask2);
}
Your problem is not with OpenCL, is with C language itself. Please read a book on how C language works. It is a VERY basic question what you are asking.
When you have a pointer, (output, s1, s2) you can access it by many ways. output refers to the pointer (adress), *output refers to the value at the first element (or single element pointed by the pointer), and output[i] refers to the ith element value.
*output and output[0] are the same, as well as *(output+1) and output[1].

OpenCL: Inserting local atomic_inc to reduction kernel

I am trying to include a local atomic similar to that described by DarkZeros here within a working reduction kernel. The kernel finds a largest value within a set of points; the aim of the local atomic is to allow me to filter selected point_ids into an output array without any gaps.
At present when I use the local atomic to increment the addition to a local array the kernel runs but produces a wrong overall highest point. If the atomic line is commented out then a correct result returns.
What is going on here and how do I fix it?
Simplified kernel code:
__kernel void reduce(__global const float4* dataSet, __global const int* input, const unsigned int items, //points and index
__global int* output, __local float4* shared, const unsigned int n, //finding highest
__global int* filtered, __global const float2* tri_input, const unsigned int pass, //finding filtered
__global int* global_count //global count
){
//set everything up
const unsigned int group_id = get_global_id(0) / get_local_size(0);
const unsigned int local_id = get_local_id(0);
const unsigned int group_size = items;
const unsigned int group_stride = 2 * group_size;
const int local_stride = group_stride * group_size;
__local float4 *zeroIt = &shared[local_id];
zeroIt->x = 0; zeroIt->y = 0; zeroIt->z = 0; zeroIt->w = 0;
volatile __local int local_count_set_1;
volatile __local int global_val_set_1;
volatile __local int filter_local[64];
if(local_id==0){
local_count_set_1 = 0;
global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
int i = group_id * group_stride + local_id;
while (i < n){
//load up a pair of points using the index to locate them within a massive dataSet
int ia = input[i];
float4 a = dataSet[ia-1];
int ib = input[i + group_size];
float4 b = dataSet[ib-1];
//on the first pass kernel increment a local count
if(pass == 0){
filter_local[atomic_inc(&local_count_set_1)] = 1; //including this line causes an erroneous highest point result
//filter_local[local_id] = 1; //but including this line does not
//atomic_inc(&local_count_set_1); //and neither does this one
}
//find the highest of the pair
float4 result;
if(a.z>b.z) result = a;
else result = b;
//load up the previous highest result locally
float4 s = shared[local_id];
//if the previous highest beat this, stick, else twist
if(s.z>result.z){ result = s; }
shared[local_id] = result;
i += local_stride;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (group_size >= 512){
if (local_id < 256) {
__local float4 *a = &shared[local_id];
__local float4 *b = &shared[local_id+256];
if(b->z>a->z){ shared[local_id] = shared[local_id+256]; }
}}
//repeat barrier ops in increments down to group_size>=2 - this filters the highest result in shared
//finally, return the filtered highest result of shared to the global level
barrier(CLK_LOCAL_MEM_FENCE);
if(local_id == 0){
__local float4 *v = &shared[0];
int send = v->w ;
output[group_id] = send+1;
}}
[UPDATE]: When the atomic_inc line is included the 'wrong' highest point result is always a point near the end of the test dataset. I'm guessing that this means that the atomic_inc is affecting a latter comparison, but I'm not sure exactly what or where yet.
[UPDATE]: Edited code to simplify/clarify/update with debugging tweaks. Still not working and it is driving me loopy.
Total face-palm moment. In the setup phase of the kernel there are the lines:
if(local_id==0){
local_count_set_1 = 0;
global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
When these are split and the local_count_set_1 is included within the while loop, the error does not occur. i.e:
if(local_id==0) global_val_set_1 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
while (i < n){
if(local_id==0) local_count_set_1 = 0;
barrier(CLK_LOCAL_MEM_FENCE);
....
if(pass = 0){
filter_local[atomic_inc(&local_count_set_1)] = 1;
}
....
I'm hoping this fixes the issue // will update if not.
Aaaand that's a weekend I'll never get back.

Optimal workgroup size for sum reduction in OpenCL

I am using the following kernel for sum reduciton.
__kernel void reduce(__global float* input, __global float* output, __local float* sdata)
{
// load shared mem
unsigned int tid = get_local_id(0);
unsigned int bid = get_group_id(0);
unsigned int gid = get_global_id(0);
unsigned int localSize = get_local_size(0);
unsigned int stride = gid * 2;
sdata[tid] = input[stride] + input[stride + 1];
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s = localSize >> 2; s > 0; s >>= 1)
{
if(tid < s)
{
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if(tid == 0) output[bid] = sdata[0];
}
It works fine, but I don't know how to choose the optimal workgroup size or number of workgroups if I need more than one workgroup (for example if I want to calculate the sum of 1048576 elements). As far as I understand, the more workgroups I use, the more subresults I will get, which also means that I will need more global reductions at the end.
I've seen the answers to the general workgroup size question here. Are there any recommendations that concern reduction operations specifically?
This question is a possible duplicate of one I answered a while back:
What is the algorithm to determine optimal work group size and number of workgroup.
Experimentation will be the best way to know for sure for any given device.
Update:
I think you can safely stick to 1-dimensional work groups, as you have done in your sample code. On the host, you can try out the best values.
For each device:
1) query for CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE.
2) loop over a few multiples and run the kernel with that group size. save the execution time for each test.
3) when you think you have an optimal value, hard code it into a new kernel for use with that specific device. This will give a further boost to performance. You can also eliminate your sdata parameter in the device-specific kernel.
//define your own context, kernel, queue here
int err;
size_t global_size; //set this somewhere to match your test data size
size_t preferred_size;
size_t max_group_size;
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), preferred_size, NULL);
//check err
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), max_group_size, NULL);
//check err
size_t test_size;
//your vars for hi-res timer go here
for (unsigned int i=preferred_size ; i<=max_group_size ; i+=preferred_size){
//reset timer
test_size = (size_t)i;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &test_size, 0, NULL, NULL);
if(err){
fail("Unable to enqueue kernel"); //implement your own fail function somewhere..
}else{
clfinish(queue);
//stop timer, save value
//output timer value and test_size
}
}
The device-specific kernel can look like this, except the first line should have your optimal value substituted:
#define LOCAL_SIZE 32
__kernel void reduce(__global float* input, __global float* output)
{
unsigned int tid = get_local_id(0);
unsigned int stride = get_global_id(0) * 2;
__local float sdata[LOCAL_SIZE];
sdata[tid] = input[stride] + input[stride + 1];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int s = LOCAL_SIZE >> 2; s > 0; s >>= 1){
if(tid < s){
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(tid == 0) output[get_group_id(0)] = sdata[0];
}

substitutions for cl_khr_int64_base_atomics

I have an ATI Firepro V4800 graphics card which does not support cl_khr_int64_base_atomics. I am trying to adapt the RadixSort algo for long integers. The algo uses atomic_inc, the 64-bit of which is atom_inc, which I cannot use in the kernel. So, my question is, is there a piece of code which performs the same function as atomic_inc which can be used? The piece of kernel code is given below:
__kernel void histogram(__global uint* unsortedData,
__global uint* buckets,
uint shiftCount,
__local uint* sharedArray)
{
size_t localId = get_local_id(0);
size_t globalId = get_global_id(0);
size_t groupId = get_group_id(0);
size_t groupSize = get_local_size(0);
uint numGroups = get_global_size(0) / get_local_size(0);
// Initialize shared array to zero //
sharedArray[localId] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
// Calculate thread-histograms //
uint value = unsortedData[globalId];
value = value >> shiftCount & 0xFFU;
atomic_inc(sharedArray+value);
barrier(CLK_LOCAL_MEM_FENCE);
// Copy calculated histogram bin to global memory //
uint bucketPos = groupId * groupSize + localId ;
//uint bucketPos = localId * numGroups + groupId ;
buckets[bucketPos] = sharedArray[localId];
}
Any suggestions? Thank you.
Edit:
Another way for the same is given in this blogsite: http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html. This gives a very generic implementation of the Atomic Inc.
You could try something like this:
void atomInc64 (__local uint *counter)
{
uint old, carry;
old = atomic_inc (&counter [0]);
carry = old == 0xFFFFFFFF;
atomic_add (&counter [1], carry);
}
Where counter is an array of two 32-bit integers. While the two halves don't increment at exactly the same time, the total should be correct when the program completes.

Resources