OpenCL single work-item vs CPU - opencl

I've just started with OpenCL development and I was trying to compare execution times of the CPU and the GPU with one work-item on a really simple operation that should take some time to execute. Now I'm expecting that a single work-item takes about the same time as the CPU, or that it would be even slower. What I'm getting as a result is a lot faster execution on a single work-item and it doesn't make sense to me.
This is the kernel code:
kernel void hello(global ulong *val) {
size_t i = get_global_id(0);
for (ulong k = 0; k < 100000; k++) {
for (ulong j = 0; j < 1000000; j++) {
val[i] += 1;
}
}
}
In the host code, I measure the time with an event:
cl_event event;
ret = clEnqueueTask(command_queue, kernel, 0, NULL, &event);
ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, sizeof(cl_mem), val, 0, NULL, NULL);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
clWaitForEvents(1, &event);
cl_ulong time_start;
cl_ulong time_end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL);
double nanoSeconds = (double) (time_end - time_start);
printf("OpenCl Execution time is: %f milliseconds \n",nanoSeconds / 1000000.0);
The CPU runs a C++ code that does the same thing as the kernel. The results are the same, but the CPU takes about 255 seconds to execute, and the GPU only about 0.15 milliseconds!
Am I measuring the time wrong or is there a reasonable explanation to this?

Related

Do I need more events when timing multiple work-items?

If I have more than one work-item to execute some kernel code, do I need to have more events to track the execution time for each work-item?
I have some strange results, 1 work-item takes about 4 seconds to execute and 100 work-items also take about 4 seconds to execute. I can't see how this could be possible since my Nvidia GeForce GT 525M only has 2 compute units, each with 48 processing elements. This leads me to believe the event I listed as an argument in clEnqueueNDRangeKernel tracks only one work-item. Is that true and if so, how can I get it to track all the work-items?
This is what the Khronos user guide says about the event argument in clEnqueueNDRangeKernel:
event returns an event object that identifies this particular kernel execution instance
What is the meaning of "this particular kernel execution instance"? Isn't that a single work-item?
EDIT:
Relevant host code:
static const size_t numberOfWorkItems = 48;
const size_t globalWorkSize[] = { numberOfWorkItems, 0, 0 };
cl_event events;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, &events);
ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, sizeof(cl_mem), val, 0, NULL, NULL);
clWaitForEvents(1, &events);
cl_ulong time_start;
cl_ulong time_end;
clGetEventProfilingInfo(events, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time_start, NULL);
clGetEventProfilingInfo(events, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL);
double nanoSeconds = (double) (time_end - time_start);
printf("OpenCl Execution time is: %f milliseconds \n",nanoSeconds / 1000000.0);
printf("Result: %lu\n", val[0]);
Kernel code:
kernel void parallel_operation(__global ulong *val) {
size_t i = get_global_id(0);
int n = 48;
local unsigned int result[48];
for (int z = 0; z < n; z++) {
result[z] = 0;
}
// here comes the long operation
for (ulong k = 0; k < 2000; k++) {
for (ulong j = 0; j < 10000; j++) {
result[i] += (j * 3) % 5;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (i == 0) {
for (int z = 1; z < n; z++) {
result[0] += result[z];
}
*val = result[0];
}
}
You are measuring the execution time of your entire kernel function. Or in other words, the time between the first work-item starts and the last work-item finishes. To my knowledge there is no possibility to measure the execution time of one single work-item in OpenCL.

OpenCL - using atomic reduction for double

I know atomic functions with OpenCL-1.x are not recommended but I just want to understand an atomic example.
The following kernel code is not working well, it produces random final values for the computation of sum of all array values (sum reduction) :
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
void atom_add_double(volatile __local double *val, double delta)
{
union {
double f;
ulong i;
} old, new;
do
{
old.f = *val;
new.f = old.f + delta;
}
while (atom_cmpxchg((volatile __local ulong *)val, old.i, new.i) != old.i);
}
__kernel void sumGPU ( __global const double *input,
__local double *localInput,
__global double *finalSum
)
{
uint lid = get_local_id(0);
uint gid = get_global_id(0);
uint localSize = get_local_size(0);
uint groupid = get_group_id(0);
local double partialSum;
local double finalSumTemp;
// Initialize sums
if (lid==0)
{
partialSum = 0.0;
finalSumTemp = 0.0;
}
barrier(CLK_LOCAL_MEM_FENCE);
// Set in local memory
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
// Compute atom_add into each workGroup
barrier(CLK_LOCAL_MEM_FENCE);
atom_add_double(&partialSum, localInput[lid]);
// See and Check if barrier below is necessary
barrier(CLK_LOCAL_MEM_FENCE);
// Final sum of partialSums
if (lid==0)
{
atom_add_double(&finalSumTemp, partialSum);
*finalSum = finalSumTemp;
}
}
The version with global id strategy works good but the version above, which passes by the using of local memory (shared memory), doesn't give the expected results (the value of *finalSum is random for each execution).
Here the Buffers and kernel args that I have put in my host code :
// Write to buffers
ret = clEnqueueWriteBuffer(command_queue, inputBuffer, CL_TRUE, 0,
nWorkItems * sizeof(double), xInput, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, finalSumBuffer, CL_TRUE, 0,
sizeof(double), finalSumGPU, 0, NULL, NULL);
// Set the arguments of the kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(kernel, 1, local_item_size*sizeof(double), NULL);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&finalSumBuffer);
and Finally, I read finalSumBuffer to get the sum value.
I think my issue comes rather from the kernel code but I can't find where is the error.
If anyone could see what's wrong, this would be nice to tell me.
Thanks
UPDATE 1 :
I nearly manage to perform this reduction. Following the propositions suggested by huseyin tugrul buyukisik, I have modified the kernel code like this :
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
void atom_add_double(volatile __local double *val, double delta)
{
union {
double d;
ulong i;
} old, new;
do
{
old.d = *val;
new.d = old.d + delta;
}
while (atom_cmpxchg((volatile __local ulong *)val, old.i, new.i) != old.i);
}
__kernel void sumGPU ( __global const double *input,
__local double *localInput,
__local double *partialSum,
__global double *finalSum
)
{
uint lid = get_local_id(0);
uint gid = get_global_id(0);
uint localSize = get_local_size(0);
uint groupid = get_group_id(0);
// Initialize partial sums
if (lid==0)
partialSum[groupid] = 0.0;
barrier(CLK_LOCAL_MEM_FENCE);
// Set in local memory
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
// Compute atom_add into each workGroup
barrier(CLK_LOCAL_MEM_FENCE);
atom_add_double(&partialSum[groupid], localInput[lid]);
// See and Check if barrier below is necessary
barrier(CLK_LOCAL_MEM_FENCE);
// Compute final sum
if (lid==0)
*finalSum += partialSum[groupid];
}
As said huseyin , I don't need to use atomic functions for the final sum of all partial sums.
So I did at the end :
// Compute final sum
if (lid==0)
*finalSum += partialSum[groupid];
But unfortunately, the final sum doesn't give the value expected and the value is random (for example, with nwork-items = 1024 and size-WorkGroup = 16, I get random values in the order of [1e+3 - 1e+4] instead of 5.248e+05 expected.
Here are the setting of arguments into the host code :
// Set the arguments of the kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
clSetKernelArg(kernel, 1, local_item_size*sizeof(double), NULL);
clSetKernelArg(kernel, 2, nWorkGroups*sizeof(double), NULL);
clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&finalSumBuffer);
Could you see where is my error in the kernel code ?
Thanks
Not an error but logic issue:
atom_add_double(&finalSumTemp, partialSum);
is working only once per group (by zero-local-indexed thread).
So you are just doing
finalSumTemp = partialSum
so atomics here is not needed.
There is race condition for
*finalSum = finalSumTemp;
between workgroups where each zero-index local thread writes to same address. So this should be the atomic addition (for learning purposes) or could be written on different cells to be added on host side such as sum_group1+sum_group2+... = total sum.
int idx = groupid * localSize + lid;
localInput[lid] = input[idx];
here using groupid is suspicious for multi-device summation. Because each device has its own global range and workgroup id indexings so two device could have same group id values for two different groups. Some device related offset should be used when multiple devices are used. Such as:
idx= get_global_id(0) + deviceOffset[deviceId];
Also if atomic operation is inavoidable, and if exactly N times operated, it could be moved to a single thread(such as 0-indexed thread) and looped for N times(probably being faster) in a second kernel unless that atomic operation latency can't be hidden by other means.

OpenCL strange behavior

Good day,
I think I've tried everything to figure out where the problem is but I couldn't. I have the following code for a host:
cl_mem cl_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, 2 * sizeof(cl_uint), NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_distances);
cl_event event;
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_workers, &local_workers, 0, NULL, &event);
clWaitForEvents(1, &event);
And for a device:
__kernel void walk(__global uint *distance_results)
{
uint global_size = get_global_size(0);
uint local_size = get_local_size(0);
uint global_id = get_global_id(0);
uint group_id = get_group_id(0);
uint local_id = get_local_id(0);
for (uint step = 0; step < 500; step++) {
if (local_id == 0) {
distance_results[group_id] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (uint n = global_id; n < 1000; n += global_size) {
if (local_id == 0) {
atomic_add(&distance_results[group_id], 1);
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
if (global_id == 0) {
for (uint i = 0; i < (global_size / local_size); i++) {
printf("step: %d; group: %d; data: %d\n", step, i, distance_results[i]);
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
So at each "step" I just add one 1 to distance[group_id] 1000 times from each group. And then I just read the result from thread with global_id == 1.
At each step I should have the following text:
step: 59; group: 0; data: 500
step: 59; group: 1; data: 500
But actually there are a lot of strings with wrong data:
step: 4; group: 0; data: 500
step: 4; group: 1; data: 210
step: 5; group: 0; data: 500
step: 5; group: 1; data: 214
If I set global_workers to 1 and local_workers to 1 then everything is okay. But if I set global_workers to 2 and local_workers to 1 then I have this strange behavior.
Do you have any ideas why this can happen?
There's a couple things going on here, but I think the core problem comes from a very common misunderstanding with OpenCL. This call:
barrier(CLK_GLOBAL_MEM_FENCE);
This is not a global barrier. It is a local barrier with a global memory fence. In other words, it still only synchronizes between work items in a single work group, not between work items in other work groups.
The loop in the code that prints the results will only have correct values for work group 0, since it is only run in work group 0. If you really want this code to work, the loop that prints the results would have to be in a separate NDRange, with proper synchronization between the NDRanges.
The memory fence just controls which types of memory writes will be committed to memory. And in this case, you want global fences for both, since you are trying to fence global memory writes, not local memory writes.

Optimal workgroup size for sum reduction in OpenCL

I am using the following kernel for sum reduciton.
__kernel void reduce(__global float* input, __global float* output, __local float* sdata)
{
// load shared mem
unsigned int tid = get_local_id(0);
unsigned int bid = get_group_id(0);
unsigned int gid = get_global_id(0);
unsigned int localSize = get_local_size(0);
unsigned int stride = gid * 2;
sdata[tid] = input[stride] + input[stride + 1];
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s = localSize >> 2; s > 0; s >>= 1)
{
if(tid < s)
{
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if(tid == 0) output[bid] = sdata[0];
}
It works fine, but I don't know how to choose the optimal workgroup size or number of workgroups if I need more than one workgroup (for example if I want to calculate the sum of 1048576 elements). As far as I understand, the more workgroups I use, the more subresults I will get, which also means that I will need more global reductions at the end.
I've seen the answers to the general workgroup size question here. Are there any recommendations that concern reduction operations specifically?
This question is a possible duplicate of one I answered a while back:
What is the algorithm to determine optimal work group size and number of workgroup.
Experimentation will be the best way to know for sure for any given device.
Update:
I think you can safely stick to 1-dimensional work groups, as you have done in your sample code. On the host, you can try out the best values.
For each device:
1) query for CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE.
2) loop over a few multiples and run the kernel with that group size. save the execution time for each test.
3) when you think you have an optimal value, hard code it into a new kernel for use with that specific device. This will give a further boost to performance. You can also eliminate your sdata parameter in the device-specific kernel.
//define your own context, kernel, queue here
int err;
size_t global_size; //set this somewhere to match your test data size
size_t preferred_size;
size_t max_group_size;
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), preferred_size, NULL);
//check err
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), max_group_size, NULL);
//check err
size_t test_size;
//your vars for hi-res timer go here
for (unsigned int i=preferred_size ; i<=max_group_size ; i+=preferred_size){
//reset timer
test_size = (size_t)i;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &test_size, 0, NULL, NULL);
if(err){
fail("Unable to enqueue kernel"); //implement your own fail function somewhere..
}else{
clfinish(queue);
//stop timer, save value
//output timer value and test_size
}
}
The device-specific kernel can look like this, except the first line should have your optimal value substituted:
#define LOCAL_SIZE 32
__kernel void reduce(__global float* input, __global float* output)
{
unsigned int tid = get_local_id(0);
unsigned int stride = get_global_id(0) * 2;
__local float sdata[LOCAL_SIZE];
sdata[tid] = input[stride] + input[stride + 1];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int s = LOCAL_SIZE >> 2; s > 0; s >>= 1){
if(tid < s){
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(tid == 0) output[get_group_id(0)] = sdata[0];
}

OpenCL local_work_size issues

I have a vector of size n (unknown before the main program is launched), and I want to experiment with the work-group sizes. However, unless I set the local_work_size to a number that exactly divides n, I only get 0 values in fprop below.
Kernel:
__kernel void palt(__global double *fprop, __global const double *fcoll,
__global const int *nn, const uint max_size)
{
size_t l = get_global_id(0);
if( l > max_size ) return;
fprop[l] = fcoll[nn[l]];
}
Host code:
int block_sz_p = 128;
const int max_size = ns*imax;
// set the parameters for the propagation operator
errNum = clSetKernelArg(propagation_kernel, 0, sizeof(cl_mem), &fpd);
errNum |= clSetKernelArg(propagation_kernel, 1, sizeof(cl_mem), &fcd);
errNum |= clSetKernelArg(propagation_kernel, 2, sizeof(cl_mem), &nnd);
errNum |= clSetKernelArg(propagation_kernel, 3, sizeof(int), (void *) &max_sz);
checkErr(errNum, "clSetKernelArg(propagation)");
// specify the work group size/dim
const size_t work_dim = 3;
const size_t global_work_size_propagation[] = {imax*ns, 1, 1};
const size_t local_work_size_propagation[] = {block_sz_p, 1, 1};
// propagation
clEnqueueNDRangeKernel(queue, propagation_kernel, work_dim, NULL,
global_work_size_propagation, local_work_size_propagation,
0, NULL, &event);
clWaitForEvents(1, &event);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end, NULL);
tker2 = (end-start);
What's going on here?
You should check for CL errors, clEnqueueNDRangeKernel and other calls return a error code (others return the error code by reference).
I assume the problem is when the global workgroup size isn't divisible by the local workgroup size, which is not supported and generates a CL error:
CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size or does not match the work-group size specified for kernel using the attribute((reqd_work_group_size(X, Y, Z))) qualifier in program source.
From the clEnqueueNDRangeKernel man page.

Resources