error CL_OUT_OF_RESOURCES while reading back data in host memory while using atomic function in opencl kernel - opencl

I am trying to implement atomic functions in my opencl kernel. Multiple threads I am creating are parallely trying to write a single memory location. I want them to perform serial execution on that particular line of code. I have never used an atomic function before.
I found similar problems on many blogs and forums,and I am trying one solution.,i.e. use of two different functions 'acquire' and 'release' for locking and unlocking the semaphore. I have included necessary opencl extensions, which are all surely supported by my device (NVIDIA GeForce GTX 630M).
My kernel execution configuration:
global_item_size = 8;
ret = clEnqueueNDRangeKernel(command_queue2, kernel2, 1, NULL, &global_item_size2, &local_item_size2, 0, NULL, NULL);
Here is my code: reducer.cl
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
typedef struct data
{
double dattr[10];
int d_id;
int bestCent;
}Data;
typedef struct cent
{
double cattr[5];
int c_id;
}Cent;
__global void acquire(__global int* mutex)
{
int occupied;
do {
occupied = atom_xchg(mutex, 1);
} while (occupied>0);
}
__global void release(__global int* mutex)
{
atom_xchg(mutex, 0); //the previous value, which is returned, is ignored
}
__kernel void reducer(__global int *keyMobj, __global int *valueMobj,__global Data *dataMobj,__global Cent *centMobj,__global int *countMobj,__global double *sumMobj, __global int *mutex)
{
__local double sum[2][2];
__local int cnt[2];
int i = get_global_id(0);
int n,j;
if(i<2)
cnt[i] = countMobj[i];
barrier(CLK_GLOBAL_MEM_FENCE);
n = keyMobj[i];
for(j=0; j<2; j++)
{
barrier(CLK_GLOBAL_MEM_FENCE);
acquire(mutex);
sum[n][j] += dataMobj[i].dattr[j];
release(mutex);
}
if(i<2)
{
for(j=0; j<2; j++)
{
sum[i][j] = sum[i][j]/countMobj[i];
centMobj[i].cattr[j] = sum[i][j];
}
}
}
Unfortunately the solution doesn't seem like working for me. When I am reading back the centMobj into the host memory, using
ret = clEnqueueReadBuffer(command_queue2, centMobj, CL_TRUE, 0, (sizeof(Cent) * 2), centNode, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue2, sumMobj, CL_TRUE, 0, (sizeof(double) * 2 * 2), sum, 0, NULL, NULL);
it is giving me error with error code = -5 (CL_OUT_OF_RESOURCES) for both centMobj and sumMobj.
I am not getting if there is any problem in my atomic function code or problem is in reading back data into the host memory. If I am using the atomic function incorrectly, please make me correct.
Thank you in advance.

In OpenCL, synchronization between work items can be done only inside a work-group. Code trying to synchronize work-items across different work-groups may work in some very specific (and implementation/device dependent) cases, but will fail in the general case.
The solution is to either use atomics to serialize accesses to the same memory location (but without blocking any work item), or redesign the code differently.

Related

OpenCl cannot compile kernel with 'printf'

I am getting the error error: implicit declaration of function 'printf' is invalid in OpenCL when I try to build an OpenCL kernel. The kernel code is this
__kernel void conj_grad(int dim, int num_vals, __local float *r,
__local float *x, __local float* A_times_p, __local float *p,
__global int *rows, __global int *cols, __global float *A,
__global float *b, __global float *result) {
local float alpha, r_length, old_r_dot_r, new_r_dot_r;
local int iteration;
int id = get_local_id(0);
int start_index = -1;
int end_index = -1;
float Ap_dot_p;
printf("OpenCL Kernel ID: %d\n", id);
This gives me the error below
input.cl:14:4: error: implicit declaration of function 'printf' is invalid in OpenCL
input.cl:14:4: note: did you mean 'rint'?
/usr/include/clc/math/unary_decl.inc:1:39: note: 'rint' declared here
/usr/include/clc/math/rint.h:2:24: note: expanded from macro '__CLC_FUNCTION'
input.cl:46:45: warning: double precision constant requires cl_khr_fp64, casting to single precision
I am getting a negative return code from this function err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
I have already tried the solutions from these questions Using printf() in OpenCL kernel and printf function doesn't work in OpenCL kernel, and neither of these solutions fix it. When I try these solutions, I see a warning such as input.cl:1:26: warning: unknown OpenCL extension 'cl_amd_printf' - ignoring
Conclusion
Looks like my system does not support the printf extensions. The code below (stealing from pmdj's answer) gives me the following output. Looks like a classic story of don't depend on vendor specific extensions to a standard.
#include <stdio.h>
#include <CL/cl.h>
int main(void) {
char ext_str[1024] = "";
size_t ext_str_len = sizeof(ext_str) - 1;
cl_device_id device_id;
cl_int err;
cl_platform_id platform;
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(ext_str), ext_str, &ext_str_len);
if(err < 0) {
perror("Couldn't get device info");
exit(1);
}
printf("CL extensions (%lu): '%s'\n", ext_str_len, ext_str);
return 0;
}
CL extensions (248): 'cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp64 cl_khr_fp16'
As you've found, printf isn't part of standard OpenCL, but some implementations offer extensions which enable it.
To check for extensions supported by your implementation, try something like this:
char ext_str[1024] = "";
size_t ext_str_len = sizeof(ext_str) - 1;
err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(ext_str), ext_str, &ext_str_len);
printf("CL extensions (%lu): '%s'\n", ext_str_len, ext_str);
If there aren't any cl_*_printf extensions listed, you are probably out of luck. If one is listed, you'll need to enable it as described in the other answers you linked. You might also want to check the specification for the specific extension yours supports in case it exhibits any particular quirks.

OpenCL channels dynamic indexing

I want to implement a systolic structure for matrix multiplication. My objective is to use a single kernel for every Processing Element so I will execute the same kernel from the host part multiple times.
To communicate between kernels I would like to use channels or pipes. The problem is that "channels extension does not support dynamic indexing into arrays of channel IDs". The number of kernels will depend on the size of the matrix so I will need some method to connect the channels to the corresponding kernels automatically.
Summarizing, I am looking for a method to create this functionality:
channel float c0[32];
__kernel void producer (__global float * data_in){
for(int i=0; i<32; i++){
write_channel_altera(c0[i],data_in[i]);
}
}
__kernel void consumer (__global float * ret_buf){
for(int i=0; i<32; i++){
ret_buf[i]=read_channel_altera(c0[i]);
}
}
Thanks in advance!
OpenCL channels (Intel FPGA extension) do not support "true" dynamic
indexing, but you can work around this limitation in most cases by
using switch or #pragma unroll approach:
switch approach is described in Intel FPGA SDK for OpenCL Programming Guide:
channel int ch[WORKGROUP_SIZE];
__kernel void consumer() {
int gid = get_global_id(0);
int value;
switch(gid)
{
case 0: value = read_channel_intel(ch[0]); break;
case 1: value = read_channel_intel(ch[1]); break;
case 2: value = read_channel_intel(ch[2]); break;
case 3: value = read_channel_intel(ch[3]); break;
//statements
case WORKGROUP_SIZE-1:read_channel_intel(ch[WORKGROUP_SIZE-1]); break;
}
}
You can also use #pragma unroll if you have a loop over channels:
__kernel void consumer() {
int values[WORKGROUP_SIZE]
#pragma unroll
for (int i = 0; i < WORKGROUP_SIZE; ++i) {
values[i] = read_channel_intel(ch[i]);
}
}
As far as I know, we need to know how many channels we would require at the maximum much before compiling the program for the board, as we cannot program the FPGA like the way we do for other computing system and allocate resources on the go. Once we know the maximum number (atleast) we can use
#pragma unroll
before we start the loop for reading/writing the channels

Swap memory pointers atomically on CUDA

I have two pointers in memory and I want to swap it atomically but atomic operation in CUDA support only int types. There is a way to do the following swap?
classA* a1 = malloc(...);
classA* a2 = malloc(...);
atomicSwap(a1,a2);
When writing device-side code...
While CUDA provides atomics, they can't cover multiple (possibly remote) memory locations at once.
To perform this swap, you will need to "protect" access to both these values with something like mutex, and have whoever wants to write values to them take a hold of the mutex for the duration of the critical section (like in C++'s host-side std::lock_guard). This can be done using CUDA's actual atomic facilities, e.g. compare-and-swap, and is the subject of this question:
Implementing a critical section in CUDA
A caveat to the above is mentioned by #RobertCrovella: If you can make do with, say, a pair of 32-bit offsets rather than a 64-bit pointer, then if you were to store them in a 64-bit aligned struct, you could use compare-and-exchange on the whole struct to implement an atomic swap of the whole struct.
... but is it really device side code?
Your code actually doesn't look like something one would run on the device: Memory allocation is usually (though not always) done from the host side before you launch your kernel and do actual work. If you could make sure these alterations only happen on the host side (think CUDA events and callbacks), and that device-side code will not be interfered with by them - you can just use your plain vanilla C++ facilities for concurrent programming (like lock_guard I mentioned above).
I managed to have the needed behaviour, it is not atomic swap but still safe. The context was a monotonic Linked List working both on CPU and GPU:
template<typename T>
union readablePointer
{
T* ptr;
unsigned long long int address;
};
template<typename T>
struct LinkedList
{
struct Node
{
T value;
readablePointer<Node> previous;
};
Node start;
Node end;
int size;
__host__ __device__ void initialize()
{
size = 0;
start.previous.ptr = nullptr;
end.previous.ptr = &start;
}
__host__ __device__ void push_back(T value)
{
Node* node = nullptr;
malloc(&node, sizeof(Node));
readablePointer<Node> nodePtr;
nodePtr.ptr = node;
nodePtr.ptr->value = value;
#ifdef __CUDA_ARCH__
nodePtr.ptr->previous.address = atomicExch(&end.previous.address, nodePtr.address);
atomicAdd(&size,1);
#else
nodePtr.ptr->previous.address = end.previous.address;
end.previous.address = nodePtr.address;
size += 1;
#endif
}
__host__ __device__ T pop_back()
{
assert(end.previous.ptr != &start);
readablePointer<Node> lastNodePtr;
lastNodePtr.ptr = nullptr;
#ifdef __CUDA_ARCH__
lastNodePtr.address = atomicExch(&end.previous.address,end.previous.ptr->previous.address);
atomicSub(&size,1);
#else
lastNodePtr.address = end.previous.address;
end.previous.address = end.previous.ptr->previous.address;
size -= 1;
#endif
T toReturn = lastNodePtr.ptr->value;
free(lastNodePtr.ptr);
return toReturn;
}
__host__ __device__ void clear()
{
while(size > 0)
{
pop_back();
}
}
};

OpenCL corrupt input WIN32, valid on OSX Lion

I am having an issue with my OpenCL kernel. The input arguments are corrupt when they are passed to the kernel. What makes this strange is this same exact kernel executes flawlessly on mac osx. Once I started porting my code over to windows (windows 8 64-bit) I started having this issue.
I have provided an example using my camera struct. The x,y,z coordinates are defined as <0,0,200>. However, when they make it to my kernel they show as <0,-0.00132704, -0.00132704>.
I have a kernel that accepts two structs.
typedef struct{
cl_float d;
cl_float3 eye;
cl_float3 lookat;
cl_float3 u;
cl_float3 v;
cl_float3 w;
cl_float3 up;
}rt_cl_camera;
typedef struct {
float r;
float g;
float b;
} rt_cl_rgb;
I have slimmed down my kernel for the sake of testing. After tracking down the issues I noticed that my input paramaters were not coming over correctly. However, I have determined that my output is being passed back correctly.
__kernel void ray_trace_scene( __global rt_cl_rgb* output,
__global rt_cl_camera* camera,
const unsigned int pcount)
{
int pixel = get_global_id(0);
if(pixel < pcount){
output[pixel].r = camera->eye.x;
output[pixel].g = camera->eye.y;
output[pixel].b = camera->eye.z;
}// End Pixel computation
}//End kernel
I am creating my input buffer with the follwoing:
cl_mem cam_input;
cl_uint cam_error;
cam_input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(rt_cl_camera), NULL, &cam_error);
I am also checking to make sure my buffer was created successfully with
if (cam_error != CL_SUCCESS || !cam_input) {
throw std::runtime_error(CLERROR_FAILED_DEVBUFF);
}
I then write my data into my buffer with the following.
cl_uint err = 0;
err = clEnqueueWriteBuffer(commands, cam_input, CL_TRUE, 0, sizeof(rt_cl_camera), cam_ptr, 0, NULL, NULL);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to write camera");
}
and finally linking my argument for the appropriate command line slot. Please note that slot zero is being used for my output.
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cam_input);
and checking that everything was successful..
if (err != CL_SUCCESS) {
throw std::runtime_error(CLERROR_FAILED_CMDARGS);
}
I am not receiving any error messages from openCL at any step of the process. Has anyone run into this? Any help is greatly appreciated.
side note - At each step of the way I am printing out my local variables to make sure they are correct and valid before I pass them over to the GPU.
Looks an alignment/packing issue. Try using float4 instead of float3 in the struct, and move float d at the end.

__local atomic in opencl

About atomic access of __local variables:
I know it's slow to do global operations compared with local ones. In this sense I'd like to make atomic access of some variables.
I know I can do atomic operations in OpenCL:
// Program A:
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
__kernel void test(global int * num)
{
atom_inc(&num[0]);
}
How do I share atomic data between work-itens within a given work-group?
for ex: I'd like to do something like that:
// Program B: (it doesn't work, just to show how I'd like it to be)
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
__kernel void test(global int * num, const int numOperations)
{
__local int num;
if (get_global_id(0) < numOperations) {
atom_inc(&num);
}
}
In the end the num value should return: numOperations - 1;
Isn't this possible? If not how could I do it?
Typically, you have one thread which intializes the shared (local) atomic followed by some barrier. I.e. your kernel starts like this:
__local int sharedNum;
if (get_local_id (0) == 0) {
sharedNum = 0;
}
barrier (CLK_LOCAL_MEM_FENCE);
// Now, you can use sharedNum
while (is_work_left ()) {
atomic_inc (&sharedNum);
}
There's not much magic to it -- all items in a work-group can see the same local variables, so you can just access it as usual.

Resources