I want to pass a structure to opencl kernel, the structure is
struct test
{
int *x;
float *y;
char *z;
};
and the memory allocation and initialization is like
struct test t;
t.x = (int*)malloc(sizeof(int)*100);
t.y = (float*) malloc (sizeof(float)*50);
t.z = (char*) malloc (sizeof(char) *25);
for(i = 0;i<100;i++)
{
t.x[i] = i;
if(i<50)
{
t.y[i] = i;
if(i<25)
t.z[i] = 'a';
}
}
can i pass such a structure to opencl kernel
You can pass such a structure, but it will be pointless because x, y and z point to different memory regions. Each of these memory buffers must be transferred on its own.
Instead of structure, its better to allocate memory at the host side and send them to kernel as kernel parameters.
Related
I would like to use the local/shared memory optimization to reduce global memory access, so I basically have this function
float __attribute__((always_inline)) test_unoptimized(const global float* data, ...) {
// ...
for(uint j=0; j<def_data_length; j++) {
const float x = data[j];
// do sime computation with x, like finding the minimum value ...
}
// ...
return x_min;
}
and do the usual local/shared memory optimization on it:
float __attribute__((always_inline)) test_optimized(const global float* data, ...) {
// ...
const uint lid = get_local_id(0); // shared memory optimization (only works with first ray)
local float cache_x[def_ws];
for(uint j=0; j<def_data_length; j+=def_ws) {
cache_x[lid] = data[j+lid];
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for(uint k=0; k<min(def_ws, def_data_length-j); k++) {
const float x = cache_x[k];
// do sime computation with x, like finding the minimum value ...
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// ...
return x_min;
}
Now the difficulty is that test_optimized is called in the kernel only in one of two possible if/else branches. If only some threads in a workgroup execute the else-branch, all other threads must not choose the if-branch for the local memory optimization in test_optimized to work. So I created a workaround: The condition for each thread in the workgroup is atomic_or-ed into an integer and then the integer, which is the same for all threads, is checked for branching. This ensures that, if 1 or more threads in the thread block choose the else-branch, all the others do too.
kernel void test_kernel(const global float* data, global float* result...) {
const uint n = get_global_id(0);
// ...
const bool condition = ...; // here I get some condition based on the thread ID n and global data
local uint condition_any; // make sure all threads within a workgroup are in the if/else part
condition_any = 0u;
barrier(CLK_LOCAL_MEM_FENCE);
atomic_or(&condition_any, condition);
barrier(CLK_LOCAL_MEM_FENCE);
if(condition_any==0u) {
// if-part is very short
result = 0;
return;
} else {
// else-part calls test_optimized function
const float x_min = test_optimized(data, ...);
result = condition ? x_min : 0;
}
}
The above code works flawlessly and is about 25% faster than with the test_unoptimized function. But atomically jamming a bit into the same local memory from all threads in the workgroup seems a bit like a hack to me and it only runs efficiently for small workgroup size (def_ws) 32, 64 or 128, but not 256 or greater.
Is this trick used in other codes and does it have a name?
If not: Is there a better way to do it?
With OpenCL 1.2 or older, I don't think there's a way to do this any faster. (I'm not aware of any relevant vendor extensions, but check your implementation's list for anything promising.)
With OpenCL 2.0+, you can use workgroup functions, in this case specifically work_group_any() for this sort of thing.
I created a buffer on the OpenCL device (a GPU), and from the host I need to know the global on-device pointer address so that I can put that on-device address in another buffer so that the kernel can then read from that buffer that contains the address of the first buffer so that then it can access the contents of that buffer.
If that's confusing here's what I'm trying to do: I create a generic floats-containing buffer representing a 2D image, then from the host I create a todo list of all the things my kernel needs to draw, which lines, which circles, which images... So from that list the kernel has to know where to find that image, but the reference to that image cannot be passed as a kernel argument, because that kernel might draw no image, or a thousand different images, all depending on what the list says, so it has to be referenced in that buffer that serves as a todo list for my kernel.
The awkward way I've done it so far:
To do so I tried making a function that calls a kernel after the creation of the image buffer that gets the buffer and returns the global on-device address as a ulong in another buffer, then the host stores that value in a 64-bit integer, like this:
uint64_t get_clmem_device_address(clctx_t *clctx, cl_mem buf)
{
const char kernel_source[] =
"kernel void get_global_ptr_address(global void *ptr, global ulong *devaddr) \n"
"{ \n"
" *devaddr = (ulong) ptr; \n"
"} \n";
int32_t i;
cl_int ret;
static int init=1;
static cl_program program;
static cl_kernel kernel;
size_t global_work_size[1];
static cl_mem ret_buffer;
uint64_t devaddr;
if (init)
{
init=0;
ret = build_cl_program(clctx, &program, kernel_source);
ret = create_cl_kernel(clctx, program, &kernel, "get_global_ptr_address");
ret_buffer = clCreateBuffer(clctx->context, CL_MEM_WRITE_ONLY, 1*sizeof(uint64_t), NULL, &ret);
}
if (kernel==NULL)
return ;
// Run the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &ret_buffer);
global_work_size[0] = 1;
ret = clEnqueueNDRangeKernel(clctx->command_queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL); // enqueue the kernel
ret = clEnqueueReadBuffer(clctx->command_queue, ret_buffer, CL_FALSE, 0, 1*sizeof(uint64_t), &devaddr, 0, NULL, NULL); // copy the value
ret = clFlush(clctx->command_queue);
clFinish(clctx->command_queue);
return devaddr;
}
Apparently this works (it does return a number, although it's hard to know if it's correct), but then I put this devaddr (a 64-bit integer on the host) in the todo list buffer that the kernel uses to know what to do, and then if necessary (according to the list) the kernel calls the function below, le here being a pointer to the relevant entry in the todo list, and the 64-bit address being the first element:
float4 blit_sprite(global uint *le, float4 pv)
{
const int2 p = (int2) (get_global_id(0), get_global_id(1));
ulong devaddr;
global float4 *im;
int2 im_dim;
devaddr = ((global ulong *) le)[0]; // global address for the start of the image as a ulong
im_dim.x = le[2];
im_dim.y = le[3];
im = (global float4 *) devaddr; // ulong is turned into a proper global pointer
if (p.x < im_dim.x)
if (p.y < im_dim.y)
pv += im[p.y * im_dim.x + p.x]; // this gives me a CL_OUT_OF_RESOURCES error, even when changing it to im[0]
return pv;
}
but big surprise this doesn't work, it gives me a CL_OUT_OF_RESOURCES error, which I assume means my im pointer isn't valid. Actually it works, it didn't work when I used two different contexts. But it's still pretty unwieldy.
Is there a less weird way to do what I want to do?
OpenCL standard doesn't guarantee that memory objects will not be physically reallocated between kernel calls. So, original Device-side address is valid only within single kernel NDRange. That's one of the reasons why OpenCL memory objects are represented on Host side as transparent structure pointers.
Though, you can save offset to memory object's first byte in 1st kernel and pass it to 2nd kernel. Every time you launch your kernel, you will obtain actual Device-side address within your kernel & increment it by saved shift value. That would be perfectly "legal".
When passing buffers as argument to OpenCL kernels, will the address of the buffer seen by the kernel code remains the same for the same buffer?
I used the code below to check and it seems that the address are indeed the same. However, I can't find anything in the standard to guarantee this.
import pyopencl as cl
import numpy as np
def main():
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
buf = cl.Buffer(ctx, mf.READ_ONLY, 1000)
buf2 = cl.Buffer(ctx, mf.READ_WRITE, 8)
prg = cl.Program(ctx, """
__kernel void
get_addr(__global const int *in, __global long *out)
{
*out = (long)in;
}
""").build()
knl = prg.get_addr
knl.set_args(buf, buf2)
cl.enqueue_task(queue, knl)
b = np.empty([1], dtype=np.int64)
cl.enqueue_copy(queue, b, buf2).wait()
print(b[0])
prg = cl.Program(ctx, """
__kernel void
get_addr(__global const int *in, __global long *out)
{
*out = (long)in;
}
""").build()
knl = prg.get_addr
knl.set_args(buf, buf2)
cl.enqueue_task(queue, knl)
b = np.empty([1], dtype=np.int64)
cl.enqueue_copy(queue, b, buf2).wait()
print(b[0])
if __name__ == '__main__':
main()
The use case is that I am running a simulation using OpenCL which has many (arrays) of parameters. In order not having to pass these arrays around as arguments, I fill them in a struct and pass the pointer to the struct around instead. Since this struct will be used many times (and by all work items) I would like not having to fill it in every run of every kernels and would like to know if the pointers will change between different runs/work items.
It is not guaranteed for OpenCL 1.x. This is why it is unsafe to store pointers in buffers. The runtime is allowed to move the allocation for each kernel launch. There is no guarantee that it will move it, and of course it is reasonable to expect that the buffer will not often need to move so it isn't surprising that you'd see the result you see. If you allocate a lot more buffers and cycle through them to force the runtime to move them around you will be more likely to see the issue.
For OpenCL 2.0 the shared virtual memory feature guarantees this by definition: the address couldn't be shared if it kept changing.
I have allocated a buffer on the device:
cl_mem buff;
I want to pass this buffer plus an offset to my kernel
i.e.
buff + offset;
I find that this is not allowed. If I instead pass buff into my kernel and then
calculate the offset buffer inside the kernel, then this is fine. But it adds a needless calculation to each kernel run.
So, I get that the device memory space is different than the host, so I can't do simple pointer arithmetic. But, is there a way of taking an address to a device memory buffer,
calculating an offset, and passing this offset buffer into the kernel?
I think this may be possible with clCreateSubBuffer, but the offset needs to be aligned to the device's CL_DEVICE_MEM_BASE_ADDR_ALIGN, and this is not always possible for my kernel.
Using clCreateSubBuffer
If offset can be calculated statically, export macro, when building Program of your Kernel;
Assuming you are using C++
std::string macro;
std::stringstream ss;
// e. g. let it be 2^10
std::size_t offset = 1024;
ss << offset;
macro = "-D offset=";
macro += ss.str();
...
// When building Programm
clBuildProgram(..., macro.c_str(), ...);
//Inside your Kernel macro "offset" is defined
void __kenel my(
__global const uchar* data)
{
__global const uchar* data_with_shift = data + offset;
return;
}
Though, calculations inside kernel are extreamly cheap, so Marco13 gave you good advice.
I have a kernel which I am running on a NVidia GTX 680 that increased in execution time when switching from using global memory to local memory.
My kernel which is part of a finite element ray tracer now loads each element into local memory before processing. The data for each element is stored in a struct fastTriangle which has the following definition :
typedef struct fastTriangle {
float cx, cy, cz, cw;
float nx, ny, nz, nd;
float ux, uy, uz, ud;
float vx, vy, vz, vd;
} fastTriangle;
I pass an array of these object to the kernel which is written as follows (I have removed the irrelevant code for brevity:
__kernel void testGPU(int n_samples, const int n_objects, global const fastTriangle *objects, __local int *x_res, __global int *hits) {
// Get gid, lid, and lsize
// Set up random number generator and thread variables
// Local storage for the two triangles being processed
__local fastTriangle triangles[2];
for(int i = 0; i < n_objects; i++) { // Fire ray from each object
event_t evt = async_work_group_copy((local float*)&triangles[0], (global float*)&objects[i],sizeof(fastTriangle)/sizeof(float),0);
//Initialise local memory x_res to 0's
barrier(CLK_LOCAL_MEM_FENCE);
wait_group_events(1, &evt);
Vector wsNormal = { triangles[0].cw*triangles[0].nx, triangles[0].cw*triangles[0].ny, triangles[0].cw*triangles[0].nz};
for(int j = 0; j < n_samples; j+= 4) {
// generate a float4 of random numbers here (rands
for(int v = 0; v < 4; v++) { // For each ray in ray packet
//load the first object to be intesected
evt = async_work_group_copy((local float*)&triangles[1], (global float*)&objects[0],sizeof(fastTriangle)/sizeof(float),0);
// Some initialising code and calculate ray here
// Should have ray fully specified at this point;
for(int w = 0; w < n_objects; w++) { // Check for intersection against each ray
wait_group_events(1, &evt);
// Check for intersection against object w
float det = wsDir.x*triangles[1].nx + wsDir.y*triangles[1].ny + wsDir.z*triangles[1].nz;
float dett = triangles[1].nd - (triangles[0].cx*triangles[1].nx + triangles[0].cy*triangles[1].ny + triangles[0].cz*triangles[1].nz);
float detpx = det*triangles[0].cx + dett*wsDir.x;
float detpy = det*triangles[0].cy + dett*wsDir.y;
float detpz = det*triangles[0].cz + dett*wsDir.z;
float detu = detpx*triangles[1].ux + detpy*triangles[1].uy + detpz*triangles[1].uz + det*triangles[1].ud;
float detv = detpx*triangles[1].vx + detpy*triangles[1].vy + detpz*triangles[1].vz + det*triangles[1].vd;
// Interleaving the copy of the next triangle
evt = async_work_group_copy((local float*)&triangles[1], (global float*)&objects[w+1],sizeof(fastTriangle)/sizeof(float),0);
// Complete intersection calculations
} // end for each object intersected
if(objectNo != -1) atomic_inc(&x_res[objectNo]);
} // end for sub rays
} // end for each ray
barrier(CLK_LOCAL_MEM_FENCE);
// Add all the local x_res to global array hits
barrier(CLK_GLOBAL_MEM_FENCE);
} // end for each object
}
When I first wrote this kernel I did not buffer each object in local memory and instead just accessed it form global memory i.e instead of triangles[0].cx I would use objects[i].cx
When setting out to optimise I switched to using local memory as listed above but then observed a execution run time increase of around 25%.
Why would performance be worse when using local memory to buffer the objects instead of directly accessing them in global memory?
It really depends on your program if local memory helps you to run faster. There are two things to consider when using local memory:
you have additional computation when copying the data from global to local and from local to global again.
I see that you have 3 times "barrier(...)", these barriers are performance killers. All OpenCL tasks have to wait at the barrier for all others. This way the parallelism is hindered and the tasks don't run independent any more.
Local memory is great when you read data lots of times in your computation. But the fast reads and writes need to get you more performance gain than the copying and synchronizing takes.