How to pass parameters to ocl kernel using pyopencl?

How to pass parameters to ocl kernel using pyopencl? - opencl

how to pass some parameters, which will be treated in the .cl file as a preprocessor define using pyopencl?
Meaning:
foo.cl
# define LIMIT 12
typedef struct {
uint i[LIMIT];
} foomatic;
turns to
foo_nodefs.cl
typedef struct {
uint i[LIMIT]; // python script passing LIMIT to set it
} foomatic;
Thanks,
John

Edit: extending the answer, making it maximally detailed.
There are two ways to do that:
(metaprogramming) Add your preprocessor directives directly to the string with the source code, or even run your own preprocessor using some templating engine.
import pyopencl as cl
import numpy
import numpy.linalg as la
a = numpy.random.rand(50000).astype(numpy.float32)
b = numpy.random.rand(50000).astype(numpy.float32)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
defines = """
#define AXIS 0
#define COEFF 1
"""
prg = cl.Program(ctx,
defines +
"""
__kernel void sum(__global const float *a,
__global const float *b, __global float *c)
{
int gid = get_global_id(AXIS);
c[gid] = a[gid] + b[gid] + COEFF;
}
""").build()
prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
a_plus_b = numpy.empty_like(a)
cl.enqueue_copy(queue, a_plus_b, dest_buf)
print(la.norm(a_plus_b - (a+b+1)), la.norm(a_plus_b))
(C-way) use options keyword of Program.build to pass build options directly to clBuildProgram():
import pyopencl as cl
import numpy
import numpy.linalg as la
a = numpy.random.rand(50000).astype(numpy.float32)
b = numpy.random.rand(50000).astype(numpy.float32)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
prg = cl.Program(ctx, """
__kernel void sum(__global const float *a,
__global const float *b, __global float *c)
{
int gid = get_global_id(AXIS);
c[gid] = a[gid] + b[gid] + COEFF;
}
""").build(options=['-D', 'AXIS=0', '-D', 'COEFF=1'])
prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
a_plus_b = numpy.empty_like(a)
cl.enqueue_copy(queue, a_plus_b, dest_buf)
print(la.norm(a_plus_b - (a+b+1)), la.norm(a_plus_b))
(I have used the modifed source code from the main page of PyOpenCL docs. Tested on pyopencl 2013.1).

Related

Is it possible to run a 4 dimensional work item in pyopencl?

I have a pyopencl based code which runs perfectly fine for 3-dimensional work groups, but when moving to 4-dimensional work groups, it breaks down with the error:
pyopencl._cl.LogicError: clEnqueueNDRangeKernel failed: INVALID_WORK_DIMENSION
Digging around, I found this answer to another question, which implies that OpenCl in fact allows higher dimensional work groups.
So my question is if it is possible to change this setting in pyopencl. From this other answer elsewhere, I understand that pyopencl immediately inputs the dimensions, but given the error I have, I think there must be some issue.
This is a minimal sample code to replicate this error.
The code works well for the first kernel function, it breaks down on the second one.
import pyopencl as cl
import numpy as np
context = cl.create_some_context()
queue = cl.CommandQueue(context)
kernel_code = """
__kernel void fun3d( __global double *output)
{
size_t i = get_global_id(0);
size_t j = get_global_id(1);
size_t k = get_global_id(2);
size_t I = get_global_size(0);
size_t J = get_global_size(1);
#
size_t idx = k*J*I + j*I + i;
#
output[idx] = idx;
}
__kernel void fun4d( __global double *output)
{
size_t i = get_global_id(0);
size_t j = get_global_id(1);
size_t k = get_global_id(2);
size_t l = get_global_id(3);
size_t I = get_global_size(0);
size_t J = get_global_size(1);
size_t K = get_global_size(2);
#
size_t idx = l*K*J*I + k*J*I + j*I + i;
#
output[idx] = idx;
}
"""
program = cl.Program(context, kernel_code).build()
I = 2
J = 3
K = 4
L = 5
output3d = np.zeros((I*J*K)).astype(np.float64)
cl_output3d = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, output3d.nbytes)
program.fun3d(queue, (I,J,K), None, cl_output3d)
cl.enqueue_copy(queue, output3d, cl_output3d)
queue.finish()
import code; code.interact(local=dict(globals(), **locals()))
# 4d attempt
output4d = np.zeros((I*J*K*L)).astype(np.float64)
cl_output4d = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, output4d.nbytes)
program.fun4d(queue, (I,J,K,L), None, cl_output4d)
cl.enqueue_copy(queue, output4d, cl_output4d)
queue.finish()

Trying to specify more dimensions than supported by implementation is not going to work.
The maximum number of supported dimensions can be queried via CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS or in terminal, for example:
$ clinfo | grep dim
Max work item dimensions 3

OpenCL event system not waiting

I have a simple openCL Kernel, called Test_Kernel.cl that reads from an image.
const sampler_t sampler =
CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
void read(image3d_t image)
{
int z = get_global_id(0);
int y = get_global_id(1);
int x = get_global_id(2);
float value = read_imagef(image,sampler,(float4)(x,y,z,0)).s0;
}
__kernel void test(image3d_t d_testdata)
{
read(d_testdata);
}
And the corresponding pyopencl file to copy a numpy array on the device, to be able to read from it:
import pyopencl as cl
import numpy as np
#Setting up contexts, devices and queues.
platform = cl.get_platforms()[0]
devs = platform.get_devices()
device1 = devs[0]
ctx = cl.Context([device1])
queue = cl.CommandQueue(ctx)
queue2 = cl.CommandQueue(ctx)
#Defining testdata.
h_testdata = np.arange(4096*2).reshape((2,64,64)).astype(np.float32,order='C')
mf = cl.mem_flags
#Building the Kernel.
f = open('Test_Kernel.cl', 'r')
fstr = "".join(f.readlines())
prg = cl.Program(ctx, fstr).build()
test_knl = prg.test
def f():
d_testdata = cl.Image(ctx, mf.READ_ONLY, cl.ImageFormat(cl.channel_order.INTENSITY,cl.channel_type.FLOAT),h_testdata.shape)
wev1=cl.enqueue_copy(queue, d_testdata, h_testdata, is_blocking = False, origin = (0,0,0), region = h_testdata.shape)
test_knl.set_args(d_testdata)
cl.enqueue_nd_range_kernel(queue2,test_knl,(64,64,64),None,wait_for=[wev1])
f()
As the program flow will get more complex further down the road, I am using events and multiple queues to care for ordering and synchronisation. However, in my profiler, I can see, that sometimes the kernel executes BEFORE the copy event. What am I doing wrong?

Use of variables in kernel function using OpenCL

My code is as follows.
* host code******
//some declarations
cl_mem Curr_domain = NULL;
cl_mem dMobj = NULL;
//created check, AvgBlk of type structure domainBlock
// now creating buffers for above datatypes
dMobj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,
dCount*sizeof(struct domainBlock), AvgBlk, &ret);
Curr_domain = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount *
sizeof(struct domainBlock), check, &ret);
//passed these 2 objects to the kernel
******* kernel code******
__kernel void calculateRms( __global struct domainBlock* dMobj,
__global struct domainBlock* Curr_domain )
{
int l = get_global_id(0);
int i=0;
int iType=0;
for(iType =0; iType<8;iType++)
{
if(iType==0)
{
Curr_domain = dMobj;
}
}
}
This gives me build failure error!
But if I initialize Curr_domain = dMobj; before the loop it won't. But I want to do this for 6 more times.So how can I fix this issue?
I'm using Intel(R) HD Graphics 4000 for executing this program.

substitutions for cl_khr_int64_base_atomics

I have an ATI Firepro V4800 graphics card which does not support cl_khr_int64_base_atomics. I am trying to adapt the RadixSort algo for long integers. The algo uses atomic_inc, the 64-bit of which is atom_inc, which I cannot use in the kernel. So, my question is, is there a piece of code which performs the same function as atomic_inc which can be used? The piece of kernel code is given below:
__kernel void histogram(__global uint* unsortedData,
__global uint* buckets,
uint shiftCount,
__local uint* sharedArray)
{
size_t localId = get_local_id(0);
size_t globalId = get_global_id(0);
size_t groupId = get_group_id(0);
size_t groupSize = get_local_size(0);
uint numGroups = get_global_size(0) / get_local_size(0);
// Initialize shared array to zero //
sharedArray[localId] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
// Calculate thread-histograms //
uint value = unsortedData[globalId];
value = value >> shiftCount & 0xFFU;
atomic_inc(sharedArray+value);
barrier(CLK_LOCAL_MEM_FENCE);
// Copy calculated histogram bin to global memory //
uint bucketPos = groupId * groupSize + localId ;
//uint bucketPos = localId * numGroups + groupId ;
buckets[bucketPos] = sharedArray[localId];
}
Any suggestions? Thank you.
Edit:
Another way for the same is given in this blogsite: http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html. This gives a very generic implementation of the Atomic Inc.

You could try something like this:
void atomInc64 (__local uint *counter)
{
uint old, carry;
old = atomic_inc (&counter [0]);
carry = old == 0xFFFFFFFF;
atomic_add (&counter [1], carry);
}
Where counter is an array of two 32-bit integers. While the two halves don't increment at exactly the same time, the total should be correct when the program completes.

How to pass an array of vectors in pyOpenCL

I'm moving a simulation into pyOpenCL and can't get my data access to work. I'm trying to supply a 1D array of vectors (well, actually several, but the example I've included just used one).
Currently, several vectors are copied over just fine, but then the data is simply not what I supplied.
I don't think I've posted here before, so apologies if any of the formatting/presentation is wrong. Also, I've just stripped out all the simulation code, so I realise this code is currently not actually doing anything, I just want to get the buffer passing correct.
Thanks in advance.
The kernel (kertest.py):
step1 = """
#pragma OPENCL EXTENSION cl_amd_printf: enable
#define X xdim
#define Y ydim
__kernel void k1(__global float3 *spins,
__local float3 *tile)
{
ushort lid = 2 * get_local_id(0);
ushort group = 2 * get_group_id(0);
ushort num = get_num_groups(0);
int lim = X*Y*3;
for (ushort i = 0; i < lim; i++)
{
if (lid == 0 && group == 0)
{
printf("%f :: %d\\n", spins[i].x, i);
}
}
}"""
The code itself (gputest.py):
import kertest as k2D
import numpy as np
import pyopencl as cl
class GPU_MC2DSim():
def __init__(self, x, y):
self.x = x
self.y = y
if x >= y:
self.xdim = int(self.x)
self.ydim = int(self.y)
else:
self.xdim = int(self.y)
self.ydim = int(self.x)
if self.xdim % 2 != 0: self.xdim += 1
if self.ydim % 2 != 0: self.ydim += 1
self.M = np.ones((self.xdim*self.ydim, 3)).astype(np.float32)
self.M[:, 1] += 1.0
self.M[:, 2] += 2.0
print self.M
def simulate(self):
ctx = cl.create_some_context()
q = cl.CommandQueue(ctx)
mf = cl.mem_flags
#Pass buffer:
M_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = self.M)
#Insert kernel parameters:
params = {"xdim" : "%d" % (self.xdim),
"ydim" : "%d" % (self.ydim),
}
for name in params:
k2D.step1 = k2D.step1.replace(name, params[name])
#Compile kernel:
step1 = cl.Program(ctx, k2D.step1).build()
locmem = cl.LocalMemory(self.xdim*4*4)
step1.k1(q, ((self.xdim*self.ydim)/4,), (self.xdim/2,), M_buf, locmem).wait()
return None
xdim = 4
ydim = 4
sim = GPU_MC2DSim(xdim, ydim)
sim.simulate()

Your code for copying the data to the device is just fine. However, your kernel has at least two problems:
float3 values are expected to be 16-byte aligned, as per OpenCL 1.2 Spec, 6.1.5:
For 3-component vector data types, the size of the data type is 4 * sizeof(component). This means that a 3-component vector data type will be aligned to a 4 * sizeof(component) boundary. The vload3 and vstore3 built-in functions can be used to read and write, respectively, 3-component vector data types from an array of packed scalar data type.
The values you upload to the devices are not properly aligned for the kernel to read float3 values directly.
Your limit calculation int lim = X*Y*3; is slightly off. You are already trying to read from an array of float3, so the *3 is superfluous.
The solution to both problems is simple: as stated in the spec, you should use vload3 to load from an array of floats:
#pragma OPENCL EXTENSION cl_amd_printf: enable
#define X xdim
#define Y ydim
__kernel void k1(__global float *spins,
__local float3 *tile)
{
ushort lid = 2 * get_local_id(0);
ushort group = 2 * get_group_id(0);
ushort num = get_num_groups(0);
int lim = X*Y;
for (ushort i = 0; i < lim; i++)
{
if (lid == 0 && group == 0)
{
float3 vec = vload3(i, spins);
printf("(%f, %f, %f) :: %d\\n", vec.x, vec.y, vec.z, i);
}
}
}