I have a simple openCL Kernel, called Test_Kernel.cl that reads from an image.
const sampler_t sampler =
CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
void read(image3d_t image)
{
int z = get_global_id(0);
int y = get_global_id(1);
int x = get_global_id(2);
float value = read_imagef(image,sampler,(float4)(x,y,z,0)).s0;
}
__kernel void test(image3d_t d_testdata)
{
read(d_testdata);
}
And the corresponding pyopencl file to copy a numpy array on the device, to be able to read from it:
import pyopencl as cl
import numpy as np
#Setting up contexts, devices and queues.
platform = cl.get_platforms()[0]
devs = platform.get_devices()
device1 = devs[0]
ctx = cl.Context([device1])
queue = cl.CommandQueue(ctx)
queue2 = cl.CommandQueue(ctx)
#Defining testdata.
h_testdata = np.arange(4096*2).reshape((2,64,64)).astype(np.float32,order='C')
mf = cl.mem_flags
#Building the Kernel.
f = open('Test_Kernel.cl', 'r')
fstr = "".join(f.readlines())
prg = cl.Program(ctx, fstr).build()
test_knl = prg.test
def f():
d_testdata = cl.Image(ctx, mf.READ_ONLY, cl.ImageFormat(cl.channel_order.INTENSITY,cl.channel_type.FLOAT),h_testdata.shape)
wev1=cl.enqueue_copy(queue, d_testdata, h_testdata, is_blocking = False, origin = (0,0,0), region = h_testdata.shape)
test_knl.set_args(d_testdata)
cl.enqueue_nd_range_kernel(queue2,test_knl,(64,64,64),None,wait_for=[wev1])
f()
As the program flow will get more complex further down the road, I am using events and multiple queues to care for ordering and synchronisation. However, in my profiler, I can see, that sometimes the kernel executes BEFORE the copy event. What am I doing wrong?
Related
I have a pyopencl based code which runs perfectly fine for 3-dimensional work groups, but when moving to 4-dimensional work groups, it breaks down with the error:
pyopencl._cl.LogicError: clEnqueueNDRangeKernel failed: INVALID_WORK_DIMENSION
Digging around, I found this answer to another question, which implies that OpenCl in fact allows higher dimensional work groups.
So my question is if it is possible to change this setting in pyopencl. From this other answer elsewhere, I understand that pyopencl immediately inputs the dimensions, but given the error I have, I think there must be some issue.
This is a minimal sample code to replicate this error.
The code works well for the first kernel function, it breaks down on the second one.
import pyopencl as cl
import numpy as np
context = cl.create_some_context()
queue = cl.CommandQueue(context)
kernel_code = """
__kernel void fun3d( __global double *output)
{
size_t i = get_global_id(0);
size_t j = get_global_id(1);
size_t k = get_global_id(2);
size_t I = get_global_size(0);
size_t J = get_global_size(1);
#
size_t idx = k*J*I + j*I + i;
#
output[idx] = idx;
}
__kernel void fun4d( __global double *output)
{
size_t i = get_global_id(0);
size_t j = get_global_id(1);
size_t k = get_global_id(2);
size_t l = get_global_id(3);
size_t I = get_global_size(0);
size_t J = get_global_size(1);
size_t K = get_global_size(2);
#
size_t idx = l*K*J*I + k*J*I + j*I + i;
#
output[idx] = idx;
}
"""
program = cl.Program(context, kernel_code).build()
I = 2
J = 3
K = 4
L = 5
output3d = np.zeros((I*J*K)).astype(np.float64)
cl_output3d = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, output3d.nbytes)
program.fun3d(queue, (I,J,K), None, cl_output3d)
cl.enqueue_copy(queue, output3d, cl_output3d)
queue.finish()
import code; code.interact(local=dict(globals(), **locals()))
# 4d attempt
output4d = np.zeros((I*J*K*L)).astype(np.float64)
cl_output4d = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, output4d.nbytes)
program.fun4d(queue, (I,J,K,L), None, cl_output4d)
cl.enqueue_copy(queue, output4d, cl_output4d)
queue.finish()
Trying to specify more dimensions than supported by implementation is not going to work.
The maximum number of supported dimensions can be queried via CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS or in terminal, for example:
$ clinfo | grep dim
Max work item dimensions 3
I am trying to write an OpenCL kernel that uses OpenCL pipes. The kernel code is given below.
uint tid = get_global_id(0);
uint numWorkItems = get_global_size(0);
int i;
int rid;
int temp = 0, temp1 = 0;
int val;
int szgr = get_local_size(0);
int lid = get_local_id(0);
for(i = tid + start_index; i < rLen; i = i + numWorkItems){
temp = 0;
val = input[i];
temp = hashTable[val - 1];
if(temp){
temp1 = projection[val - 1];
}
reserve_id_t rid1 = work_group_reserve_write_pipe(c0, szgr);
while(is_valid_reserve_id(rid1) == false){
rid1 = work_group_reserve_write_pipe(c0, szgr);
}
if(is_valid_reserve_id(rid1))
{
write_pipe(c0,rid1,lid, &temp);
work_group_commit_write_pipe(c0, rid1);
}
reserve_id_t rid2 = work_group_reserve_write_pipe(c1, szgr);
while(is_valid_reserve_id(rid2) == false){
rid2 = work_group_reserve_write_pipe(c1, szgr);
}
if(is_valid_reserve_id(rid2))
{
write_pipe(c1,rid2,lid, &temp1);
work_group_commit_write_pipe(c1, rid2);
}
}
But the work_group_reserve_write_pipe function always fails and because of this the kernels hangs at the while loop. If I remove this while loop then the code doesnt hang but writing to the pipe doesnt happen. Can someone tell me why this is happening?
The pipe is declared as a _write_only pipe.
About work_group_reserve_write_pipe:
This built-in function must be encountered by all work-items in a
work-group executing the kernel with the same argument values;
otherwise the behavior is undefined.
the loop starts from tid + start_index so after some loop iterations, some work items doesn't hit this instruction. Also a while loop is doing same undefined behaviour.
I have recursive FFT algortihm for multiplying polynoms and i need to paralellize it with openmp. After some research around and attempts i got to this
Complex * multiply(Complex *p1, Complex *p2)
{
#pragma omp parallel
{
//evaluate p1
#pragma omp single nowait
pFFT(n,p1,1);
#pragma omp single nowait
pFFT(n,p2,1);
}
//...multiply part etc
}
void pFFT(int deg, Complex *pol,int sign)
{
if( deg == 1)
return;
//divide polynom into two parts with even and odd coeficients
Complex *even = new Complex [deg/2];
Complex *odd = new Complex [deg/2];
for(int i = 0;i<deg/2;i++)
{
even[i] = pol[2*i];
odd[i] = pol[2*i+1];
}
#pragma omp task
pFFT(deg/2,even,sign);
#pragma omp task
pFFT(deg/2,odd,sign);
#pragma omp taskwait
//wn = n-th root of unity
int x = lg2(deg);
Complex wn;
wn.re = pcos[x];
wn.im = sign*psin[x];
Complex w;
w.re = 1;
w.im = 0;
Complex *ret = pol;
Complex product;
if(deg==2)
{
product = mul(odd,&w);
ret[0].re = even[0].re+product.re;
ret[0].im = even[0].im+product.im;
ret[1].re = even[0].re-product.re;
ret[1].im = even[0].im-product.im;
}
else
for(int i = 0;i<deg/2-1;i+=2)
{
product = mul(odd+i,&w);
ret[i].re = even[i].re+product.re;
ret[i].im = even[i].im+product.im;
ret[i+deg/2].re = even[i].re-product.re;
ret[i+deg/2].im = even[i].im-product.im;
w = mul(&w,&wn);
product = mul(odd+i+1,&w);
ret[i+1].re = even[i+1].re+product.re;
ret[i+1].im = even[i+1].im+product.im;
ret[i+1+deg/2].re = even[i+1].re-product.re;
ret[i+1+deg/2].im = even[i+1].im-product.im;
w = mul(&w,&wn);
}
delete[] even;
delete[] odd;
}
But code is even slower than sequential version, only speed up i can do is, when i remove tasks, and let just 2 threads compute each polynom simultaneously. I understand, that there is lot of memory operations but still,is there somethnig i can/should do. Than you.
You should stop parallelizing earlier. Try with sizes 16, 64, 256. Stopping at size 2 creates too many small tasks with relatively gigantic overheads.
how to pass some parameters, which will be treated in the .cl file as a preprocessor define using pyopencl?
Meaning:
foo.cl
# define LIMIT 12
typedef struct {
uint i[LIMIT];
} foomatic;
turns to
foo_nodefs.cl
typedef struct {
uint i[LIMIT]; // python script passing LIMIT to set it
} foomatic;
Thanks,
John
Edit: extending the answer, making it maximally detailed.
There are two ways to do that:
(metaprogramming) Add your preprocessor directives directly to the string with the source code, or even run your own preprocessor using some templating engine.
import pyopencl as cl
import numpy
import numpy.linalg as la
a = numpy.random.rand(50000).astype(numpy.float32)
b = numpy.random.rand(50000).astype(numpy.float32)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
defines = """
#define AXIS 0
#define COEFF 1
"""
prg = cl.Program(ctx,
defines +
"""
__kernel void sum(__global const float *a,
__global const float *b, __global float *c)
{
int gid = get_global_id(AXIS);
c[gid] = a[gid] + b[gid] + COEFF;
}
""").build()
prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
a_plus_b = numpy.empty_like(a)
cl.enqueue_copy(queue, a_plus_b, dest_buf)
print(la.norm(a_plus_b - (a+b+1)), la.norm(a_plus_b))
(C-way) use options keyword of Program.build to pass build options directly to clBuildProgram():
import pyopencl as cl
import numpy
import numpy.linalg as la
a = numpy.random.rand(50000).astype(numpy.float32)
b = numpy.random.rand(50000).astype(numpy.float32)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
prg = cl.Program(ctx, """
__kernel void sum(__global const float *a,
__global const float *b, __global float *c)
{
int gid = get_global_id(AXIS);
c[gid] = a[gid] + b[gid] + COEFF;
}
""").build(options=['-D', 'AXIS=0', '-D', 'COEFF=1'])
prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
a_plus_b = numpy.empty_like(a)
cl.enqueue_copy(queue, a_plus_b, dest_buf)
print(la.norm(a_plus_b - (a+b+1)), la.norm(a_plus_b))
(I have used the modifed source code from the main page of PyOpenCL docs. Tested on pyopencl 2013.1).
I'm moving a simulation into pyOpenCL and can't get my data access to work. I'm trying to supply a 1D array of vectors (well, actually several, but the example I've included just used one).
Currently, several vectors are copied over just fine, but then the data is simply not what I supplied.
I don't think I've posted here before, so apologies if any of the formatting/presentation is wrong. Also, I've just stripped out all the simulation code, so I realise this code is currently not actually doing anything, I just want to get the buffer passing correct.
Thanks in advance.
The kernel (kertest.py):
step1 = """
#pragma OPENCL EXTENSION cl_amd_printf: enable
#define X xdim
#define Y ydim
__kernel void k1(__global float3 *spins,
__local float3 *tile)
{
ushort lid = 2 * get_local_id(0);
ushort group = 2 * get_group_id(0);
ushort num = get_num_groups(0);
int lim = X*Y*3;
for (ushort i = 0; i < lim; i++)
{
if (lid == 0 && group == 0)
{
printf("%f :: %d\\n", spins[i].x, i);
}
}
}"""
The code itself (gputest.py):
import kertest as k2D
import numpy as np
import pyopencl as cl
class GPU_MC2DSim():
def __init__(self, x, y):
self.x = x
self.y = y
if x >= y:
self.xdim = int(self.x)
self.ydim = int(self.y)
else:
self.xdim = int(self.y)
self.ydim = int(self.x)
if self.xdim % 2 != 0: self.xdim += 1
if self.ydim % 2 != 0: self.ydim += 1
self.M = np.ones((self.xdim*self.ydim, 3)).astype(np.float32)
self.M[:, 1] += 1.0
self.M[:, 2] += 2.0
print self.M
def simulate(self):
ctx = cl.create_some_context()
q = cl.CommandQueue(ctx)
mf = cl.mem_flags
#Pass buffer:
M_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = self.M)
#Insert kernel parameters:
params = {"xdim" : "%d" % (self.xdim),
"ydim" : "%d" % (self.ydim),
}
for name in params:
k2D.step1 = k2D.step1.replace(name, params[name])
#Compile kernel:
step1 = cl.Program(ctx, k2D.step1).build()
locmem = cl.LocalMemory(self.xdim*4*4)
step1.k1(q, ((self.xdim*self.ydim)/4,), (self.xdim/2,), M_buf, locmem).wait()
return None
xdim = 4
ydim = 4
sim = GPU_MC2DSim(xdim, ydim)
sim.simulate()
Your code for copying the data to the device is just fine. However, your kernel has at least two problems:
float3 values are expected to be 16-byte aligned, as per OpenCL 1.2 Spec, 6.1.5:
For 3-component vector data types, the size of the data type is 4 * sizeof(component). This means that a 3-component vector data type will be aligned to a 4 * sizeof(component) boundary. The vload3 and vstore3 built-in functions can be used to read and write, respectively, 3-component vector data types from an array of packed scalar data type.
The values you upload to the devices are not properly aligned for the kernel to read float3 values directly.
Your limit calculation int lim = X*Y*3; is slightly off. You are already trying to read from an array of float3, so the *3 is superfluous.
The solution to both problems is simple: as stated in the spec, you should use vload3 to load from an array of floats:
#pragma OPENCL EXTENSION cl_amd_printf: enable
#define X xdim
#define Y ydim
__kernel void k1(__global float *spins,
__local float3 *tile)
{
ushort lid = 2 * get_local_id(0);
ushort group = 2 * get_group_id(0);
ushort num = get_num_groups(0);
int lim = X*Y;
for (ushort i = 0; i < lim; i++)
{
if (lid == 0 && group == 0)
{
float3 vec = vload3(i, spins);
printf("(%f, %f, %f) :: %d\\n", vec.x, vec.y, vec.z, i);
}
}
}