OpenCL function calls - opencl

I'm working on an openCL kernel that loads up some points, decides which is the highest, and returns it. All good there, but I want to add a calculation before the highest evaluation. This compares the point to a pair of lines. I have it written and working to a degree, as follows:
size_t i = group_id * group_stride + local_id;
while (i < n){
//load up a pair of points using the index to locate them within a massive dataSet
int ia = LOAD_GLOBAL_I1(input, i);
float4 a = LOAD_GLOBAL_F4(dataSet, ia);
int ib = LOAD_GLOBAL_I1(input, i + group_size);
float4 b = LOAD_GLOBAL_F4(dataSet, ib);
//pre-assess the points relative to lines
if(pass == 0){
float px = a.x;
float py = a.y;
int checkAnswer;
//want to write this section as a function
float x1 = tri_input[0].x; float y1 = tri_input[0].y;
float x2 = tri_input[2].x; float y2 = tri_input[2].y;
float check = sign((x1-x2) * (py-y1) - (y2-y1) * (px-x1));
if(check != tri_input[3].x){ //point is outside line 1
checkAnswer = 1;
}
else{
x1 = tri_input[2].x; y1 = tri_input[2].y;
x2 = tri_input[1].x; y2 = tri_input[1].y;
check = sign((x1-x2)*(py-y1) - (y2-y1)*(px-x1));
if(check != tri_input[3].y){ //point is outside line 2
checkAnswer = 2;
}
else{
checkAnswer = 0; //point is within both lines
}}}
//later use the checkAnswer result to change the following
//find the highest of the pair
float4 result;
if(a.z>b.z) result = a;
else result = b;
//load up the previous highest result locally
float4 s = LOAD_LOCAL_F4(shared, local_id);
//if the previous highest beat this, stick, else twist
if(s.z>result.z){ STORE_LOCAL_F4(shared, local_id, s);}
else{ STORE_LOCAL_F4(shared, local_id, result);}
i += local_stride;
}
What I would like to do is call the line check twice as a function, i.e the code becomes:
size_t i = group_id * group_stride + local_id;
while (i < n){
//load up a pair of points using the index to locate them within a massive dataSet
int ia = LOAD_GLOBAL_I1(input, i);
float4 a = LOAD_GLOBAL_F4(dataSet, ia);
int ib = LOAD_GLOBAL_I1(input, i + group_size);
float4 b = LOAD_GLOBAL_F4(dataSet, ib);
//pre-assess the points relative to lines
if(pass == 0){
float px = a.x;
float py = a.y;
int checkA = pointCheck( px, py, tri_input);
px = b.x;
py = b.y;
int checkB = pointCheck( px, py, tri_input);
}
//later use the checkAnswer result to change the following
//find the highest of the pair
float4 result;
if(a.z>b.z) result = a;
else result = b;
//load up the previous highest result locally
float4 s = LOAD_LOCAL_F4(shared, local_id);
//if the previous highest beat this, stick, else twist
if(s.z>result.z){ STORE_LOCAL_F4(shared, local_id, s);}
else{ STORE_LOCAL_F4(shared, local_id, result);}
i += local_stride;
}
In this instance the function is:
int pointCheck( float *px, float *py, float2 *testLines){
float x1 = testLines[0].x; float y1 = testLines[0].y;
float x2 = testLines[2].x; float y2 = testLines[2].y;
float check = sign((x1-x2) * (py-y1) - (y2-y1) * (px-x1));
if(check != testLines[3].x){ //point is outside line 1
return 1;
}
else{
x1 = testLines[2].x; y1 = testLines[2].y;
x2 = testLines[1].x; y2 = testLines[1].y;
check = sign((x1-x2)*(py-y1) - (y2-y1)*(px-x1));
if(check != testLines[3].y){ //point is outside line 2
return 2;
}
else{
return 0; //point is within both lines
}}}
Whilst the longhand version runs fine and returns a normal 'highest point' result, the function version returns an erroneous result (not detecting the highest point I have hidden in the data set). It produces a wrong result even though the function as yet has no overall effect.
What am I doing wrong?
S
[Update]:
This revised function works as far as the commented out line, then hangs on something:
int pointCheck(float4 *P, float2 *testLines){
float2 *l0 = &testLines[0];
float2 *l1 = &testLines[1];
float2 *l2 = &testLines[2];
float2 *l3 = &testLines[3];
float x1 = l0->x; float y1 = l0->y;
float x2 = l2->x; float y2 = l2->y;
float pX = P->x; float pY = P->y;
float c1 = l3->x; float c2 = l3->y;
//float check = sign((x1-x2) * (pY-y1) - (y2-y1) * (pX-x1)); //seems to be a problem with sign
// if(check != c1){ //point is outside line 1
// return 1;
// }
// else{
// x1 = l2->x; y1 = l2->y;
// x2 = l1->x; y2 = l1->y;
// check = sign((x1-x2) * (pY-y1) - (y2-y1) * (pX-x1));
// if(check != c2){ //point is outside line 2
// return 2;
// }
// else{
// return 0; //point is within both lines
// }}
}

One immediate issue is how you pass the parameters to the called function:
int checkA = pointCheck( px, py, tri_input);
whereas the function itself expects pointers for px and py. You should instead call the function as:
int checkA = pointCheck(&px, &py, tri_input);
It is surprising that OpenCL does not give build errors for this kernel.
In my experience, some OpenCL runtimes do not like multiple return statements in a single function. Try to save the return value into a local variable and use a single return statement at the end of the function. This is because OpenCL does not support real function calls, but rather inlines all functions directly into the kernel. A best practice is therefore to mark all non __kernel functions as inline, and treat them as such (i.e. make it easier for the compiler to inline your function by not using multiple return statements).

Related

Processing 3 improving intensive math calculation

I wrote a very simple sketch to simulate the interference of two planar waves, very easy.
The problem seems to be a little to much intensive for the cpu (moreover processing uses only one core) and I get only 1 o 2 fps.
Any idea how to improve this sketch?
float x0;
float y0;
float x1;
float y1;
float x2;
float y2;
int t = 0;
void setup() {
//noLoop();
frameRate(30);
size(400, 400, P2D);
x0 = width/2;
y0 = height/2;
x1 = width/4;
y1 = height/2;
x2 = width * 3/4;
y2 = height / 2;
}
void draw() {
background(0);
for (int x = 0; x <= width; x++) {
for (int y = 0; y <= height; y++) {
float d1 = dist(x1, y1, x, y);
float d2 = dist(x2, y2, x, y);
float factorA = 20;
float factorB = 80;
float wave1 = (1 + (sin(TWO_PI * d1/factorA + t)))/2 * exp(-d1/factorB);
float wave2 = (1 + (sin(TWO_PI * d2/factorA + t)))/2 * exp(-d2/factorB);
stroke( (wave1 + wave2) *255);
point(x, y);
}
}
t--; //Wave propagation
//saveFrame("wave-##.png");
}
As Kevin suggested, using point() isn't the most efficient method since it calls beginShape();vertex() and endShape();. You might be off better using pixels.
Additionally, the nested loops can be written as a single loop and dist() which uses square root behind the scenes can be avoided (you can uses squared distance with higher values).
Here's a version using these:
float x1;
float y1;
float x2;
float y2;
int t = 0;
//using larger factors to use squared distance bellow instead of dist(),sqrt()
float factorA = 20*200;
float factorB = 80*200;
void setup() {
//noLoop();
frameRate(30);
size(400, 400);
x1 = width/4;
y1 = height/2;
x2 = width * 3/4;
y2 = height / 2;
//use pixels, not points()
loadPixels();
}
void draw() {
for (int i = 0; i < pixels.length; i++) {
int x = i % width;
int y = i / height;
float dx1 = x1-x;
float dy1 = y1-y;
float dx2 = x2-x;
float dy2 = y2-y;
//squared distance
float d1 = dx1*dx1+dy1*dy1;//dist(x1, y1, x, y);
float d2 = dx2*dx2+dy2*dy2;//dist(x2, y2, x, y);
float wave1 = (1 + (sin(TWO_PI * d1/factorA + t))) * 0.5 * exp(-d1/factorB);
float wave2 = (1 + (sin(TWO_PI * d2/factorA + t))) * 0.5 * exp(-d2/factorB);
pixels[i] = color((wave1 + wave2) *255);
}
updatePixels();
text((int)frameRate+"fps",10,15);
// endShape();
t--; //Wave propagation
//saveFrame("wave-##.png");
}
This can be sped up further using lookup tables for the more time consuming functions such as sin() and exp().
You can see a rough (numbers need to be tweaked) preview running even in javascript:
var x1;
var y1;
var x2;
var y2;
var t = 0;
var factorA = 20*200;
var factorB = 80*200;
var numPixels;
var scaledWidth;
function setup() {
createCanvas(400, 400);
fill(255);
frameRate(30);
x1 = width /4;
y1 = height /2;
x2 = width * 3/4;
y2 = height / 2;
loadPixels();
numPixels = (width * height) * pixelDensity();
scaledWidth = width * pixelDensity();
}
function draw() {
for (var i = 0, j = 0; i < numPixels; i++, j += 4) {
var x = i % scaledWidth;
var y = floor(i / scaledWidth);
var dx1 = x1 - x;
var dy1 = y1 - y;
var dx2 = x2 - x;
var dy2 = y2 - y;
var d1 = (dx1 * dx1) + (dy1 * dy1);//dist(x1, y1, x, y);
var d2 = (dx2 * dx2) + (dy2 * dy2);//dist(x2, y2, x, y);
var wave1 = (1 + (sin(TWO_PI * d1 / factorA + t))) * 0.5 * exp(-d1 / factorB);
var wave2 = (1 + (sin(TWO_PI * d2 / factorA + t))) * 0.5 * exp(-d2 / factorB);
var gray = (wave1 + wave2) * 255;
pixels[j] = pixels[j+1] = pixels[j+2] = gray;
pixels[j+3] = 255;
}
updatePixels();
text(frameRate().toFixed(2)+"fps",10,15);
t--; //Wave propagation
}
<script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.0.0/p5.min.js"></script>
Because you're using math to synthesise the image, it may make more sense to write this as a GLSL Shader. Be sure sure to checkout the PShader tutorial for more info.
Update:
Here's a GLSL version: code is less hacky and a lot more readable:
float t = 0;
float factorA = 0.20;
float factorB = 0.80;
PShader waves;
void setup() {
size(400, 400, P2D);
noStroke();
waves = loadShader("waves.glsl");
waves.set("resolution", float(width), float(height));
waves.set("factorA",factorA);
waves.set("factorB",factorB);
waves.set("pt1",-0.5,0.0);
waves.set("pt2",0.75,0.0);
}
void draw() {
t++;
waves.set("t",t);
shader(waves);
rect(0, 0, width, height);
}
void mouseDragged(){
float x = map(mouseX,0,width,-1.0,1.0);
float y = map(mouseY,0,height,1.0,-1.0);
println(x,y);
if(keyPressed) waves.set("pt2",x,y);
else waves.set("pt1",x,y);
}
void keyPressed(){
float amount = 0.05;
if(keyCode == UP) factorA += amount;
if(keyCode == DOWN) factorA -= amount;
if(keyCode == LEFT) factorB -= amount;
if(keyCode == RIGHT) factorB += amount;
waves.set("factorA",factorA);
waves.set("factorB",factorB);
println(factorA,factorB);
}
And the waves.glsl:
#define PROCESSING_COLOR_SHADER
uniform vec2 pt1;
uniform vec2 pt2;
uniform float t;
uniform float factorA;
uniform float factorB;
const float TWO_PI = 6.283185307179586;
uniform vec2 resolution;
uniform float time;
void main(void) {
vec2 p = -1.0 + 2.0 * gl_FragCoord.xy / resolution.xy;
float d1 = distance(pt1,p);
float d2 = distance(pt2,p);
float wave1 = (1.0 + (sin(TWO_PI * d1/factorA + t))) * 0.5 * exp(-d1/factorB);
float wave2 = (1.0 + (sin(TWO_PI * d2/factorA + t))) * 0.5 * exp(-d2/factorB);
float gray = wave1 + wave2;
gl_FragColor=vec4(gray,gray,gray,1.0);
}
You can use drag for first point and hold a key and drag for the second point.
Additionally, use UP/DOWN, LEFT/RIGHT keys to change factorA and factorB. Results look interesting:
Also, you can grab a bit of code from this answer to save frames using Threads (I recommend saving uncompressed).
Option 1: Pre-render your sketch.
This seems to be a static repeating pattern, so you can pre-render it by running the animation ahead of time and saving each frame to an image. I see that you already had a call to saveFrame() in there. Once you have the images saved, you can then load them into a new sketch and play them one frame at a time. It shouldn't require very many images, since it seems to repeat itself pretty quickly. Think of an animated gif that loops forever.
Option 2: Decrease the resolution of your sketch.
Do you really need pixel-perfect 400x400 resolution? Can you maybe draw to an image that's 100x100 and scale up?
Or you could decrease the resolution of your for loops by incrementing by more than 1:
for (int x = 0; x <= width; x+=2) {
for (int y = 0; y <= height; y+=2) {
You could play with how much you increase and then use the strokeWeight() or rect() function to draw larger pixels.
Option 3: Decrease the time resolution of your sketch.
Instead of moving by 1 pixel every 1 frame, what if you move by 5 pixels every 5 frames? Speed your animation up, but only move it every X frames, that way the overall speed appears to be the same. You can use the modulo operator along with the frameCount variable to only do something every X frames. Note that you'd still want to keep the overall framerate of your sketch to 30 or 60, but you'd only change the animation every X frames.
Option 4: Simplify your animation.
Do you really need to calculate every single pixels? If all you want to show is a series of circles that increase in size, there are much easier ways to do that. Calling the ellipse() function is much faster than calling the point() function a bunch of times. You can use other functions to create the blurry effect without calling point() half a million times every second (which is how often you're trying to call it).
Option 5: Refactor your code.
If all else fails, then you're going to have to refactor your code. Most of your program's time is being spent in the point() function- you can prove this by drawing an ellipse at mouseX, mouseY at the end of the draw() function and comparing the performance of that when you comment out the call to point() inside your nested for loops.
Computers aren't magic, so calling the point() function half a million times every second isn't free. You're going to have to decrease that number somehow, either by taking one (or more than one) of the above options, or by refactoring your code in some other way.
How you do that really depends on your actual goals, which you haven't stated. If you're just trying to render this animation, then pre-rendering it will work fine. If you need to have user interaction with it, then maybe something like decreasing the resolution will work. You're going to have to sacrifice something, and it's really up to you what that is.

Different results GPU & CPU when more than one 8 work items per group

I'm new in open cl. And tried as my first work to write code that checks intersection between many polylines to single polygon.
I'm running the code in both cpu and gpu.. and get different results.
First I sent NULL as local parameter when called clEnqueueNDRangeKernel.
clEnqueueNDRangeKernel(command_queue, kIntersect, 1, NULL, &global, null, 2, &evtCalcBounds, &evtKernel);
After trying many things i saw that if i send 1 as local it is working good. and returning the same results for the cpu and gpu.
size_t local = 1;
clEnqueueNDRangeKernel(command_queue, kIntersect, 1, NULL, &global, &local, 2, &evtCalcBounds, &evtKernel);
Played abit more and found that the cpu returns false result when i run the kernel with local 8 or more (for some reason).
I'm not using any local memory, just globals and privates.
I didn't added the code because i think it is irrelevant to the problem (note that for single work group it is working good), and it is long. If it is needed, i will try to simplify it.
The code flow is going like this:
I have polylines coordinates stored in a big buffer. and the single polygon in another. In addition i'm providing another buffer with single int that holds the current results count. All buffers are __global arguments.
In the kernel i'm simply checking intersection between all the lines of the "polyline[get_global(0)]" with the lines of the polygon. If true,
i'm using atomic_inc for the results count. There is no read and write memory from the same buffer, no barriers or mem fences,... the atomic_inc is the only thread safe mechanism i'm using.
-- UPDATE --
Added my code:
I know that i can maybe have better use of open cl functions for calculating some vectors, but for now, i'm simply convert code from my old regular CPU single threaded program to CL. so this is not my concern now.
bool isPointInPolygon(float x, float y, __global float* polygon) {
bool blnInside = false;
uint length = convert_uint(polygon[4]);
int s = 5;
uint j = length - 1;
for (uint i = 0; i < length; j = i++) {
uint realIdx = s + i * 2;
uint realInvIdx = s + j * 2;
if (((polygon[realIdx + 1] > y) != (polygon[realInvIdx + 1] > y)) &&
(x < (polygon[realInvIdx] - polygon[realIdx]) * (y - polygon[realIdx + 1]) / (polygon[realInvIdx + 1] - polygon[realIdx + 1]) + polygon[realIdx]))
blnInside = !blnInside;
}
return blnInside;
}
bool isRectanglesIntersected(float p_dblMinX1, float p_dblMinY1,
float p_dblMaxX1, float p_dblMaxY1,
float p_dblMinX2, float p_dblMinY2,
float p_dblMaxX2, float p_dblMaxY2) {
bool blnResult = true;
if (p_dblMinX1 > p_dblMaxX2 ||
p_dblMaxX1 < p_dblMinX2 ||
p_dblMinY1 > p_dblMaxY2 ||
p_dblMaxY1 < p_dblMinY2) {
blnResult = false;
}
return blnResult;
}
bool isLinesIntersects(
double Ax, double Ay,
double Bx, double By,
double Cx, double Cy,
double Dx, double Dy) {
double distAB, theCos, theSin, newX, ABpos;
// Fail if either line is undefined.
if (Ax == Bx && Ay == By || Cx == Dx && Cy == Dy)
return false;
// (1) Translate the system so that point A is on the origin.
Bx -= Ax; By -= Ay;
Cx -= Ax; Cy -= Ay;
Dx -= Ax; Dy -= Ay;
// Discover the length of segment A-B.
distAB = sqrt(Bx*Bx + By*By);
// (2) Rotate the system so that point B is on the positive X axis.
theCos = Bx / distAB;
theSin = By / distAB;
newX = Cx*theCos + Cy*theSin;
Cy = Cy*theCos - Cx*theSin; Cx = newX;
newX = Dx*theCos + Dy*theSin;
Dy = Dy*theCos - Dx*theSin; Dx = newX;
// Fail if the lines are parallel.
return (Cy != Dy);
}
bool isPolygonInersectsPolyline(__global float* polygon, __global float* polylines, uint startIdx) {
uint polylineLength = convert_uint(polylines[startIdx]);
uint start = startIdx + 1;
float x1 = polylines[start];
float y1 = polylines[start + 1];
float x2;
float y2;
int polygonLength = convert_uint(polygon[4]);
int polygonLength2 = polygonLength * 2;
int startPolygonIdx = 5;
for (int currPolyineIdx = 0; currPolyineIdx < polylineLength - 1; currPolyineIdx++)
{
x2 = polylines[start + (currPolyineIdx*2) + 2];
y2 = polylines[start + (currPolyineIdx*2) + 3];
float polyX1 = polygon[0];
float polyY1 = polygon[1];
for (int currPolygonIdx = 0; currPolygonIdx < polygonLength; ++currPolygonIdx)
{
float polyX2 = polygon[startPolygonIdx + (currPolygonIdx * 2 + 2) % polygonLength2];
float polyY2 = polygon[startPolygonIdx + (currPolygonIdx * 2 + 3) % polygonLength2];
if (isLinesIntersects(x1, y1, x2, y2, polyX1, polyY1, polyX2, polyY2)) {
return true;
}
polyX1 = polyX2;
polyY1 = polyY2;
}
x1 = x2;
y1 = y2;
}
// No intersection found till now so we check containing
return isPointInPolygon(x1, y1, polygon);
}
__kernel void calcIntersections(__global float* polylines, // My flat points array - [pntCount, x,y,x,y,...., pntCount, x,y,... ]
__global float* pBounds, // The rectangle bounds of each polyline - set of 4 values [top, left, bottom, right....]
__global uint* pStarts, // The start index of each polyline in the polylines array
__global float* polygon, // The polygon i want to intersect with - first 4 items are the rectangle bounds [top, left, bottom, right, pntCount, x,y,x,y,x,y....]
__global float* output, // Result array for saving the intersections polylines indices
__global uint* resCount) // The result count
{
int i = get_global_id(0);
uint start = convert_uint(pStarts[i]);
if (isRectanglesIntersected(pBounds[i * 4], pBounds[i * 4 + 1], pBounds[i * 4 + 2], pBounds[i * 4 + 3],
polygon[0], polygon[1], polygon[2], polygon[3])) {
if (isPolygonInersectsPolyline(polygon, polylines, start)){
int oldVal = atomic_inc(resCount);
output[oldVal] = i;
}
}
}
Can anyone explain it to me ?

Higher radix (or better) formulation for Stockham FFT

Background
I've implemented this algorithm from Microsoft Research for a radix-2 FFT (Stockham auto sort) using OpenCL.
I use floating point textures (256 cols X N rows) for input and output in the kernel, because I will need to sample at non-integral points and I thought it better to delegate that to the texture sampling hardware. Note that my FFTs are always of 256-point sequences (every row in my texture). At this point, my N is 16384 or 32768 depending on the GPU i'm using and the max 2D texture size allowed.
I also need to perform the FFT of 4 real-valued sequences at once, so the kernel performs the FFT(a, b, c, d) as FFT(a + ib, c + id) from which I can extract the 4 complex sequences out later using an O(n) algorithm. I can elaborate on this if someone wishes - but I don't believe it falls in the scope of this question.
Kernel Source
const sampler_t fftSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void FFT_Stockham(read_only image2d_t input, write_only image2d_t output, int fftSize, int size)
{
int x = get_global_id(0);
int y = get_global_id(1);
int b = floor(x / convert_float(fftSize)) * (fftSize / 2);
int offset = x % (fftSize / 2);
int x0 = b + offset;
int x1 = x0 + (size / 2);
float4 val0 = read_imagef(input, fftSampler, (int2)(x0, y));
float4 val1 = read_imagef(input, fftSampler, (int2)(x1, y));
float angle = -6.283185f * (convert_float(x) / convert_float(fftSize));
// TODO: Convert the two calculations below into lookups from a __constant buffer
float tA = native_cos(angle);
float tB = native_sin(angle);
float4 coeffs1 = (float4)(tA, tB, tA, tB);
float4 coeffs2 = (float4)(-tB, tA, -tB, tA);
float4 result = val0 + coeffs1 * val1.xxzz + coeffs2 * val1.yyww;
write_imagef(output, (int2)(x, y), result);
}
The host code simply invokes this kernel log2(256) times, ping-ponging the input and output textures.
Note: I tried removing the native_cos and native_sin to see if that impacted timing, but it doesn't seem to change things by very much. Not the factor I'm looking for, in any case.
Access pattern
Knowing that I am probably memory-bandwidth bound, here is the memory access pattern (per-row) for my radix-2 FFT.
X0 - element 1 to combine (read)
X1 - element 2 to combine (read)
X - element to write to (write)
Question
So my question is - can someone help me with/point me toward a higher-radix formulation for this algorithm? I ask because most FFTs are optimized for large cases and single real/complex valued sequences. Their kernel generators are also very case dependent and break down quickly when I try to muck with their internals.
Are there other options better than simply going to a radix-8 or 16 kernel?
Some of my constraints are - I have to use OpenCL (no cuFFT). I also cannot use clAmdFft from ACML for this purpose. It would be nice to also talk about CPU optimizations (this kernel SUCKS big time on the CPU) - but getting it to run in fewer iterations on the GPU is my main use-case.
Thanks in advance for reading through all this and trying to help!
I tried several versions, but the one with the best performance on CPU and GPU was a radix-16 kernel for my specific case.
Here is the kernel for reference. It was taken from Eric Bainville's (most excellent) website and used with full attribution.
// #define M_PI 3.14159265358979f
//Global size is x.Length/2, Scale = 1 for direct, 1/N to inverse (iFFT)
__kernel void ConjugateAndScale(__global float4* x, const float Scale)
{
int i = get_global_id(0);
float temp = Scale;
float4 t = (float4)(temp, -temp, temp, -temp);
x[i] *= t;
}
// Return a*EXP(-I*PI*1/2) = a*(-I)
float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
// Return a^2
float2 sqr_1(float2 a)
{ return (float2)(a.x*a.x-a.y*a.y,2.0f*a.x*a.y); }
// Return the 2x DFT2 of the four complex numbers in A
// If A=(a,b,c,d) then return (a',b',c',d') where (a',c')=DFT2(a,c)
// and (b',d')=DFT2(b,d).
float8 dft2_4(float8 a) { return (float8)(a.lo+a.hi,a.lo-a.hi); }
// Return the DFT of 4 complex numbers in A
float8 dft4_4(float8 a)
{
// 2x DFT2
float8 x = dft2_4(a);
// Shuffle, twiddle, and 2x DFT2
return dft2_4((float8)(x.lo.lo,x.hi.lo,x.lo.hi,mul_p1q2(x.hi.hi)));
}
// Complex product, multiply vectors of complex numbers
#define MUL_RE(a,b) (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a,b) (a.even*b.odd + a.odd*b.even)
float2 mul_1(float2 a, float2 b)
{ float2 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_1_F4(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_2(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
// Return the DFT2 of the two complex numbers in vector A
float4 dft2_2(float4 a) { return (float4)(a.lo+a.hi,a.lo-a.hi); }
// Return cos(alpha)+I*sin(alpha) (3 variants)
float2 exp_alpha_1(float alpha)
{
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
//cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float2)(cs,sn);
}
// Return cos(alpha)+I*sin(alpha) (3 variants)
float4 exp_alpha_1_F4(float alpha)
{
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
// cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float4)(cs,sn,cs,sn);
}
// mul_p*q*(a) returns a*EXP(-I*PI*P/Q)
#define mul_p0q1(a) (a)
#define mul_p0q2 mul_p0q1
//float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
__constant float SQRT_1_2 = 0.707106781186548; // cos(Pi/4)
#define mul_p0q4 mul_p0q2
float2 mul_p1q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(a.x+a.y,-a.x+a.y); }
#define mul_p2q4 mul_p1q2
float2 mul_p3q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(-a.x+a.y,-a.x-a.y); }
__constant float COS_8 = 0.923879532511287; // cos(Pi/8)
__constant float SIN_8 = 0.382683432365089; // sin(Pi/8)
#define mul_p0q8 mul_p0q4
float2 mul_p1q8(float2 a) { return mul_1((float2)(COS_8,-SIN_8),a); }
#define mul_p2q8 mul_p1q4
float2 mul_p3q8(float2 a) { return mul_1((float2)(SIN_8,-COS_8),a); }
#define mul_p4q8 mul_p2q4
float2 mul_p5q8(float2 a) { return mul_1((float2)(-SIN_8,-COS_8),a); }
#define mul_p6q8 mul_p3q4
float2 mul_p7q8(float2 a) { return mul_1((float2)(-COS_8,-SIN_8),a); }
// Compute in-place DFT2 and twiddle
#define DFT2_TWIDDLE(a,b,t) { float2 tmp = t(a-b); a += b; b = tmp; }
// T = N/16 = number of threads.
// P is the length of input sub-sequences, 1,16,256,...,N/16.
__kernel void FFT_Radix16(__global const float4 * x, __global float4 * y, int pp)
{
int p = pp;
int t = get_global_size(0); // number of threads
int i = get_global_id(0); // current thread
////// y[i] = 2*x[i];
////// return;
int k = i & (p-1); // index in input sequence, in 0..P-1
// Inputs indices are I+{0,..,15}*T
x += i;
// Output indices are J+{0,..,15}*P, where
// J is I with four 0 bits inserted at bit log2(P)
y += ((i-k)<<4) + k;
// Load
float4 u[16];
for (int m=0;m<16;m++) u[m] = x[m*t];
// Twiddle, twiddling factors are exp(_I*PI*{0,..,15}*K/4P)
float alpha = -M_PI*(float)k/(float)(8*p);
for (int m=1;m<16;m++) u[m] = mul_1_F4(exp_alpha_1_F4(m * alpha), u[m]);
// 8x in-place DFT2 and twiddle (1)
DFT2_TWIDDLE(u[0].lo,u[8].lo,mul_p0q8);
DFT2_TWIDDLE(u[0].hi,u[8].hi,mul_p0q8);
DFT2_TWIDDLE(u[1].lo,u[9].lo,mul_p1q8);
DFT2_TWIDDLE(u[1].hi,u[9].hi,mul_p1q8);
DFT2_TWIDDLE(u[2].lo,u[10].lo,mul_p2q8);
DFT2_TWIDDLE(u[2].hi,u[10].hi,mul_p2q8);
DFT2_TWIDDLE(u[3].lo,u[11].lo,mul_p3q8);
DFT2_TWIDDLE(u[3].hi,u[11].hi,mul_p3q8);
DFT2_TWIDDLE(u[4].lo,u[12].lo,mul_p4q8);
DFT2_TWIDDLE(u[4].hi,u[12].hi,mul_p4q8);
DFT2_TWIDDLE(u[5].lo,u[13].lo,mul_p5q8);
DFT2_TWIDDLE(u[5].hi,u[13].hi,mul_p5q8);
DFT2_TWIDDLE(u[6].lo,u[14].lo,mul_p6q8);
DFT2_TWIDDLE(u[6].hi,u[14].hi,mul_p6q8);
DFT2_TWIDDLE(u[7].lo,u[15].lo,mul_p7q8);
DFT2_TWIDDLE(u[7].hi,u[15].hi,mul_p7q8);
// 8x in-place DFT2 and twiddle (2)
DFT2_TWIDDLE(u[0].lo,u[4].lo,mul_p0q4);
DFT2_TWIDDLE(u[0].hi,u[4].hi,mul_p0q4);
DFT2_TWIDDLE(u[1].lo,u[5].lo,mul_p1q4);
DFT2_TWIDDLE(u[1].hi,u[5].hi,mul_p1q4);
DFT2_TWIDDLE(u[2].lo,u[6].lo,mul_p2q4);
DFT2_TWIDDLE(u[2].hi,u[6].hi,mul_p2q4);
DFT2_TWIDDLE(u[3].lo,u[7].lo,mul_p3q4);
DFT2_TWIDDLE(u[3].hi,u[7].hi,mul_p3q4);
DFT2_TWIDDLE(u[8].lo,u[12].lo,mul_p0q4);
DFT2_TWIDDLE(u[8].hi,u[12].hi,mul_p0q4);
DFT2_TWIDDLE(u[9].lo,u[13].lo,mul_p1q4);
DFT2_TWIDDLE(u[9].hi,u[13].hi,mul_p1q4);
DFT2_TWIDDLE(u[10].lo,u[14].lo,mul_p2q4);
DFT2_TWIDDLE(u[10].hi,u[14].hi,mul_p2q4);
DFT2_TWIDDLE(u[11].lo,u[15].lo,mul_p3q4);
DFT2_TWIDDLE(u[11].hi,u[15].hi,mul_p3q4);
// 8x in-place DFT2 and twiddle (3)
DFT2_TWIDDLE(u[0].lo,u[2].lo,mul_p0q2);
DFT2_TWIDDLE(u[0].hi,u[2].hi,mul_p0q2);
DFT2_TWIDDLE(u[1].lo,u[3].lo,mul_p1q2);
DFT2_TWIDDLE(u[1].hi,u[3].hi,mul_p1q2);
DFT2_TWIDDLE(u[4].lo,u[6].lo,mul_p0q2);
DFT2_TWIDDLE(u[4].hi,u[6].hi,mul_p0q2);
DFT2_TWIDDLE(u[5].lo,u[7].lo,mul_p1q2);
DFT2_TWIDDLE(u[5].hi,u[7].hi,mul_p1q2);
DFT2_TWIDDLE(u[8].lo,u[10].lo,mul_p0q2);
DFT2_TWIDDLE(u[8].hi,u[10].hi,mul_p0q2);
DFT2_TWIDDLE(u[9].lo,u[11].lo,mul_p1q2);
DFT2_TWIDDLE(u[9].hi,u[11].hi,mul_p1q2);
DFT2_TWIDDLE(u[12].lo,u[14].lo,mul_p0q2);
DFT2_TWIDDLE(u[12].hi,u[14].hi,mul_p0q2);
DFT2_TWIDDLE(u[13].lo,u[15].lo,mul_p1q2);
DFT2_TWIDDLE(u[13].hi,u[15].hi,mul_p1q2);
// 8x DFT2 and store (reverse binary permutation)
y[0] = u[0] + u[1];
y[p] = u[8] + u[9];
y[2*p] = u[4] + u[5];
y[3*p] = u[12] + u[13];
y[4*p] = u[2] + u[3];
y[5*p] = u[10] + u[11];
y[6*p] = u[6] + u[7];
y[7*p] = u[14] + u[15];
y[8*p] = u[0] - u[1];
y[9*p] = u[8] - u[9];
y[10*p] = u[4] - u[5];
y[11*p] = u[12] - u[13];
y[12*p] = u[2] - u[3];
y[13*p] = u[10] - u[11];
y[14*p] = u[6] - u[7];
y[15*p] = u[14] - u[15];
}
Note that I have modified the kernel to perform the FFT of 2 complex-valued sequences at once instead of one. Also, since I only need the FFT of 256 elements at a time in a much larger sequence, I perform only 2 runs of this kernel, which leaves me with 256-length DFTs in the larger array.
Here's some of the relevant host code as well.
var ev = new[] { new Cl.Event() };
var pEv = new[] { new Cl.Event() };
int fftSize = 1;
int iter = 0;
int n = distributionSize >> 5;
while (fftSize <= n)
{
Cl.SetKernelArg(fftKernel, 0, memA);
Cl.SetKernelArg(fftKernel, 1, memB);
Cl.SetKernelArg(fftKernel, 2, fftSize);
Cl.EnqueueNDRangeKernel(commandQueue, fftKernel, 1, null, globalWorkgroupSize, localWorkgroupSize,
(uint)(iter == 0 ? 0 : 1),
iter == 0 ? null : pEv,
out ev[0]).Check();
if (iter > 0)
pEv[0].Dispose();
Swap(ref ev, ref pEv);
Swap(ref memA, ref memB); // ping-pong
fftSize = fftSize << 4;
iter++;
Cl.Finish(commandQueue);
}
Swap(ref memA, ref memB);
Hope this helps someone!

OpenCL traversal kernel - further optimization

Currently, I have an OpenCL kernel for like traversal as below. I'd be glad if someone had some point on optimization of this quite large kernel.
The thing is, I'm running this code with SAH BVH and I'd like to get performance similar to Timo Aila with his traversals in his paper (Understanding the Efficiency of Ray Traversal on GPUs), of course his code uses SplitBVH (which I might consider using in place of SAH BVH, but in my opinion it has really slow build times). But I'm asking about traversal, not BVH (also I've so far worked only with scenes, where SplitBVH won't give you much advantages over SAH BVH).
First of all, here is what I have so far (standard while-while traversal kernel).
__constant sampler_t sampler = CLK_FILTER_NEAREST;
// Inline definition of horizontal max
inline float max4(float a, float b, float c, float d)
{
return max(max(max(a, b), c), d);
}
// Inline definition of horizontal min
inline float min4(float a, float b, float c, float d)
{
return min(min(min(a, b), c), d);
}
// Traversal kernel
__kernel void traverse( __read_only image2d_t nodes,
__global const float4* triangles,
__global const float4* rays,
__global float4* result,
const int num,
const int w,
const int h)
{
// Ray index
int idx = get_global_id(0);
if(idx < num)
{
// Stack
int todo[32];
int todoOffset = 0;
// Current node
int nodeNum = 0;
float tmin = 0.0f;
float depth = 2e30f;
// Fetch ray origin, direction and compute invdirection
float4 origin = rays[2 * idx + 0];
float4 direction = rays[2 * idx + 1];
float4 invdir = native_recip(direction);
float4 temp = (float4)(0.0f, 0.0f, 0.0f, 1.0f);
// Traversal loop
while(true)
{
// Fetch node information
int2 nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2) / w);
int4 specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0));
// While node isn't leaf
while(specs.z == 0)
{
// Fetch child bounding boxes
float4 n0xy = read_imagef(nodes, sampler, nodeCoord);
float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0));
float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0));
// Test ray against child bounding boxes
float oodx = origin.x * invdir.x;
float oody = origin.y * invdir.y;
float oodz = origin.z * invdir.z;
float c0lox = n0xy.x * invdir.x - oodx;
float c0hix = n0xy.y * invdir.x - oodx;
float c0loy = n0xy.z * invdir.y - oody;
float c0hiy = n0xy.w * invdir.y - oody;
float c0loz = nz.x * invdir.z - oodz;
float c0hiz = nz.y * invdir.z - oodz;
float c1loz = nz.z * invdir.z - oodz;
float c1hiz = nz.w * invdir.z - oodz;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), tmin);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), depth);
float c1lox = n1xy.x * invdir.x - oodx;
float c1hix = n1xy.y * invdir.x - oodx;
float c1loy = n1xy.z * invdir.y - oody;
float c1hiy = n1xy.w * invdir.y - oody;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), tmin);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), depth);
bool traverseChild0 = (c0max >= c0min);
bool traverseChild1 = (c1max >= c1min);
nodeNum = specs.x;
int nodeAbove = specs.y;
// We hit just one out of 2 childs
if(traverseChild0 != traverseChild1)
{
if(traverseChild1)
{
nodeNum = nodeAbove;
}
}
// We hit either both or none
else
{
// If we hit none, pop node from stack (or exit traversal, if stack is empty)
if (!traverseChild0)
{
if(todoOffset == 0)
{
break;
}
nodeNum = todo[--todoOffset];
}
// If we hit both
else
{
// Sort them (so nearest goes 1st, further 2nd)
if(c1min < c0min)
{
unsigned int tmp = nodeNum;
nodeNum = nodeAbove;
nodeAbove = tmp;
}
// Push further on stack
todo[todoOffset++] = nodeAbove;
}
}
// Fetch next node information
nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2) / w);
specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0));
}
// If node is leaf & has some primitives
if(specs.z > 0)
{
// Loop through primitives & perform intersection with them (Woop triangles)
for(int i = specs.x; i < specs.y; i++)
{
// Fetch first point from global memory
float4 v0 = triangles[i * 4 + 0];
float o_z = v0.w - origin.x * v0.x - origin.y * v0.y - origin.z * v0.z;
float i_z = 1.0f / (direction.x * v0.x + direction.y * v0.y + direction.z * v0.z);
float t = o_z * i_z;
if(t > 0.0f && t < depth)
{
// Fetch second point from global memory
float4 v1 = triangles[i * 4 + 1];
float o_x = v1.w + origin.x * v1.x + origin.y * v1.y + origin.z * v1.z;
float d_x = direction.x * v1.x + direction.y * v1.y + direction.z * v1.z;
float u = o_x + t * d_x;
if(u >= 0.0f && u <= 1.0f)
{
// Fetch third point from global memory
float4 v2 = triangles[i * 4 + 2];
float o_y = v2.w + origin.x * v2.x + origin.y * v2.y + origin.z * v2.z;
float d_y = direction.x * v2.x + direction.y * v2.y + direction.z * v2.z;
float v = o_y + t * d_y;
if(v >= 0.0f && u + v <= 1.0f)
{
// We got successful hit, store the information
depth = t;
temp.x = u;
temp.y = v;
temp.z = t;
temp.w = as_float(i);
}
}
}
}
}
// Pop node from stack (if empty, finish traversal)
if(todoOffset == 0)
{
break;
}
nodeNum = todo[--todoOffset];
}
// Store the ray traversal result in global memory
result[idx] = temp;
}
}
First question of the day is, how could one write his Persistent while-while and Speculative while-while kernel in OpenCL?
Ad Persistent while-while, do I get it right, that I actually just start kernel with global work size equivalent to local work size, and both these numbers should be equal to warp/wavefront size of the GPU?
I get that with CUDA the persistent thread implementation looks like this:
do
{
volatile int& jobIndexBase = nextJobArray[threadIndex.y];
if(threadIndex.x == 0)
{
jobIndexBase = atomicAdd(&warpCounter, WARP_SIZE);
}
index = jobIndexBase + threadIndex.x;
if(index >= totalJobs)
return;
/* Perform work for task numbered 'index' */
}
while(true);
How could equivalent in OpenCL look like, I know I'll have to do some barriers in there, I also know that one should be after the score where I atomically add WARP_SIZE to warpCounter.
Ad Speculative traversal - well I probably don't have any ideas how this should be implemented in OpenCL, so any hints are welcome. I also don't have idea where to put barriers (because putting them around simulated __any will result in driver crash).
If you made it here, thanks for reading & any hints, answers, etc. are welcome!
An optimization you can do is use vector variables and the fused multiply add function to speed up your set up math. As for the rest of the kernel, It is slow because it is branchy. If you can make assumptions on the signal data you might be able to reduce the execution time by reducing the code branches. I have not checked the float4 swizles (the .xxyy and .x .y .z .w after the float 4 variables) so just check that.
float4 n0xy = read_imagef(nodes, sampler, nodeCoord);
float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0));
float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0));
float4 oodf4 = -origin * invdir;
float4 c0xyf4 = fma(n0xy,invdir.xxyy,oodf4);
float4 c0zc1z = fma(nz,(float4)(invdir.z),oodf4);
float c0min = max4(min(c0xyf4.x, c0xyf4.y), min(c0xyf4.z, c0xyf4.w), min(c0zc1z.z, c0zc1z.w), tmin);
float c0max = min4(max(c0xyf4.x, c0xyf4.y), max(c0xyf4.z, c0xyf4.w), max(c0zc1z.z, c0zc1z.w), depth);
float4 c1xy = fma(n1xy,invdir.xxyy,oodf4);
float c1min = max4(min(c1xy.x, c1xy.y), min(c1xy.z, c1xy.w), min(c0zc1z.z, c0zc1z.w), tmin);
float c1max = min4(max(c1xy.x, c1xy.y), max(c1xy.z, c1xy.w), max(c0zc1z.z, c0zc1z.w), depth);

Using Recursion for 3D Array Manipulation -- Causing StackOverflow (not Infinite!)

I recently posted a question yesterday about a similar issue, but I have coded up something a little different and now have a different problem. Here is my code that is causing a StackOverflow.
** Note that the 3D grid array is upwards of 1 million elements and can reach up to around 64 million elements (stores enums).
** Also note that this is not going into infinity. On small data sets, this algorithm works fine.
Is this likely caused by the extreme recursion? How do I handle this (this is an essential part of my algorithm!)? I've done some research and have heard using a queue, for even just massive for-loops.
What will reduce the likelihood of causing a stackoverflow?
Thank you!
/**
* Fills all void cells in the 3D grid of Atom.
*
* #param x
* The starting x coordinate
* #param y
* The starting y coordinate
* #param z
* The starting z coordinate
*/
private void fillAllVoidCells(int x, int y, int z)
{
// Base case -- If not BLOATED_ATOM, BOUNDING_BOX,
// or VOID then must be a cavity (only 4 CellType
// enum types.
if ((grid[x][y][z] == CellType.BLOATED_ATOM)
|| grid[x][y][z] == CellType.BOUNDING_BOX
|| grid[x][y][z] == CellType.VOID)
{
// Pop off runtime stack
return;
}
else
{
// Set to void then check all surrounding cells.
grid[x][y][z] = CellType.VOID;
fillAllVoidCells(x + 1, y, z); // right
fillAllVoidCells(x - 1, y, z); // left
fillAllVoidCells(x, y + 1, z); // in front
fillAllVoidCells(x, y - 1, z); // behind
fillAllVoidCells(x, y, z + 1); // above
fillAllVoidCells(x, y, z - 1); // below
}
}
===== EDIT ====== New Method Implemented Using a Stack (per Roee Gavirel help)
Would this be a correct implementation?
// ----------------------------------------------------------
/**
* Fills all void cells in the 3D grid of Atom.
*
* #param x
* The starting x coordinate
* #param y
* The starting y coordinate
* #param z
* The starting z coordinate
*/
private void fillAllVoidCells(int x, int y, int z)
{
Point p = new Point(x, y, z);
stack.push(p);
while (!stack.isEmpty())
p = stack.top();
stack.pop();
// Base case -- If not BLOATED_ATOM, BOUNDING_BOX,
// or VOID then must be a cavity (only 4 CellType
// enum types.
CellType state = grid[p.x][p.y][p.z];
if ((state == CellType.BLOATED_ATOM) || state == CellType.BOUNDING_BOX
|| state == CellType.VOID)
{
return;
}
else
{
// Set to void then check all surrounding cells.
grid[p.x][p.y][p.z] = CellType.VOID;
Point tempP = p;
tempP.x = p.x - 1;
stack.push(tempP);
tempP.x = p.x + 1;
stack.push(tempP);
tempP.x = p.x; // return to original x coordinate
tempP.y = p.y - 1;
stack.push(tempP);
tempP.y = p.y + 1;
stack.push(tempP);
tempP.y = p.y; // return to original y coordiante
tempP.z = p.z - 1;
stack.push(tempP);
tempP.z = p.z + 1;
stack.push(tempP);
tempP.z = p.z; // return to original z coordinate
}
}
This is most likely to cause an overflow. what you can (and should) do to avoid it is to use your own stack for the data and avoid recursion.
In you case:
1. have a stack of relevant points (x,y,z) which have the point you initially called fillAllVoidCells with.
2. while the stack is not empty you should do your checks
3. If it's cavity add the surrounding points to the stack.
==EDIT==
something like that:
struct point {
int x,y,z;
}
private void fillAllVoidCells(int x, int y, int z)
{
std::list<point> Ps;
point p;
p.x = x;
p.y = y;
p.z = z;
Ps.push_back(p);
while (!Ps.empty())
p = Ps.back();
Ps.pop_back();
// Base case -- If not BLOATED_ATOM, BOUNDING_BOX,
// or VOID then must be a cavity (only 4 CellType
// enum types.
auto state = grid[p.x][p.y][p.z];
if ((state == CellType.BLOATED_ATOM)
|| state == CellType.BOUNDING_BOX
|| state == CellType.VOID)
{
continue;
}
else
{
// Set to void then check all surrounding cells.
grid[p.x][p.y][p.z] = CellType.VOID;
point tempP = p;
tempP.x = P.x - 1;
Ps.push_back(tempP);
tempP.x = P.x + 1;
Ps.push_back(tempP);
tempP.y = P.y - 1;
Ps.push_back(tempP);
tempP.y = P.y + 1;
Ps.push_back(tempP);
tempP.z = P.z - 1;
Ps.push_back(tempP);
tempP.z = P.z + 1;
Ps.push_back(tempP);
}
}
}

Resources