MandelBrot set Using openCL - opencl

Trying to use the same code (sort of) as what I have used when running using TBB (threading building blocks).
I don't have a great deal of experience with OpenCL, but I think most of the main code is correct. I believe the errors are in the .cl file, where it does the math.
Here is my mandelbrot code in TBB:
Mandelbrot TBB
Here is my code in OpenCL
Mandelbrot OpenCL
Any help would be greatly appreciated.

I changed the code in the kernel, and it ran fine. My new kernel code is the following:
// voronoi kernels
// local memory version
kernel void voronoiL(write_only image2d_t outputImage)
// get id of element in array
int x = get_global_id(0);
int y = get_global_id(1);
int w = get_global_size(0);
int h = get_global_size(1);
float4 result = (float4)(0.0f,0.0f,0.0f,1.0f);
float MinRe = -2.0f;
float MaxRe = 1.0f;
float MinIm = -1.5f;
float MaxIm = MinIm+(MaxRe-MinRe)*h/w;
float Re_factor = (MaxRe-MinRe)/(w-1);
float Im_factor = (MaxIm-MinIm)/(h-1);
float MaxIterations = 50;
//C imaginary
float c_im = MaxIm - y*Im_factor;
//C real
float c_re = MinRe + x*Re_factor;
//Z real
float Z_re = c_re, Z_im = c_im;
bool isInside = true;
bool col2 = false;
bool col3 = false;
int iteration =0;
for(int n=0; n<MaxIterations; n++)
// Z - real and imaginary
float Z_re2 = Z_re*Z_re, Z_im2 = Z_im*Z_im;
//if Z real squared plus Z imaginary squared is greater than c squared
if(Z_re2 + Z_im2 > 4)
if(n >= 0 && n <= (MaxIterations/2-1))
col2 = true;
isInside = false;
else if(n >= MaxIterations/2 && n <= MaxIterations-1)
col3 = true;
isInside = false;
Z_im = 2*Z_re*Z_im + c_im;
Z_re = Z_re2 - Z_im2 + c_re;
result = (float4)(iteration*0.05f,0.0f, 0.0f, 1.0f);
else if(col3)
result = (float4)(255, iteration*0.05f, iteration*0.05f, 1.0f);
else if(isInside)
result = (float4)(0.0f, 0.0f, 0.0f, 1.0f);
write_imagef(outputImage, (int2)(x, y), result);
You can also find it here:

See this link. It's developed by #eric-bainville. The CPU code both native and with OpenCL is not optimal (it does not use SSE/AVX) but I think the GPU code may be good. For the CPU you can speed up the code quite a bit by using AVX and operating on eight pixels at once.


Magnetometer biases issue

I am using lis3mdl magnetometer in my quadcopter project to compensate gyroscope drift. Unfortunatelly Im having problems probably with calibrating.
Ive achive max and min values (what is weird they are 14 bits instead of 16) and calculated biases like that :
biases[i] = (MAX_VALUES[i]+MIN_VALUES[i])/2;
(where i represent each of 3 axis).
Ive substracted biases from raw values x = (double)new_x-biases[0]; (etc), and then wanted to calculate heading like that :
heading = atan2(x,y);
heading += declinationAngle;
where declination angle is calculated.
Outcome are angles (conversion from radians heading*(180/M_PI)), and it does change when Iam rotating quad in yaw axi, BUT when I am rotating it in roll and pitch axi value change either. I want to achive stable yaw value which does not change when Iam rotating object in other axis. Maybe some type of fusing with accelerometer?
I am not sure when Ive made mistake in my calculations...
Whole class:
class Magnetometer {
int x=0,y=0,z=0;
LIS3MDL mag;
int running_min[3] = {32767, 32767, 32767}, running_max[3] = {-32768, -32768, -32768};
double mag_norm = 0.0;
double declinationAngle = 0.0;
double heading=0.0;
const int MAX_VALUES[3] = {3014,3439,10246};
const int MIN_VALUES[3] = {-4746, -4110, 492};
double biases[3] = {0.0};
double scales[3] = {0.0};
double avg_scale = 0.0;
ButterworthDLPF xyz_filter[3];
double DLPF_ON = true;
const float sample_rate = MAG_SAMPLE_RATE;
const float cutoff_freq = 4.0;
Magnetometer() {}
void Init() {
declinationAngle = (6.0 + (8.0 / 60.0)) / (180.0 / M_PI);
for(int i=0; i<3; i++) {
biases[i] = (MAX_VALUES[i]+MIN_VALUES[i])/2;
scales[i] = (MAX_VALUES[i]-MIN_VALUES[i])/2;
avg_scale = (scales[0]+scales[1]+scales[2])/3.0;
for(int i=0; i<3; i++) scales[i] = avg_scale / scales[i];
Serial.println("Turning on magnetometer. . .");
if(!mag.init()) {
Serial.println("Failed to detect magnetometer!");
for(int i=0; i<3; i++) xyz_filter[i].Init(sample_rate, cutoff_freq);
Serial.println("9DOF readdy!");
void Calibrate() {
while(true) {;
if(running_max[0]<mag.m.x) running_max[0] = mag.m.x;
if(running_max[1]<mag.m.y) running_max[1] = mag.m.y;
if(running_max[2]<mag.m.z) running_max[2] = mag.m.z;
if(running_min[0]>mag.m.x) running_min[0] = mag.m.x;
if(running_min[1]>mag.m.y) running_min[1] = mag.m.y;
if(running_min[2]>mag.m.z) running_min[2] = mag.m.z;
Serial.println((String)running_max[0]+" "+(String)running_max[1]+" "+(String)running_max[2]+ " "+(String)running_min[0] +" "+(String)running_min[1]+" "+(String)running_min[2]);
void Update(){;
/*x = ((double)xyz_filter[0].getData()-biases[0])*scales[0];
y = ((double)xyz_filter[1].getData()-biases[1])*scales[1];
z = ((double)xyz_filter[2].getData()-biases[2])*scales[2];*/
x = ((double)mag.m.x-biases[0])*scales[0];
y = ((double)mag.m.y-biases[1])*scales[1];
z = ((double)mag.m.z-biases[2])*scales[2];
void CalculateHeading() {
heading = atan2(y,x);
heading += declinationAngle;
//if(heading<0) heading += 2*PI;
//else if(heading>2*PI) heading -= 2*PI;
double GetHeading() {return heading;}
void ShowRawValues(bool names=false) {
if(names) Serial.print("X: "+(String)x+" Y: "+ (String)y+ " Z: " + (String)z);
else Serial.print((String)x+" "+ (String)y+ " " + (String)z);

Different results GPU & CPU when more than one 8 work items per group

I'm new in open cl. And tried as my first work to write code that checks intersection between many polylines to single polygon.
I'm running the code in both cpu and gpu.. and get different results.
First I sent NULL as local parameter when called clEnqueueNDRangeKernel.
clEnqueueNDRangeKernel(command_queue, kIntersect, 1, NULL, &global, null, 2, &evtCalcBounds, &evtKernel);
After trying many things i saw that if i send 1 as local it is working good. and returning the same results for the cpu and gpu.
size_t local = 1;
clEnqueueNDRangeKernel(command_queue, kIntersect, 1, NULL, &global, &local, 2, &evtCalcBounds, &evtKernel);
Played abit more and found that the cpu returns false result when i run the kernel with local 8 or more (for some reason).
I'm not using any local memory, just globals and privates.
I didn't added the code because i think it is irrelevant to the problem (note that for single work group it is working good), and it is long. If it is needed, i will try to simplify it.
The code flow is going like this:
I have polylines coordinates stored in a big buffer. and the single polygon in another. In addition i'm providing another buffer with single int that holds the current results count. All buffers are __global arguments.
In the kernel i'm simply checking intersection between all the lines of the "polyline[get_global(0)]" with the lines of the polygon. If true,
i'm using atomic_inc for the results count. There is no read and write memory from the same buffer, no barriers or mem fences,... the atomic_inc is the only thread safe mechanism i'm using.
-- UPDATE --
Added my code:
I know that i can maybe have better use of open cl functions for calculating some vectors, but for now, i'm simply convert code from my old regular CPU single threaded program to CL. so this is not my concern now.
bool isPointInPolygon(float x, float y, __global float* polygon) {
bool blnInside = false;
uint length = convert_uint(polygon[4]);
int s = 5;
uint j = length - 1;
for (uint i = 0; i < length; j = i++) {
uint realIdx = s + i * 2;
uint realInvIdx = s + j * 2;
if (((polygon[realIdx + 1] > y) != (polygon[realInvIdx + 1] > y)) &&
(x < (polygon[realInvIdx] - polygon[realIdx]) * (y - polygon[realIdx + 1]) / (polygon[realInvIdx + 1] - polygon[realIdx + 1]) + polygon[realIdx]))
blnInside = !blnInside;
return blnInside;
bool isRectanglesIntersected(float p_dblMinX1, float p_dblMinY1,
float p_dblMaxX1, float p_dblMaxY1,
float p_dblMinX2, float p_dblMinY2,
float p_dblMaxX2, float p_dblMaxY2) {
bool blnResult = true;
if (p_dblMinX1 > p_dblMaxX2 ||
p_dblMaxX1 < p_dblMinX2 ||
p_dblMinY1 > p_dblMaxY2 ||
p_dblMaxY1 < p_dblMinY2) {
blnResult = false;
return blnResult;
bool isLinesIntersects(
double Ax, double Ay,
double Bx, double By,
double Cx, double Cy,
double Dx, double Dy) {
double distAB, theCos, theSin, newX, ABpos;
// Fail if either line is undefined.
if (Ax == Bx && Ay == By || Cx == Dx && Cy == Dy)
return false;
// (1) Translate the system so that point A is on the origin.
Bx -= Ax; By -= Ay;
Cx -= Ax; Cy -= Ay;
Dx -= Ax; Dy -= Ay;
// Discover the length of segment A-B.
distAB = sqrt(Bx*Bx + By*By);
// (2) Rotate the system so that point B is on the positive X axis.
theCos = Bx / distAB;
theSin = By / distAB;
newX = Cx*theCos + Cy*theSin;
Cy = Cy*theCos - Cx*theSin; Cx = newX;
newX = Dx*theCos + Dy*theSin;
Dy = Dy*theCos - Dx*theSin; Dx = newX;
// Fail if the lines are parallel.
return (Cy != Dy);
bool isPolygonInersectsPolyline(__global float* polygon, __global float* polylines, uint startIdx) {
uint polylineLength = convert_uint(polylines[startIdx]);
uint start = startIdx + 1;
float x1 = polylines[start];
float y1 = polylines[start + 1];
float x2;
float y2;
int polygonLength = convert_uint(polygon[4]);
int polygonLength2 = polygonLength * 2;
int startPolygonIdx = 5;
for (int currPolyineIdx = 0; currPolyineIdx < polylineLength - 1; currPolyineIdx++)
x2 = polylines[start + (currPolyineIdx*2) + 2];
y2 = polylines[start + (currPolyineIdx*2) + 3];
float polyX1 = polygon[0];
float polyY1 = polygon[1];
for (int currPolygonIdx = 0; currPolygonIdx < polygonLength; ++currPolygonIdx)
float polyX2 = polygon[startPolygonIdx + (currPolygonIdx * 2 + 2) % polygonLength2];
float polyY2 = polygon[startPolygonIdx + (currPolygonIdx * 2 + 3) % polygonLength2];
if (isLinesIntersects(x1, y1, x2, y2, polyX1, polyY1, polyX2, polyY2)) {
return true;
polyX1 = polyX2;
polyY1 = polyY2;
x1 = x2;
y1 = y2;
// No intersection found till now so we check containing
return isPointInPolygon(x1, y1, polygon);
__kernel void calcIntersections(__global float* polylines, // My flat points array - [pntCount, x,y,x,y,...., pntCount, x,y,... ]
__global float* pBounds, // The rectangle bounds of each polyline - set of 4 values [top, left, bottom, right....]
__global uint* pStarts, // The start index of each polyline in the polylines array
__global float* polygon, // The polygon i want to intersect with - first 4 items are the rectangle bounds [top, left, bottom, right, pntCount, x,y,x,y,x,y....]
__global float* output, // Result array for saving the intersections polylines indices
__global uint* resCount) // The result count
int i = get_global_id(0);
uint start = convert_uint(pStarts[i]);
if (isRectanglesIntersected(pBounds[i * 4], pBounds[i * 4 + 1], pBounds[i * 4 + 2], pBounds[i * 4 + 3],
polygon[0], polygon[1], polygon[2], polygon[3])) {
if (isPolygonInersectsPolyline(polygon, polylines, start)){
int oldVal = atomic_inc(resCount);
output[oldVal] = i;
Can anyone explain it to me ?

Using a distance sensor in Processing to control the attributes of shapes

I'm trying to make a program that will use the readings it gets from a distance sensor to control the attributes of circles (size, xy and colour). To do this I'm trying to make it record the current value and apply that to the value when you press the relevant key (Eg. press 's' and it changes the size to whatever the distance was at that point). - Ideally I'd like the circle to change whatever field is next dynamically as you move your hand over the sensor, but that seems a bit beyond me.
I've tried to do as much as I can, but everything I'm not sure of I've commented out. Any tips or advice? I'm really not sure what I'm doing when it comes to classes and constructors.
EDIT: When I run the code, nothing happens.
import processing.serial.*;
int xpos, ypos, s, r, g, b;
Circle circle;
int shapeSize, distance;
String comPortString;
Serial myPort;
void setup(){
size(displayWidth,displayHeight); //Use entire screen size.
//Open the serial port for communication with the Arduino
myPort = new Serial(this, "/dev/cu.usbmodem1411", 9600);
myPort.bufferUntil('\n'); // Trigger a SerialEvent on new line
void draw(){
delay(50); //Delay used to refresh screen
void serialEvent(Serial cPort){
comPortString = (new String(cPort.readBytesUntil('\n')));
if(comPortString != null) {
/* Use the distance received by the Arduino to modify the y position
of the first square (others will follow). Should match the
code settings on the Arduino. In this case 200 is the maximum
distance expected. The distance is then mapped to a value
between 1 and the height of your screen */
distance = int(map(Integer.parseInt(comPortString),1,200,1,height));
/*If computer receives a negative number (-1), then the
sensor is reporting an "out of range" error. Convert all
of these to a distance of 0. */
distance = 0;
void keyPressed()
// N for new circle (and keep old one)
if((key == 'N') || (key == 'n')) {
circle = new Circle(1,1,1,1,1,1);
//r - change red
if((key == 'R') || (key == 'r')) {
float red = map(distance, 0, 700, 0, 255);
r = int(red);
println("r " + r);
//g - change green
if((key == 'G') || (key == 'g')) {
float green = map(distance, 0, 700, 0, 255);
g = int(green);
println("g " + g);
//b - change blue
if((key == 'B') || (key == 'b')) {
float blue = map(distance, 0, 700, 0, 255);
b = int(blue);
println("b " + b);
//S - change Size
if((key == 'S') || (key == 's')) {
s = distance;
println("s " + s);
//X - change x pos
if((key == 'X') || (key == 'x')) {
xpos = distance;
println("x " + xpos);
//y - change y pos
if((key == 'Y') || (key == 'y')) {
ypos = distance;
println("y " + ypos);
class Circle {
Circle(int xpos, int ypos, int s, int r, int g, int b){
ellipse(xpos, ypos, s, s);
color(r, g, b);
int getX(){
return xpos;
int getY(){
return ypos;
I would split this into steps/tasks:
Connecting to the Arduino
Reading values from Arduino
Mapping read values
Controlling mapping
You've got the Arduino part pretty much there, but things look messy when trying to map read values to the circle on screen.
For now, for simplicity reasons, let's ignore classes and focus on simply drawing a single ellipse with x,y,size,r,g,b properties.
To get read of jitter you should update the property ellipse continuously, not just when pressing a key. On the key event you should simply change what property gets updated.
You could use extra variables to keep track of what ellipse properties you're updating.
Here's a refactored version of the code based on the points above:
import processing.serial.*;
int xpos,ypos,s,r,g,b;
int distance;
int propertyID = 0;//keep track of what property should be updated on distance
int PROP_XPOS = 0;
int PROP_YPOS = 1;
int PROP_S = 2;
int PROP_R = 3;
int PROP_G = 4;
int PROP_B = 5;
void setup(){
//setup some defaults to see something on screen
xpos = ypos = 200;
s = 20;
r = g = b = 127;
//initialize arduino - search for port based on OSX name
String[] portNames = Serial.list();
for(int i = 0 ; i < portNames.length; i++){
Serial arduino = new Serial(this,portNames[i],9600);
}catch(Exception e){
void showSerialError(){
System.err.println("Error connecting to Arduino!\nPlease check the USB port");
void draw(){
void serialEvent(Serial arduino){
String rawString = arduino.readString();//fetch raw string
if(rawString != null){
String trimmedString = rawString.trim();//trim the raw string
int rawDistance = int(trimmedString);//convert to integer
distance = (int)map(rawDistance,1,200,1,height);
updatePropsOnDistance();//continously update circle properties
void updatePropsOnDistance(){
if(propertyID == PROP_XPOS) xpos = distance;
if(propertyID == PROP_YPOS) ypos = distance;
if(propertyID == PROP_S) s = distance;
if(propertyID == PROP_R) r = distance;
if(propertyID == PROP_G) g = distance;
if(propertyID == PROP_B) b = distance;
void keyReleased(){//only change what proprty changes on key press
if(key == 'x' || key == 'X') propertyID = PROP_XPOS;
if(key == 'y' || key == 'Y') propertyID = PROP_YPOS;
if(key == 's' || key == 'S') propertyID = PROP_S;
if(key == 'r' || key == 'R') propertyID = PROP_R;
if(key == 'g' || key == 'G') propertyID = PROP_G;
if(key == 'b' || key == 'B') propertyID = PROP_B;
//usually a good idea to test - in this case use mouseY instead of distance sensor
void mouseDragged(){
distance = mouseY;
If this makes sense, it can easily be encapsulated in a class.
We could use an array to store those properties, but if something like props[0] for x, props1 for y, etc. is harder to read, you could use an IntDict which allows you to index values based on a String instead of a value (so you can do props["x"] instead of props[0]).
Here's an encapsulated version of the code:
import processing.serial.*;
Circle circle = new Circle();
void setup(){
//initialize arduino - search for port based on OSX name
String[] portNames = Serial.list();
for(int i = 0 ; i < portNames.length; i++){
Serial arduino = new Serial(this,portNames[i],9600);
}catch(Exception e){
void showSerialError(){
System.err.println("Error connecting to Arduino!\nPlease check the USB port");
void draw(){
void serialEvent(Serial arduino){
String rawString = arduino.readString();
if(rawString != null){
String trimmedString = rawString.trim();
int rawDistance = int(trimmedString);
int distance = (int)map(rawDistance,1,200,1,height);
void keyReleased(){
circle.setUpdateProperty(key+"");//update the circle property based on what key gets pressed. the +"" is a quick way to make a String from the char
//usually a good idea to test - in this case use mouseY instead of distance sensor
void mouseDragged(){
class Circle{
//an IntDict (integer dictionary) is an associative array where instead of accessing values by an integer index (e.g. array[0]
//you access them by a String index (e.g. array["name"])
IntDict properties = new IntDict();
String updateProperty = "x";//property to update
void draw(){
void setUpdateProperty(String prop){
if(properties.hasKey(prop)) updateProperty = prop;
println("circle does not contain property: " + prop+"\navailable properties:");
void update(int value){
In both examples you can test the distance value by dragging your mouse on the Y axis.
Regarding the HC-SR04 sensor, you can find code on the Arduino Playground to get the distance in cm. I haven't used the sensor myself yet, but I notice other people has some issues with it, so it's worth checking this post as well. If you want to roll your own Arduino code, no problem, you can use the HC-SR04 datasheet(pdf link) to get the formula:
Formula: uS / 58 = centimeters or uS / 148 =inch; or: the range = high
level time * velocity (340M/S) / 2; we suggest to use over 60ms
measurement cycle, in order to prevent trigger signal to the echo
It's important to get accurate values (you'll avoid jitter when using these to draw in Processing). Additionally you can use easing or a moving average.
Here's a basic moving average example:
int historySize = 25;//remember a number of past values
int[] x = new int[historySize];
int[] y = new int[historySize];
void setup(){
void draw(){
//draw original trails in red
//compute moving average
float avgX = average(x,mouseX);
float avgY = average(y,mouseY);
//draw moving average in green
void mouseReleased(){
float average(int[] values,int newValue){
//shift elements by 1, from the last to the 2nd: count backwards
float total = 0;
int size = values.length;
for(int i = size-1; i > 0; i--){//count backwards
values[i] = values[i-1];//copy previous value into current
total += values[i];//add values to total
values[0] = newValue;//add the newest value at the start of the list
total += values[0];//add the latest value to the total
return (float)total/size;//return the average

Optimizing kernel shuffled keys code - OpenCL

I have just started getting into OpenCL and going through the basics of writing a kernel code. I have written a kernel code for calculating shuffled keys for points array. So, for a number of points N, the shuffled keys are calculated in 3-bit fashion, where x-bit at depth d (0
xd = 0 if p.x < Cd.x
xd = 1, otherwise
The Shuffled xyz key is given as:
The Kernel code written is given below. The point is inputted in a column major format.
__constant float3 boundsOffsetTable[8] = {
uint setBit(uint x,unsigned char position)
uint mask = 1<<position;
return x|mask;
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt;
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
code[i] = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) code = setBit(code,0);
if(pt.y>center.y) code = setBit(code,1);
if(pt.z>center.z) code = setBit(code,2);
for(int l = 1;l<level;l++)
for(int i=0;i<8;i++)
newRadius = radius *0.5;
newCenter = center + boundOffsetTable[i]*radius;
if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
if(pt.x>newCenter.x) code = setBit(code,3*l);
if(pt.y>newCenter.y) code = setBit(code,3*l+1);
if(pt.z>newCenter.z) code = setBit(code,3*l+2);
It works but I just wanted to ask if I am missing something in the code and if there is an way to optimize the code.
Try this kernel:
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt;
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
uint res;
res = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) res = setBit(res,0);
if(pt.y>center.y) res = setBit(res,1);
if(pt.z>center.z) res = setBit(res,2);
for(int l = 1;l<level;l++)
for(int i=0;i<8;i++)
newRadius = radius *0.5;
newCenter = center + boundOffsetTable[i]*radius;
if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
if(pt.x>newCenter.x) res = setBit(res,3*l);
if(pt.y>newCenter.y) res = setBit(res,3*l+1);
if(pt.z>newCenter.z) res = setBit(res,3*l+2);
//Save the result
code[i] = res;
Rules to optimize:
Avoid Global memory (you were using "code" directly from global memory, I changed that), you should see 3x increase in performance now.
Avoid Ifs, use "select" instead if it is possible. (See OpenCL documentation)
Use more memory inside the kernel. You don't need to operate at bit level. Operation at int level would be better and could avoid huge amount of calls to "setBit". Then you can construct your result at the end.
Another interesting thing. Is that if you are operating at 3D level, you can just use float3 variables and compute the distances with OpenCL operators. This can increase your performance quite a LOT. BUt also requires a complete rewrite of your kernel.

Higher radix (or better) formulation for Stockham FFT

I've implemented this algorithm from Microsoft Research for a radix-2 FFT (Stockham auto sort) using OpenCL.
I use floating point textures (256 cols X N rows) for input and output in the kernel, because I will need to sample at non-integral points and I thought it better to delegate that to the texture sampling hardware. Note that my FFTs are always of 256-point sequences (every row in my texture). At this point, my N is 16384 or 32768 depending on the GPU i'm using and the max 2D texture size allowed.
I also need to perform the FFT of 4 real-valued sequences at once, so the kernel performs the FFT(a, b, c, d) as FFT(a + ib, c + id) from which I can extract the 4 complex sequences out later using an O(n) algorithm. I can elaborate on this if someone wishes - but I don't believe it falls in the scope of this question.
Kernel Source
__kernel void FFT_Stockham(read_only image2d_t input, write_only image2d_t output, int fftSize, int size)
int x = get_global_id(0);
int y = get_global_id(1);
int b = floor(x / convert_float(fftSize)) * (fftSize / 2);
int offset = x % (fftSize / 2);
int x0 = b + offset;
int x1 = x0 + (size / 2);
float4 val0 = read_imagef(input, fftSampler, (int2)(x0, y));
float4 val1 = read_imagef(input, fftSampler, (int2)(x1, y));
float angle = -6.283185f * (convert_float(x) / convert_float(fftSize));
// TODO: Convert the two calculations below into lookups from a __constant buffer
float tA = native_cos(angle);
float tB = native_sin(angle);
float4 coeffs1 = (float4)(tA, tB, tA, tB);
float4 coeffs2 = (float4)(-tB, tA, -tB, tA);
float4 result = val0 + coeffs1 * val1.xxzz + coeffs2 * val1.yyww;
write_imagef(output, (int2)(x, y), result);
The host code simply invokes this kernel log2(256) times, ping-ponging the input and output textures.
Note: I tried removing the native_cos and native_sin to see if that impacted timing, but it doesn't seem to change things by very much. Not the factor I'm looking for, in any case.
Access pattern
Knowing that I am probably memory-bandwidth bound, here is the memory access pattern (per-row) for my radix-2 FFT.
X0 - element 1 to combine (read)
X1 - element 2 to combine (read)
X - element to write to (write)
So my question is - can someone help me with/point me toward a higher-radix formulation for this algorithm? I ask because most FFTs are optimized for large cases and single real/complex valued sequences. Their kernel generators are also very case dependent and break down quickly when I try to muck with their internals.
Are there other options better than simply going to a radix-8 or 16 kernel?
Some of my constraints are - I have to use OpenCL (no cuFFT). I also cannot use clAmdFft from ACML for this purpose. It would be nice to also talk about CPU optimizations (this kernel SUCKS big time on the CPU) - but getting it to run in fewer iterations on the GPU is my main use-case.
Thanks in advance for reading through all this and trying to help!
I tried several versions, but the one with the best performance on CPU and GPU was a radix-16 kernel for my specific case.
Here is the kernel for reference. It was taken from Eric Bainville's (most excellent) website and used with full attribution.
// #define M_PI 3.14159265358979f
//Global size is x.Length/2, Scale = 1 for direct, 1/N to inverse (iFFT)
__kernel void ConjugateAndScale(__global float4* x, const float Scale)
int i = get_global_id(0);
float temp = Scale;
float4 t = (float4)(temp, -temp, temp, -temp);
x[i] *= t;
// Return a*EXP(-I*PI*1/2) = a*(-I)
float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
// Return a^2
float2 sqr_1(float2 a)
{ return (float2)(a.x*a.x-a.y*a.y,2.0f*a.x*a.y); }
// Return the 2x DFT2 of the four complex numbers in A
// If A=(a,b,c,d) then return (a',b',c',d') where (a',c')=DFT2(a,c)
// and (b',d')=DFT2(b,d).
float8 dft2_4(float8 a) { return (float8)(a.lo+a.hi,a.lo-a.hi); }
// Return the DFT of 4 complex numbers in A
float8 dft4_4(float8 a)
// 2x DFT2
float8 x = dft2_4(a);
// Shuffle, twiddle, and 2x DFT2
return dft2_4((float8)(x.lo.lo,x.hi.lo,x.lo.hi,mul_p1q2(x.hi.hi)));
// Complex product, multiply vectors of complex numbers
#define MUL_RE(a,b) (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a,b) (a.even*b.odd + a.odd*b.even)
float2 mul_1(float2 a, float2 b)
{ float2 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_1_F4(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_2(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
// Return the DFT2 of the two complex numbers in vector A
float4 dft2_2(float4 a) { return (float4)(a.lo+a.hi,a.lo-a.hi); }
// Return cos(alpha)+I*sin(alpha) (3 variants)
float2 exp_alpha_1(float alpha)
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
//cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float2)(cs,sn);
// Return cos(alpha)+I*sin(alpha) (3 variants)
float4 exp_alpha_1_F4(float alpha)
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
// cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float4)(cs,sn,cs,sn);
// mul_p*q*(a) returns a*EXP(-I*PI*P/Q)
#define mul_p0q1(a) (a)
#define mul_p0q2 mul_p0q1
//float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
__constant float SQRT_1_2 = 0.707106781186548; // cos(Pi/4)
#define mul_p0q4 mul_p0q2
float2 mul_p1q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(a.x+a.y,-a.x+a.y); }
#define mul_p2q4 mul_p1q2
float2 mul_p3q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(-a.x+a.y,-a.x-a.y); }
__constant float COS_8 = 0.923879532511287; // cos(Pi/8)
__constant float SIN_8 = 0.382683432365089; // sin(Pi/8)
#define mul_p0q8 mul_p0q4
float2 mul_p1q8(float2 a) { return mul_1((float2)(COS_8,-SIN_8),a); }
#define mul_p2q8 mul_p1q4
float2 mul_p3q8(float2 a) { return mul_1((float2)(SIN_8,-COS_8),a); }
#define mul_p4q8 mul_p2q4
float2 mul_p5q8(float2 a) { return mul_1((float2)(-SIN_8,-COS_8),a); }
#define mul_p6q8 mul_p3q4
float2 mul_p7q8(float2 a) { return mul_1((float2)(-COS_8,-SIN_8),a); }
// Compute in-place DFT2 and twiddle
#define DFT2_TWIDDLE(a,b,t) { float2 tmp = t(a-b); a += b; b = tmp; }
// T = N/16 = number of threads.
// P is the length of input sub-sequences, 1,16,256,...,N/16.
__kernel void FFT_Radix16(__global const float4 * x, __global float4 * y, int pp)
int p = pp;
int t = get_global_size(0); // number of threads
int i = get_global_id(0); // current thread
////// y[i] = 2*x[i];
////// return;
int k = i & (p-1); // index in input sequence, in 0..P-1
// Inputs indices are I+{0,..,15}*T
x += i;
// Output indices are J+{0,..,15}*P, where
// J is I with four 0 bits inserted at bit log2(P)
y += ((i-k)<<4) + k;
// Load
float4 u[16];
for (int m=0;m<16;m++) u[m] = x[m*t];
// Twiddle, twiddling factors are exp(_I*PI*{0,..,15}*K/4P)
float alpha = -M_PI*(float)k/(float)(8*p);
for (int m=1;m<16;m++) u[m] = mul_1_F4(exp_alpha_1_F4(m * alpha), u[m]);
// 8x in-place DFT2 and twiddle (1)
// 8x in-place DFT2 and twiddle (2)
// 8x in-place DFT2 and twiddle (3)
// 8x DFT2 and store (reverse binary permutation)
y[0] = u[0] + u[1];
y[p] = u[8] + u[9];
y[2*p] = u[4] + u[5];
y[3*p] = u[12] + u[13];
y[4*p] = u[2] + u[3];
y[5*p] = u[10] + u[11];
y[6*p] = u[6] + u[7];
y[7*p] = u[14] + u[15];
y[8*p] = u[0] - u[1];
y[9*p] = u[8] - u[9];
y[10*p] = u[4] - u[5];
y[11*p] = u[12] - u[13];
y[12*p] = u[2] - u[3];
y[13*p] = u[10] - u[11];
y[14*p] = u[6] - u[7];
y[15*p] = u[14] - u[15];
Note that I have modified the kernel to perform the FFT of 2 complex-valued sequences at once instead of one. Also, since I only need the FFT of 256 elements at a time in a much larger sequence, I perform only 2 runs of this kernel, which leaves me with 256-length DFTs in the larger array.
Here's some of the relevant host code as well.
var ev = new[] { new Cl.Event() };
var pEv = new[] { new Cl.Event() };
int fftSize = 1;
int iter = 0;
int n = distributionSize >> 5;
while (fftSize <= n)
Cl.SetKernelArg(fftKernel, 0, memA);
Cl.SetKernelArg(fftKernel, 1, memB);
Cl.SetKernelArg(fftKernel, 2, fftSize);
Cl.EnqueueNDRangeKernel(commandQueue, fftKernel, 1, null, globalWorkgroupSize, localWorkgroupSize,
(uint)(iter == 0 ? 0 : 1),
iter == 0 ? null : pEv,
out ev[0]).Check();
if (iter > 0)
Swap(ref ev, ref pEv);
Swap(ref memA, ref memB); // ping-pong
fftSize = fftSize << 4;
Swap(ref memA, ref memB);
Hope this helps someone!
