Related
I'm doing a control engineering project, implementing a PID motor position control for automatic antenna tracking system. The system contain a dc motor, absolute encoder, and a motor driver.
Everything work as expected, but one thing. The motor cannot stop at set point value near 0 degree (350 - 359, 0 - 10 degree). The used code:
#include <PID_v1.h>
int RPWM = 5;
int LPWM = 6;
int L_EN = 7;
int R_EN = 8;
boolean pin_state[10];
byte input_pin[] = {1, 2, 3, 4, 9, 10, 11, 12, 13};
int dec_position = 0;
int dc = 0;
double kp = 50, ki = 45, kd = 2;
double input = 0, output = 0, setpoint = 0;
volatile long encoderPos = 0;
PID myPID(&input, &output, &setpoint, kp, ki, kd, DIRECT);
void setup() {
pinMode(L_EN, OUTPUT);
pinMode(R_EN, OUTPUT);
pinMode(RPWM, OUTPUT);
pinMode(LPWM, OUTPUT);
for (byte i = 0; i < 9; i++) {
pinMode(input_pin[i], INPUT);
}
TCCR1B = TCCR1B & 0b11111000 | 1;
myPID.SetMode(AUTOMATIC);
myPID.SetSampleTime(1);
myPID.SetOutputLimits(-255, 255);
digitalWrite(L_EN, HIGH);
digitalWrite(R_EN, HIGH);
}
void loop() {
if (Serial.available() > 0) {
String baca = Serial.readString();
setpoint = baca.toInt();
}
ReadEncoder();
input = dc;
myPID.Compute();
pwmOut(output);
}
void pwmOut(int out) {
if (out > 0) {
analogWrite(RPWM, out);//Sets speed variable via PWM
}
else {
analogWrite(LPWM, abs(out));//Sets speed variable via PWM
}
}
void ReadEncoder() {
// FOR READING ENCODER POSITION, GIVING 0-359 OUTPUT CORRESPOND TO THE ENCODER POSITION
for (byte i = 0; i < 9; i++) {
pin_state[i] = !(digitalRead(input_pin[i]));
}
dec_position = (pin_state[8] * 256) + (pin_state[7] * 128) + (pin_state[6] * 64) + (pin_state[5] * 32) + (pin_state[4] * 16) + (pin_state[3] * 8) + (pin_state[2] * 4) + (pin_state[1] * 2) + pin_state[0];
dc = map(dec_position, 0, 500, 0, 360);
}
When the set point is a value between 10 - 350 the sytem worked well. But when it is not, the motor never stop rotating.
I know the problem is due to a little position overshoot cause the encoder to read a very large error.
For instance, if the setpoint is 0 degree, the motor rotate to reach it. Motor rotation is slowing down as its "now" position approaching 0 degree, but the system is not overshoot free. Therefore, even 1 degree overshoot cause the error value is -359 (set point - now position) and the motor rotate again to reach the desired position.
Need help how to overcome this problem. Sorry for bad english.
Here's the solution
double error;
if (SP>PV) {
if (abs(SP-PV) < abs(-360 + SP - PV)) error = SP - PV;
else error = -360 + SP - PV;
}
else{
if(abs(SP-PV)< abs(360 - PV + SP)) error = SP - PV;
else error = 360 - SP + PV;
}
Instead off simple present value minus the set point for error. The code above return shortest path from present value to set point.
i didn't read your code yet. However, to reach "set point" (SV) you should give for "present value" (PV) an error allowance (EA)
For example: EA = SV - PV.
If EA = (-2,+2)degree then it reach "now position"
And, you should not use degree for angle, you should convert it to position (calculate by pulse)
Hope that concept can help you.
I was working on a project where I get analog values from a resistive touchscreen and turn them into intersection points.
Here is an example:
Here is my code for the data collection using an Arduino Uno and construction of the points using tool called processing.
#define side1 2
#define side2 3
#define side3 4
#define side4 5
#define contact A0
void setup() {
pinMode(contact, INPUT);
pinMode(side1, OUTPUT);
pinMode(side2, OUTPUT);
pinMode(side3, OUTPUT);
pinMode(side4, OUTPUT);
Serial.begin(9600);
}
void loop() {
int sensorValue1;
int sensorValue2;
int sensorValue3;
int sensorValue4;
// SENSOR VALUE 1:
digitalWrite(side1, LOW);
digitalWrite(side2, HIGH);
digitalWrite(side3, HIGH);
digitalWrite(side4, HIGH);
delay(5);
for (int i = 0; i < 10; i++){
sensorValue1 = analogRead(contact);
}
// SENSOR VALUE 2:
digitalWrite(side2, LOW);
digitalWrite(side3, HIGH);
digitalWrite(side4, HIGH);
digitalWrite(side1, HIGH);
delay(5);
for (int i = 0; i < 10; i++){
sensorValue2 = analogRead(contact);
}
// SENSOR VALUE 3:
digitalWrite(side3, LOW);
digitalWrite(side2, HIGH);
digitalWrite(side4, HIGH);
digitalWrite(side1, HIGH);
delay(5);
for (int i = 0; i < 10; i++){
sensorValue3 = analogRead(contact);
}
// SENSOR VALUE 2:
digitalWrite(side4, LOW);
digitalWrite(side3, HIGH);
digitalWrite(side2, HIGH);
digitalWrite(side1, HIGH);
delay(5);
for (int i = 0; i < 10; i++){
sensorValue4 = analogRead(contact);
}
Serial.print(sensorValue1);
Serial.print(",");
Serial.print(sensorValue2);
Serial.print(",");
Serial.print(sensorValue3);
Serial.print(",");
Serial.print(sensorValue4);
Serial.println();
}
This is the Processing code for the construction of the graph.
import processing.serial.*;
Serial myPort; // The serial port
int maxNumberOfSensors = 4;
float[] sensorValues = new float[maxNumberOfSensors];
float sensorValueX;
float sensorValueX1;
float sensorValueY;
float sensorValueY1;
int scaleValue = 2;
void setup () {
size(600, 600); // set up the window to whatever size you want
//println(Serial.list()); // List all the available serial ports
String portName = "COM5";
myPort = new Serial(this, portName, 9600);
myPort.clear();
myPort.bufferUntil('\n'); // don't generate a serialEvent() until you get a newline (\n) byte
background(255); // set inital background
smooth(); // turn on antialiasing
}
void draw () {
//background(255);
//noFill();
fill(100,100,100,100);
ellipse(height,0, scaleValue*sensorValues[0], scaleValue*sensorValues[0]);
ellipse(0,width, scaleValue*sensorValues[1], scaleValue*sensorValues[1]);
ellipse(height,width, scaleValue*sensorValues[2], scaleValue*sensorValues[2]);
ellipse(0,0, scaleValue*sensorValues[3], scaleValue*sensorValues[3]);
//ellipse(sensorValueY, sensorValueX, 10,10);
//println(sensorValueY,sensorValueX);
sensorValueX = ((sensorValues[3]*sensorValues[3])-(sensorValues[2]*sensorValues[2])+600*600)/2000;
sensorValueX1 = ((sensorValues[0]*sensorValues[0])-(sensorValues[1]*sensorValues[1])+600*600)/2000;
sensorValueY = ((sensorValues[3]*sensorValues[3])-(sensorValues[2]*sensorValues[2])+(600*600))/2000;
sensorValueY1 = ((sensorValues[1]*sensorValues[1])-(sensorValues[0]*sensorValues[0])+(600*600))/2000;
line(0, scaleValue*sensorValueX, height,scaleValue* sensorValueX);
line(scaleValue*sensorValueY, 0, scaleValue*sensorValueY, width);
ellipse(scaleValue*sensorValueY, scaleValue*sensorValueX, 20,20);
line(0, scaleValue*sensorValueX1, height,scaleValue* sensorValueX1);
line(scaleValue*sensorValueY1, 0, scaleValue*sensorValueY1, width);
ellipse(scaleValue*sensorValueY1, scaleValue*sensorValueX1, 20,20);
println(scaleValue*sensorValueX,scaleValue*sensorValueY);
}
void serialEvent (Serial myPort) {
String inString = myPort.readStringUntil('\n'); // get the ASCII string
if (inString != null) { // if it's not empty
inString = trim(inString); // trim off any whitespace
int incomingValues[] = int(split(inString, ",")); // convert to an array of ints
if (incomingValues.length <= maxNumberOfSensors && incomingValues.length > 0) {
for (int i = 0; i < incomingValues.length; i++) {
// map the incoming values (0 to 1023) to an appropriate gray-scale range (0-255):
sensorValues[i] = map(incomingValues[i], 0, 1023, 0, width);
//println(incomingValues[i]+ " " + sensorValues[i]);
}
}
}
}
I was wondering how I could convert the intersection of those points to a coordinate? Example: in the image, I showed you, I set the parameters for the dimensions to be (600,600). Is it possible to change that intersection are to a coordinate value? Currently, my code is printing out coordinates however they are diagonals such at the x and y values are equal. I want the coordinates of x and y to have different quantities so that I can get coordinates for different sides in the square. Can somebody help?
By reading your code I'm assuming that you know the position of all n sensors and the distance from each n sensor to a target. So what you're essentially trying to do is trilateration (as mentioned by Nico Schertler). In other words determining a relative position based on the distance between n points.
Just a quick definition note in case of confusion:
Triangulation = Working with angles
Trilateration = Working with distances
Trilateration requires at least 3 points and distances.
1 sensor gives you the distance the target is away from the sensor
2 sensors gives you 2 possible locations the target can be
3 sensors tells you which of the 2 locations the target is at
The first solution that probably comes to mind is calculating the intersections
between 3 sensors treating them as circles. Given that there might be some error in the distances this means that the circles might not always intersect. Which rules out this solution.
The following code has all been done in Processing.
I took the liberty of making a class Sensor.
class Sensor {
public PVector p; // position
public float d; // distance from sensor to target (radius of the circle)
public Sensor(float x, float y) {
this.p = new PVector(x, y);
this.d = 0;
}
}
Now to calculate and approximate the intersection point between the sensors/circles, do the following:
PVector trilateration(Sensor s1, Sensor s2, Sensor s3) {
PVector s = PVector.sub(s2.p, s1.p).div(PVector.sub(s2.p, s1.p).mag());
float a = s.dot(PVector.sub(s3.p, s1.p));
PVector t = PVector.sub(s3.p, s1.p).sub(PVector.mult(s, a)).div(PVector.sub(s3.p, s1.p).sub(PVector.mult(s, a)).mag());
float b = t.dot(PVector.sub(s3.p, s1.p));
float c = PVector.sub(s2.p, s1.p).mag();
float x = (sq(s1.d) - sq(s2.d) + sq(c)) / (c * 2);
float y = ((sq(s1.d) - sq(s3.d) + sq(a) + sq(b)) / (b * 2)) - ((a / b) * x);
s.mult(x);
t.mult(y);
return PVector.add(s1.p, s).add(t);
}
Where s1, s2, s3 is any of your 3 sensors, do the following to calculate the the intersection point between the given sensors:
PVector target = trilateration(s1, s2, s3);
While it is possible to calculate the intersection between any amount of sensors. It becomes more and more complex the more sensors you want to include. Especially since you're doing it yourself. If you're able to use external Java libraries, then it would be a lot easier.
If you're able to use external Java libraries, then I highly recommend using com.lemmingapex.trilateration. Then you'd be able to calculate the intersection point between 4 sensors by doing:
Considering s1, s2, s3, s4 as instances of the previously mentioned class Sensor.
double[][] positions = new double[][] { { s1.x, s1.y }, { s2.x, s2.y }, { s3.x, s3.y }, { s4.x, s4.y } };
double[] distances = new double[] { s1.d, s2.d, s3.d, s4.d };
NonLinearLeastSquaresSolver solver = new NonLinearLeastSquaresSolver(
new TrilaterationFunction(positions, distances),
new LevenbergMarquardtOptimizer());
Optimum optimum = solver.solve();
double[] target = optimum.getPoint().toArray();
double x = target[0];
double y = target[1];
The following examples, are examples of the trilateration() method I wrote and not an example of the library above.
Example 1 - No Sensor Error
The 3 big circles being any 3 sensors and the single red circle being the approximated point.
Example 2 - With Sensor Error
The 3 big circles being any 3 sensors and the single red circle being the approximated point.
What you need to compute is the point that it nearest to the a set of circles,
let denote their centers by (x1,y1), (x2,y2), (x3,y3), (x4,y4) and their radii by r1,r2,r3,r4.
You want to find (x,y) that minimizes
F(x,y) = Sum_i [ square( d2( (x,y), (xi,yi)) - ri) ]
This can be achieved by using Newton's algorithm. Newton's algorithm works from an "initial guess" (let's say at the center of the screen), improved iteratively by solving a series of linear systems (in this case, with 2 variables, easy to solve).
M P = -G
where M is the (2x2) matrix of the second order derivatives of F with respect to x and y (called the Hessian), and G the vector of the first order derivatives of F with
respect to x and y (the gradient). This gives the "update" vector P, that tells how to move the coordinates:
Then (x,y) is updated by x = x + Px, y = y + Py, and so on and so forth (recompute M and G, solve for P, update x and y, recompute M and G, solve for P, update x and y). In your case it will probably converge in a handful of iterations.
Since you got two variables only, the 2x2 linear solve is trivial, and the expression of F and its derivatives is simple, thus you can implement it without needing an external library.
Note1: the Levenberg-Marquardt algorithm mentioned in the other answer is a variant of Newton's algorithm (specialized for sum of squares, like here, and that neglects some terms, and that regularizes the matrix M by adding small numbers to its diagonal coefficients). More on this here.
Note2: a simple gradient descent will also probably work (a bit simpler to implement, since it only uses first order derivatives), but given that you only got two variables to implement, the 2x2 linear solve is trivial, so Newton is probably worth it (requires a much much smaller number of iterations for convergence, may be critial if your system is interactive).
I'm new in open cl. And tried as my first work to write code that checks intersection between many polylines to single polygon.
I'm running the code in both cpu and gpu.. and get different results.
First I sent NULL as local parameter when called clEnqueueNDRangeKernel.
clEnqueueNDRangeKernel(command_queue, kIntersect, 1, NULL, &global, null, 2, &evtCalcBounds, &evtKernel);
After trying many things i saw that if i send 1 as local it is working good. and returning the same results for the cpu and gpu.
size_t local = 1;
clEnqueueNDRangeKernel(command_queue, kIntersect, 1, NULL, &global, &local, 2, &evtCalcBounds, &evtKernel);
Played abit more and found that the cpu returns false result when i run the kernel with local 8 or more (for some reason).
I'm not using any local memory, just globals and privates.
I didn't added the code because i think it is irrelevant to the problem (note that for single work group it is working good), and it is long. If it is needed, i will try to simplify it.
The code flow is going like this:
I have polylines coordinates stored in a big buffer. and the single polygon in another. In addition i'm providing another buffer with single int that holds the current results count. All buffers are __global arguments.
In the kernel i'm simply checking intersection between all the lines of the "polyline[get_global(0)]" with the lines of the polygon. If true,
i'm using atomic_inc for the results count. There is no read and write memory from the same buffer, no barriers or mem fences,... the atomic_inc is the only thread safe mechanism i'm using.
-- UPDATE --
Added my code:
I know that i can maybe have better use of open cl functions for calculating some vectors, but for now, i'm simply convert code from my old regular CPU single threaded program to CL. so this is not my concern now.
bool isPointInPolygon(float x, float y, __global float* polygon) {
bool blnInside = false;
uint length = convert_uint(polygon[4]);
int s = 5;
uint j = length - 1;
for (uint i = 0; i < length; j = i++) {
uint realIdx = s + i * 2;
uint realInvIdx = s + j * 2;
if (((polygon[realIdx + 1] > y) != (polygon[realInvIdx + 1] > y)) &&
(x < (polygon[realInvIdx] - polygon[realIdx]) * (y - polygon[realIdx + 1]) / (polygon[realInvIdx + 1] - polygon[realIdx + 1]) + polygon[realIdx]))
blnInside = !blnInside;
}
return blnInside;
}
bool isRectanglesIntersected(float p_dblMinX1, float p_dblMinY1,
float p_dblMaxX1, float p_dblMaxY1,
float p_dblMinX2, float p_dblMinY2,
float p_dblMaxX2, float p_dblMaxY2) {
bool blnResult = true;
if (p_dblMinX1 > p_dblMaxX2 ||
p_dblMaxX1 < p_dblMinX2 ||
p_dblMinY1 > p_dblMaxY2 ||
p_dblMaxY1 < p_dblMinY2) {
blnResult = false;
}
return blnResult;
}
bool isLinesIntersects(
double Ax, double Ay,
double Bx, double By,
double Cx, double Cy,
double Dx, double Dy) {
double distAB, theCos, theSin, newX, ABpos;
// Fail if either line is undefined.
if (Ax == Bx && Ay == By || Cx == Dx && Cy == Dy)
return false;
// (1) Translate the system so that point A is on the origin.
Bx -= Ax; By -= Ay;
Cx -= Ax; Cy -= Ay;
Dx -= Ax; Dy -= Ay;
// Discover the length of segment A-B.
distAB = sqrt(Bx*Bx + By*By);
// (2) Rotate the system so that point B is on the positive X axis.
theCos = Bx / distAB;
theSin = By / distAB;
newX = Cx*theCos + Cy*theSin;
Cy = Cy*theCos - Cx*theSin; Cx = newX;
newX = Dx*theCos + Dy*theSin;
Dy = Dy*theCos - Dx*theSin; Dx = newX;
// Fail if the lines are parallel.
return (Cy != Dy);
}
bool isPolygonInersectsPolyline(__global float* polygon, __global float* polylines, uint startIdx) {
uint polylineLength = convert_uint(polylines[startIdx]);
uint start = startIdx + 1;
float x1 = polylines[start];
float y1 = polylines[start + 1];
float x2;
float y2;
int polygonLength = convert_uint(polygon[4]);
int polygonLength2 = polygonLength * 2;
int startPolygonIdx = 5;
for (int currPolyineIdx = 0; currPolyineIdx < polylineLength - 1; currPolyineIdx++)
{
x2 = polylines[start + (currPolyineIdx*2) + 2];
y2 = polylines[start + (currPolyineIdx*2) + 3];
float polyX1 = polygon[0];
float polyY1 = polygon[1];
for (int currPolygonIdx = 0; currPolygonIdx < polygonLength; ++currPolygonIdx)
{
float polyX2 = polygon[startPolygonIdx + (currPolygonIdx * 2 + 2) % polygonLength2];
float polyY2 = polygon[startPolygonIdx + (currPolygonIdx * 2 + 3) % polygonLength2];
if (isLinesIntersects(x1, y1, x2, y2, polyX1, polyY1, polyX2, polyY2)) {
return true;
}
polyX1 = polyX2;
polyY1 = polyY2;
}
x1 = x2;
y1 = y2;
}
// No intersection found till now so we check containing
return isPointInPolygon(x1, y1, polygon);
}
__kernel void calcIntersections(__global float* polylines, // My flat points array - [pntCount, x,y,x,y,...., pntCount, x,y,... ]
__global float* pBounds, // The rectangle bounds of each polyline - set of 4 values [top, left, bottom, right....]
__global uint* pStarts, // The start index of each polyline in the polylines array
__global float* polygon, // The polygon i want to intersect with - first 4 items are the rectangle bounds [top, left, bottom, right, pntCount, x,y,x,y,x,y....]
__global float* output, // Result array for saving the intersections polylines indices
__global uint* resCount) // The result count
{
int i = get_global_id(0);
uint start = convert_uint(pStarts[i]);
if (isRectanglesIntersected(pBounds[i * 4], pBounds[i * 4 + 1], pBounds[i * 4 + 2], pBounds[i * 4 + 3],
polygon[0], polygon[1], polygon[2], polygon[3])) {
if (isPolygonInersectsPolyline(polygon, polylines, start)){
int oldVal = atomic_inc(resCount);
output[oldVal] = i;
}
}
}
Can anyone explain it to me ?
I am trying to implement an angular constraint into a simple verlet integration based 2D physic engine. This is the code I am currently using:
int indexA = physicAngularConstraint[i].indexA;
int indexB = physicAngularConstraint[i].indexB;
int indexC = physicAngularConstraint[i].indexC;
CGPoint e = CGPointSubtract(physicParticle[indexB].pos,physicParticle[indexA].pos);
CGPoint f = CGPointSubtract(physicParticle[indexC].pos,physicParticle[indexB].pos);
float dot = CGPointDot(e, f);
float cross = CGPointCross(e, f);
float angle = atan2f(cross, dot);
float da = (angle < physicAngularConstraint[i].minAngle)? angle - physicAngularConstraint[i].minAngle : (angle > physicAngularConstraint[i].maxAngle)? angle - physicAngularConstraint[i].maxAngle : 0.0f;
if (da != 0.0f)
{
physicParticle[indexA].pos = CGPointRotate(physicParticle[indexA].pos,
physicParticle[indexB].pos, da);
physicParticle[indexC].pos = CGPointRotate(physicParticle[indexC].pos,
physicParticle[indexB].pos, -da);
}
The CGPointRotate function looks like this:
CGPoint CGPointRotate(CGPoint pt, CGPoint center, float angle)
{
CGPoint ret;
pt = CGPointSubtract(pt, center);
float co = cosf(angle);
float si = sinf(angle);
ret.x = pt.x * co - pt.y * si;
ret.y = pt.x * si + pt.y * co;
ret = CGPointAdd(ret, center);
return ret;
}
I am testing this implementation with a row of particles which are connected through distant constraints. Without the angular constraints they act like a rope. I am trying to give the "rope" some stiffness through the angular constraints. But my implementation above is gaining energy and blowing up the system after some millisecs. Why does this constrain implementation is gaining energie?
Background
I've implemented this algorithm from Microsoft Research for a radix-2 FFT (Stockham auto sort) using OpenCL.
I use floating point textures (256 cols X N rows) for input and output in the kernel, because I will need to sample at non-integral points and I thought it better to delegate that to the texture sampling hardware. Note that my FFTs are always of 256-point sequences (every row in my texture). At this point, my N is 16384 or 32768 depending on the GPU i'm using and the max 2D texture size allowed.
I also need to perform the FFT of 4 real-valued sequences at once, so the kernel performs the FFT(a, b, c, d) as FFT(a + ib, c + id) from which I can extract the 4 complex sequences out later using an O(n) algorithm. I can elaborate on this if someone wishes - but I don't believe it falls in the scope of this question.
Kernel Source
const sampler_t fftSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void FFT_Stockham(read_only image2d_t input, write_only image2d_t output, int fftSize, int size)
{
int x = get_global_id(0);
int y = get_global_id(1);
int b = floor(x / convert_float(fftSize)) * (fftSize / 2);
int offset = x % (fftSize / 2);
int x0 = b + offset;
int x1 = x0 + (size / 2);
float4 val0 = read_imagef(input, fftSampler, (int2)(x0, y));
float4 val1 = read_imagef(input, fftSampler, (int2)(x1, y));
float angle = -6.283185f * (convert_float(x) / convert_float(fftSize));
// TODO: Convert the two calculations below into lookups from a __constant buffer
float tA = native_cos(angle);
float tB = native_sin(angle);
float4 coeffs1 = (float4)(tA, tB, tA, tB);
float4 coeffs2 = (float4)(-tB, tA, -tB, tA);
float4 result = val0 + coeffs1 * val1.xxzz + coeffs2 * val1.yyww;
write_imagef(output, (int2)(x, y), result);
}
The host code simply invokes this kernel log2(256) times, ping-ponging the input and output textures.
Note: I tried removing the native_cos and native_sin to see if that impacted timing, but it doesn't seem to change things by very much. Not the factor I'm looking for, in any case.
Access pattern
Knowing that I am probably memory-bandwidth bound, here is the memory access pattern (per-row) for my radix-2 FFT.
X0 - element 1 to combine (read)
X1 - element 2 to combine (read)
X - element to write to (write)
Question
So my question is - can someone help me with/point me toward a higher-radix formulation for this algorithm? I ask because most FFTs are optimized for large cases and single real/complex valued sequences. Their kernel generators are also very case dependent and break down quickly when I try to muck with their internals.
Are there other options better than simply going to a radix-8 or 16 kernel?
Some of my constraints are - I have to use OpenCL (no cuFFT). I also cannot use clAmdFft from ACML for this purpose. It would be nice to also talk about CPU optimizations (this kernel SUCKS big time on the CPU) - but getting it to run in fewer iterations on the GPU is my main use-case.
Thanks in advance for reading through all this and trying to help!
I tried several versions, but the one with the best performance on CPU and GPU was a radix-16 kernel for my specific case.
Here is the kernel for reference. It was taken from Eric Bainville's (most excellent) website and used with full attribution.
// #define M_PI 3.14159265358979f
//Global size is x.Length/2, Scale = 1 for direct, 1/N to inverse (iFFT)
__kernel void ConjugateAndScale(__global float4* x, const float Scale)
{
int i = get_global_id(0);
float temp = Scale;
float4 t = (float4)(temp, -temp, temp, -temp);
x[i] *= t;
}
// Return a*EXP(-I*PI*1/2) = a*(-I)
float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
// Return a^2
float2 sqr_1(float2 a)
{ return (float2)(a.x*a.x-a.y*a.y,2.0f*a.x*a.y); }
// Return the 2x DFT2 of the four complex numbers in A
// If A=(a,b,c,d) then return (a',b',c',d') where (a',c')=DFT2(a,c)
// and (b',d')=DFT2(b,d).
float8 dft2_4(float8 a) { return (float8)(a.lo+a.hi,a.lo-a.hi); }
// Return the DFT of 4 complex numbers in A
float8 dft4_4(float8 a)
{
// 2x DFT2
float8 x = dft2_4(a);
// Shuffle, twiddle, and 2x DFT2
return dft2_4((float8)(x.lo.lo,x.hi.lo,x.lo.hi,mul_p1q2(x.hi.hi)));
}
// Complex product, multiply vectors of complex numbers
#define MUL_RE(a,b) (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a,b) (a.even*b.odd + a.odd*b.even)
float2 mul_1(float2 a, float2 b)
{ float2 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_1_F4(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_2(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
// Return the DFT2 of the two complex numbers in vector A
float4 dft2_2(float4 a) { return (float4)(a.lo+a.hi,a.lo-a.hi); }
// Return cos(alpha)+I*sin(alpha) (3 variants)
float2 exp_alpha_1(float alpha)
{
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
//cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float2)(cs,sn);
}
// Return cos(alpha)+I*sin(alpha) (3 variants)
float4 exp_alpha_1_F4(float alpha)
{
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
// cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float4)(cs,sn,cs,sn);
}
// mul_p*q*(a) returns a*EXP(-I*PI*P/Q)
#define mul_p0q1(a) (a)
#define mul_p0q2 mul_p0q1
//float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
__constant float SQRT_1_2 = 0.707106781186548; // cos(Pi/4)
#define mul_p0q4 mul_p0q2
float2 mul_p1q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(a.x+a.y,-a.x+a.y); }
#define mul_p2q4 mul_p1q2
float2 mul_p3q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(-a.x+a.y,-a.x-a.y); }
__constant float COS_8 = 0.923879532511287; // cos(Pi/8)
__constant float SIN_8 = 0.382683432365089; // sin(Pi/8)
#define mul_p0q8 mul_p0q4
float2 mul_p1q8(float2 a) { return mul_1((float2)(COS_8,-SIN_8),a); }
#define mul_p2q8 mul_p1q4
float2 mul_p3q8(float2 a) { return mul_1((float2)(SIN_8,-COS_8),a); }
#define mul_p4q8 mul_p2q4
float2 mul_p5q8(float2 a) { return mul_1((float2)(-SIN_8,-COS_8),a); }
#define mul_p6q8 mul_p3q4
float2 mul_p7q8(float2 a) { return mul_1((float2)(-COS_8,-SIN_8),a); }
// Compute in-place DFT2 and twiddle
#define DFT2_TWIDDLE(a,b,t) { float2 tmp = t(a-b); a += b; b = tmp; }
// T = N/16 = number of threads.
// P is the length of input sub-sequences, 1,16,256,...,N/16.
__kernel void FFT_Radix16(__global const float4 * x, __global float4 * y, int pp)
{
int p = pp;
int t = get_global_size(0); // number of threads
int i = get_global_id(0); // current thread
////// y[i] = 2*x[i];
////// return;
int k = i & (p-1); // index in input sequence, in 0..P-1
// Inputs indices are I+{0,..,15}*T
x += i;
// Output indices are J+{0,..,15}*P, where
// J is I with four 0 bits inserted at bit log2(P)
y += ((i-k)<<4) + k;
// Load
float4 u[16];
for (int m=0;m<16;m++) u[m] = x[m*t];
// Twiddle, twiddling factors are exp(_I*PI*{0,..,15}*K/4P)
float alpha = -M_PI*(float)k/(float)(8*p);
for (int m=1;m<16;m++) u[m] = mul_1_F4(exp_alpha_1_F4(m * alpha), u[m]);
// 8x in-place DFT2 and twiddle (1)
DFT2_TWIDDLE(u[0].lo,u[8].lo,mul_p0q8);
DFT2_TWIDDLE(u[0].hi,u[8].hi,mul_p0q8);
DFT2_TWIDDLE(u[1].lo,u[9].lo,mul_p1q8);
DFT2_TWIDDLE(u[1].hi,u[9].hi,mul_p1q8);
DFT2_TWIDDLE(u[2].lo,u[10].lo,mul_p2q8);
DFT2_TWIDDLE(u[2].hi,u[10].hi,mul_p2q8);
DFT2_TWIDDLE(u[3].lo,u[11].lo,mul_p3q8);
DFT2_TWIDDLE(u[3].hi,u[11].hi,mul_p3q8);
DFT2_TWIDDLE(u[4].lo,u[12].lo,mul_p4q8);
DFT2_TWIDDLE(u[4].hi,u[12].hi,mul_p4q8);
DFT2_TWIDDLE(u[5].lo,u[13].lo,mul_p5q8);
DFT2_TWIDDLE(u[5].hi,u[13].hi,mul_p5q8);
DFT2_TWIDDLE(u[6].lo,u[14].lo,mul_p6q8);
DFT2_TWIDDLE(u[6].hi,u[14].hi,mul_p6q8);
DFT2_TWIDDLE(u[7].lo,u[15].lo,mul_p7q8);
DFT2_TWIDDLE(u[7].hi,u[15].hi,mul_p7q8);
// 8x in-place DFT2 and twiddle (2)
DFT2_TWIDDLE(u[0].lo,u[4].lo,mul_p0q4);
DFT2_TWIDDLE(u[0].hi,u[4].hi,mul_p0q4);
DFT2_TWIDDLE(u[1].lo,u[5].lo,mul_p1q4);
DFT2_TWIDDLE(u[1].hi,u[5].hi,mul_p1q4);
DFT2_TWIDDLE(u[2].lo,u[6].lo,mul_p2q4);
DFT2_TWIDDLE(u[2].hi,u[6].hi,mul_p2q4);
DFT2_TWIDDLE(u[3].lo,u[7].lo,mul_p3q4);
DFT2_TWIDDLE(u[3].hi,u[7].hi,mul_p3q4);
DFT2_TWIDDLE(u[8].lo,u[12].lo,mul_p0q4);
DFT2_TWIDDLE(u[8].hi,u[12].hi,mul_p0q4);
DFT2_TWIDDLE(u[9].lo,u[13].lo,mul_p1q4);
DFT2_TWIDDLE(u[9].hi,u[13].hi,mul_p1q4);
DFT2_TWIDDLE(u[10].lo,u[14].lo,mul_p2q4);
DFT2_TWIDDLE(u[10].hi,u[14].hi,mul_p2q4);
DFT2_TWIDDLE(u[11].lo,u[15].lo,mul_p3q4);
DFT2_TWIDDLE(u[11].hi,u[15].hi,mul_p3q4);
// 8x in-place DFT2 and twiddle (3)
DFT2_TWIDDLE(u[0].lo,u[2].lo,mul_p0q2);
DFT2_TWIDDLE(u[0].hi,u[2].hi,mul_p0q2);
DFT2_TWIDDLE(u[1].lo,u[3].lo,mul_p1q2);
DFT2_TWIDDLE(u[1].hi,u[3].hi,mul_p1q2);
DFT2_TWIDDLE(u[4].lo,u[6].lo,mul_p0q2);
DFT2_TWIDDLE(u[4].hi,u[6].hi,mul_p0q2);
DFT2_TWIDDLE(u[5].lo,u[7].lo,mul_p1q2);
DFT2_TWIDDLE(u[5].hi,u[7].hi,mul_p1q2);
DFT2_TWIDDLE(u[8].lo,u[10].lo,mul_p0q2);
DFT2_TWIDDLE(u[8].hi,u[10].hi,mul_p0q2);
DFT2_TWIDDLE(u[9].lo,u[11].lo,mul_p1q2);
DFT2_TWIDDLE(u[9].hi,u[11].hi,mul_p1q2);
DFT2_TWIDDLE(u[12].lo,u[14].lo,mul_p0q2);
DFT2_TWIDDLE(u[12].hi,u[14].hi,mul_p0q2);
DFT2_TWIDDLE(u[13].lo,u[15].lo,mul_p1q2);
DFT2_TWIDDLE(u[13].hi,u[15].hi,mul_p1q2);
// 8x DFT2 and store (reverse binary permutation)
y[0] = u[0] + u[1];
y[p] = u[8] + u[9];
y[2*p] = u[4] + u[5];
y[3*p] = u[12] + u[13];
y[4*p] = u[2] + u[3];
y[5*p] = u[10] + u[11];
y[6*p] = u[6] + u[7];
y[7*p] = u[14] + u[15];
y[8*p] = u[0] - u[1];
y[9*p] = u[8] - u[9];
y[10*p] = u[4] - u[5];
y[11*p] = u[12] - u[13];
y[12*p] = u[2] - u[3];
y[13*p] = u[10] - u[11];
y[14*p] = u[6] - u[7];
y[15*p] = u[14] - u[15];
}
Note that I have modified the kernel to perform the FFT of 2 complex-valued sequences at once instead of one. Also, since I only need the FFT of 256 elements at a time in a much larger sequence, I perform only 2 runs of this kernel, which leaves me with 256-length DFTs in the larger array.
Here's some of the relevant host code as well.
var ev = new[] { new Cl.Event() };
var pEv = new[] { new Cl.Event() };
int fftSize = 1;
int iter = 0;
int n = distributionSize >> 5;
while (fftSize <= n)
{
Cl.SetKernelArg(fftKernel, 0, memA);
Cl.SetKernelArg(fftKernel, 1, memB);
Cl.SetKernelArg(fftKernel, 2, fftSize);
Cl.EnqueueNDRangeKernel(commandQueue, fftKernel, 1, null, globalWorkgroupSize, localWorkgroupSize,
(uint)(iter == 0 ? 0 : 1),
iter == 0 ? null : pEv,
out ev[0]).Check();
if (iter > 0)
pEv[0].Dispose();
Swap(ref ev, ref pEv);
Swap(ref memA, ref memB); // ping-pong
fftSize = fftSize << 4;
iter++;
Cl.Finish(commandQueue);
}
Swap(ref memA, ref memB);
Hope this helps someone!