MPI program runtime error MPI_GATHER, qsub mpijobparallel - runtime-error

I am trying to run this fast fourier implementation code. It compiles fine but gives this error at runtime. I have no idea about the error or what it means. Can anyone help me out?
I compiled and run the program by:
mpicc -o exec test.c
./exec
CODE:
This is the code that I found on GITHUB. Its the parallel version of fast fourier algorithm.
#include <stdio.h>
#include <mpi.h> //To use MPI
#include <complex.h> //to use complex numbers
#include <math.h> //for cos() and sin()
#include "timer.h" //to use timer
#define PI 3.14159265
#define bigN 16384 //Problem Size
#define howmanytimesavg 3
int main()
{
int my_rank,comm_sz;
MPI_Init(NULL,NULL); //start MPI
MPI_Comm_size(MPI_COMM_WORLD,&comm_sz); ///how many processes are we
using?
MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); //which process is this?
double start,finish;
double avgtime = 0;
FILE *outfile;
int h;
if(my_rank == 0) //if process 0 open outfile
{
outfile = fopen("ParallelVersionOutput.txt", "w"); //open from current
directory
}
for(h = 0; h < howmanytimesavg; h++) //loop to run multiple times for AVG
time.
{
if(my_rank == 0) //If it's process 0 starts timer
{
start = MPI_Wtime();
}
int i,k,n,j; //Basic loop variables
double complex evenpart[(bigN / comm_sz / 2)]; //array to save the data
for EVENHALF
double complex oddpart[(bigN / comm_sz / 2)]; //array to save the data
for ODDHALF
double complex evenpartmaster[ (bigN / comm_sz / 2) * comm_sz]; //array
to save the data for EVENHALF
double complex oddpartmaster[ (bigN / comm_sz / 2) * comm_sz]; //array
to save the data for ODDHALF
double storeKsumreal[bigN]; //store the K real variable so we can abuse
symmerty
double storeKsumimag[bigN]; //store the K imaginary variable so we can
abuse symmerty
double subtable[(bigN / comm_sz)][3]; //Each process owns a subtable
from the table below
double table[bigN][3] = //TABLE of numbers to use
{
0,3.6,2.6, //n, Real,Imaginary CREATES TABLE
1,2.9,6.3,
2,5.6,4.0,
3,4.8,9.1,
4,3.3,0.4,
5,5.9,4.8,
6,5.0,2.6,
7,4.3,4.1,
};
if(bigN > 8) //Everything after row 8 is all 0's
{
for(i = 8; i < bigN; i++)
{
table[i][0] = i;
for(j = 1; j < 3;j++)
{
table[i][j] = 0.0; //set to 0.0
}
}
}
int sendandrecvct = (bigN / comm_sz) * 3; //how much to send and
recieve??
MPI_Scatter(table,sendandrecvct,MPI_DOUBLE,subtable,sendandrecvct,MPI_DOUBLE,0,MPI_COMM_WORLD); //scatter the table to subtables
for (k = 0; k < bigN / 2; k++) //K coeffiencet Loop
{
/* Variables used for the computation */
double sumrealeven = 0.0; //sum of real numbers for even
double sumimageven = 0.0; //sum of imaginary numbers for even
double sumrealodd = 0.0; //sum of real numbers for odd
double sumimagodd = 0.0; //sum of imaginary numbers for odd
for(i = 0; i < (bigN/comm_sz)/2; i++) //Sigma loop EVEN and ODD
{
double factoreven , factorodd = 0.0;
int shiftevenonnonzeroP = my_rank * subtable[2*i][0]; //used to shift index numbers for correct results for EVEN.
int shiftoddonnonzeroP = my_rank * subtable[2*i + 1][0]; //used to shift index numbers for correct results for ODD.
/* -------- EVEN PART -------- */
double realeven = subtable[2*i][1]; //Access table for real number at spot 2i
double complex imaginaryeven = subtable[2*i][2]; //Access table for imaginary number at spot 2i
double complex componeeven = (realeven + imaginaryeven * I); //Create the first component from table
if(my_rank == 0) //if proc 0, dont use shiftevenonnonzeroP
{
factoreven = ((2*PI)*((2*i)*k))/bigN; //Calculates the even factor for Cos() and Sin()
// *********Reduces computational time*********
}
else //use shiftevenonnonzeroP
{
factoreven = ((2*PI)*((shiftevenonnonzeroP)*k))/bigN; //Calculates the even factor for Cos() and Sin()
// *********Reduces computational time*********
}
double complex comptwoeven = (cos(factoreven) - (sin(factoreven)*I)); //Create the second component
evenpart[i] = (componeeven * comptwoeven); //store in the evenpart array
/* -------- ODD PART -------- */
double realodd = subtable[2*i + 1][1]; //Access table for real number at spot 2i+1
double complex imaginaryodd = subtable[2*i + 1][2]; //Access table for imaginary number at spot 2i+1
double complex componeodd = (realodd + imaginaryodd * I); //Create the first component from table
if (my_rank == 0)//if proc 0, dont use shiftoddonnonzeroP
{
factorodd = ((2*PI)*((2*i+1)*k))/bigN;//Calculates the odd factor for Cos() and Sin()
// *********Reduces computational time*********
}
else //use shiftoddonnonzeroP
{
factorodd = ((2*PI)*((shiftoddonnonzeroP)*k))/bigN;//Calculates the odd factor for Cos() and Sin()
// *********Reduces computational time*********
}
double complex comptwoodd = (cos(factorodd) - (sin(factorodd)*I));//Create the second component
oddpart[i] = (componeodd * comptwoodd); //store in the oddpart array
}
/*Process ZERO gathers the even and odd part arrays and creates a evenpartmaster and oddpartmaster array*/
MPI_Gather(evenpart,(bigN / comm_sz / 2),MPI_DOUBLE_COMPLEX,evenpartmaster,(bigN / comm_sz / 2), MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD);
MPI_Gather(oddpart,(bigN / comm_sz / 2),MPI_DOUBLE_COMPLEX,oddpartmaster,(bigN / comm_sz / 2), MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD);
if(my_rank == 0)
{
for(i = 0; i < (bigN / comm_sz / 2) * comm_sz; i++) //loop to sum the EVEN and ODD parts
{
sumrealeven += creal(evenpartmaster[i]); //sums the realpart of the even half
sumimageven += cimag(evenpartmaster[i]); //sums the imaginarypart of the even half
sumrealodd += creal(oddpartmaster[i]); //sums the realpart of the odd half
sumimagodd += cimag(oddpartmaster[i]); //sums the imaginary part of the odd half
}
storeKsumreal[k] = sumrealeven + sumrealodd; //add the calculated reals from even and odd
storeKsumimag[k] = sumimageven + sumimagodd; //add the calculated imaginary from even and odd
storeKsumreal[k + bigN/2] = sumrealeven - sumrealodd; //ABUSE symmetry Xkreal + N/2 = Evenk - OddK
storeKsumimag[k + bigN/2] = sumimageven - sumimagodd; //ABUSE symmetry Xkimag + N/2 = Evenk - OddK
if(k <= 10) //Do the first 10 K's
{
if(k == 0)
{
fprintf(outfile," \n\n TOTAL PROCESSED SAMPLES : %d\n",bigN);
}
fprintf(outfile,"================================\n");
fprintf(outfile,"XR[%d]: %.4f XI[%d]: %.4f \n",k,storeKsumreal[k],k,storeKsumimag[k]);
fprintf(outfile,"================================\n");
}
}
}
if(my_rank == 0)
{
GET_TIME(finish); //stop timer
double timeElapsed = finish-start; //Time for that iteration
avgtime = avgtime + timeElapsed; //AVG the time
fprintf(outfile,"Time Elaspsed on Iteration %d: %f Seconds\n", (h+1),timeElapsed);
}
}
if(my_rank == 0)
{
avgtime = avgtime / howmanytimesavg; //get avg time
fprintf(outfile,"\nAverage Time Elaspsed: %f Seconds", avgtime);
fclose(outfile); //CLOSE file ONLY proc 0 can.
}
MPI_Barrier(MPI_COMM_WORLD); //wait to all proccesses to catch up before finalize
MPI_Finalize(); //End MPI
return 0;
}
ERROR:
Fatal error in PMPI_Gather: Invalid datatype, error stack:
PMPI_Gather(904): MPI_Gather(sbuf=0x7fffb62799a0, scount=8192,
MPI_DATATYPE_NULL, rbuf=0x7fffb6239980, rcount=8192, MPI_DATATYPE_NULL,
root=0, MPI_COMM_WORLD) failed
PMPI_Gather(815): Datatype for argument sendtype is a null datatype
[unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=537490947
:
system msg for write_line failure : Bad file descriptor

There is no MPI_DATATYPE_NULL in your code, but you only use MPI_DOUBLE_COMPLEX. Note the latter type is a Fortran datatype, and using it in C is not correct strictly speaking.
My guess is that MPI_DOUBLE_COMPLEX is causing the issue (type not defined or not initialized because you invoked the C version of MPI_Init()).
You can obviously rewrite your code in Fortran, or use your own derived datatype for a C double complex number.
Meanwhile, I suggest you write simple C and Fortran helloworld programs that use MPI_DOUBLE_COMPLEX (MPI_Bcast() of one element for example) to confirm the issue is with MPI_DOUBLE_COMPLEX and is restricted to C or not.

Related

Arduino - How to convert double to HEX format

I have an arudino code where I get some temperature reading:
double c1 = device.readCelsius();
Serial.println(c1);
The output is for example: 26.23
What I need is to get this converted to 2623 and then to HEX value so I get: 0x0A3F
Any clue?
I guess your float values always get numbers up to two decimal. So, you can just multiply the value which you read from sensor with a 100.
decimalValue = 100 * c1
And then you can use this small code for converting the decimal value to HEX.
Thanks to GeeksforGeeks
You can find the full tutorial here
// C++ program to convert a decimal
// number to hexadecimal number
#include <iostream>
using namespace std;
// function to convert decimal to hexadecimal
void decToHexa(int n)
{
// char array to store hexadecimal number
char hexaDeciNum[100];
// counter for hexadecimal number array
int i = 0;
while (n != 0) {
// temporary variable to store remainder
int temp = 0;
// storing remainder in temp variable.
temp = n % 16;
// check if temp < 10
if (temp < 10) {
hexaDeciNum[i] = temp + 48;
i++;
}
else {
hexaDeciNum[i] = temp + 55;
i++;
}
n = n / 16;
}
// printing hexadecimal number array in reverse order
for (int j = i - 1; j >= 0; j--)
cout << hexaDeciNum[j];
}
// Driver program to test above function
int main()
{
int n = 2545;
decToHexa(n);
return 0;
}

MPI Scatterv : How to deal with the root process?

The thing I am still not too certain about is what happens with the root process in MPI Scatter / Scatterv.
If I divide an array as I try in my code, do I need to include the root process in the number of receivers (hence making the sendcounts of size nproc) or is it excluded?
In my example code for Matrix Multiplication, I still get an error by one of the processes running into aberrant behaviour, terminating the program prematurely:
void readMatrix();
double StartTime;
int rank, nproc, proc;
//double matrix_A[N_ROWS][N_COLS];
double **matrix_A;
//double matrix_B[N_ROWS][N_COLS];
double **matrix_B;
//double matrix_C[N_ROWS][N_COLS];
double **matrix_C;
int low_bound = 0; //low bound of the number of rows of each process
int upper_bound = 0; //upper bound of the number of rows of [A] of each process
int portion = 0; //portion of the number of rows of [A] of each process
int main (int argc, char *argv[]) {
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
matrix_A = (double **)malloc(N_ROWS * sizeof(double*));
for(int i = 0; i < N_ROWS; i++) matrix_A[i] = (double *)malloc(N_COLS * sizeof(double));
matrix_B = (double **)malloc(N_ROWS * sizeof(double*));
for(int i = 0; i < N_ROWS; i++) matrix_B[i] = (double *)malloc(N_COLS * sizeof(double));
matrix_C = (double **)malloc(N_ROWS * sizeof(double*));
for(int i = 0; i < N_ROWS; i++) matrix_C[i] = (double *)malloc(N_COLS * sizeof(double));
int *counts = new int[nproc](); // array to hold number of items to be sent to each process
// -------------------> If we have more than one process, we can distribute the work through scatterv
if (nproc > 1) {
// -------------------> Process 0 initalizes matrices and scatters the portions of the [A] Matrix
if (rank==0) {
readMatrix();
}
StartTime = MPI_Wtime();
int counter = 0;
for (int proc = 0; proc < nproc; proc++) {
counts[proc] = N_ROWS / nproc ;
counter += N_ROWS / nproc ;
}
counter = N_ROWS - counter;
counts[nproc-1] = counter;
//set bounds for each process
low_bound = rank*(N_ROWS/nproc);
portion = counts[rank];
upper_bound = low_bound + portion;
printf("I am process %i and my lower bound is %i and my portion is %i and my upper bound is %i \n",rank,low_bound, portion,upper_bound);
//scatter the work among the processes
int *displs = new int[nproc]();
displs[0] = 0;
for (int proc = 1; proc < nproc; proc++) displs[proc] = displs[proc-1] + (N_ROWS/nproc);
MPI_Scatterv(matrix_A, counts, displs, MPI_DOUBLE, &matrix_A[low_bound][0], portion, MPI_DOUBLE, 0, MPI_COMM_WORLD);
//broadcast [B] to all the slaves
MPI_Bcast(&matrix_B, N_ROWS*N_COLS, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// -------------------> Everybody does their work
for (int i = low_bound; i < upper_bound; i++) {//iterate through a given set of rows of [A]
for (int j = 0; j < N_COLS; j++) {//iterate through columns of [B]
for (int k = 0; k < N_ROWS; k++) {//iterate through rows of [B]
matrix_C[i][j] += (matrix_A[i][k] * matrix_B[k][j]);
}
}
}
// -------------------> Process 0 gathers the work
MPI_Gatherv(&matrix_C[low_bound][0],portion,MPI_DOUBLE,matrix_C,counts,displs,MPI_DOUBLE,0,MPI_COMM_WORLD);
}
...
The root process also takes place in the receiver side. If you are not interested in that, just set sendcounts[root] = 0.
See MPI_Scatterv for specific information on which values you have to pass exactly.
However, take care of what you are doing. I strongly suggest that you change the way you allocate your matrix as a one-dimensional array, using a single malloc like this:
double* matrix = (double*) malloc( N_ROWS * N_COLS * sizeof(double) );
If you still want to use a two-dimensional array, then you may need to define your types as a MPI derived datatype.
The datatype you are passing is not valid if you want to send more than a row in a single MPI transfer.
With MPI_DOUBLE you are telling MPI that the buffer contains a contiguous array of count MPI_DOUBLE values.
Since you are allocating a two-dimensional array using multiple malloc calls, then your data is not contiguous.

Arduino: float function returns inf

I have a function (shown below) that I need some advice on. The function returns the slope of a line which is fit (via the least squares method) to n data points. To give you a context, my project is a barometric pressure based altimeter which uses this function to determine velocity based on the n most recent altitude-time pairs. These altitude-time pairs are stored in 2 global arrays(times[] and alts[]).
My problem is not that this method doesn't work. It usually does. But sometimes I will run the altimeter and this function will return the value 'inf' interspersed with a bunch of other wrong values (I have also seen 'NaN' but that is more rare). There are a few areas of suspicion I have at this point but I would like a fresh perspective. Here is some further contextual information that may or may not be of use:
I am using interrupts for a quadrature encoder
The times[] array is of type unsigned long
The alts[] array is of type float
n is a const int, in this case n = 9
On the ATMEGA328 a double is the same as a float.. Arduino-double
float velF() { // uses the last n data points, fits a line to them,
// and uses the slope of that line as the velocity at that moment
float sumTY = 0, sumT = 0, sumY = 0, sumT2 = 0;
for (int i = 0; i < n; i++) {
sumTY += (float)times[i] * alts[i] / 1000;
sumT += (float)times[i] / 1000;
sumY += alts[i];
sumT2 += (float)times[i] * times[i] / 1000000;
}
return (n*sumTY - sumT*sumY) / (n*sumT2 - sumT*sumT);
}
Any help or advice would be greatly appreciated!
Code is certainly performing division by zero.
For a variety of reasons, n*sumT2 - sumT*sumT will be zero. #John Bollinger In most of these cases, the top (dividend) of the division will also be zero and a return value of zero would be acceptable.
float velF(void) {
float sumTY = 0, sumT = 0, sumY = 0, sumT2 = 0;
for (size_t i = 0; i < n; i++) {
// insure values are reasoable
assert(alts[i] >= ALT_MIN && alts[i] <= ALT_MAX);
assert(times[i] >= TIME_MIN && times[i] <= TIME_MAX);
sumTY += (float)times[i] * alts[i] / 1000;
sumT += (float)times[i] / 1000;
sumY += alts[i];
sumT2 += (float)times[i] * times[i] / 1000000;
}
float d = n*sumT2 - sumT*sumT;
if (d == 0) return 0;
return (n*sumTY - sumT*sumY) / d;
}
Side note: could factor out the division for improved accuracy and speed. Suggest performing the last calculation as double.
float velF(void) {
float sumTY = 0, sumT = 0, sumY = 0, sumT2 = 0;
for (size_t i = 0; i < n; i++) {
float tf = (float) times[i];
sumTY += tf * alts[i];
sumT += tf;
sumY += alts[i];
sumT2 += tf * tf;
}
double nd = n;
double sumTd = sumT;
double d = nd*sumT2 - sumTd*sumTd;
if (d == 0) return 0;
return (nd*sumTY - sumTd*sumY)*1000 / d;
}

why fundamental Frequency and magnitude are not null when microphone is off?

I would like to make real time audio processing with Qt and display the spectrum using FFTW3.
What I've done in steps:
I capture any sound from computer device and fill it into the buffer.
I assign sound samples to double array
I compute the fundamental frequency.
when I'm display the fundamental frequency and Magnetitude when the microphone is on but no signal(silence) , the fundamental frequency is not what I expected , the code don't always return zero , sometimes the code returns 1500Hz,2000hz as frequency
and when the microphone is off (mute) the code don't return zero as fundamamental frequency but returns a number between 0 and 9000Hz. Any help woulbd be appreciated
here is my code
QByteArray *buffer;
QAudioInput *audioInput;
audioInput = new QAudioInput(format, this);
//Check the number of samples in input buffer
qint64 len = audioInput->bytesReady();
//Limit sample size
if(len > 4096)
len = 4096;
//Read sound samples from input device to buffer
qint64 l = input->read(buffer.data(), len);
int input_size= BufferSize;
int output_size = input_size; //input_size/2+1;
fftw_plan p3;
double in[output_size];
fftw_complex out[output_size];
short *outdata = (short*)m_buffer.data();// assign sample into short array
int data_size = size_t(outdata);
int data_size1 = sizeof(outdata);
int count = 0;
double w = 0;
for(int i(chanelNumber); i < output_size/2; i= i + 2) //fill array in
{
w= 0.5 * (1 - cos(2*M_PI*i/output_size)); // Hann Windows
double x = 0;
if(i < data_size){
x = outdata[i];
}
if(count < output_size){
in[count] = x;// fill Array In with sample from buffer
count++;
}
}
for(int i=count; i<output_size; i++){
in[i] = 0;
}
p3 = fftw_plan_dft_r2c_1d(output_size, in, out, FFTW_ESTIMATE);// create Plan
fftw_execute(p3);// FFT
for (int i = 0; i < (output_size/2); i++) {
long peak=0;
double Amplitudemax=0;
double r1 = out[i][0] * out[i][0];
double im1 = out[i][3] * out[i][4];
double t1 = r1 + im1;
//double t = 20*log(sqrt(t1));
double t = sqrt(t1)/(double)(output_size/2);
double f = (double)i*8000 / ((double)output_size/2);
if(Magnitude > AmplitudeMax)
{
AmplitudeMax = Magnitude;
Peak =2* i;
}
}
fftw_destroy_plan(p3);
return Peak*(static_cast<double>(8000)/output_Size);
What you think is silence might contain some small amount of noise. The FFT of random noise will also appear random, and thus have a random magnitude peak. But it is possible that noise might come from equipment or electronics in the environment (fans, flyback transformers, etc.), or the power supply to your ADC or mic, thus showing some frequency biases.
If the noise level is low enough, normally one checks the level of the magnitude peak, compares it against a threshold, and cuts off frequency estimation reporting below this threshold.

Higher radix (or better) formulation for Stockham FFT

Background
I've implemented this algorithm from Microsoft Research for a radix-2 FFT (Stockham auto sort) using OpenCL.
I use floating point textures (256 cols X N rows) for input and output in the kernel, because I will need to sample at non-integral points and I thought it better to delegate that to the texture sampling hardware. Note that my FFTs are always of 256-point sequences (every row in my texture). At this point, my N is 16384 or 32768 depending on the GPU i'm using and the max 2D texture size allowed.
I also need to perform the FFT of 4 real-valued sequences at once, so the kernel performs the FFT(a, b, c, d) as FFT(a + ib, c + id) from which I can extract the 4 complex sequences out later using an O(n) algorithm. I can elaborate on this if someone wishes - but I don't believe it falls in the scope of this question.
Kernel Source
const sampler_t fftSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void FFT_Stockham(read_only image2d_t input, write_only image2d_t output, int fftSize, int size)
{
int x = get_global_id(0);
int y = get_global_id(1);
int b = floor(x / convert_float(fftSize)) * (fftSize / 2);
int offset = x % (fftSize / 2);
int x0 = b + offset;
int x1 = x0 + (size / 2);
float4 val0 = read_imagef(input, fftSampler, (int2)(x0, y));
float4 val1 = read_imagef(input, fftSampler, (int2)(x1, y));
float angle = -6.283185f * (convert_float(x) / convert_float(fftSize));
// TODO: Convert the two calculations below into lookups from a __constant buffer
float tA = native_cos(angle);
float tB = native_sin(angle);
float4 coeffs1 = (float4)(tA, tB, tA, tB);
float4 coeffs2 = (float4)(-tB, tA, -tB, tA);
float4 result = val0 + coeffs1 * val1.xxzz + coeffs2 * val1.yyww;
write_imagef(output, (int2)(x, y), result);
}
The host code simply invokes this kernel log2(256) times, ping-ponging the input and output textures.
Note: I tried removing the native_cos and native_sin to see if that impacted timing, but it doesn't seem to change things by very much. Not the factor I'm looking for, in any case.
Access pattern
Knowing that I am probably memory-bandwidth bound, here is the memory access pattern (per-row) for my radix-2 FFT.
X0 - element 1 to combine (read)
X1 - element 2 to combine (read)
X - element to write to (write)
Question
So my question is - can someone help me with/point me toward a higher-radix formulation for this algorithm? I ask because most FFTs are optimized for large cases and single real/complex valued sequences. Their kernel generators are also very case dependent and break down quickly when I try to muck with their internals.
Are there other options better than simply going to a radix-8 or 16 kernel?
Some of my constraints are - I have to use OpenCL (no cuFFT). I also cannot use clAmdFft from ACML for this purpose. It would be nice to also talk about CPU optimizations (this kernel SUCKS big time on the CPU) - but getting it to run in fewer iterations on the GPU is my main use-case.
Thanks in advance for reading through all this and trying to help!
I tried several versions, but the one with the best performance on CPU and GPU was a radix-16 kernel for my specific case.
Here is the kernel for reference. It was taken from Eric Bainville's (most excellent) website and used with full attribution.
// #define M_PI 3.14159265358979f
//Global size is x.Length/2, Scale = 1 for direct, 1/N to inverse (iFFT)
__kernel void ConjugateAndScale(__global float4* x, const float Scale)
{
int i = get_global_id(0);
float temp = Scale;
float4 t = (float4)(temp, -temp, temp, -temp);
x[i] *= t;
}
// Return a*EXP(-I*PI*1/2) = a*(-I)
float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
// Return a^2
float2 sqr_1(float2 a)
{ return (float2)(a.x*a.x-a.y*a.y,2.0f*a.x*a.y); }
// Return the 2x DFT2 of the four complex numbers in A
// If A=(a,b,c,d) then return (a',b',c',d') where (a',c')=DFT2(a,c)
// and (b',d')=DFT2(b,d).
float8 dft2_4(float8 a) { return (float8)(a.lo+a.hi,a.lo-a.hi); }
// Return the DFT of 4 complex numbers in A
float8 dft4_4(float8 a)
{
// 2x DFT2
float8 x = dft2_4(a);
// Shuffle, twiddle, and 2x DFT2
return dft2_4((float8)(x.lo.lo,x.hi.lo,x.lo.hi,mul_p1q2(x.hi.hi)));
}
// Complex product, multiply vectors of complex numbers
#define MUL_RE(a,b) (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a,b) (a.even*b.odd + a.odd*b.even)
float2 mul_1(float2 a, float2 b)
{ float2 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_1_F4(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
float4 mul_2(float4 a, float4 b)
{ float4 x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x; }
// Return the DFT2 of the two complex numbers in vector A
float4 dft2_2(float4 a) { return (float4)(a.lo+a.hi,a.lo-a.hi); }
// Return cos(alpha)+I*sin(alpha) (3 variants)
float2 exp_alpha_1(float alpha)
{
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
//cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float2)(cs,sn);
}
// Return cos(alpha)+I*sin(alpha) (3 variants)
float4 exp_alpha_1_F4(float alpha)
{
float cs,sn;
// sn = sincos(alpha,&cs); // sincos
// cs = native_cos(alpha); sn = native_sin(alpha); // native sin+cos
cs = cos(alpha); sn = sin(alpha); // sin+cos
return (float4)(cs,sn,cs,sn);
}
// mul_p*q*(a) returns a*EXP(-I*PI*P/Q)
#define mul_p0q1(a) (a)
#define mul_p0q2 mul_p0q1
//float2 mul_p1q2(float2 a) { return (float2)(a.y,-a.x); }
__constant float SQRT_1_2 = 0.707106781186548; // cos(Pi/4)
#define mul_p0q4 mul_p0q2
float2 mul_p1q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(a.x+a.y,-a.x+a.y); }
#define mul_p2q4 mul_p1q2
float2 mul_p3q4(float2 a) { return (float2)(SQRT_1_2)*(float2)(-a.x+a.y,-a.x-a.y); }
__constant float COS_8 = 0.923879532511287; // cos(Pi/8)
__constant float SIN_8 = 0.382683432365089; // sin(Pi/8)
#define mul_p0q8 mul_p0q4
float2 mul_p1q8(float2 a) { return mul_1((float2)(COS_8,-SIN_8),a); }
#define mul_p2q8 mul_p1q4
float2 mul_p3q8(float2 a) { return mul_1((float2)(SIN_8,-COS_8),a); }
#define mul_p4q8 mul_p2q4
float2 mul_p5q8(float2 a) { return mul_1((float2)(-SIN_8,-COS_8),a); }
#define mul_p6q8 mul_p3q4
float2 mul_p7q8(float2 a) { return mul_1((float2)(-COS_8,-SIN_8),a); }
// Compute in-place DFT2 and twiddle
#define DFT2_TWIDDLE(a,b,t) { float2 tmp = t(a-b); a += b; b = tmp; }
// T = N/16 = number of threads.
// P is the length of input sub-sequences, 1,16,256,...,N/16.
__kernel void FFT_Radix16(__global const float4 * x, __global float4 * y, int pp)
{
int p = pp;
int t = get_global_size(0); // number of threads
int i = get_global_id(0); // current thread
////// y[i] = 2*x[i];
////// return;
int k = i & (p-1); // index in input sequence, in 0..P-1
// Inputs indices are I+{0,..,15}*T
x += i;
// Output indices are J+{0,..,15}*P, where
// J is I with four 0 bits inserted at bit log2(P)
y += ((i-k)<<4) + k;
// Load
float4 u[16];
for (int m=0;m<16;m++) u[m] = x[m*t];
// Twiddle, twiddling factors are exp(_I*PI*{0,..,15}*K/4P)
float alpha = -M_PI*(float)k/(float)(8*p);
for (int m=1;m<16;m++) u[m] = mul_1_F4(exp_alpha_1_F4(m * alpha), u[m]);
// 8x in-place DFT2 and twiddle (1)
DFT2_TWIDDLE(u[0].lo,u[8].lo,mul_p0q8);
DFT2_TWIDDLE(u[0].hi,u[8].hi,mul_p0q8);
DFT2_TWIDDLE(u[1].lo,u[9].lo,mul_p1q8);
DFT2_TWIDDLE(u[1].hi,u[9].hi,mul_p1q8);
DFT2_TWIDDLE(u[2].lo,u[10].lo,mul_p2q8);
DFT2_TWIDDLE(u[2].hi,u[10].hi,mul_p2q8);
DFT2_TWIDDLE(u[3].lo,u[11].lo,mul_p3q8);
DFT2_TWIDDLE(u[3].hi,u[11].hi,mul_p3q8);
DFT2_TWIDDLE(u[4].lo,u[12].lo,mul_p4q8);
DFT2_TWIDDLE(u[4].hi,u[12].hi,mul_p4q8);
DFT2_TWIDDLE(u[5].lo,u[13].lo,mul_p5q8);
DFT2_TWIDDLE(u[5].hi,u[13].hi,mul_p5q8);
DFT2_TWIDDLE(u[6].lo,u[14].lo,mul_p6q8);
DFT2_TWIDDLE(u[6].hi,u[14].hi,mul_p6q8);
DFT2_TWIDDLE(u[7].lo,u[15].lo,mul_p7q8);
DFT2_TWIDDLE(u[7].hi,u[15].hi,mul_p7q8);
// 8x in-place DFT2 and twiddle (2)
DFT2_TWIDDLE(u[0].lo,u[4].lo,mul_p0q4);
DFT2_TWIDDLE(u[0].hi,u[4].hi,mul_p0q4);
DFT2_TWIDDLE(u[1].lo,u[5].lo,mul_p1q4);
DFT2_TWIDDLE(u[1].hi,u[5].hi,mul_p1q4);
DFT2_TWIDDLE(u[2].lo,u[6].lo,mul_p2q4);
DFT2_TWIDDLE(u[2].hi,u[6].hi,mul_p2q4);
DFT2_TWIDDLE(u[3].lo,u[7].lo,mul_p3q4);
DFT2_TWIDDLE(u[3].hi,u[7].hi,mul_p3q4);
DFT2_TWIDDLE(u[8].lo,u[12].lo,mul_p0q4);
DFT2_TWIDDLE(u[8].hi,u[12].hi,mul_p0q4);
DFT2_TWIDDLE(u[9].lo,u[13].lo,mul_p1q4);
DFT2_TWIDDLE(u[9].hi,u[13].hi,mul_p1q4);
DFT2_TWIDDLE(u[10].lo,u[14].lo,mul_p2q4);
DFT2_TWIDDLE(u[10].hi,u[14].hi,mul_p2q4);
DFT2_TWIDDLE(u[11].lo,u[15].lo,mul_p3q4);
DFT2_TWIDDLE(u[11].hi,u[15].hi,mul_p3q4);
// 8x in-place DFT2 and twiddle (3)
DFT2_TWIDDLE(u[0].lo,u[2].lo,mul_p0q2);
DFT2_TWIDDLE(u[0].hi,u[2].hi,mul_p0q2);
DFT2_TWIDDLE(u[1].lo,u[3].lo,mul_p1q2);
DFT2_TWIDDLE(u[1].hi,u[3].hi,mul_p1q2);
DFT2_TWIDDLE(u[4].lo,u[6].lo,mul_p0q2);
DFT2_TWIDDLE(u[4].hi,u[6].hi,mul_p0q2);
DFT2_TWIDDLE(u[5].lo,u[7].lo,mul_p1q2);
DFT2_TWIDDLE(u[5].hi,u[7].hi,mul_p1q2);
DFT2_TWIDDLE(u[8].lo,u[10].lo,mul_p0q2);
DFT2_TWIDDLE(u[8].hi,u[10].hi,mul_p0q2);
DFT2_TWIDDLE(u[9].lo,u[11].lo,mul_p1q2);
DFT2_TWIDDLE(u[9].hi,u[11].hi,mul_p1q2);
DFT2_TWIDDLE(u[12].lo,u[14].lo,mul_p0q2);
DFT2_TWIDDLE(u[12].hi,u[14].hi,mul_p0q2);
DFT2_TWIDDLE(u[13].lo,u[15].lo,mul_p1q2);
DFT2_TWIDDLE(u[13].hi,u[15].hi,mul_p1q2);
// 8x DFT2 and store (reverse binary permutation)
y[0] = u[0] + u[1];
y[p] = u[8] + u[9];
y[2*p] = u[4] + u[5];
y[3*p] = u[12] + u[13];
y[4*p] = u[2] + u[3];
y[5*p] = u[10] + u[11];
y[6*p] = u[6] + u[7];
y[7*p] = u[14] + u[15];
y[8*p] = u[0] - u[1];
y[9*p] = u[8] - u[9];
y[10*p] = u[4] - u[5];
y[11*p] = u[12] - u[13];
y[12*p] = u[2] - u[3];
y[13*p] = u[10] - u[11];
y[14*p] = u[6] - u[7];
y[15*p] = u[14] - u[15];
}
Note that I have modified the kernel to perform the FFT of 2 complex-valued sequences at once instead of one. Also, since I only need the FFT of 256 elements at a time in a much larger sequence, I perform only 2 runs of this kernel, which leaves me with 256-length DFTs in the larger array.
Here's some of the relevant host code as well.
var ev = new[] { new Cl.Event() };
var pEv = new[] { new Cl.Event() };
int fftSize = 1;
int iter = 0;
int n = distributionSize >> 5;
while (fftSize <= n)
{
Cl.SetKernelArg(fftKernel, 0, memA);
Cl.SetKernelArg(fftKernel, 1, memB);
Cl.SetKernelArg(fftKernel, 2, fftSize);
Cl.EnqueueNDRangeKernel(commandQueue, fftKernel, 1, null, globalWorkgroupSize, localWorkgroupSize,
(uint)(iter == 0 ? 0 : 1),
iter == 0 ? null : pEv,
out ev[0]).Check();
if (iter > 0)
pEv[0].Dispose();
Swap(ref ev, ref pEv);
Swap(ref memA, ref memB); // ping-pong
fftSize = fftSize << 4;
iter++;
Cl.Finish(commandQueue);
}
Swap(ref memA, ref memB);
Hope this helps someone!

Resources