Locking not working with OpenCL - opencl

I'm stuck with an issue in my OpenCL code where I try to synch inside a kernel:
__kernel void pdiffs (__global const long2 *inData, __global const long2 *inData2, __global long2 *outData) {
long2 diffSum = 0;
uint idx0 = get_local_size(0)*get_group_id(0);
for (uint idx=idx0; idx<idx0+get_local_size(0); idx += 1) {
diffSum += inData[idx] - inData2[idx];
outData[get_group_id(0)] = diffSum;
printf("%d %d %d %d/%d\n", get_group_id(0), get_num_groups(0), get_local_size(0), diffSum.x, diffSum.y);
barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
if (get_group_id(0) == 0) {
for (size_t i = 1; i < get_num_groups(0); i++){
outData[0] += outData[i];
printf("v(%d): %d/%d\n", i, outData[i].x, outData[i].y);
}
}
}
(I know that this piece of code is simply bad...)
I just thought that the barrier will synch the single groups so the values in outData are defined. But my trace shows that some differences are not calculated and contain zero (I setup my data so all shall return the value 1, but some display as 0). Further it makes a difference whether I have printf statements or not. Without printf even more differences seem to be incorrect.

Related

MPI program runtime error MPI_GATHER, qsub mpijobparallel

I am trying to run this fast fourier implementation code. It compiles fine but gives this error at runtime. I have no idea about the error or what it means. Can anyone help me out?
I compiled and run the program by:
mpicc -o exec test.c
./exec
CODE:
This is the code that I found on GITHUB. Its the parallel version of fast fourier algorithm.
#include <stdio.h>
#include <mpi.h> //To use MPI
#include <complex.h> //to use complex numbers
#include <math.h> //for cos() and sin()
#include "timer.h" //to use timer
#define PI 3.14159265
#define bigN 16384 //Problem Size
#define howmanytimesavg 3
int main()
{
int my_rank,comm_sz;
MPI_Init(NULL,NULL); //start MPI
MPI_Comm_size(MPI_COMM_WORLD,&comm_sz); ///how many processes are we
using?
MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); //which process is this?
double start,finish;
double avgtime = 0;
FILE *outfile;
int h;
if(my_rank == 0) //if process 0 open outfile
{
outfile = fopen("ParallelVersionOutput.txt", "w"); //open from current
directory
}
for(h = 0; h < howmanytimesavg; h++) //loop to run multiple times for AVG
time.
{
if(my_rank == 0) //If it's process 0 starts timer
{
start = MPI_Wtime();
}
int i,k,n,j; //Basic loop variables
double complex evenpart[(bigN / comm_sz / 2)]; //array to save the data
for EVENHALF
double complex oddpart[(bigN / comm_sz / 2)]; //array to save the data
for ODDHALF
double complex evenpartmaster[ (bigN / comm_sz / 2) * comm_sz]; //array
to save the data for EVENHALF
double complex oddpartmaster[ (bigN / comm_sz / 2) * comm_sz]; //array
to save the data for ODDHALF
double storeKsumreal[bigN]; //store the K real variable so we can abuse
symmerty
double storeKsumimag[bigN]; //store the K imaginary variable so we can
abuse symmerty
double subtable[(bigN / comm_sz)][3]; //Each process owns a subtable
from the table below
double table[bigN][3] = //TABLE of numbers to use
{
0,3.6,2.6, //n, Real,Imaginary CREATES TABLE
1,2.9,6.3,
2,5.6,4.0,
3,4.8,9.1,
4,3.3,0.4,
5,5.9,4.8,
6,5.0,2.6,
7,4.3,4.1,
};
if(bigN > 8) //Everything after row 8 is all 0's
{
for(i = 8; i < bigN; i++)
{
table[i][0] = i;
for(j = 1; j < 3;j++)
{
table[i][j] = 0.0; //set to 0.0
}
}
}
int sendandrecvct = (bigN / comm_sz) * 3; //how much to send and
recieve??
MPI_Scatter(table,sendandrecvct,MPI_DOUBLE,subtable,sendandrecvct,MPI_DOUBLE,0,MPI_COMM_WORLD); //scatter the table to subtables
for (k = 0; k < bigN / 2; k++) //K coeffiencet Loop
{
/* Variables used for the computation */
double sumrealeven = 0.0; //sum of real numbers for even
double sumimageven = 0.0; //sum of imaginary numbers for even
double sumrealodd = 0.0; //sum of real numbers for odd
double sumimagodd = 0.0; //sum of imaginary numbers for odd
for(i = 0; i < (bigN/comm_sz)/2; i++) //Sigma loop EVEN and ODD
{
double factoreven , factorodd = 0.0;
int shiftevenonnonzeroP = my_rank * subtable[2*i][0]; //used to shift index numbers for correct results for EVEN.
int shiftoddonnonzeroP = my_rank * subtable[2*i + 1][0]; //used to shift index numbers for correct results for ODD.
/* -------- EVEN PART -------- */
double realeven = subtable[2*i][1]; //Access table for real number at spot 2i
double complex imaginaryeven = subtable[2*i][2]; //Access table for imaginary number at spot 2i
double complex componeeven = (realeven + imaginaryeven * I); //Create the first component from table
if(my_rank == 0) //if proc 0, dont use shiftevenonnonzeroP
{
factoreven = ((2*PI)*((2*i)*k))/bigN; //Calculates the even factor for Cos() and Sin()
// *********Reduces computational time*********
}
else //use shiftevenonnonzeroP
{
factoreven = ((2*PI)*((shiftevenonnonzeroP)*k))/bigN; //Calculates the even factor for Cos() and Sin()
// *********Reduces computational time*********
}
double complex comptwoeven = (cos(factoreven) - (sin(factoreven)*I)); //Create the second component
evenpart[i] = (componeeven * comptwoeven); //store in the evenpart array
/* -------- ODD PART -------- */
double realodd = subtable[2*i + 1][1]; //Access table for real number at spot 2i+1
double complex imaginaryodd = subtable[2*i + 1][2]; //Access table for imaginary number at spot 2i+1
double complex componeodd = (realodd + imaginaryodd * I); //Create the first component from table
if (my_rank == 0)//if proc 0, dont use shiftoddonnonzeroP
{
factorodd = ((2*PI)*((2*i+1)*k))/bigN;//Calculates the odd factor for Cos() and Sin()
// *********Reduces computational time*********
}
else //use shiftoddonnonzeroP
{
factorodd = ((2*PI)*((shiftoddonnonzeroP)*k))/bigN;//Calculates the odd factor for Cos() and Sin()
// *********Reduces computational time*********
}
double complex comptwoodd = (cos(factorodd) - (sin(factorodd)*I));//Create the second component
oddpart[i] = (componeodd * comptwoodd); //store in the oddpart array
}
/*Process ZERO gathers the even and odd part arrays and creates a evenpartmaster and oddpartmaster array*/
MPI_Gather(evenpart,(bigN / comm_sz / 2),MPI_DOUBLE_COMPLEX,evenpartmaster,(bigN / comm_sz / 2), MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD);
MPI_Gather(oddpart,(bigN / comm_sz / 2),MPI_DOUBLE_COMPLEX,oddpartmaster,(bigN / comm_sz / 2), MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD);
if(my_rank == 0)
{
for(i = 0; i < (bigN / comm_sz / 2) * comm_sz; i++) //loop to sum the EVEN and ODD parts
{
sumrealeven += creal(evenpartmaster[i]); //sums the realpart of the even half
sumimageven += cimag(evenpartmaster[i]); //sums the imaginarypart of the even half
sumrealodd += creal(oddpartmaster[i]); //sums the realpart of the odd half
sumimagodd += cimag(oddpartmaster[i]); //sums the imaginary part of the odd half
}
storeKsumreal[k] = sumrealeven + sumrealodd; //add the calculated reals from even and odd
storeKsumimag[k] = sumimageven + sumimagodd; //add the calculated imaginary from even and odd
storeKsumreal[k + bigN/2] = sumrealeven - sumrealodd; //ABUSE symmetry Xkreal + N/2 = Evenk - OddK
storeKsumimag[k + bigN/2] = sumimageven - sumimagodd; //ABUSE symmetry Xkimag + N/2 = Evenk - OddK
if(k <= 10) //Do the first 10 K's
{
if(k == 0)
{
fprintf(outfile," \n\n TOTAL PROCESSED SAMPLES : %d\n",bigN);
}
fprintf(outfile,"================================\n");
fprintf(outfile,"XR[%d]: %.4f XI[%d]: %.4f \n",k,storeKsumreal[k],k,storeKsumimag[k]);
fprintf(outfile,"================================\n");
}
}
}
if(my_rank == 0)
{
GET_TIME(finish); //stop timer
double timeElapsed = finish-start; //Time for that iteration
avgtime = avgtime + timeElapsed; //AVG the time
fprintf(outfile,"Time Elaspsed on Iteration %d: %f Seconds\n", (h+1),timeElapsed);
}
}
if(my_rank == 0)
{
avgtime = avgtime / howmanytimesavg; //get avg time
fprintf(outfile,"\nAverage Time Elaspsed: %f Seconds", avgtime);
fclose(outfile); //CLOSE file ONLY proc 0 can.
}
MPI_Barrier(MPI_COMM_WORLD); //wait to all proccesses to catch up before finalize
MPI_Finalize(); //End MPI
return 0;
}
ERROR:
Fatal error in PMPI_Gather: Invalid datatype, error stack:
PMPI_Gather(904): MPI_Gather(sbuf=0x7fffb62799a0, scount=8192,
MPI_DATATYPE_NULL, rbuf=0x7fffb6239980, rcount=8192, MPI_DATATYPE_NULL,
root=0, MPI_COMM_WORLD) failed
PMPI_Gather(815): Datatype for argument sendtype is a null datatype
[unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=537490947
:
system msg for write_line failure : Bad file descriptor
There is no MPI_DATATYPE_NULL in your code, but you only use MPI_DOUBLE_COMPLEX. Note the latter type is a Fortran datatype, and using it in C is not correct strictly speaking.
My guess is that MPI_DOUBLE_COMPLEX is causing the issue (type not defined or not initialized because you invoked the C version of MPI_Init()).
You can obviously rewrite your code in Fortran, or use your own derived datatype for a C double complex number.
Meanwhile, I suggest you write simple C and Fortran helloworld programs that use MPI_DOUBLE_COMPLEX (MPI_Bcast() of one element for example) to confirm the issue is with MPI_DOUBLE_COMPLEX and is restricted to C or not.

OpenCL undefined behavior in parallel reduction algorithm

I am working on a simple parallel reduction algorithm to find the minimum value in an array and am coming across some interesting undefined behavior in my algorithm. I am running Intel's OpenCL 1.2 on Ubuntu 16.04.
The following kernel is what I am trying to run which is currently giving me the wrong answer:
__kernel void Find_Min(int arraySize, __global double* scratch_arr, __global double* value_arr, __global double* min_arr){
const int index = get_global_id(0);
int length = (int)sqrt((double)arraySize);
int start = index*length;
double min_val = INFINITY;
for(int i=start; i<start+length && i < arraySize; i++){
if(value_arr[i] < min_val)
min_val = value_arr[i];
}
scratch_arr[index] = min_val;
barrier(CLK_GLOBAL_MEM_FENCE);
if(index == 0){
double totalMin = min_val;
for(int i=1; i<length; i++){
if(scratch_arr[i] < totalMin)
totalMin = scratch_arr[i];
}
min_arr[0] = totalMin;
}
}
When in put in an array that is {0,-1,-2,-3,-4,-5,-6,-7,-8} it ends up returning -2.
Here is where the undefined behavior comes in. When I run the following kernel with a printf statement before the barrier I get the right answer (-8):
__kernel void Find_Min(int arraySize, __global double* scratch_arr, __global double* value_arr, __global double* min_arr){
const int index = get_global_id(0);
int length = (int)sqrt((double)arraySize);
int start = index*length;
double min_val = INFINITY;
for(int i=start; i<start+length && i < arraySize; i++){
if(value_arr[i] < min_val)
min_val = value_arr[i];
}
scratch_arr[index] = min_val;
printf("setting scratch[%i] to %f\n", index, min_val);
barrier(CLK_GLOBAL_MEM_FENCE);
if(index == 0){
double totalMin = min_val;
for(int i=1; i<length; i++){
if(scratch_arr[i] < totalMin)
totalMin = scratch_arr[i];
}
min_arr[0] = totalMin;
}
}
The only thing I can think of that could be happening is that I am using the barrier command incorrectly and all the printf is doing is causing a delay in the kernel that is somehow synchronizing the calls so they all complete before the final reduction step. But without the printf, the kernel 0 executes the final reduction before the other kernels are finished.
Does anyone else have any suggestions or tips on how to debug this issue?
Thanks in advance!!
The problem was that the kernel was being launched with one thread per workgroup and barriers only work within a work group. See this response to a similar question: Open CL no synchronization despite barrier

CL_DEVICE_NOT_AVAILABLE on casting int to float

I can't seem to find any good info anywhere for what I've run into. I've written a bit of code for Kohonen SOM in OpenCL, on a iMac w/ a ATI Radeon HD 6770M. I'm choosing the GPU device for the context. There is a single line in my code that is causing a CL_DEVICE_NOT_AVAILABLE error. If I comment it out, code compiles fine... but with it, and the variations I've tried, I consistently get the error.
Here's the code, with the offending line commented:
"// THIS line, only, causes CL_DEVICE_NOT_AVAILABLE !!!".
I'm hoping one of you guys has run into this at some point, as I'm a little baffled. convert_float(diff) did not work for me.
There are bound to be computational errors, as I haven't gotten beyond the essential complete-compile step, so feel free to ignore or point those out. Either way, I'm really just trying to get beyond the compile.
inline float _calc_sample_distance(__global float* weights, ulong startIdx, uint nodeWidth, __constant float* sample) {
float accum = 0.0f;
float diff = 0.0f;
uint i = 0;
for(i = 0; i<nodeWidth; i++) {
diff = weights[startIdx+i] - sample[i];
accum += pow(diff,2);
}
accum = pow(accum, .5f);
return accum;
}
inline void _calc_coords(uint dimCount, __constant uint* dimSizes, size_t offset, uint* thisCoords) {
// reversed so, processed as xy, then y
ulong trim = offset, multi = 0;
int i = 0, j = 0;
for(i = dimCount-1; i>=0; i--) {
multi = 1;
for(j=i-1; j>=0; j--) {
multi *= dimSizes[j];
}
thisCoords[i] = trim / multi;
trim = trim % multi;
}
}
inline float _calc_map_coord_distance(uint dimCount, __constant uint* bmuCoords, uint* thisCoords) {
float accum = 0.0f;
uint i = 0;
int diff = 0;
for(i = 0; i < dimCount; i++) {
diff = bmuCoords[i] - thisCoords[i];
diff *= diff;
accum += (float)diff; // THIS line, only, causes CL_DEVICE_NOT_AVAILABLE !!!
}
accum = pow(accum,.5f);
return accum;
}
__kernel void calc_kohonen_som_distances(
// map data
__global float* weights, // weights
uint nodeWidth, // the number of weights per node
uint nodeCount, // the total number of weights
__constant float* sample, // sample, of nodeWidth wide
__global float* output // the output distance of each node to the sample
) {
size_t nodeIndex = get_global_id(0);
ulong startIdx = nodeIndex * nodeWidth;
output[nodeIndex] = _calc_sample_distance(weights,startIdx,nodeWidth,sample);
}
__kernel void calc_kohonen_som_update_weights(
// map data
__global float* weights, // weights
uint nodeWidth, // the number of weights per node
uint dimCount, // the number of dimensions
__constant uint* dimSizes, // the size of each dimension
__constant float *sampleData, // the sample to use for updating the bmu and surrounding units
__constant uint* bmuCoords, // the coordinates of the best matching unit, from which we derive offset
float learningRate, // calculated on the CPU as per step
float radius // calculated on the CPU as per step
) {
size_t nodeIndex = get_global_id(0);
ulong startIdx = nodeIndex * nodeWidth;
uint* thisCoords = (uint*)malloc(sizeof(uint)*dimCount);
memset(thisCoords,0,sizeof(uint)*dimCount);
// determine the coordinates of the offset provided
if(dimCount!=1) {
_calc_coords(dimCount,dimSizes,nodeIndex,thisCoords);
} else {
thisCoords[0] = nodeIndex;
}
float distance = _calc_map_coord_distance(dimCount, bmuCoords, thisCoords);
if(distance<radius) {
float influence = exp( (-1*distance)/(2*pow(radius,2.0f)) );
for(uint i=0;i<dimCount;i++) {
weights[startIdx+i] = weights[startIdx+i] + ( influence * learningRate * (sampleData[i] - weights[startIdx+i]) );
}
}
}

OpenCL: Inserting local atomic_inc to reduction kernel

I am trying to include a local atomic similar to that described by DarkZeros here within a working reduction kernel. The kernel finds a largest value within a set of points; the aim of the local atomic is to allow me to filter selected point_ids into an output array without any gaps.
At present when I use the local atomic to increment the addition to a local array the kernel runs but produces a wrong overall highest point. If the atomic line is commented out then a correct result returns.
What is going on here and how do I fix it?
Simplified kernel code:
__kernel void reduce(__global const float4* dataSet, __global const int* input, const unsigned int items, //points and index
__global int* output, __local float4* shared, const unsigned int n, //finding highest
__global int* filtered, __global const float2* tri_input, const unsigned int pass, //finding filtered
__global int* global_count //global count
){
//set everything up
const unsigned int group_id = get_global_id(0) / get_local_size(0);
const unsigned int local_id = get_local_id(0);
const unsigned int group_size = items;
const unsigned int group_stride = 2 * group_size;
const int local_stride = group_stride * group_size;
__local float4 *zeroIt = &shared[local_id];
zeroIt->x = 0; zeroIt->y = 0; zeroIt->z = 0; zeroIt->w = 0;
volatile __local int local_count_set_1;
volatile __local int global_val_set_1;
volatile __local int filter_local[64];
if(local_id==0){
local_count_set_1 = 0;
global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
int i = group_id * group_stride + local_id;
while (i < n){
//load up a pair of points using the index to locate them within a massive dataSet
int ia = input[i];
float4 a = dataSet[ia-1];
int ib = input[i + group_size];
float4 b = dataSet[ib-1];
//on the first pass kernel increment a local count
if(pass == 0){
filter_local[atomic_inc(&local_count_set_1)] = 1; //including this line causes an erroneous highest point result
//filter_local[local_id] = 1; //but including this line does not
//atomic_inc(&local_count_set_1); //and neither does this one
}
//find the highest of the pair
float4 result;
if(a.z>b.z) result = a;
else result = b;
//load up the previous highest result locally
float4 s = shared[local_id];
//if the previous highest beat this, stick, else twist
if(s.z>result.z){ result = s; }
shared[local_id] = result;
i += local_stride;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (group_size >= 512){
if (local_id < 256) {
__local float4 *a = &shared[local_id];
__local float4 *b = &shared[local_id+256];
if(b->z>a->z){ shared[local_id] = shared[local_id+256]; }
}}
//repeat barrier ops in increments down to group_size>=2 - this filters the highest result in shared
//finally, return the filtered highest result of shared to the global level
barrier(CLK_LOCAL_MEM_FENCE);
if(local_id == 0){
__local float4 *v = &shared[0];
int send = v->w ;
output[group_id] = send+1;
}}
[UPDATE]: When the atomic_inc line is included the 'wrong' highest point result is always a point near the end of the test dataset. I'm guessing that this means that the atomic_inc is affecting a latter comparison, but I'm not sure exactly what or where yet.
[UPDATE]: Edited code to simplify/clarify/update with debugging tweaks. Still not working and it is driving me loopy.
Total face-palm moment. In the setup phase of the kernel there are the lines:
if(local_id==0){
local_count_set_1 = 0;
global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
When these are split and the local_count_set_1 is included within the while loop, the error does not occur. i.e:
if(local_id==0) global_val_set_1 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
while (i < n){
if(local_id==0) local_count_set_1 = 0;
barrier(CLK_LOCAL_MEM_FENCE);
....
if(pass = 0){
filter_local[atomic_inc(&local_count_set_1)] = 1;
}
....
I'm hoping this fixes the issue // will update if not.
Aaaand that's a weekend I'll never get back.

Optimizing kernel shuffled keys code - OpenCL

I have just started getting into OpenCL and going through the basics of writing a kernel code. I have written a kernel code for calculating shuffled keys for points array. So, for a number of points N, the shuffled keys are calculated in 3-bit fashion, where x-bit at depth d (0
xd = 0 if p.x < Cd.x
xd = 1, otherwise
The Shuffled xyz key is given as:
x1y1z1x2y2z2...xDyDzD
The Kernel code written is given below. The point is inputted in a column major format.
__constant float3 boundsOffsetTable[8] = {
{-0.5,-0.5,-0.5},
{+0.5,-0.5,-0.5},
{-0.5,+0.5,-0.5},
{-0.5,-0.5,+0.5},
{+0.5,+0.5,-0.5},
{+0.5,-0.5,+0.5},
{-0.5,+0.5,+0.5},
{+0.5,+0.5,+0.5}
};
uint setBit(uint x,unsigned char position)
{
uint mask = 1<<position;
return x|mask;
}
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt;
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
code[i] = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) code = setBit(code,0);
if(pt.y>center.y) code = setBit(code,1);
if(pt.z>center.z) code = setBit(code,2);
for(int l = 1;l<level;l++)
{
for(int i=0;i<8;i++)
{
newRadius = radius *0.5;
newCenter = center + boundOffsetTable[i]*radius;
if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
{
if(pt.x>newCenter.x) code = setBit(code,3*l);
if(pt.y>newCenter.y) code = setBit(code,3*l+1);
if(pt.z>newCenter.z) code = setBit(code,3*l+2);
}
}
}
}
It works but I just wanted to ask if I am missing something in the code and if there is an way to optimize the code.
Try this kernel:
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt;
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
uint res;
res = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) res = setBit(res,0);
if(pt.y>center.y) res = setBit(res,1);
if(pt.z>center.z) res = setBit(res,2);
for(int l = 1;l<level;l++)
{
for(int i=0;i<8;i++)
{
newRadius = radius *0.5;
newCenter = center + boundOffsetTable[i]*radius;
if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
{
if(pt.x>newCenter.x) res = setBit(res,3*l);
if(pt.y>newCenter.y) res = setBit(res,3*l+1);
if(pt.z>newCenter.z) res = setBit(res,3*l+2);
}
}
}
//Save the result
code[i] = res;
}
Rules to optimize:
Avoid Global memory (you were using "code" directly from global memory, I changed that), you should see 3x increase in performance now.
Avoid Ifs, use "select" instead if it is possible. (See OpenCL documentation)
Use more memory inside the kernel. You don't need to operate at bit level. Operation at int level would be better and could avoid huge amount of calls to "setBit". Then you can construct your result at the end.
Another interesting thing. Is that if you are operating at 3D level, you can just use float3 variables and compute the distances with OpenCL operators. This can increase your performance quite a LOT. BUt also requires a complete rewrite of your kernel.

Resources