Related
How do we find number of subtrees of height 'h' in a binary tree.
Function is defined as
int subtree( node *root, int k);
where k is the specific height.
First, we recursively calculate the height of the tree as follows:
If the tree is empty, the height is 0.
If the tree is non-empty, the height is the maximum height of its children, plus 1.
In C (I assume OP is using C based on response), this looks like
typedef struct Node {
Node* leftChild,
node* rightChild
} Node;
typedef Node* Tree;
unsigned int max(unsigned int a, unsigned int b) {
return a > b ? a : b;
}
unsigned int height(Tree tree) {
return tree ? 1 + max(height(tree->leftChild, tree->rightChild)) : 0;
}
Note that generally, Node will have some additional data. But that data isn't relevant here, so we don't include it (although it's easy enough to do so if you wish to).
Now, we want to modify the height function slightly. To do this, we define
typdef struct Result {
unsigned int height,
unsigned int count
} Result;
/**
* The returned .height should be the height of the tree.
* The returned .count should be the number of subtrees of tree
* with height k.
*/
Result resultCountChildren(Tree tree, unsigned int k) {
if (tree) {
Result leftResult = resultCountChildren(tree->left, k);
Result rightResult = resultCountChildren(tree->right, k);
unsigned int heightOfTree = 1 + max(leftResult.height, rightResult.height);
unsigned int count = leftResult.count + rightResult.count + (heightOfTree == k);
Result result = {
.height = heightOfTree,
.count = count
};
return result;
} else {
unsigned int height = 0;
unsigned int count = (0 == k);
Result result = {
.height = height,
.count = count
};
return result;
}
}
unsigned int count(Tree tree, unsigned int k) {
return resultCountChildren(tree).count;
}
I create a wireframe mesh of two lines between three points:
By these functions:
Qt3DRender::QGeometryRenderer *Utils::createWireframeMesh()
{
Qt3DRender::QGeometryRenderer *mesh = new Qt3DRender::QGeometryRenderer();
Qt3DRender::QGeometry *geometry = new Qt3DRender::QGeometry(mesh);
Qt3DRender::QBuffer *vertexDataBuffer = new Qt3DRender::QBuffer(Qt3DRender::QBuffer::VertexBuffer,
geometry);
Qt3DRender::QBuffer *indexDataBuffer = new Qt3DRender::QBuffer(Qt3DRender::QBuffer::IndexBuffer,
geometry);
QByteArray vertexBufferData;
QByteArray indexBufferData;
int vertexCount = 3; // Three vertices at (0, -1, 0) and (1, 0, 0) and (0, 1, 0)
int lineCount = 2; // Two lines between three vertices
vertexBufferData.resize(vertexCount * 3 * sizeof(float));
indexBufferData.resize(lineCount * 2 * sizeof(ushort));
// Arrow triangle is 2D and is inside XY plane
float *vPtr = reinterpret_cast<float *>(vertexBufferData.data());
vPtr[0] = 0.0f; vPtr[1] = -1.0f; vPtr[2] = 0.0f; // First vertex at (0, -1, 0)
vPtr[3] = 1.0f; vPtr[4] = 0.0f; vPtr[5] = 0.0f; // Second vertex at (1, 0, 0)
vPtr[6] = 0.0f; vPtr[7] = +1.0f; vPtr[8] = 0.0f; // Third vertex at (0, 1, 0)
ushort *iPtr = reinterpret_cast<ushort *>(indexBufferData.data());
iPtr[0] = 0; iPtr[1] = 1; // First line from index 0 to index 1
iPtr[2] = 1; iPtr[3] = 2; // Second line from index 1 to index 2
vertexDataBuffer->setData(vertexBufferData);
indexDataBuffer->setData(indexBufferData);
addPositionAttributeToGeometry(geometry, vertexDataBuffer, vertexCount);
addIndexAttributeToGeometry(geometry, indexDataBuffer, lineCount * 2);
mesh->setInstanceCount(1);
mesh->setIndexOffset(0);
mesh->setFirstInstance(0);
// How to set vertex count here?
mesh->setVertexCount(vertexCount);
mesh->setPrimitiveType(Qt3DRender::QGeometryRenderer::Lines);
mesh->setGeometry(geometry);
return mesh;
}
void Utils::addPositionAttributeToGeometry(Qt3DRender::QGeometry *geometry,
Qt3DRender::QBuffer *buffer, int count)
{
Qt3DRender::QAttribute *posAttribute = new Qt3DRender::QAttribute();
posAttribute->setAttributeType(Qt3DRender::QAttribute::VertexAttribute);
posAttribute->setBuffer(buffer);
posAttribute->setDataType(Qt3DRender::QAttribute::Float);
posAttribute->setDataSize(3);
posAttribute->setByteOffset(0);
posAttribute->setByteStride(0);
posAttribute->setCount(count);
posAttribute->setName(Qt3DRender::QAttribute::defaultPositionAttributeName());
geometry->addAttribute(posAttribute);
}
void Utils::addIndexAttributeToGeometry(Qt3DRender::QGeometry *geometry,
Qt3DRender::QBuffer *buffer, int count)
{
Qt3DRender::QAttribute *indexAttribute = new Qt3DRender::QAttribute();
indexAttribute->setAttributeType(Qt3DRender::QAttribute::IndexAttribute);
indexAttribute->setBuffer(buffer);
indexAttribute->setDataType(Qt3DRender::QAttribute::UnsignedShort);
indexAttribute->setDataSize(1);
indexAttribute->setByteOffset(0);
indexAttribute->setByteStride(0);
indexAttribute->setCount(count);
geometry->addAttribute(indexAttribute);
}
In above code, I tried three different statements at this line:
// How to set vertex count here?
mesh->setVertexCount(vertexCount);
mesh->setVertexCount(vertexCount * 2);
mesh->setVertexCount(vertexCount * 3);
With these results - I do some ray casting in my 3D scene which are surprisingly affected too:
Documentation explains vertexCount property of Qt3DRender::QGeometryRenderer as:
vertexCount : int
Holds the primitive count.
In my case, primitive count is line count, so I tried it but only one line is drawn:
I'm confused about setVertexCount API. Can anybody give me a hint?
vertexCount is the same value that you would pass to glDrawArrays or glDrawElements, ie it's the number of vertices involved in the drawing. Since you're using indexed rendering, that would typically be the number of indexes (assuming you're drawing all in data in the index array). So in the case above, it should be 4.
Please note we recently fixed a bug with line picking when using primitive restart, but that doesn't affect the code you included above.
I am trying to run this fast fourier implementation code. It compiles fine but gives this error at runtime. I have no idea about the error or what it means. Can anyone help me out?
I compiled and run the program by:
mpicc -o exec test.c
./exec
CODE:
This is the code that I found on GITHUB. Its the parallel version of fast fourier algorithm.
#include <stdio.h>
#include <mpi.h> //To use MPI
#include <complex.h> //to use complex numbers
#include <math.h> //for cos() and sin()
#include "timer.h" //to use timer
#define PI 3.14159265
#define bigN 16384 //Problem Size
#define howmanytimesavg 3
int main()
{
int my_rank,comm_sz;
MPI_Init(NULL,NULL); //start MPI
MPI_Comm_size(MPI_COMM_WORLD,&comm_sz); ///how many processes are we
using?
MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); //which process is this?
double start,finish;
double avgtime = 0;
FILE *outfile;
int h;
if(my_rank == 0) //if process 0 open outfile
{
outfile = fopen("ParallelVersionOutput.txt", "w"); //open from current
directory
}
for(h = 0; h < howmanytimesavg; h++) //loop to run multiple times for AVG
time.
{
if(my_rank == 0) //If it's process 0 starts timer
{
start = MPI_Wtime();
}
int i,k,n,j; //Basic loop variables
double complex evenpart[(bigN / comm_sz / 2)]; //array to save the data
for EVENHALF
double complex oddpart[(bigN / comm_sz / 2)]; //array to save the data
for ODDHALF
double complex evenpartmaster[ (bigN / comm_sz / 2) * comm_sz]; //array
to save the data for EVENHALF
double complex oddpartmaster[ (bigN / comm_sz / 2) * comm_sz]; //array
to save the data for ODDHALF
double storeKsumreal[bigN]; //store the K real variable so we can abuse
symmerty
double storeKsumimag[bigN]; //store the K imaginary variable so we can
abuse symmerty
double subtable[(bigN / comm_sz)][3]; //Each process owns a subtable
from the table below
double table[bigN][3] = //TABLE of numbers to use
{
0,3.6,2.6, //n, Real,Imaginary CREATES TABLE
1,2.9,6.3,
2,5.6,4.0,
3,4.8,9.1,
4,3.3,0.4,
5,5.9,4.8,
6,5.0,2.6,
7,4.3,4.1,
};
if(bigN > 8) //Everything after row 8 is all 0's
{
for(i = 8; i < bigN; i++)
{
table[i][0] = i;
for(j = 1; j < 3;j++)
{
table[i][j] = 0.0; //set to 0.0
}
}
}
int sendandrecvct = (bigN / comm_sz) * 3; //how much to send and
recieve??
MPI_Scatter(table,sendandrecvct,MPI_DOUBLE,subtable,sendandrecvct,MPI_DOUBLE,0,MPI_COMM_WORLD); //scatter the table to subtables
for (k = 0; k < bigN / 2; k++) //K coeffiencet Loop
{
/* Variables used for the computation */
double sumrealeven = 0.0; //sum of real numbers for even
double sumimageven = 0.0; //sum of imaginary numbers for even
double sumrealodd = 0.0; //sum of real numbers for odd
double sumimagodd = 0.0; //sum of imaginary numbers for odd
for(i = 0; i < (bigN/comm_sz)/2; i++) //Sigma loop EVEN and ODD
{
double factoreven , factorodd = 0.0;
int shiftevenonnonzeroP = my_rank * subtable[2*i][0]; //used to shift index numbers for correct results for EVEN.
int shiftoddonnonzeroP = my_rank * subtable[2*i + 1][0]; //used to shift index numbers for correct results for ODD.
/* -------- EVEN PART -------- */
double realeven = subtable[2*i][1]; //Access table for real number at spot 2i
double complex imaginaryeven = subtable[2*i][2]; //Access table for imaginary number at spot 2i
double complex componeeven = (realeven + imaginaryeven * I); //Create the first component from table
if(my_rank == 0) //if proc 0, dont use shiftevenonnonzeroP
{
factoreven = ((2*PI)*((2*i)*k))/bigN; //Calculates the even factor for Cos() and Sin()
// *********Reduces computational time*********
}
else //use shiftevenonnonzeroP
{
factoreven = ((2*PI)*((shiftevenonnonzeroP)*k))/bigN; //Calculates the even factor for Cos() and Sin()
// *********Reduces computational time*********
}
double complex comptwoeven = (cos(factoreven) - (sin(factoreven)*I)); //Create the second component
evenpart[i] = (componeeven * comptwoeven); //store in the evenpart array
/* -------- ODD PART -------- */
double realodd = subtable[2*i + 1][1]; //Access table for real number at spot 2i+1
double complex imaginaryodd = subtable[2*i + 1][2]; //Access table for imaginary number at spot 2i+1
double complex componeodd = (realodd + imaginaryodd * I); //Create the first component from table
if (my_rank == 0)//if proc 0, dont use shiftoddonnonzeroP
{
factorodd = ((2*PI)*((2*i+1)*k))/bigN;//Calculates the odd factor for Cos() and Sin()
// *********Reduces computational time*********
}
else //use shiftoddonnonzeroP
{
factorodd = ((2*PI)*((shiftoddonnonzeroP)*k))/bigN;//Calculates the odd factor for Cos() and Sin()
// *********Reduces computational time*********
}
double complex comptwoodd = (cos(factorodd) - (sin(factorodd)*I));//Create the second component
oddpart[i] = (componeodd * comptwoodd); //store in the oddpart array
}
/*Process ZERO gathers the even and odd part arrays and creates a evenpartmaster and oddpartmaster array*/
MPI_Gather(evenpart,(bigN / comm_sz / 2),MPI_DOUBLE_COMPLEX,evenpartmaster,(bigN / comm_sz / 2), MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD);
MPI_Gather(oddpart,(bigN / comm_sz / 2),MPI_DOUBLE_COMPLEX,oddpartmaster,(bigN / comm_sz / 2), MPI_DOUBLE_COMPLEX,0,MPI_COMM_WORLD);
if(my_rank == 0)
{
for(i = 0; i < (bigN / comm_sz / 2) * comm_sz; i++) //loop to sum the EVEN and ODD parts
{
sumrealeven += creal(evenpartmaster[i]); //sums the realpart of the even half
sumimageven += cimag(evenpartmaster[i]); //sums the imaginarypart of the even half
sumrealodd += creal(oddpartmaster[i]); //sums the realpart of the odd half
sumimagodd += cimag(oddpartmaster[i]); //sums the imaginary part of the odd half
}
storeKsumreal[k] = sumrealeven + sumrealodd; //add the calculated reals from even and odd
storeKsumimag[k] = sumimageven + sumimagodd; //add the calculated imaginary from even and odd
storeKsumreal[k + bigN/2] = sumrealeven - sumrealodd; //ABUSE symmetry Xkreal + N/2 = Evenk - OddK
storeKsumimag[k + bigN/2] = sumimageven - sumimagodd; //ABUSE symmetry Xkimag + N/2 = Evenk - OddK
if(k <= 10) //Do the first 10 K's
{
if(k == 0)
{
fprintf(outfile," \n\n TOTAL PROCESSED SAMPLES : %d\n",bigN);
}
fprintf(outfile,"================================\n");
fprintf(outfile,"XR[%d]: %.4f XI[%d]: %.4f \n",k,storeKsumreal[k],k,storeKsumimag[k]);
fprintf(outfile,"================================\n");
}
}
}
if(my_rank == 0)
{
GET_TIME(finish); //stop timer
double timeElapsed = finish-start; //Time for that iteration
avgtime = avgtime + timeElapsed; //AVG the time
fprintf(outfile,"Time Elaspsed on Iteration %d: %f Seconds\n", (h+1),timeElapsed);
}
}
if(my_rank == 0)
{
avgtime = avgtime / howmanytimesavg; //get avg time
fprintf(outfile,"\nAverage Time Elaspsed: %f Seconds", avgtime);
fclose(outfile); //CLOSE file ONLY proc 0 can.
}
MPI_Barrier(MPI_COMM_WORLD); //wait to all proccesses to catch up before finalize
MPI_Finalize(); //End MPI
return 0;
}
ERROR:
Fatal error in PMPI_Gather: Invalid datatype, error stack:
PMPI_Gather(904): MPI_Gather(sbuf=0x7fffb62799a0, scount=8192,
MPI_DATATYPE_NULL, rbuf=0x7fffb6239980, rcount=8192, MPI_DATATYPE_NULL,
root=0, MPI_COMM_WORLD) failed
PMPI_Gather(815): Datatype for argument sendtype is a null datatype
[unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=537490947
:
system msg for write_line failure : Bad file descriptor
There is no MPI_DATATYPE_NULL in your code, but you only use MPI_DOUBLE_COMPLEX. Note the latter type is a Fortran datatype, and using it in C is not correct strictly speaking.
My guess is that MPI_DOUBLE_COMPLEX is causing the issue (type not defined or not initialized because you invoked the C version of MPI_Init()).
You can obviously rewrite your code in Fortran, or use your own derived datatype for a C double complex number.
Meanwhile, I suggest you write simple C and Fortran helloworld programs that use MPI_DOUBLE_COMPLEX (MPI_Bcast() of one element for example) to confirm the issue is with MPI_DOUBLE_COMPLEX and is restricted to C or not.
I'm trying to make a program that will use the readings it gets from a distance sensor to control the attributes of circles (size, xy and colour). To do this I'm trying to make it record the current value and apply that to the value when you press the relevant key (Eg. press 's' and it changes the size to whatever the distance was at that point). - Ideally I'd like the circle to change whatever field is next dynamically as you move your hand over the sensor, but that seems a bit beyond me.
I've tried to do as much as I can, but everything I'm not sure of I've commented out. Any tips or advice? I'm really not sure what I'm doing when it comes to classes and constructors.
EDIT: When I run the code, nothing happens.
import processing.serial.*;
int xpos, ypos, s, r, g, b;
Circle circle;
int shapeSize, distance;
String comPortString;
Serial myPort;
void setup(){
size(displayWidth,displayHeight); //Use entire screen size.
//Open the serial port for communication with the Arduino
myPort = new Serial(this, "/dev/cu.usbmodem1411", 9600);
myPort.bufferUntil('\n'); // Trigger a SerialEvent on new line
}
void draw(){
background(0);
delay(50); //Delay used to refresh screen
println(distance);
}
void serialEvent(Serial cPort){
comPortString = (new String(cPort.readBytesUntil('\n')));
if(comPortString != null) {
comPortString=trim(comPortString);
/* Use the distance received by the Arduino to modify the y position
of the first square (others will follow). Should match the
code settings on the Arduino. In this case 200 is the maximum
distance expected. The distance is then mapped to a value
between 1 and the height of your screen */
distance = int(map(Integer.parseInt(comPortString),1,200,1,height));
if(distance<0){
/*If computer receives a negative number (-1), then the
sensor is reporting an "out of range" error. Convert all
of these to a distance of 0. */
distance = 0;
}
}
}
void keyPressed()
{
// N for new circle (and keep old one)
if((key == 'N') || (key == 'n')) {
println("n");
circle = new Circle(1,1,1,1,1,1);
}
//r - change red
if((key == 'R') || (key == 'r')) {
float red = map(distance, 0, 700, 0, 255);
r = int(red);
println("r " + r);
}
//g - change green
if((key == 'G') || (key == 'g')) {
float green = map(distance, 0, 700, 0, 255);
g = int(green);
println("g " + g);
}
//b - change blue
if((key == 'B') || (key == 'b')) {
float blue = map(distance, 0, 700, 0, 255);
b = int(blue);
println("b " + b);
}
//S - change Size
if((key == 'S') || (key == 's')) {
s = distance;
println("s " + s);
}
//X - change x pos
if((key == 'X') || (key == 'x')) {
xpos = distance;
println("x " + xpos);
}
//y - change y pos
if((key == 'Y') || (key == 'y')) {
ypos = distance;
println("y " + ypos);
}
}
class Circle {
Circle(int xpos, int ypos, int s, int r, int g, int b){
ellipse(xpos, ypos, s, s);
color(r, g, b);
}
int getX(){
return xpos;
}
int getY(){
return ypos;
}
}
I would split this into steps/tasks:
Connecting to the Arduino
Reading values from Arduino
Mapping read values
Controlling mapping
You've got the Arduino part pretty much there, but things look messy when trying to map read values to the circle on screen.
For now, for simplicity reasons, let's ignore classes and focus on simply drawing a single ellipse with x,y,size,r,g,b properties.
To get read of jitter you should update the property ellipse continuously, not just when pressing a key. On the key event you should simply change what property gets updated.
You could use extra variables to keep track of what ellipse properties you're updating.
Here's a refactored version of the code based on the points above:
import processing.serial.*;
int xpos,ypos,s,r,g,b;
int distance;
int propertyID = 0;//keep track of what property should be updated on distance
int PROP_XPOS = 0;
int PROP_YPOS = 1;
int PROP_S = 2;
int PROP_R = 3;
int PROP_G = 4;
int PROP_B = 5;
void setup(){
size(400,400);
//setup some defaults to see something on screen
xpos = ypos = 200;
s = 20;
r = g = b = 127;
//initialize arduino - search for port based on OSX name
String[] portNames = Serial.list();
for(int i = 0 ; i < portNames.length; i++){
if(portNames[i].contains("usbmodem")){
try{
Serial arduino = new Serial(this,portNames[i],9600);
arduino.bufferUntil('\n');
return;
}catch(Exception e){
showSerialError();
}
}
}
showSerialError();
}
void showSerialError(){
System.err.println("Error connecting to Arduino!\nPlease check the USB port");
}
void draw(){
background(0);
fill(r,g,b);
ellipse(xpos,ypos,s,s);
}
void serialEvent(Serial arduino){
String rawString = arduino.readString();//fetch raw string
if(rawString != null){
String trimmedString = rawString.trim();//trim the raw string
int rawDistance = int(trimmedString);//convert to integer
distance = (int)map(rawDistance,1,200,1,height);
updatePropsOnDistance();//continously update circle properties
}
}
void updatePropsOnDistance(){
if(propertyID == PROP_XPOS) xpos = distance;
if(propertyID == PROP_YPOS) ypos = distance;
if(propertyID == PROP_S) s = distance;
if(propertyID == PROP_R) r = distance;
if(propertyID == PROP_G) g = distance;
if(propertyID == PROP_B) b = distance;
}
void keyReleased(){//only change what proprty changes on key press
if(key == 'x' || key == 'X') propertyID = PROP_XPOS;
if(key == 'y' || key == 'Y') propertyID = PROP_YPOS;
if(key == 's' || key == 'S') propertyID = PROP_S;
if(key == 'r' || key == 'R') propertyID = PROP_R;
if(key == 'g' || key == 'G') propertyID = PROP_G;
if(key == 'b' || key == 'B') propertyID = PROP_B;
}
//usually a good idea to test - in this case use mouseY instead of distance sensor
void mouseDragged(){
distance = mouseY;
updatePropsOnDistance();
}
If this makes sense, it can easily be encapsulated in a class.
We could use an array to store those properties, but if something like props[0] for x, props1 for y, etc. is harder to read, you could use an IntDict which allows you to index values based on a String instead of a value (so you can do props["x"] instead of props[0]).
Here's an encapsulated version of the code:
import processing.serial.*;
Circle circle = new Circle();
void setup(){
size(400,400);
//initialize arduino - search for port based on OSX name
String[] portNames = Serial.list();
for(int i = 0 ; i < portNames.length; i++){
if(portNames[i].contains("usbmodem")){
try{
Serial arduino = new Serial(this,portNames[i],9600);
arduino.bufferUntil('\n');
return;
}catch(Exception e){
showSerialError();
}
}
}
showSerialError();
}
void showSerialError(){
System.err.println("Error connecting to Arduino!\nPlease check the USB port");
}
void draw(){
background(0);
circle.draw();
}
void serialEvent(Serial arduino){
String rawString = arduino.readString();
if(rawString != null){
String trimmedString = rawString.trim();
int rawDistance = int(trimmedString);
int distance = (int)map(rawDistance,1,200,1,height);
circle.update(distance);
}
}
void keyReleased(){
circle.setUpdateProperty(key+"");//update the circle property based on what key gets pressed. the +"" is a quick way to make a String from the char
}
//usually a good idea to test - in this case use mouseY instead of distance sensor
void mouseDragged(){
circle.update(mouseY);
}
class Circle{
//an IntDict (integer dictionary) is an associative array where instead of accessing values by an integer index (e.g. array[0]
//you access them by a String index (e.g. array["name"])
IntDict properties = new IntDict();
String updateProperty = "x";//property to update
Circle(){
//defaults
properties.set("x",200);
properties.set("y",200);
properties.set("s",20);
properties.set("r",127);
properties.set("g",127);
properties.set("b",127);
}
void draw(){
fill(properties.get("r"),properties.get("g"),properties.get("b"));
ellipse(properties.get("x"),properties.get("y"),properties.get("s"),properties.get("s"));
}
void setUpdateProperty(String prop){
if(properties.hasKey(prop)) updateProperty = prop;
else{
println("circle does not contain property: " + prop+"\navailable properties:");
println(properties.keyArray());
}
}
void update(int value){
properties.set(updateProperty,value);
}
}
In both examples you can test the distance value by dragging your mouse on the Y axis.
Regarding the HC-SR04 sensor, you can find code on the Arduino Playground to get the distance in cm. I haven't used the sensor myself yet, but I notice other people has some issues with it, so it's worth checking this post as well. If you want to roll your own Arduino code, no problem, you can use the HC-SR04 datasheet(pdf link) to get the formula:
Formula: uS / 58 = centimeters or uS / 148 =inch; or: the range = high
level time * velocity (340M/S) / 2; we suggest to use over 60ms
measurement cycle, in order to prevent trigger signal to the echo
signal.
It's important to get accurate values (you'll avoid jitter when using these to draw in Processing). Additionally you can use easing or a moving average.
Here's a basic moving average example:
int historySize = 25;//remember a number of past values
int[] x = new int[historySize];
int[] y = new int[historySize];
void setup(){
size(400,400);
background(255);
noFill();
}
void draw(){
//draw original trails in red
stroke(192,0,0,127);
ellipse(mouseX,mouseY,10,10);
//compute moving average
float avgX = average(x,mouseX);
float avgY = average(y,mouseY);
//draw moving average in green
stroke(0,192,0,127);
ellipse(avgX,avgY,10,10);
}
void mouseReleased(){
background(255);
}
float average(int[] values,int newValue){
//shift elements by 1, from the last to the 2nd: count backwards
float total = 0;
int size = values.length;
for(int i = size-1; i > 0; i--){//count backwards
values[i] = values[i-1];//copy previous value into current
total += values[i];//add values to total
}
values[0] = newValue;//add the newest value at the start of the list
total += values[0];//add the latest value to the total
return (float)total/size;//return the average
}
I am trying to include a local atomic similar to that described by DarkZeros here within a working reduction kernel. The kernel finds a largest value within a set of points; the aim of the local atomic is to allow me to filter selected point_ids into an output array without any gaps.
At present when I use the local atomic to increment the addition to a local array the kernel runs but produces a wrong overall highest point. If the atomic line is commented out then a correct result returns.
What is going on here and how do I fix it?
Simplified kernel code:
__kernel void reduce(__global const float4* dataSet, __global const int* input, const unsigned int items, //points and index
__global int* output, __local float4* shared, const unsigned int n, //finding highest
__global int* filtered, __global const float2* tri_input, const unsigned int pass, //finding filtered
__global int* global_count //global count
){
//set everything up
const unsigned int group_id = get_global_id(0) / get_local_size(0);
const unsigned int local_id = get_local_id(0);
const unsigned int group_size = items;
const unsigned int group_stride = 2 * group_size;
const int local_stride = group_stride * group_size;
__local float4 *zeroIt = &shared[local_id];
zeroIt->x = 0; zeroIt->y = 0; zeroIt->z = 0; zeroIt->w = 0;
volatile __local int local_count_set_1;
volatile __local int global_val_set_1;
volatile __local int filter_local[64];
if(local_id==0){
local_count_set_1 = 0;
global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
int i = group_id * group_stride + local_id;
while (i < n){
//load up a pair of points using the index to locate them within a massive dataSet
int ia = input[i];
float4 a = dataSet[ia-1];
int ib = input[i + group_size];
float4 b = dataSet[ib-1];
//on the first pass kernel increment a local count
if(pass == 0){
filter_local[atomic_inc(&local_count_set_1)] = 1; //including this line causes an erroneous highest point result
//filter_local[local_id] = 1; //but including this line does not
//atomic_inc(&local_count_set_1); //and neither does this one
}
//find the highest of the pair
float4 result;
if(a.z>b.z) result = a;
else result = b;
//load up the previous highest result locally
float4 s = shared[local_id];
//if the previous highest beat this, stick, else twist
if(s.z>result.z){ result = s; }
shared[local_id] = result;
i += local_stride;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (group_size >= 512){
if (local_id < 256) {
__local float4 *a = &shared[local_id];
__local float4 *b = &shared[local_id+256];
if(b->z>a->z){ shared[local_id] = shared[local_id+256]; }
}}
//repeat barrier ops in increments down to group_size>=2 - this filters the highest result in shared
//finally, return the filtered highest result of shared to the global level
barrier(CLK_LOCAL_MEM_FENCE);
if(local_id == 0){
__local float4 *v = &shared[0];
int send = v->w ;
output[group_id] = send+1;
}}
[UPDATE]: When the atomic_inc line is included the 'wrong' highest point result is always a point near the end of the test dataset. I'm guessing that this means that the atomic_inc is affecting a latter comparison, but I'm not sure exactly what or where yet.
[UPDATE]: Edited code to simplify/clarify/update with debugging tweaks. Still not working and it is driving me loopy.
Total face-palm moment. In the setup phase of the kernel there are the lines:
if(local_id==0){
local_count_set_1 = 0;
global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
When these are split and the local_count_set_1 is included within the while loop, the error does not occur. i.e:
if(local_id==0) global_val_set_1 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
while (i < n){
if(local_id==0) local_count_set_1 = 0;
barrier(CLK_LOCAL_MEM_FENCE);
....
if(pass = 0){
filter_local[atomic_inc(&local_count_set_1)] = 1;
}
....
I'm hoping this fixes the issue // will update if not.
Aaaand that's a weekend I'll never get back.