pyglet gives error regarding GLSL version - initialization

I am trying to run a code that has a GUI built with pyglet.
but it gives this error. I have searched and found that I need to directly set the version of GLSL to be used by the code but I don't know how. would be happy if you helped me out with it.
b"0:20(27): error: cannot initialize uniform weight in GLSL 1.10 (GLSL 1.20 required)\n0:20(27): error: array constructors forbidden in GLSL 1.10 (GLSL 1.20 or GLSL ES 3.00 required)\n0:20(27): error: initializer of uniform variable `weight' must be a constant expression\n0:79(17): error: could not implicitly convert operands to arithmetic operator\n0:79(16): error: operands to arithmetic operators must be numeric\n0:89(7): warning: `coeff' used uninitialized\n"
this is the shader.py file:
Update
added the glsl file with the uniform weight in it
#ifdef GL_ES
precision highp float;
#endif
uniform sampler2D inlet;
uniform sampler2D disp0;
uniform float sigma;
uniform bool xmirror;
uniform vec3 colors[9];
uniform float streams[9];
/*mat4 gaussm = mat4(0.00000067, 0.00002292, 0.00019117, 0.00038771,
0.00002292, 0.00078634, 0.00655965, 0.01330373,
0.00019117, 0.00655965, 0.05472157, 0.11098164,
0.00038771, 0.01330373, 0.11098164, 0.22508352);*/
uniform float weight[5] = float[](0.2270270270, 0.1945946, 0.1216216216, 0.0540540541, 0.0162162162);
vec4 sample(vec2 p)
{
vec4 col;
if(streams[0] >= 0.)
{
int stream = 0;
for(int i=0;streams[stream] < min(p.x, 1.) && stream < 8;stream++) { }
col = vec4(colors[stream], 1.);
}
else {
col = texture2D(inlet, p);
}
return col;
}
float gaussian(float d, float s)
{
float pi = 3.141592653;
//return exp(- d*d / (4.0 * pi * s * s));
//return pow(4*pi*s*s, 0.5)*exp(- d*d / (4.0 * pi * s * s));
return exp(- d*d / (2.*s*s));
}
float gaussian2(float d, float s)
{
float pi = 3.141592653;
float c = pow(1.0 / (4.0 * pi * s), 0.5);
float e = -1.0*(d * d) / (4.0 * s);
return c * exp(e);
}
float gaussf(int i, int j,float nf, float s)
{
//return gaussm[i][j];
float fi = float(i)/nf;
float jf = float(j)/nf;
return gaussian2(sqrt(fi*fi+jf*jf), s);
}
float cosh(float x)
{
return (exp(x)+exp(-x))/2.;
}
float rect_calc(vec2 d)
{
float pi = 3.141592653;
float AR = 0.25;
float offset = 0.125;
float m = 155.;
float n = 155.;
vec3 xyz = vec3(0., (d.x/1.), (d.y/8. + offset));
float u = 0.;
float coeff = (16 * pow(AR, 2.)) / pow(pi, 4.);
float num;
float den;
for(float i = 1.; i <= n; i += 2.)
{
for(float j = 1.; j <= m; j += 2.)
{
num = sin(i * pi * xyz.y) * sin(j * pi * ((xyz.z)/AR));
den = i * j * (pow(AR, 2.) * pow(AR, 2.) + pow(j, 2.));
u += coeff * (num / den);
}
}
// Convert velocity to time-of-flight
float L = 2.0;
float u_mean = 0.0043;
float u_norm = u/u_mean;
return L / u_norm;
}
void main()
{
vec2 uv = gl_TexCoord[0].st;
if(xmirror)
{
uv.x = 1.-uv.x;
}
vec2 d = texture2D(disp0, uv).yz * vec2(1.,8.);
if(xmirror)
{
d.x = -d.x;
uv.x = 1.-uv.x;
}
vec2 p = uv + d;
if(sigma <= 0.)
{
gl_FragColor = sample(p);
} else {
//Sample
vec4 c = vec4(0.);
float Dt = sigma*rect_calc(uv.xy);
float s = pow(Dt, 0.5);
float s2 = 1.0;
float t = 0.;
int ni = 8;
float n = 8.;
for(int ii = 0; ii < ni-1; ii++)
{
float i = float(ii);
for(int jj = 0; jj < ni-1; jj++)
{
float j = float(jj);
t += gaussf(ii,jj,n-1.,s2)*4.;
c += gaussf(ii,jj,n-1.,s2) * (sample(p + vec2((n-1.-i)*s, (n-1.-j)*s)) + sample(p + vec2(-(n-1.-i)*s, (n-1.-j)*s)) + sample(p + vec2(-(n-1.-i)*s, -(n-1.-j)*s)) + sample(p + vec2((n-1.-i)*s, -(n-1.-j)*s)));
}
t += gaussf(ii,ni-1,n-1.,s2)*4.;
c += gaussf(ii,ni-1,n-1.,s2) * (sample(p + vec2((n-1.-i)*s, 0.)) + sample(p + vec2(-(n-1.-i)*s, 0.))+ sample(p + vec2(0., (n-1.-i)*s))+ sample(p + vec2(0., -(n-1.-i)*s)));
}
t += gaussf(ni-1,ni-1,n-1.,s2);
c += gaussf(ni-1,ni-1,n-1.,s2) * sample(p);
//gl_FragColor = c;
gl_FragColor = c/t;
//gl_FragColor = (sigma*rect_calcu(uv.xy))*c/t;
}
}

well it got solved!
just needed to add the directive #version 120 at the beginning of the shader like this:
#version 120
#ifdef GL_ES
precision highp float;
#endif
uniform sampler2D inlet;
uniform sampler2D disp0;
uniform float sigma;
uniform bool xmirror;
uniform vec3 colors[9];
uniform float streams[9];
/*mat4 gaussm = mat4(0.00000067, 0.00002292, 0.00019117, 0.00038771,
0.00002292, 0.00078634, 0.00655965, 0.01330373,
0.00019117, 0.00655965, 0.05472157, 0.11098164,
0.00038771, 0.01330373, 0.11098164, 0.22508352);*/
uniform float weight[5] = float[](0.2270270270, 0.1945946, 0.1216216216, 0.0540540541, 0.0162162162);
vec4 sample(vec2 p)
{
vec4 col;
if(streams[0] >= 0.)
{
int stream = 0;
for(int i=0;streams[stream] < min(p.x, 1.) && stream < 8;stream++) { }
col = vec4(colors[stream], 1.);
}
else {
col = texture2D(inlet, p);
}
return col;
}
float gaussian(float d, float s)
{
float pi = 3.141592653;
//return exp(- d*d / (4.0 * pi * s * s));
//return pow(4*pi*s*s, 0.5)*exp(- d*d / (4.0 * pi * s * s));
return exp(- d*d / (2.*s*s));
}
float gaussian2(float d, float s)
{
float pi = 3.141592653;
float c = pow(1.0 / (4.0 * pi * s), 0.5);
float e = -1.0*(d * d) / (4.0 * s);
return c * exp(e);
}
float gaussf(int i, int j,float nf, float s)
{
//return gaussm[i][j];
float fi = float(i)/nf;
float jf = float(j)/nf;
return gaussian2(sqrt(fi*fi+jf*jf), s);
}
float cosh(float x)
{
return (exp(x)+exp(-x))/2.;
}
float rect_calc(vec2 d)
{
float pi = 3.141592653;
float AR = 0.25;
float offset = 0.125;
float m = 155.;
float n = 155.;
vec3 xyz = vec3(0., (d.x/1.), (d.y/8. + offset));
float u = 0.;
float coeff = (16 * pow(AR, 2.)) / pow(pi, 4.);
float num;
float den;
for(float i = 1.; i <= n; i += 2.)
{
for(float j = 1.; j <= m; j += 2.)
{
num = sin(i * pi * xyz.y) * sin(j * pi * ((xyz.z)/AR));
den = i * j * (pow(AR, 2.) * pow(AR, 2.) + pow(j, 2.));
u += coeff * (num / den);
}
}
// Convert velocity to time-of-flight
float L = 2.0;
float u_mean = 0.0043;
float u_norm = u/u_mean;
return L / u_norm;
}
void main()
{
vec2 uv = gl_TexCoord[0].st;
if(xmirror)
{
uv.x = 1.-uv.x;
}
vec2 d = texture2D(disp0, uv).yz * vec2(1.,8.);
if(xmirror)
{
d.x = -d.x;
uv.x = 1.-uv.x;
}
vec2 p = uv + d;
if(sigma <= 0.)
{
gl_FragColor = sample(p);
} else {
//Sample
vec4 c = vec4(0.);
float Dt = sigma*rect_calc(uv.xy);
float s = pow(Dt, 0.5);
float s2 = 1.0;
float t = 0.;
int ni = 8;
float n = 8.;
for(int ii = 0; ii < ni-1; ii++)
{
float i = float(ii);
for(int jj = 0; jj < ni-1; jj++)
{
float j = float(jj);
t += gaussf(ii,jj,n-1.,s2)*4.;
c += gaussf(ii,jj,n-1.,s2) * (sample(p + vec2((n-1.-i)*s, (n-1.-j)*s)) + sample(p + vec2(-(n-1.-i)*s, (n-1.-j)*s)) + sample(p + vec2(-(n-1.-i)*s, -(n-1.-j)*s)) + sample(p + vec2((n-1.-i)*s, -(n-1.-j)*s)));
}
t += gaussf(ii,ni-1,n-1.,s2)*4.;
c += gaussf(ii,ni-1,n-1.,s2) * (sample(p + vec2((n-1.-i)*s, 0.)) + sample(p + vec2(-(n-1.-i)*s, 0.))+ sample(p + vec2(0., (n-1.-i)*s))+ sample(p + vec2(0., -(n-1.-i)*s)));
}
t += gaussf(ni-1,ni-1,n-1.,s2);
c += gaussf(ni-1,ni-1,n-1.,s2) * sample(p);
//gl_FragColor = c;
gl_FragColor = c/t;
//gl_FragColor = (sigma*rect_calcu(uv.xy))*c/t;
}
}

Related

I would like to simulate gravitation between particles, but I think I forgot something?

So, for each star, i compare this one to all other stars to calculate his speed, velocity, etc.
But that didn't work, I'm not too strong in maths and I think my formula is maybe wrong? idk why that didn't work here my code :
//for each star I compare to all other stars
for(let i = 0; i < pos.length; i ++) {
for (let j = 0; j < pos.length; j ++){
if (i !== j){
// Formula part
const vector = compute_interaction(pos[i], pos[j], 1.0);
accelerations[i].x += vector.x;
accelerations[i].y += vector.y;
accelerations[i].z += vector.z;
break;
}
}
}
for (let i = 0 ; i<accelerations.length ; i++){
speedStars[i].x += accelerations[i].x * 0.001;
speedStars[i].y += accelerations[i].y * 0.001;
speedStars[i].z += accelerations[i].z * 0.001;
}
for (let i = 0 ; i<speedStars.length ; i++){
const i3 = i*3;
starsPositions[i3] += speedStars[i].x * 0.001;
starsPositions[i3 + 1] += speedStars[i].y * 0.001;
starsPositions[i3 + 2] += speedStars[i].z * 0.001;
}
function compute_interaction(currentPosition, positionOtherStar, smoothing_length)
{
const vector = new THREE.Vector3(positionOtherStar.x - currentPosition.x, positionOtherStar.y - currentPosition.y, positionOtherStar.z - currentPosition.z).normalize();
let x = vector.x / (Math.pow(positionOtherStar.x,2.0) - Math.pow(currentPosition.x,2.0)+ smoothing_length)
let y = vector.y / (Math.pow(positionOtherStar.y,2.0) - Math.pow(currentPosition.y,2.0)+ smoothing_length)
let z = vector.z / (Math.pow(positionOtherStar.z,2.0) - Math.pow(currentPosition.z,2.0)+ smoothing_length)
return new THREE.Vector3(x, y, z);
}
Here the CodePen: https://codepen.io/n0rvel/pen/ExEXbYN?editors=0010
Here is the formula/code logic I found on one OpenCL program that works:
Probably, the compute_interaction() function should be:
function compute_interaction(currentPosition, positionOtherStar, smoothing_length)
{
//const vector = new THREE.Vector3(positionOtherStar.x - currentPosition.x, positionOtherStar.y - currentPosition.y, positionOtherStar.z - currentPosition.z).normalize();
//let x = vector.x / (Math.pow(positionOtherStar.x,2.0) - Math.pow(currentPosition.x,2.0)+ smoothing_length)
//let y = vector.y / (Math.pow(positionOtherStar.y,2.0) - Math.pow(currentPosition.y,2.0)+ smoothing_length)
//let z = vector.z / (Math.pow(positionOtherStar.z,2.0) - Math.pow(currentPosition.z,2.0)+ smoothing_length)
//return new THREE.Vector3(x, y, z);
const vector = new THREE.Vector3().subVectors(positionOtherStar, currentPosition);
return vector.normalize().divideScalar(vector.lengthSq() + smoothing_length);
}

OpenGL draw sphere using glVertex3f

I needed to draw sphere on OpenGL without using gluSphere() function. I have found somewhere this function:
void drawSphere(double r, int lats, int longs) {
int i, j;
for(i = 0; i <= lats; i++) {
double lat0 = M_PI * (-0.5 + (double) (i - 1) / lats);
double z0 = sin(lat0);
double zr0 = cos(lat0);
double lat1 = M_PI * (-0.5 + (double) i / lats);
double z1 = sin(lat1);
double zr1 = cos(lat1);
glBegin(GL_QUAD_STRIP);
for(j = 0; j <= longs; j++) {
double lng = 2 * M_PI * (double) (j - 1) / longs;
double x = cos(lng);
double y = sin(lng);
glNormal3f(x * zr0, y * zr0, z0);
glVertex3f(x * zr0, y * zr0, z0);
glNormal3f(x * zr1, y * zr1, z1);
glVertex3f(x * zr1, y * zr1, z1);
}
glEnd();
}
}
But I can't understand what it does. I think it draws polyhedron that looks like sphere.
Also, I think lat0, lat1 used to determine how far from Z axis vertices will be located.

OpenCL double precision different from CPU double precision

I am programming in OpenCL using a GeForce GT 610 card in Linux. My CPU and GPU double precision results are not consistent. I can post part of the code here, but I would first like to know whether anyone else has faced this problem. The difference between the GPU and CPU double precision results get pronounced when I run loops with many iterations. There is really nothing special about the code, but I can post it here if anyone is interested. Thanks a lot. Here is my code. Please excuse the __ and bad formatting as I am new here :) As you can see, I have two loops and my CPU code is essentially almost an identical version.
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double precision floating point not supported by OpenCL implementation."
#endif
__kernel void simpar(__global double* fp, __global double* fp1,
__global double* fp3, __global double* fp5,
__global double* fp6, __global double* fp7,
__global double* fp8, __global double* fp8Plus,
__global double* x, __global double* v, __global double* acc,
__global double* keBuf, __global double* peBuf,
unsigned int prntstps, unsigned int nprntstps, double dt
) {
unsigned int m,i,j,k,l,t;
unsigned int chainlngth=100;
double dxi, twodxi, dxipl1, dximn1, fac, fac1, fac2, fac13, fac23;
double ke,pe,tke,tpe,te,dx;
double hdt, hdt2;
double alpha=0.16;
double beta=0.7;
double cmass;
double peTemp;
nprntstps=1001;
dt=0.01;
prntstps=100;
double alphaby4=beta/4.0;
hdt=0.5*dt;
hdt2=dt*0.5*dt;
double Xlocal,Vlocal,Acclocal;
unsigned int global_id=get_global_id(0);
if (global_id<chainlngth){
Xlocal=x[global_id];
Vlocal=v[global_id];
Acclocal=acc[global_id];
for (m=0;m<nprntstps;m++){
for(l=0;l<prntstps;l++){
Xlocal =Xlocal+dt *Vlocal+hdt2*Acclocal;
x[global_id]=Xlocal;
barrier(CLK_LOCAL_MEM_FENCE);
Vlocal =Vlocal+ hdt * Acclocal;
barrier(CLK_LOCAL_MEM_FENCE);
j = global_id - 1;
k = global_id + 1;
if (j == -1) {
dximn1 = 0.0;
} else {
dximn1 = x[j];
}
if (k == chainlngth) {
dxipl1 = 0.0;
} else {
dxipl1 = x[k];
}
dxi = Xlocal;
twodxi = 2.0 * dxi;
fac = dxipl1 + dximn1 - twodxi;
fac1 = dxipl1 - dxi;
fac2 = dxi - dximn1;
fac13 = fac1 * fac1 * fac1;
fac23 = fac2 * fac2 * fac2;
Acclocal = alpha * fac + beta * (fac13 - fac23);
barrier(CLK_GLOBAL_MEM_FENCE);
Vlocal += hdt * Acclocal;
v[global_id]=Vlocal;
acc[global_id]=Acclocal;
barrier(CLK_GLOBAL_MEM_FENCE);
}
barrier(CLK_GLOBAL_MEM_FENCE);
tke = tpe = te = dx = 0.0;
ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal;
barrier(CLK_GLOBAL_MEM_FENCE);
fp6[(m*100)+global_id]=ke;
keBuf[global_id]=ke;
ke=0.0;
barrier(CLK_GLOBAL_MEM_FENCE);
j = global_id - 1;
k = global_id + 1;
if (j == -1) {
dximn1 = 0.0;
} else {
dximn1 = x[j];
}
if (k == chainlngth) {
dxipl1 = 0.0;
} else {
dxipl1 = x[k];
}
dxi = Xlocal;
twodxi = 2.0 * dxi;
fac = dxipl1 + dximn1 - twodxi;
fac1 = dxipl1 - dxi;
fac2 = dxi - dximn1;
fac13 = fac1 * fac1 * fac1;
fac23 = fac2 * fac2 * fac2;
Acclocal = alpha * fac + beta * (fac13 - fac23);
barrier(CLK_GLOBAL_MEM_FENCE);
Vlocal += hdt * Acclocal;
v[global_id]=Vlocal;
acc[global_id]=Acclocal;
barrier(CLK_GLOBAL_MEM_FENCE);
}
barrier(CLK_GLOBAL_MEM_FENCE);
tke = tpe = te = dx = 0.0;
ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal;
barrier(CLK_GLOBAL_MEM_FENCE);
fp6[(m*100)+global_id]=ke;
keBuf[global_id]=ke;
ke=0.0;
barrier(CLK_GLOBAL_MEM_FENCE);
j = global_id - 1;
k = global_id + 1;
if (j == -1) {
dximn1 = 0.0;
} else {
dximn1 = x[j];
}
if (k == chainlngth) {
dxipl1 = 0.0;
} else {
dxipl1 = x[k];
}
dxi = Xlocal;
twodxi = 2.0 * dxi;
fac = dxipl1 + dximn1 - twodxi;
fac1 = dxipl1 - dxi;
fac2 = dxi - dximn1;
fac13 = fac1 * fac1 * fac1;
fac23 = fac2 * fac2 * fac2;
Acclocal = alpha * fac + beta * (fac13 - fac23);
barrier(CLK_GLOBAL_MEM_FENCE);
Vlocal += hdt * Acclocal;
v[global_id]=Vlocal;
acc[global_id]=Acclocal;
barrier(CLK_GLOBAL_MEM_FENCE);
}
barrier(CLK_GLOBAL_MEM_FENCE);
tke = tpe = te = dx = 0.0;
ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal;
barrier(CLK_GLOBAL_MEM_FENCE);
fp6[(m*100)+global_id]=ke;
keBuf[global_id]=ke;
ke=0.0;
barrier(CLK_GLOBAL_MEM_FENCE);
if (global_id ==0){
for(t=0;t<100;t++)
tke+=keBuf[t];
}
barrier(CLK_GLOBAL_MEM_FENCE);
k = global_id-1;
if (k == -1) {
dx = Xlocal;
}else{
dx = Xlocal-x[k];
}
fac = dx * dx;
peTemp = alpha * 0.5 * fac + alphaby4 * fac * fac;
fp8[global_id*m]=peTemp;
if (global_id == 0)
tpe+=peTemp;
barrier(CLK_GLOBAL_MEM_FENCE);
cmass=0.0;
dx = -x[100-1];
fac = dx*dx;
pe=alpha*0.5*fac+alphaby4*fac*fac;
if (global_id==0){
fp8Plus[m]=pe;
tpe+=peBuf[0];
fp5[m*2]=i;
fp5[m*2+1]=cmass;
te=tke+tpe;
fp[m*2]=m;
fp[m*2+1]=te;
}
barrier(CLK_GLOBAL_MEM_FENCE);
//cmass /=100;
fp1[(m*chainlngth)+global_id]=Xlocal-cmass;
// barrier(CLK_GLOBAL_MEM_FENCE);
fp3[(m*chainlngth)+global_id]=Vlocal;
// barrier(CLK_GLOBAL_MEM_FENCE);
fp7[(m*chainlngth)+global_id]=Acclocal;
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
}
This is somewhat expected behavior, actually.
On older x86 CPUs, floating point numbers are 80bits long (Intel's "long double"), and truncated to 64bit only when need be.
When SIMD units/instructions for floating point arithmetics arrived for x86 CPUs, floating point double precision became 64bit by default; however, 80bit is still possible, depending on your compiler settings. There's a lot to read about this out there: Wikipedia: Floating Point.
Check your compiler settings for OpenCL and host code on floating point "magic tricks", to get better agreement of your results. Calculate the absolute and relative error of your values and check if this error margin is safe for your application.

OpenCL traversal kernel - further optimization

Currently, I have an OpenCL kernel for like traversal as below. I'd be glad if someone had some point on optimization of this quite large kernel.
The thing is, I'm running this code with SAH BVH and I'd like to get performance similar to Timo Aila with his traversals in his paper (Understanding the Efficiency of Ray Traversal on GPUs), of course his code uses SplitBVH (which I might consider using in place of SAH BVH, but in my opinion it has really slow build times). But I'm asking about traversal, not BVH (also I've so far worked only with scenes, where SplitBVH won't give you much advantages over SAH BVH).
First of all, here is what I have so far (standard while-while traversal kernel).
__constant sampler_t sampler = CLK_FILTER_NEAREST;
// Inline definition of horizontal max
inline float max4(float a, float b, float c, float d)
{
return max(max(max(a, b), c), d);
}
// Inline definition of horizontal min
inline float min4(float a, float b, float c, float d)
{
return min(min(min(a, b), c), d);
}
// Traversal kernel
__kernel void traverse( __read_only image2d_t nodes,
__global const float4* triangles,
__global const float4* rays,
__global float4* result,
const int num,
const int w,
const int h)
{
// Ray index
int idx = get_global_id(0);
if(idx < num)
{
// Stack
int todo[32];
int todoOffset = 0;
// Current node
int nodeNum = 0;
float tmin = 0.0f;
float depth = 2e30f;
// Fetch ray origin, direction and compute invdirection
float4 origin = rays[2 * idx + 0];
float4 direction = rays[2 * idx + 1];
float4 invdir = native_recip(direction);
float4 temp = (float4)(0.0f, 0.0f, 0.0f, 1.0f);
// Traversal loop
while(true)
{
// Fetch node information
int2 nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2) / w);
int4 specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0));
// While node isn't leaf
while(specs.z == 0)
{
// Fetch child bounding boxes
float4 n0xy = read_imagef(nodes, sampler, nodeCoord);
float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0));
float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0));
// Test ray against child bounding boxes
float oodx = origin.x * invdir.x;
float oody = origin.y * invdir.y;
float oodz = origin.z * invdir.z;
float c0lox = n0xy.x * invdir.x - oodx;
float c0hix = n0xy.y * invdir.x - oodx;
float c0loy = n0xy.z * invdir.y - oody;
float c0hiy = n0xy.w * invdir.y - oody;
float c0loz = nz.x * invdir.z - oodz;
float c0hiz = nz.y * invdir.z - oodz;
float c1loz = nz.z * invdir.z - oodz;
float c1hiz = nz.w * invdir.z - oodz;
float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), tmin);
float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), depth);
float c1lox = n1xy.x * invdir.x - oodx;
float c1hix = n1xy.y * invdir.x - oodx;
float c1loy = n1xy.z * invdir.y - oody;
float c1hiy = n1xy.w * invdir.y - oody;
float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), tmin);
float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), depth);
bool traverseChild0 = (c0max >= c0min);
bool traverseChild1 = (c1max >= c1min);
nodeNum = specs.x;
int nodeAbove = specs.y;
// We hit just one out of 2 childs
if(traverseChild0 != traverseChild1)
{
if(traverseChild1)
{
nodeNum = nodeAbove;
}
}
// We hit either both or none
else
{
// If we hit none, pop node from stack (or exit traversal, if stack is empty)
if (!traverseChild0)
{
if(todoOffset == 0)
{
break;
}
nodeNum = todo[--todoOffset];
}
// If we hit both
else
{
// Sort them (so nearest goes 1st, further 2nd)
if(c1min < c0min)
{
unsigned int tmp = nodeNum;
nodeNum = nodeAbove;
nodeAbove = tmp;
}
// Push further on stack
todo[todoOffset++] = nodeAbove;
}
}
// Fetch next node information
nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2) / w);
specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0));
}
// If node is leaf & has some primitives
if(specs.z > 0)
{
// Loop through primitives & perform intersection with them (Woop triangles)
for(int i = specs.x; i < specs.y; i++)
{
// Fetch first point from global memory
float4 v0 = triangles[i * 4 + 0];
float o_z = v0.w - origin.x * v0.x - origin.y * v0.y - origin.z * v0.z;
float i_z = 1.0f / (direction.x * v0.x + direction.y * v0.y + direction.z * v0.z);
float t = o_z * i_z;
if(t > 0.0f && t < depth)
{
// Fetch second point from global memory
float4 v1 = triangles[i * 4 + 1];
float o_x = v1.w + origin.x * v1.x + origin.y * v1.y + origin.z * v1.z;
float d_x = direction.x * v1.x + direction.y * v1.y + direction.z * v1.z;
float u = o_x + t * d_x;
if(u >= 0.0f && u <= 1.0f)
{
// Fetch third point from global memory
float4 v2 = triangles[i * 4 + 2];
float o_y = v2.w + origin.x * v2.x + origin.y * v2.y + origin.z * v2.z;
float d_y = direction.x * v2.x + direction.y * v2.y + direction.z * v2.z;
float v = o_y + t * d_y;
if(v >= 0.0f && u + v <= 1.0f)
{
// We got successful hit, store the information
depth = t;
temp.x = u;
temp.y = v;
temp.z = t;
temp.w = as_float(i);
}
}
}
}
}
// Pop node from stack (if empty, finish traversal)
if(todoOffset == 0)
{
break;
}
nodeNum = todo[--todoOffset];
}
// Store the ray traversal result in global memory
result[idx] = temp;
}
}
First question of the day is, how could one write his Persistent while-while and Speculative while-while kernel in OpenCL?
Ad Persistent while-while, do I get it right, that I actually just start kernel with global work size equivalent to local work size, and both these numbers should be equal to warp/wavefront size of the GPU?
I get that with CUDA the persistent thread implementation looks like this:
do
{
volatile int& jobIndexBase = nextJobArray[threadIndex.y];
if(threadIndex.x == 0)
{
jobIndexBase = atomicAdd(&warpCounter, WARP_SIZE);
}
index = jobIndexBase + threadIndex.x;
if(index >= totalJobs)
return;
/* Perform work for task numbered 'index' */
}
while(true);
How could equivalent in OpenCL look like, I know I'll have to do some barriers in there, I also know that one should be after the score where I atomically add WARP_SIZE to warpCounter.
Ad Speculative traversal - well I probably don't have any ideas how this should be implemented in OpenCL, so any hints are welcome. I also don't have idea where to put barriers (because putting them around simulated __any will result in driver crash).
If you made it here, thanks for reading & any hints, answers, etc. are welcome!
An optimization you can do is use vector variables and the fused multiply add function to speed up your set up math. As for the rest of the kernel, It is slow because it is branchy. If you can make assumptions on the signal data you might be able to reduce the execution time by reducing the code branches. I have not checked the float4 swizles (the .xxyy and .x .y .z .w after the float 4 variables) so just check that.
float4 n0xy = read_imagef(nodes, sampler, nodeCoord);
float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0));
float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0));
float4 oodf4 = -origin * invdir;
float4 c0xyf4 = fma(n0xy,invdir.xxyy,oodf4);
float4 c0zc1z = fma(nz,(float4)(invdir.z),oodf4);
float c0min = max4(min(c0xyf4.x, c0xyf4.y), min(c0xyf4.z, c0xyf4.w), min(c0zc1z.z, c0zc1z.w), tmin);
float c0max = min4(max(c0xyf4.x, c0xyf4.y), max(c0xyf4.z, c0xyf4.w), max(c0zc1z.z, c0zc1z.w), depth);
float4 c1xy = fma(n1xy,invdir.xxyy,oodf4);
float c1min = max4(min(c1xy.x, c1xy.y), min(c1xy.z, c1xy.w), min(c0zc1z.z, c0zc1z.w), tmin);
float c1max = min4(max(c1xy.x, c1xy.y), max(c1xy.z, c1xy.w), max(c0zc1z.z, c0zc1z.w), depth);

Dijkstra's algorithm in CUDA

I am having troubles with this piece of CUDA code I have written. This is supposed to be the CUDA implementation of the Dijkstra's algorithm. The code is as follows:
__global__ void cuda_dijkstra_kernel_1(float* Va, int* Ea, int* Sa, float* Ca, float* Ua, char* Ma, unsigned int* lock){
int tid = blockIdx.x;
if(Ma[tid]=='1'){
Ma[tid] = '0';
int ind_Ea = Sa[tid * 2];
int num_edges = Sa[(tid * 2) + 1];
int v;
float wt = 0;
unsigned int leaveloop;
leaveloop = 0u;
while(leaveloop==0u){
if(atomicExch(lock, 1u) == 0u){
for(v = 0; v < num_edges; v++){
wt = (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) * (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) +
(Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) * (Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) +
(Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) * (Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) ;
wt = sqrt(wt);
if(Ca[Ea[ind_Ea + v]] > (Ca[tid] + wt)){
Ca[Ea[ind_Ea + v]] = Ca[tid] + wt;
Ma[Ea[ind_Ea + v]] = '1';
}
__threadfence();
leaveloop = 1u;
atomicExch(lock, 0u);
}
}
}
}
}
The problem is in the relaxation phase of the Dijkstra's algorithm. I have implemented such a phase as a critical section. If there is a vertex (lets say a) which is a neighbor of more than one vertex (i.e., connecting to other vertices with edges), then all of the threads for those vertices will try to write to the location of vertex a in the Cost Array Ca. Now my goal is to have the smaller value written in that location. To do that, I am trying to serialize the process and applying __threadfence() as well so that value written by one thread is visible to others and then eventually the smaller value is retained in the location of vertex a. But the problem is, that this logic is not working. The location of vertex a does not get the smallest value of all the threads trying to write to that location and I don't understand why. Any help will be highly appreciated.
There is a "classical" (at least, mostly referenced) implementation of Dijkstra's Single-Source Shortest Path (SSSP) algorithm on the GPU contained in the paper
Accelerating large graph algorithms on the GPU using CUDA by Parwan Harish and P.J. Narayanan
However, the implementation in that paper has been recognized to be bugged, see
CUDA Solutions for the SSSP Problem by Pedro J. Martín, Roberto Torres, and Antonio Gavilanes
I'm reporting below the implementation suggested in the first paper fixed according to the remark of the second. The code also contains a C++ version.
#include <sstream>
#include <vector>
#include <iostream>
#include <stdio.h>
#include <float.h>
#include "Utilities.cuh"
#define NUM_ASYNCHRONOUS_ITERATIONS 20 // Number of async loop iterations before attempting to read results back
#define BLOCK_SIZE 16
/***********************/
/* GRAPHDATA STRUCTURE */
/***********************/
// --- The graph data structure is an adjacency list.
typedef struct {
// --- Contains the integer offset to point to the edge list for each vertex
int *vertexArray;
// --- Overall number of vertices
int numVertices;
// --- Contains the "destination" vertices each edge is attached to
int *edgeArray;
// --- Overall number of edges
int numEdges;
// --- Contains the weight of each edge
float *weightArray;
} GraphData;
/**********************************/
/* GENERATE RANDOM GRAPH FUNCTION */
/**********************************/
void generateRandomGraph(GraphData *graph, int numVertices, int neighborsPerVertex) {
graph -> numVertices = numVertices;
graph -> vertexArray = (int *)malloc(graph -> numVertices * sizeof(int));
graph -> numEdges = numVertices * neighborsPerVertex;
graph -> edgeArray = (int *)malloc(graph -> numEdges * sizeof(int));
graph -> weightArray = (float *)malloc(graph -> numEdges * sizeof(float));
for (int i = 0; i < graph -> numVertices; i++) graph -> vertexArray[i] = i * neighborsPerVertex;
int *tempArray = (int *)malloc(neighborsPerVertex * sizeof(int));
for (int k = 0; k < numVertices; k++) {
for (int l = 0; l < neighborsPerVertex; l++) tempArray[l] = INT_MAX;
for (int l = 0; l < neighborsPerVertex; l++) {
bool goOn = false;
int temp;
while (goOn == false) {
goOn = true;
temp = (rand() % graph->numVertices);
for (int t = 0; t < neighborsPerVertex; t++)
if (temp == tempArray[t]) goOn = false;
if (temp == k) goOn = false;
if (goOn == true) tempArray[l] = temp;
}
graph -> edgeArray [k * neighborsPerVertex + l] = temp;
graph -> weightArray[k * neighborsPerVertex + l] = (float)(rand() % 1000) / 1000.0f;
}
}
}
/************************/
/* minDistance FUNCTION */
/************************/
// --- Finds the vertex with minimum distance value, from the set of vertices not yet included in shortest path tree
int minDistance(float *shortestDistances, bool *finalizedVertices, const int sourceVertex, const int N) {
// --- Initialize minimum value
int minIndex = sourceVertex;
float min = FLT_MAX;
for (int v = 0; v < N; v++)
if (finalizedVertices[v] == false && shortestDistances[v] <= min) min = shortestDistances[v], minIndex = v;
return minIndex;
}
/************************/
/* dijkstraCPU FUNCTION */
/************************/
void dijkstraCPU(float *graph, float *h_shortestDistances, int sourceVertex, const int N) {
// --- h_finalizedVertices[i] is true if vertex i is included in the shortest path tree
// or the shortest distance from the source node to i is finalized
bool *h_finalizedVertices = (bool *)malloc(N * sizeof(bool));
// --- Initialize h_shortestDistancesances as infinite and h_shortestDistances as false
for (int i = 0; i < N; i++) h_shortestDistances[i] = FLT_MAX, h_finalizedVertices[i] = false;
// --- h_shortestDistancesance of the source vertex from itself is always 0
h_shortestDistances[sourceVertex] = 0.f;
// --- Dijkstra iterations
for (int iterCount = 0; iterCount < N - 1; iterCount++) {
// --- Selecting the minimum distance vertex from the set of vertices not yet
// processed. currentVertex is always equal to sourceVertex in the first iteration.
int currentVertex = minDistance(h_shortestDistances, h_finalizedVertices, sourceVertex, N);
// --- Mark the current vertex as processed
h_finalizedVertices[currentVertex] = true;
// --- Relaxation loop
for (int v = 0; v < N; v++) {
// --- Update dist[v] only if it is not in h_finalizedVertices, there is an edge
// from u to v, and the cost of the path from the source vertex to v through
// currentVertex is smaller than the current value of h_shortestDistances[v]
if (!h_finalizedVertices[v] &&
graph[currentVertex * N + v] &&
h_shortestDistances[currentVertex] != FLT_MAX &&
h_shortestDistances[currentVertex] + graph[currentVertex * N + v] < h_shortestDistances[v])
h_shortestDistances[v] = h_shortestDistances[currentVertex] + graph[currentVertex * N + v];
}
}
}
/***************************/
/* MASKARRAYEMPTY FUNCTION */
/***************************/
// --- Check whether all the vertices have been finalized. This tells the algorithm whether it needs to continue running or not.
bool allFinalizedVertices(bool *finalizedVertices, int numVertices) {
for (int i = 0; i < numVertices; i++) if (finalizedVertices[i] == true) { return false; }
return true;
}
/*************************/
/* ARRAY INITIALIZATIONS */
/*************************/
__global__ void initializeArrays(bool * __restrict__ d_finalizedVertices, float* __restrict__ d_shortestDistances, float* __restrict__ d_updatingShortestDistances,
const int sourceVertex, const int numVertices) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (sourceVertex == tid) {
d_finalizedVertices[tid] = true;
d_shortestDistances[tid] = 0.f;
d_updatingShortestDistances[tid] = 0.f; }
else {
d_finalizedVertices[tid] = false;
d_shortestDistances[tid] = FLT_MAX;
d_updatingShortestDistances[tid] = FLT_MAX;
}
}
}
/**************************/
/* DIJKSTRA GPU KERNEL #1 */
/**************************/
__global__ void Kernel1(const int * __restrict__ vertexArray, const int* __restrict__ edgeArray,
const float * __restrict__ weightArray, bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances,
float * __restrict__ updatingShortestDistances, const int numVertices, const int numEdges) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (finalizedVertices[tid] == true) {
finalizedVertices[tid] = false;
int edgeStart = vertexArray[tid], edgeEnd;
if (tid + 1 < (numVertices)) edgeEnd = vertexArray[tid + 1];
else edgeEnd = numEdges;
for (int edge = edgeStart; edge < edgeEnd; edge++) {
int nid = edgeArray[edge];
atomicMin(&updatingShortestDistances[nid], shortestDistances[tid] + weightArray[edge]);
}
}
}
}
/**************************/
/* DIJKSTRA GPU KERNEL #1 */
/**************************/
__global__ void Kernel2(const int * __restrict__ vertexArray, const int * __restrict__ edgeArray, const float* __restrict__ weightArray,
bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances, float* __restrict__ updatingShortestDistances,
const int numVertices) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (shortestDistances[tid] > updatingShortestDistances[tid]) {
shortestDistances[tid] = updatingShortestDistances[tid];
finalizedVertices[tid] = true; }
updatingShortestDistances[tid] = shortestDistances[tid];
}
}
/************************/
/* dijkstraGPU FUNCTION */
/************************/
void dijkstraGPU(GraphData *graph, const int sourceVertex, float * __restrict__ h_shortestDistances) {
// --- Create device-side adjacency-list, namely, vertex array Va, edge array Ea and weight array Wa from G(V,E,W)
int *d_vertexArray; gpuErrchk(cudaMalloc(&d_vertexArray, sizeof(int) * graph -> numVertices));
int *d_edgeArray; gpuErrchk(cudaMalloc(&d_edgeArray, sizeof(int) * graph -> numEdges));
float *d_weightArray; gpuErrchk(cudaMalloc(&d_weightArray, sizeof(float) * graph -> numEdges));
// --- Copy adjacency-list to the device
gpuErrchk(cudaMemcpy(d_vertexArray, graph -> vertexArray, sizeof(int) * graph -> numVertices, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_edgeArray, graph -> edgeArray, sizeof(int) * graph -> numEdges, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_weightArray, graph -> weightArray, sizeof(float) * graph -> numEdges, cudaMemcpyHostToDevice));
// --- Create mask array Ma, cost array Ca and updating cost array Ua of size V
bool *d_finalizedVertices; gpuErrchk(cudaMalloc(&d_finalizedVertices, sizeof(bool) * graph->numVertices));
float *d_shortestDistances; gpuErrchk(cudaMalloc(&d_shortestDistances, sizeof(float) * graph->numVertices));
float *d_updatingShortestDistances; gpuErrchk(cudaMalloc(&d_updatingShortestDistances, sizeof(float) * graph->numVertices));
bool *h_finalizedVertices = (bool *)malloc(sizeof(bool) * graph->numVertices);
// --- Initialize mask Ma to false, cost array Ca and Updating cost array Ua to \u221e
initializeArrays <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_finalizedVertices, d_shortestDistances,
d_updatingShortestDistances, sourceVertex, graph -> numVertices);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
// --- Read mask array from device -> host
gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost));
while (!allFinalizedVertices(h_finalizedVertices, graph->numVertices)) {
// --- In order to improve performance, we run some number of iterations without reading the results. This might result
// in running more iterations than necessary at times, but it will in most cases be faster because we are doing less
// stalling of the GPU waiting for results.
for (int asyncIter = 0; asyncIter < NUM_ASYNCHRONOUS_ITERATIONS; asyncIter++) {
Kernel1 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances,
d_updatingShortestDistances, graph->numVertices, graph->numEdges);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
Kernel2 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances, d_updatingShortestDistances,
graph->numVertices);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost));
}
// --- Copy the result to host
gpuErrchk(cudaMemcpy(h_shortestDistances, d_shortestDistances, sizeof(float) * graph->numVertices, cudaMemcpyDeviceToHost));
free(h_finalizedVertices);
gpuErrchk(cudaFree(d_vertexArray));
gpuErrchk(cudaFree(d_edgeArray));
gpuErrchk(cudaFree(d_weightArray));
gpuErrchk(cudaFree(d_finalizedVertices));
gpuErrchk(cudaFree(d_shortestDistances));
gpuErrchk(cudaFree(d_updatingShortestDistances));
}
/****************/
/* MAIN PROGRAM */
/****************/
int main() {
// --- Number of graph vertices
int numVertices = 8;
// --- Number of edges per graph vertex
int neighborsPerVertex = 6;
// --- Source vertex
int sourceVertex = 0;
// --- Allocate memory for arrays
GraphData graph;
generateRandomGraph(&graph, numVertices, neighborsPerVertex);
// --- From adjacency list to adjacency matrix.
// Initializing the adjacency matrix
float *weightMatrix = (float *)malloc(numVertices * numVertices * sizeof(float));
for (int k = 0; k < numVertices * numVertices; k++) weightMatrix[k] = FLT_MAX;
// --- Displaying the adjacency list and constructing the adjacency matrix
printf("Adjacency list\n");
for (int k = 0; k < numVertices; k++) weightMatrix[k * numVertices + k] = 0.f;
for (int k = 0; k < numVertices; k++)
for (int l = 0; l < neighborsPerVertex; l++) {
weightMatrix[k * numVertices + graph.edgeArray[graph.vertexArray[k] + l]] = graph.weightArray[graph.vertexArray[k] + l];
printf("Vertex nr. %i; Edge nr. %i; Weight = %f\n", k, graph.edgeArray[graph.vertexArray[k] + l],
graph.weightArray[graph.vertexArray[k] + l]);
}
for (int k = 0; k < numVertices * neighborsPerVertex; k++)
printf("%i %i %f\n", k, graph.edgeArray[k], graph.weightArray[k]);
// --- Displaying the adjacency matrix
printf("\nAdjacency matrix\n");
for (int k = 0; k < numVertices; k++) {
for (int l = 0; l < numVertices; l++)
if (weightMatrix[k * numVertices + l] < FLT_MAX)
printf("%1.3f\t", weightMatrix[k * numVertices + l]);
else
printf("--\t");
printf("\n");
}
// --- Running Dijkstra on the CPU
float *h_shortestDistancesCPU = (float *)malloc(numVertices * sizeof(float));
dijkstraCPU(weightMatrix, h_shortestDistancesCPU, sourceVertex, numVertices);
printf("\nCPU results\n");
for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesCPU[k]);
// --- Allocate space for the h_shortestDistancesGPU
float *h_shortestDistancesGPU = (float*)malloc(sizeof(float) * graph.numVertices);
dijkstraGPU(&graph, sourceVertex, h_shortestDistancesGPU);
printf("\nGPU results\n");
for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesGPU[k]);
free(h_shortestDistancesCPU);
free(h_shortestDistancesGPU);
return 0;
}

Resources