PCL Poisson reconstruction function "NonLinearUpdateWeightContribution" does not generate unit weight - point-cloud-library

Function in pcl/1.13.0_2/include/pcl-1.13/pcl/surface/3rdparty/poisson4/multi_grid_octree_data.hpp:
template<int Degree>
int Octree<Degree>::NonLinearUpdateWeightContribution( TreeOctNode* node , const Point3D<Real>& position , Real weight )
{
TreeOctNode::Neighbors3& neighbors = neighborKey.setNeighbors( node );
double x,dxdy,dx[DIMENSION][3];
double width;
Point3D<Real> center;
Real w;
node->centerAndWidth( center , w );
width=w;
const double SAMPLE_SCALE = 1. / ( 0.125 * 0.125 + 0.75 * 0.75 + 0.125 * 0.125 );
for( int i=0 ; i<DIMENSION ; i++ )
{
x = ( center[i] - position[i] - width ) / width;
dx[i][0] = 1.125 + 1.500*x + 0.500*x*x;
x = ( center[i] - position[i] ) / width;
dx[i][1] = 0.750 - x*x;
dx[i][2] = 1. - dx[i][1] - dx[i][0];
// Note that we are splatting along a co-dimension one manifold, so uniform point samples
// do not generate a unit sample weight.
dx[i][0] *= SAMPLE_SCALE;
}
for( int i=0 ; i<3 ; i++ ) for( int j=0 ; j<3 ; j++ )
{
dxdy = dx[0][i] * dx[1][j] * weight;
for( int k=0 ; k<3 ; k++ ) if( neighbors.neighbors[i][j][k] )
neighbors.neighbors[i][j][k]->nodeData.centerWeightContribution += Real( dxdy * dx[2][k] );
}
return 0;
}
My confusions are:
why is the PCL version adding the additional SAMPLE_SCALE, and only onto the first spline coefficient?
since the implicit indicator function is defined on R^3 (ignoring discreteness), where do the co-dimension one and SAMPLE_SCALE come from?
I found the version of Poisson reconstruction shipped with MeshLab does not have this issue in meshlab/unsupported/plugins_unsupported/filter_poisson/src/MultiGridOctreeData.inl:
template<int Degree>
int Octree<Degree>::NonLinearUpdateWeightContribution(TreeOctNode* node,const Point3D<Real>& position,const Real& weight){
int i,j,k;
TreeOctNode::Neighbors& neighbors=neighborKey.setNeighbors(node);
double x,dxdy,dx[DIMENSION][3];
double width;
Point3D<Real> center;
Real w;
node->centerAndWidth(center,w);
width=w;
for(i=0;i<DIMENSION;i++){
x=(center.coords[i]-position.coords[i]-width)/width;
dx[i][0]=1.125+1.500*x+0.500*x*x;
x=(center.coords[i]-position.coords[i])/width;
dx[i][1]=0.750 - x*x;
dx[i][2]=1.0-dx[i][1]-dx[i][0];
}
for(i=0;i<3;i++){
for(j=0;j<3;j++){
dxdy=dx[0][i]*dx[1][j]*weight;
for(k=0;k<3;k++){
if(neighbors.neighbors[i][j][k]){neighbors.neighbors[i][j][k]->nodeData.centerWeightContribution+=Real(dxdy*dx[2][k]);}
}
}
}
return 0;
}

Related

I would like to simulate gravitation between particles, but I think I forgot something?

So, for each star, i compare this one to all other stars to calculate his speed, velocity, etc.
But that didn't work, I'm not too strong in maths and I think my formula is maybe wrong? idk why that didn't work here my code :
//for each star I compare to all other stars
for(let i = 0; i < pos.length; i ++) {
for (let j = 0; j < pos.length; j ++){
if (i !== j){
// Formula part
const vector = compute_interaction(pos[i], pos[j], 1.0);
accelerations[i].x += vector.x;
accelerations[i].y += vector.y;
accelerations[i].z += vector.z;
break;
}
}
}
for (let i = 0 ; i<accelerations.length ; i++){
speedStars[i].x += accelerations[i].x * 0.001;
speedStars[i].y += accelerations[i].y * 0.001;
speedStars[i].z += accelerations[i].z * 0.001;
}
for (let i = 0 ; i<speedStars.length ; i++){
const i3 = i*3;
starsPositions[i3] += speedStars[i].x * 0.001;
starsPositions[i3 + 1] += speedStars[i].y * 0.001;
starsPositions[i3 + 2] += speedStars[i].z * 0.001;
}
function compute_interaction(currentPosition, positionOtherStar, smoothing_length)
{
const vector = new THREE.Vector3(positionOtherStar.x - currentPosition.x, positionOtherStar.y - currentPosition.y, positionOtherStar.z - currentPosition.z).normalize();
let x = vector.x / (Math.pow(positionOtherStar.x,2.0) - Math.pow(currentPosition.x,2.0)+ smoothing_length)
let y = vector.y / (Math.pow(positionOtherStar.y,2.0) - Math.pow(currentPosition.y,2.0)+ smoothing_length)
let z = vector.z / (Math.pow(positionOtherStar.z,2.0) - Math.pow(currentPosition.z,2.0)+ smoothing_length)
return new THREE.Vector3(x, y, z);
}
Here the CodePen: https://codepen.io/n0rvel/pen/ExEXbYN?editors=0010
Here is the formula/code logic I found on one OpenCL program that works:
Probably, the compute_interaction() function should be:
function compute_interaction(currentPosition, positionOtherStar, smoothing_length)
{
//const vector = new THREE.Vector3(positionOtherStar.x - currentPosition.x, positionOtherStar.y - currentPosition.y, positionOtherStar.z - currentPosition.z).normalize();
//let x = vector.x / (Math.pow(positionOtherStar.x,2.0) - Math.pow(currentPosition.x,2.0)+ smoothing_length)
//let y = vector.y / (Math.pow(positionOtherStar.y,2.0) - Math.pow(currentPosition.y,2.0)+ smoothing_length)
//let z = vector.z / (Math.pow(positionOtherStar.z,2.0) - Math.pow(currentPosition.z,2.0)+ smoothing_length)
//return new THREE.Vector3(x, y, z);
const vector = new THREE.Vector3().subVectors(positionOtherStar, currentPosition);
return vector.normalize().divideScalar(vector.lengthSq() + smoothing_length);
}

Opencl 3D array indexing

I have 3D array (height, width, depth). My global worksize is (height * width * depth). and local work size is 1. In kernel code how I can get row offset and column offset?
I am doing the convolution operation in opencl. In C we do as follow,
// iterating through number of filters
for(c = 0; c < number_of_filters; c++)
{
for(h = 0; h < out_height; h++)
{
for(w = 0; w < out_width; w++)
{
vert_start = h * stride;
vert_end = vert_start + f_size ;
hor_start = w * stride;
hor_end = hor_start + f_size;
sum = 0;
for(c_f = 0; c_f < input_channel; c_f++)
{
for(h_f = vert_start; h_f < vert_end; h_f++)
{
for(w_f = hor_start; w_f < hor_end; w_f++)
{
// computing convolution
sum = sum +
(INPUT[(c_f * input_height * input_width) + (h * input_width) + w] *
FILTER[(c_f * filt_height* filt_width) + (h_f * filt_width) + w_f)]);
}
}
}
// storing result in output
OUTPUT[(c * out_height * out_width) + (h * out_width) + w] = sum;
}
}
}
I am not getting how to get that row offset and column offset from image for convolution in opencl?

R Weighted moving average with partial averages

I am trying to code in R a(centered) weighted moving average function that returns me a vector of the same size than the input vector.
The following code almost gives me what I want but it does not work for the first and last values of my vector
set.seed(0)
len=10
x=floor(l*runif(l))
weights=c(1,3,0,3,1)
weights=weights/sum(weights)
rollapply(x,width=length(weights), function(x) sum(x*weights),align="center")
na.omit(filter(x,sides=2,weights))
Setting partial=TRUE in the rollapply function is sort of what I want to do. Anyway it does not work since my function does not support an x of changing sizes.
I could the latter and manually add the sides computations with a loop. It would work but I would like to find a nicer (computationally faster) way to do it.
For a more rigorous description of my needs here is a mathematical version
r is the vector my function would return
x and the weights w as inputs :
With Rcpp, you can do:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector roll_mean(const NumericVector& x,
const NumericVector& w) {
int n = x.size();
int w_size = w.size();
int size = (w_size - 1) / 2;
NumericVector res(n);
int i, ind_x, ind_w;
double w_sum = Rcpp::sum(w), tmp_wsum, tmp_xwsum, tmp_w;
// beginning
for (i = 0; i < size; i++) {
tmp_xwsum = tmp_wsum = 0;
for (ind_x = i + size, ind_w = w_size - 1; ind_x >= 0; ind_x--, ind_w--) {
tmp_w = w[ind_w];
tmp_wsum += tmp_w;
tmp_xwsum += x[ind_x] * tmp_w;
}
res[i] = tmp_xwsum / tmp_wsum;
}
// middle
int lim2 = n - size;
for (; i < lim2; i++) {
tmp_xwsum = 0;
for (ind_x = i - size, ind_w = 0; ind_w < w_size; ind_x++, ind_w++) {
tmp_xwsum += x[ind_x] * w[ind_w];
}
res[i] = tmp_xwsum / w_sum;
}
// end
for (; i < n; i++) {
tmp_xwsum = tmp_wsum = 0;
for (ind_x = i - size, ind_w = 0; ind_x < n; ind_x++, ind_w++) {
tmp_w = w[ind_w];
tmp_wsum += tmp_w;
tmp_xwsum += x[ind_x] * tmp_w;
}
res[i] = tmp_xwsum / tmp_wsum;
}
return res;
}
I use this function in one of my packages.
Just put that in a .cpp file and source it with Rcpp::sourceCpp.

Arduino: float function returns inf

I have a function (shown below) that I need some advice on. The function returns the slope of a line which is fit (via the least squares method) to n data points. To give you a context, my project is a barometric pressure based altimeter which uses this function to determine velocity based on the n most recent altitude-time pairs. These altitude-time pairs are stored in 2 global arrays(times[] and alts[]).
My problem is not that this method doesn't work. It usually does. But sometimes I will run the altimeter and this function will return the value 'inf' interspersed with a bunch of other wrong values (I have also seen 'NaN' but that is more rare). There are a few areas of suspicion I have at this point but I would like a fresh perspective. Here is some further contextual information that may or may not be of use:
I am using interrupts for a quadrature encoder
The times[] array is of type unsigned long
The alts[] array is of type float
n is a const int, in this case n = 9
On the ATMEGA328 a double is the same as a float.. Arduino-double
float velF() { // uses the last n data points, fits a line to them,
// and uses the slope of that line as the velocity at that moment
float sumTY = 0, sumT = 0, sumY = 0, sumT2 = 0;
for (int i = 0; i < n; i++) {
sumTY += (float)times[i] * alts[i] / 1000;
sumT += (float)times[i] / 1000;
sumY += alts[i];
sumT2 += (float)times[i] * times[i] / 1000000;
}
return (n*sumTY - sumT*sumY) / (n*sumT2 - sumT*sumT);
}
Any help or advice would be greatly appreciated!
Code is certainly performing division by zero.
For a variety of reasons, n*sumT2 - sumT*sumT will be zero. #John Bollinger In most of these cases, the top (dividend) of the division will also be zero and a return value of zero would be acceptable.
float velF(void) {
float sumTY = 0, sumT = 0, sumY = 0, sumT2 = 0;
for (size_t i = 0; i < n; i++) {
// insure values are reasoable
assert(alts[i] >= ALT_MIN && alts[i] <= ALT_MAX);
assert(times[i] >= TIME_MIN && times[i] <= TIME_MAX);
sumTY += (float)times[i] * alts[i] / 1000;
sumT += (float)times[i] / 1000;
sumY += alts[i];
sumT2 += (float)times[i] * times[i] / 1000000;
}
float d = n*sumT2 - sumT*sumT;
if (d == 0) return 0;
return (n*sumTY - sumT*sumY) / d;
}
Side note: could factor out the division for improved accuracy and speed. Suggest performing the last calculation as double.
float velF(void) {
float sumTY = 0, sumT = 0, sumY = 0, sumT2 = 0;
for (size_t i = 0; i < n; i++) {
float tf = (float) times[i];
sumTY += tf * alts[i];
sumT += tf;
sumY += alts[i];
sumT2 += tf * tf;
}
double nd = n;
double sumTd = sumT;
double d = nd*sumT2 - sumTd*sumTd;
if (d == 0) return 0;
return (nd*sumTY - sumTd*sumY)*1000 / d;
}

Dijkstra's algorithm in CUDA

I am having troubles with this piece of CUDA code I have written. This is supposed to be the CUDA implementation of the Dijkstra's algorithm. The code is as follows:
__global__ void cuda_dijkstra_kernel_1(float* Va, int* Ea, int* Sa, float* Ca, float* Ua, char* Ma, unsigned int* lock){
int tid = blockIdx.x;
if(Ma[tid]=='1'){
Ma[tid] = '0';
int ind_Ea = Sa[tid * 2];
int num_edges = Sa[(tid * 2) + 1];
int v;
float wt = 0;
unsigned int leaveloop;
leaveloop = 0u;
while(leaveloop==0u){
if(atomicExch(lock, 1u) == 0u){
for(v = 0; v < num_edges; v++){
wt = (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) * (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) +
(Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) * (Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) +
(Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) * (Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) ;
wt = sqrt(wt);
if(Ca[Ea[ind_Ea + v]] > (Ca[tid] + wt)){
Ca[Ea[ind_Ea + v]] = Ca[tid] + wt;
Ma[Ea[ind_Ea + v]] = '1';
}
__threadfence();
leaveloop = 1u;
atomicExch(lock, 0u);
}
}
}
}
}
The problem is in the relaxation phase of the Dijkstra's algorithm. I have implemented such a phase as a critical section. If there is a vertex (lets say a) which is a neighbor of more than one vertex (i.e., connecting to other vertices with edges), then all of the threads for those vertices will try to write to the location of vertex a in the Cost Array Ca. Now my goal is to have the smaller value written in that location. To do that, I am trying to serialize the process and applying __threadfence() as well so that value written by one thread is visible to others and then eventually the smaller value is retained in the location of vertex a. But the problem is, that this logic is not working. The location of vertex a does not get the smallest value of all the threads trying to write to that location and I don't understand why. Any help will be highly appreciated.
There is a "classical" (at least, mostly referenced) implementation of Dijkstra's Single-Source Shortest Path (SSSP) algorithm on the GPU contained in the paper
Accelerating large graph algorithms on the GPU using CUDA by Parwan Harish and P.J. Narayanan
However, the implementation in that paper has been recognized to be bugged, see
CUDA Solutions for the SSSP Problem by Pedro J. Martín, Roberto Torres, and Antonio Gavilanes
I'm reporting below the implementation suggested in the first paper fixed according to the remark of the second. The code also contains a C++ version.
#include <sstream>
#include <vector>
#include <iostream>
#include <stdio.h>
#include <float.h>
#include "Utilities.cuh"
#define NUM_ASYNCHRONOUS_ITERATIONS 20 // Number of async loop iterations before attempting to read results back
#define BLOCK_SIZE 16
/***********************/
/* GRAPHDATA STRUCTURE */
/***********************/
// --- The graph data structure is an adjacency list.
typedef struct {
// --- Contains the integer offset to point to the edge list for each vertex
int *vertexArray;
// --- Overall number of vertices
int numVertices;
// --- Contains the "destination" vertices each edge is attached to
int *edgeArray;
// --- Overall number of edges
int numEdges;
// --- Contains the weight of each edge
float *weightArray;
} GraphData;
/**********************************/
/* GENERATE RANDOM GRAPH FUNCTION */
/**********************************/
void generateRandomGraph(GraphData *graph, int numVertices, int neighborsPerVertex) {
graph -> numVertices = numVertices;
graph -> vertexArray = (int *)malloc(graph -> numVertices * sizeof(int));
graph -> numEdges = numVertices * neighborsPerVertex;
graph -> edgeArray = (int *)malloc(graph -> numEdges * sizeof(int));
graph -> weightArray = (float *)malloc(graph -> numEdges * sizeof(float));
for (int i = 0; i < graph -> numVertices; i++) graph -> vertexArray[i] = i * neighborsPerVertex;
int *tempArray = (int *)malloc(neighborsPerVertex * sizeof(int));
for (int k = 0; k < numVertices; k++) {
for (int l = 0; l < neighborsPerVertex; l++) tempArray[l] = INT_MAX;
for (int l = 0; l < neighborsPerVertex; l++) {
bool goOn = false;
int temp;
while (goOn == false) {
goOn = true;
temp = (rand() % graph->numVertices);
for (int t = 0; t < neighborsPerVertex; t++)
if (temp == tempArray[t]) goOn = false;
if (temp == k) goOn = false;
if (goOn == true) tempArray[l] = temp;
}
graph -> edgeArray [k * neighborsPerVertex + l] = temp;
graph -> weightArray[k * neighborsPerVertex + l] = (float)(rand() % 1000) / 1000.0f;
}
}
}
/************************/
/* minDistance FUNCTION */
/************************/
// --- Finds the vertex with minimum distance value, from the set of vertices not yet included in shortest path tree
int minDistance(float *shortestDistances, bool *finalizedVertices, const int sourceVertex, const int N) {
// --- Initialize minimum value
int minIndex = sourceVertex;
float min = FLT_MAX;
for (int v = 0; v < N; v++)
if (finalizedVertices[v] == false && shortestDistances[v] <= min) min = shortestDistances[v], minIndex = v;
return minIndex;
}
/************************/
/* dijkstraCPU FUNCTION */
/************************/
void dijkstraCPU(float *graph, float *h_shortestDistances, int sourceVertex, const int N) {
// --- h_finalizedVertices[i] is true if vertex i is included in the shortest path tree
// or the shortest distance from the source node to i is finalized
bool *h_finalizedVertices = (bool *)malloc(N * sizeof(bool));
// --- Initialize h_shortestDistancesances as infinite and h_shortestDistances as false
for (int i = 0; i < N; i++) h_shortestDistances[i] = FLT_MAX, h_finalizedVertices[i] = false;
// --- h_shortestDistancesance of the source vertex from itself is always 0
h_shortestDistances[sourceVertex] = 0.f;
// --- Dijkstra iterations
for (int iterCount = 0; iterCount < N - 1; iterCount++) {
// --- Selecting the minimum distance vertex from the set of vertices not yet
// processed. currentVertex is always equal to sourceVertex in the first iteration.
int currentVertex = minDistance(h_shortestDistances, h_finalizedVertices, sourceVertex, N);
// --- Mark the current vertex as processed
h_finalizedVertices[currentVertex] = true;
// --- Relaxation loop
for (int v = 0; v < N; v++) {
// --- Update dist[v] only if it is not in h_finalizedVertices, there is an edge
// from u to v, and the cost of the path from the source vertex to v through
// currentVertex is smaller than the current value of h_shortestDistances[v]
if (!h_finalizedVertices[v] &&
graph[currentVertex * N + v] &&
h_shortestDistances[currentVertex] != FLT_MAX &&
h_shortestDistances[currentVertex] + graph[currentVertex * N + v] < h_shortestDistances[v])
h_shortestDistances[v] = h_shortestDistances[currentVertex] + graph[currentVertex * N + v];
}
}
}
/***************************/
/* MASKARRAYEMPTY FUNCTION */
/***************************/
// --- Check whether all the vertices have been finalized. This tells the algorithm whether it needs to continue running or not.
bool allFinalizedVertices(bool *finalizedVertices, int numVertices) {
for (int i = 0; i < numVertices; i++) if (finalizedVertices[i] == true) { return false; }
return true;
}
/*************************/
/* ARRAY INITIALIZATIONS */
/*************************/
__global__ void initializeArrays(bool * __restrict__ d_finalizedVertices, float* __restrict__ d_shortestDistances, float* __restrict__ d_updatingShortestDistances,
const int sourceVertex, const int numVertices) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (sourceVertex == tid) {
d_finalizedVertices[tid] = true;
d_shortestDistances[tid] = 0.f;
d_updatingShortestDistances[tid] = 0.f; }
else {
d_finalizedVertices[tid] = false;
d_shortestDistances[tid] = FLT_MAX;
d_updatingShortestDistances[tid] = FLT_MAX;
}
}
}
/**************************/
/* DIJKSTRA GPU KERNEL #1 */
/**************************/
__global__ void Kernel1(const int * __restrict__ vertexArray, const int* __restrict__ edgeArray,
const float * __restrict__ weightArray, bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances,
float * __restrict__ updatingShortestDistances, const int numVertices, const int numEdges) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (finalizedVertices[tid] == true) {
finalizedVertices[tid] = false;
int edgeStart = vertexArray[tid], edgeEnd;
if (tid + 1 < (numVertices)) edgeEnd = vertexArray[tid + 1];
else edgeEnd = numEdges;
for (int edge = edgeStart; edge < edgeEnd; edge++) {
int nid = edgeArray[edge];
atomicMin(&updatingShortestDistances[nid], shortestDistances[tid] + weightArray[edge]);
}
}
}
}
/**************************/
/* DIJKSTRA GPU KERNEL #1 */
/**************************/
__global__ void Kernel2(const int * __restrict__ vertexArray, const int * __restrict__ edgeArray, const float* __restrict__ weightArray,
bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances, float* __restrict__ updatingShortestDistances,
const int numVertices) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (shortestDistances[tid] > updatingShortestDistances[tid]) {
shortestDistances[tid] = updatingShortestDistances[tid];
finalizedVertices[tid] = true; }
updatingShortestDistances[tid] = shortestDistances[tid];
}
}
/************************/
/* dijkstraGPU FUNCTION */
/************************/
void dijkstraGPU(GraphData *graph, const int sourceVertex, float * __restrict__ h_shortestDistances) {
// --- Create device-side adjacency-list, namely, vertex array Va, edge array Ea and weight array Wa from G(V,E,W)
int *d_vertexArray; gpuErrchk(cudaMalloc(&d_vertexArray, sizeof(int) * graph -> numVertices));
int *d_edgeArray; gpuErrchk(cudaMalloc(&d_edgeArray, sizeof(int) * graph -> numEdges));
float *d_weightArray; gpuErrchk(cudaMalloc(&d_weightArray, sizeof(float) * graph -> numEdges));
// --- Copy adjacency-list to the device
gpuErrchk(cudaMemcpy(d_vertexArray, graph -> vertexArray, sizeof(int) * graph -> numVertices, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_edgeArray, graph -> edgeArray, sizeof(int) * graph -> numEdges, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_weightArray, graph -> weightArray, sizeof(float) * graph -> numEdges, cudaMemcpyHostToDevice));
// --- Create mask array Ma, cost array Ca and updating cost array Ua of size V
bool *d_finalizedVertices; gpuErrchk(cudaMalloc(&d_finalizedVertices, sizeof(bool) * graph->numVertices));
float *d_shortestDistances; gpuErrchk(cudaMalloc(&d_shortestDistances, sizeof(float) * graph->numVertices));
float *d_updatingShortestDistances; gpuErrchk(cudaMalloc(&d_updatingShortestDistances, sizeof(float) * graph->numVertices));
bool *h_finalizedVertices = (bool *)malloc(sizeof(bool) * graph->numVertices);
// --- Initialize mask Ma to false, cost array Ca and Updating cost array Ua to \u221e
initializeArrays <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_finalizedVertices, d_shortestDistances,
d_updatingShortestDistances, sourceVertex, graph -> numVertices);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
// --- Read mask array from device -> host
gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost));
while (!allFinalizedVertices(h_finalizedVertices, graph->numVertices)) {
// --- In order to improve performance, we run some number of iterations without reading the results. This might result
// in running more iterations than necessary at times, but it will in most cases be faster because we are doing less
// stalling of the GPU waiting for results.
for (int asyncIter = 0; asyncIter < NUM_ASYNCHRONOUS_ITERATIONS; asyncIter++) {
Kernel1 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances,
d_updatingShortestDistances, graph->numVertices, graph->numEdges);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
Kernel2 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances, d_updatingShortestDistances,
graph->numVertices);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost));
}
// --- Copy the result to host
gpuErrchk(cudaMemcpy(h_shortestDistances, d_shortestDistances, sizeof(float) * graph->numVertices, cudaMemcpyDeviceToHost));
free(h_finalizedVertices);
gpuErrchk(cudaFree(d_vertexArray));
gpuErrchk(cudaFree(d_edgeArray));
gpuErrchk(cudaFree(d_weightArray));
gpuErrchk(cudaFree(d_finalizedVertices));
gpuErrchk(cudaFree(d_shortestDistances));
gpuErrchk(cudaFree(d_updatingShortestDistances));
}
/****************/
/* MAIN PROGRAM */
/****************/
int main() {
// --- Number of graph vertices
int numVertices = 8;
// --- Number of edges per graph vertex
int neighborsPerVertex = 6;
// --- Source vertex
int sourceVertex = 0;
// --- Allocate memory for arrays
GraphData graph;
generateRandomGraph(&graph, numVertices, neighborsPerVertex);
// --- From adjacency list to adjacency matrix.
// Initializing the adjacency matrix
float *weightMatrix = (float *)malloc(numVertices * numVertices * sizeof(float));
for (int k = 0; k < numVertices * numVertices; k++) weightMatrix[k] = FLT_MAX;
// --- Displaying the adjacency list and constructing the adjacency matrix
printf("Adjacency list\n");
for (int k = 0; k < numVertices; k++) weightMatrix[k * numVertices + k] = 0.f;
for (int k = 0; k < numVertices; k++)
for (int l = 0; l < neighborsPerVertex; l++) {
weightMatrix[k * numVertices + graph.edgeArray[graph.vertexArray[k] + l]] = graph.weightArray[graph.vertexArray[k] + l];
printf("Vertex nr. %i; Edge nr. %i; Weight = %f\n", k, graph.edgeArray[graph.vertexArray[k] + l],
graph.weightArray[graph.vertexArray[k] + l]);
}
for (int k = 0; k < numVertices * neighborsPerVertex; k++)
printf("%i %i %f\n", k, graph.edgeArray[k], graph.weightArray[k]);
// --- Displaying the adjacency matrix
printf("\nAdjacency matrix\n");
for (int k = 0; k < numVertices; k++) {
for (int l = 0; l < numVertices; l++)
if (weightMatrix[k * numVertices + l] < FLT_MAX)
printf("%1.3f\t", weightMatrix[k * numVertices + l]);
else
printf("--\t");
printf("\n");
}
// --- Running Dijkstra on the CPU
float *h_shortestDistancesCPU = (float *)malloc(numVertices * sizeof(float));
dijkstraCPU(weightMatrix, h_shortestDistancesCPU, sourceVertex, numVertices);
printf("\nCPU results\n");
for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesCPU[k]);
// --- Allocate space for the h_shortestDistancesGPU
float *h_shortestDistancesGPU = (float*)malloc(sizeof(float) * graph.numVertices);
dijkstraGPU(&graph, sourceVertex, h_shortestDistancesGPU);
printf("\nGPU results\n");
for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesGPU[k]);
free(h_shortestDistancesCPU);
free(h_shortestDistancesGPU);
return 0;
}

Resources