opencl kernel implementing a simple mathematical formula

opencl kernel implementing a simple mathematical formula - opencl

What are the best practices to consider when implementing an error function defined as
using an OpenCL kernel?
A, B and C are 3D float arrays and \delta is the Kronecker delta.
Typical values for (N, M) = (2, 7) or (N, M) = (3, 23).
The naive implementation (given below) is by several orders of magnitude slower than the CPU version.
Thanks,
T.
__kernel void cl_bilinear_alg(
__global float * A,
__global float * B,
__global float * C,
__global const int M,
__global const int N,
__global float * R)
{
int index = get_global_id(0);
int N2 = N * N;
int mat_offset = index * N2 * M;
float s1, s2, err = 0.0f;
for (int i = 0; i < N; ++i)
{
for (int j = 0; j < N; ++j)
{
for (int k = 0; k < N; ++k)
{
for (int l = 0; l < N; ++l)
{
for (int m = 0; m < N; ++m)
{
for (int n = 0; n < N; ++n)
{
s1 = (n == i) * (j == k) * (l == m);
s2 = 0;
for (int r = 0; r < M; ++r)
{
s2 += A[mat_offset + r * N2 + i * N + j] *
B[mat_offset + r * N2 + k * N + l] *
C[mat_offset + r * N2 + m * N + n];
}
err += (s2 - s1) * (s2 - s1);
}
}
}
}
}
}
R[index] = err;
}
UPDATE
The primary target is a Geforce GTX 570, though this could change in the future.
UPDATE2
After vectorizing the code, moving bits to local memory, unrolling some loops and passing precomputed Kronecker products explicitly to the kernel the code looks as follows:
__kernel void cl_bilinear_alg(__global const float * A,
__global const float * B,
__global const float * C,
__global const int N,
__global const int M,
__global const float * kron,
__global float * R)
{
__private int index = get_global_id(0);
__private int cM = ceil(M / 4.0f);
__private int N2 = N*N;
__private int N4 = N2*N2;
__private int mat_offset = index * N2 * M;
__private float s1, s2, err = 0;
__private float4 vzero = (float4) (0.0f, 0.0f, 0.0f, 0.0f);
__local float4 va[54], vb[54], vc[54];
for (int ij = 0, k = 0; ij < N2; ++ij)
{
int r = 0;
for (; r < M / 4; r += 4, ++k)
{
int idx0 = mat_offset + N2 * r + ij;
int idx1 = mat_offset + N2 * (r + 1) + ij;
int idx2 = mat_offset + N2 * (r + 2) + ij;
int idx3 = mat_offset + N2 * (r + 3) + ij;
va[k] = (float4) (A[idx0], A[idx1], A[idx2], A[idx3]);
vb[k] = (float4) (B[idx0], B[idx1], B[idx2], B[idx3]);
vc[k] = (float4) (C[idx0], C[idx1], C[idx2], C[idx3]);
}
if (M % 4)
{
float buffa[4] = {0}, buffb[4] = {0}, buffc[4] = {0};
for (; r < M; ++r)
{
int idx = mat_offset + N2 * r + ij;
buffa[r % 4] = A[idx];
buffb[r % 4] = B[idx];
buffc[r % 4] = C[idx];
}
va[k] = vload4(0, buffa);
vb[k] = vload4(0, buffb);
vc[k++] = vload4(0, buffc);
}
}
for (int ij = 0; ij < N2; ++ij)
{
for (int kl = 0; kl < N2; ++kl)
{
for (int mn = 0; mn < N2; ++mn)
{
s1 = kron[ij * N4 + kl * N2 + mn];
s2 = 0;
for (int r = 0; r < cM; ++r)
s2 += dot(va[cM * ij + r], mad(vb[cM * kl + r], vc[cM * mn + r], vzero));
//the most expensive line
err += (s2 - s1) * (s2 - s1);
}
}
}
R[index] = err;
}
By applying these changes a 4x speed increase was observed compared to the naive implementation. Furthermore, it was revealed that the most expensive line of all is the error update, i.e.
err += (s2 - s1) * (s2 - s1);
Any suggestions?

Typically you'd want to break some of those loops up... a lot...
- the outer loops become split over multiple workgroups, which run on their own compute unit (there are around 16 compute units per GPU, not many)
- the next few loops would be split over different threads within each workgroup
If you try to run all the calculations all at the same time, they will all try to load the data into memory at the same time, and this will simply thrash horribly. GPUs have very limited memory. Sure, the global memory sounds large enough, several gigabytes, but the global GPU memory is slow. You want to get the data into the local memory, which is per compute unit, and is of the order of 32-64KB, not much more than that.
You'd typically want to somehow divide your task into very small tasks, and do the following, for each workgroup:
load a chunk of memory from global memory into local memory
the whole workgroup warp of threads can participate in doing the copy, using coallesced access
do work on this memory, like doing some sums, and so on
write the results back to global memory
then, can either iterate a bit, or simply exit, and leave other workgroups to handle other bits of the work
On the CPU, the mathematical operations tend to be a major bottleneck, but on the GPU, generally the cores are mostly spinning uselessly, whilst waiting for data to gradually get to them, from global memory. Whatever you can do to optimize this process, prevent conflicting demands, and so on, will make the kernel significantly faster.

Related

PyOpenCL - not seeing expected speedup

In experimenting with PyOpenCL, I noticed my code was running slower than expected. It turned out that it ran faster on CPU than on GPU (running on PyOpenCL in both cases, achieving just 1 GFLOP).
To debug this, I then tried naive matrix multiplication as a comparison, and only see a 2x speedup on GPU vs CPU (~20 GFLOPs vs ~10 GFLOPs). My system is i7 8750H + GTX 1070 Max-Q.
Does anyone have any thoughts they could share about what I might be doing wrong? I know that the code below is not optimal, but I would have expected that with the much increased floating point capability and memory bandwidth of my GPU there would be a bigger difference.
import pyopencl as cl
import pyopencl.array as pycl_array
import numpy as np
import numpy.linalg as la
import time
size = 4000
m1 = np.random.normal(size = [size,size]).astype(np.float32)
m2 = np.random.normal(size = [size,size]).astype(np.float32)
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)
a = pycl_array.to_device(queue, m1)
b = pycl_array.to_device(queue, m2)
res = pycl_array.empty_like(a)
prg = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
res[size * i + j] = 0;
for (int k = 0; k < size; k++)
{
res[size * i + j] += a[k + size * j] * b[i + size * k];
}
}
""").build()
t = time.time()
task = prg.multiplymatrices(queue, m1.shape, None, np.int32(size), a.data, b.data, res.data)
task.wait()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))

Following the suggestion to use a local register to accumulate the results, I modified my code as follows, getting about 90 gflops at about 360 GB/s of memory bandwidth (which is the maximum bandwidth my GPU is capable of). Improving the gflops would require a more sophisticated matrix multiplication algorithm which reuses the same data stored in cache multiple times, but is outside the scope of this question.
__kernel void multiplymatrices(const unsigned int size, __global const float * a,
__global const float * b, __global float * res) {
int i = get_global_id(0);
int j = get_global_id(1);
float temp = 0;
for (int k = 0; k < size; k++)
{
temp += a[k + size * j] * b[i + size * k];
}
res[size * i + j] = temp;
}
EDIT: For those looking for an example of fast matrix multiplication, which showcases using local memory with workgroups as well as 2D register tiling, I have created the below based on the tutorial here. It gets 1.4 TFLOPs on my GPU.
prg4 = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global const float * A,
__global const float * B, __global float * res) {
int ig = get_group_id(0);
int jg = get_group_id(1);
int il = get_local_id(0);
int jl = get_local_id(1);
const int memtile = 64;
const int regtile = 4;
volatile int il2;
volatile int jl2;
int iglob = memtile*ig + regtile*il;
int jglob = memtile*jg + regtile*jl;
__local float Asub[64][64];
__local float Bsub[64][64];
float acc[4][4];
float Areg;
float Breg[4];
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
acc[k][m] = 0;
}
}
for (int l = 0; l < size/memtile; l++) {
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
il2 = il*regtile + k;
jl2 = jl*regtile + m;
Asub[il2][jl2] = A[size*(iglob + k) + memtile*l + jl2];
Bsub[il2][jl2] = B[size*(memtile*l + il2) + jglob + m];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < regtile; k++) {
for (int r = 0; r < regtile; r++) {
Breg[r] = Bsub[il*regtile+k][jl*regtile+r];
}
for (int m = 0; m < regtile; m++) {
Areg = Asub[il*regtile+m][jl*regtile+k];
for (int r = 0; r < regtile; r++) {
acc[k][m] += Areg*Breg[r];
}
}
}
}
for (int k = 0; k < regtile; k++) {
for (int m = 0; m < regtile; m++) {
res[size*(iglob+k)+jglob+m] = acc[k][m];
}
}
}
""").build()
t = time.time()
memtile = 64
regtile = 4
wgsize = int(memtile/regtile)
global_size = int(size/regtile)
task = prg4.multiplymatrices(queue, (global_size,global_size), (wgsize,wgsize), np.int32(size), a.data, b.data, res.data)
queue.finish()
tot_time = time.time()-t
print("gflops", 2*size**3/(tot_time*1000**3))
print("GB/s total", 2*4*size**3/(tot_time*1000**3))
print("GB/s global", 2*4*size**3/(memtile*tot_time*1000**3))

OpenCL Matrix Multiplication Altera Example

I am very new to OpenCL and am going through the Altera OpenCL examples.
In their matrix multiplication example, they have used the concept of blocks, where dimensions of the input matrices are multiple of block size. Here's the code:
void matrixMult( // Input and output matrices
__global float *restrict C,
__global float *A,
__global float *B,
// Widths of matrices.
int A_width, int B_width)
{
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);
// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);
// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;
float running_sum = 0.0f;
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
}
}
// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
Assume block size is 2, then: block_x and block_y are both 0; and local_x and local_y are both 0.
Then A_local[0][0] would be A[0] and B_local[0][0] would be B[0].
Sizes of A_local and B_local are 4 elements each.
In that case, how would A_local and B_local access other elements of the block in that iteration?
Also would separate threads/cores be assigned for each local_x and local_y?

There is definitely a barrier missing in your code sample. The outer for loop as you have it will only produce correct results if all work items are executing instructions in lockstep fashion, thus guaranteeing the local memory is populated before the for k loop.
Maybe this is the case for Altera and other FPGAs, but this is not correct for CPUs and GPUs.
You should add barrier(CLK_LOCAL_MEM_FENCE); if you are getting unexpected results, or want to be compatible with other type of hardware.
float running_sum = 0.0f;
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
}
}

A_local and B_local are both shared by all work items of the work group, so all their elements are loaded in parallel (by all work items of the work group) at each step of the encompassing for loop.
Then each work item uses some of the loaded values (not necessarily the values the work item loaded itself) to do its share of the computation.
And finally, the work item stores its individual result into the global output matrix.
It is a classical tiled implementation of a matrix-matrix multiplication. However, I'm really surprised not to see any sort of call to a memory synchronisation function, such as work_group_barrier(CLK_LOCAL_MEM_FENCE) between the load of A_local and B_local and their use in the k loop... But I might very well have overlooked something here.

Different results GPU & CPU when more than one 8 work items per group

I'm new in open cl. And tried as my first work to write code that checks intersection between many polylines to single polygon.
I'm running the code in both cpu and gpu.. and get different results.
First I sent NULL as local parameter when called clEnqueueNDRangeKernel.
clEnqueueNDRangeKernel(command_queue, kIntersect, 1, NULL, &global, null, 2, &evtCalcBounds, &evtKernel);
After trying many things i saw that if i send 1 as local it is working good. and returning the same results for the cpu and gpu.
size_t local = 1;
clEnqueueNDRangeKernel(command_queue, kIntersect, 1, NULL, &global, &local, 2, &evtCalcBounds, &evtKernel);
Played abit more and found that the cpu returns false result when i run the kernel with local 8 or more (for some reason).
I'm not using any local memory, just globals and privates.
I didn't added the code because i think it is irrelevant to the problem (note that for single work group it is working good), and it is long. If it is needed, i will try to simplify it.
The code flow is going like this:
I have polylines coordinates stored in a big buffer. and the single polygon in another. In addition i'm providing another buffer with single int that holds the current results count. All buffers are __global arguments.
In the kernel i'm simply checking intersection between all the lines of the "polyline[get_global(0)]" with the lines of the polygon. If true,
i'm using atomic_inc for the results count. There is no read and write memory from the same buffer, no barriers or mem fences,... the atomic_inc is the only thread safe mechanism i'm using.
-- UPDATE --
Added my code:
I know that i can maybe have better use of open cl functions for calculating some vectors, but for now, i'm simply convert code from my old regular CPU single threaded program to CL. so this is not my concern now.
bool isPointInPolygon(float x, float y, __global float* polygon) {
bool blnInside = false;
uint length = convert_uint(polygon[4]);
int s = 5;
uint j = length - 1;
for (uint i = 0; i < length; j = i++) {
uint realIdx = s + i * 2;
uint realInvIdx = s + j * 2;
if (((polygon[realIdx + 1] > y) != (polygon[realInvIdx + 1] > y)) &&
(x < (polygon[realInvIdx] - polygon[realIdx]) * (y - polygon[realIdx + 1]) / (polygon[realInvIdx + 1] - polygon[realIdx + 1]) + polygon[realIdx]))
blnInside = !blnInside;
}
return blnInside;
}
bool isRectanglesIntersected(float p_dblMinX1, float p_dblMinY1,
float p_dblMaxX1, float p_dblMaxY1,
float p_dblMinX2, float p_dblMinY2,
float p_dblMaxX2, float p_dblMaxY2) {
bool blnResult = true;
if (p_dblMinX1 > p_dblMaxX2 ||
p_dblMaxX1 < p_dblMinX2 ||
p_dblMinY1 > p_dblMaxY2 ||
p_dblMaxY1 < p_dblMinY2) {
blnResult = false;
}
return blnResult;
}
bool isLinesIntersects(
double Ax, double Ay,
double Bx, double By,
double Cx, double Cy,
double Dx, double Dy) {
double distAB, theCos, theSin, newX, ABpos;
// Fail if either line is undefined.
if (Ax == Bx && Ay == By || Cx == Dx && Cy == Dy)
return false;
// (1) Translate the system so that point A is on the origin.
Bx -= Ax; By -= Ay;
Cx -= Ax; Cy -= Ay;
Dx -= Ax; Dy -= Ay;
// Discover the length of segment A-B.
distAB = sqrt(Bx*Bx + By*By);
// (2) Rotate the system so that point B is on the positive X axis.
theCos = Bx / distAB;
theSin = By / distAB;
newX = Cx*theCos + Cy*theSin;
Cy = Cy*theCos - Cx*theSin; Cx = newX;
newX = Dx*theCos + Dy*theSin;
Dy = Dy*theCos - Dx*theSin; Dx = newX;
// Fail if the lines are parallel.
return (Cy != Dy);
}
bool isPolygonInersectsPolyline(__global float* polygon, __global float* polylines, uint startIdx) {
uint polylineLength = convert_uint(polylines[startIdx]);
uint start = startIdx + 1;
float x1 = polylines[start];
float y1 = polylines[start + 1];
float x2;
float y2;
int polygonLength = convert_uint(polygon[4]);
int polygonLength2 = polygonLength * 2;
int startPolygonIdx = 5;
for (int currPolyineIdx = 0; currPolyineIdx < polylineLength - 1; currPolyineIdx++)
{
x2 = polylines[start + (currPolyineIdx*2) + 2];
y2 = polylines[start + (currPolyineIdx*2) + 3];
float polyX1 = polygon[0];
float polyY1 = polygon[1];
for (int currPolygonIdx = 0; currPolygonIdx < polygonLength; ++currPolygonIdx)
{
float polyX2 = polygon[startPolygonIdx + (currPolygonIdx * 2 + 2) % polygonLength2];
float polyY2 = polygon[startPolygonIdx + (currPolygonIdx * 2 + 3) % polygonLength2];
if (isLinesIntersects(x1, y1, x2, y2, polyX1, polyY1, polyX2, polyY2)) {
return true;
}
polyX1 = polyX2;
polyY1 = polyY2;
}
x1 = x2;
y1 = y2;
}
// No intersection found till now so we check containing
return isPointInPolygon(x1, y1, polygon);
}
__kernel void calcIntersections(__global float* polylines, // My flat points array - [pntCount, x,y,x,y,...., pntCount, x,y,... ]
__global float* pBounds, // The rectangle bounds of each polyline - set of 4 values [top, left, bottom, right....]
__global uint* pStarts, // The start index of each polyline in the polylines array
__global float* polygon, // The polygon i want to intersect with - first 4 items are the rectangle bounds [top, left, bottom, right, pntCount, x,y,x,y,x,y....]
__global float* output, // Result array for saving the intersections polylines indices
__global uint* resCount) // The result count
{
int i = get_global_id(0);
uint start = convert_uint(pStarts[i]);
if (isRectanglesIntersected(pBounds[i * 4], pBounds[i * 4 + 1], pBounds[i * 4 + 2], pBounds[i * 4 + 3],
polygon[0], polygon[1], polygon[2], polygon[3])) {
if (isPolygonInersectsPolyline(polygon, polylines, start)){
int oldVal = atomic_inc(resCount);
output[oldVal] = i;
}
}
}
Can anyone explain it to me ?

OpenCL Matrix multiplication: inner product versus outer product

I'm hoping everyone is familiar with the standard "naive" method of multiplying two (n x n square for simplicity) matrices. In C this is:
for(int i = 0; i < n; ++i)
for(int j = 0; j < n; ++j)
for(int k = 0; k < n; ++k)
C[i*n + j] += A[i*n + k] * B[k*n + j];
The above method computes the dot (inner) product of a row of A with a column of B and is easy to implement in OpenCL as follows:
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
)
{
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
for(int i = 0; i < n; i++)
C[row*n + col] += A[row*n + i]*B[i*n + col];
}
Interchanging the two inner-most loops of the original C implementation results in a method that computes outer products, i.e., it computes rank-1 updates of the rows of the C matrix:
for(int i = 0; i < n; ++i)
for(int k = 0; k < n; ++k)
for(int j = 0; j < n; ++j)
C[i*n + j] += A[i*n + k] * B[k*n + j];
Does anybody know how to properly implement the above outer-product method in OpenCL? I have two of my attempts pasted below but I just can't seem to nail it
Attempt 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
const int n
)
{
const int row = get_global_id(1); // row
const int col = get_global_id(0); // col
__local float r;
r = A[row*n + col];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = 0; i < n; ++i)
C[row*n + i] += r * B[col*n + i];
}
Attempt 2
#define TS 1
__kernel void matmul_ocl(
__global const float *A,
__global const float *B,
__global float *C,
int n)
{
// Thread coordinates
const int row = get_local_id(1); // row
const int col = get_local_id(0); // col
// Group tile coordinates
const int by = get_group_id(1); // row
const int bx = get_group_id(0); // col
A += TS*by + TS*bx*n + n*row + (col);
B += TS*by*n + n*row + (col);
C += TS*bx*n + n*(row) + col;
__global const float *Blast = B + n;
float c[2] = {0.0f,0.0f};
float* cptr = &c[0];
__local float bs[2];
do
{
bs[0] = B[0];
bs[1] = B[n];
barrier(CLK_LOCAL_MEM_FENCE);
*cptr += A[0] * bs[0];
*cptr++ += A[0] * bs[1];
B++;
barrier(CLK_LOCAL_MEM_FENCE);
} while( B < Blast );
C[0] += c[0];
C[1] += c[1];
}

The OpenCL implementation of the common algorithm maps the outer two loops to the OpenCL NDRange implicit loops. This works because the outer two loops can be safely run in parallel.
There are a few problems with Attempt 1:
The __local variable r is assigned different values from multiple work-items simultaneously. There is a race condition here, the value of r is undefined. This could be fixed by just making r a private variable instead.
The more serious problem is that there is a race condition in the assignment of C. Every value of col (NDRange dimension 0) will be running its own loop over i in parallel.
There isn't a simple way around the second issue. The loop over k (in the transposed version) cannot be run in parallel. You can only map either the outer loop or the inner loop to a single dimensional NDRange in OpenCL.

Dijkstra's algorithm in CUDA

I am having troubles with this piece of CUDA code I have written. This is supposed to be the CUDA implementation of the Dijkstra's algorithm. The code is as follows:
__global__ void cuda_dijkstra_kernel_1(float* Va, int* Ea, int* Sa, float* Ca, float* Ua, char* Ma, unsigned int* lock){
int tid = blockIdx.x;
if(Ma[tid]=='1'){
Ma[tid] = '0';
int ind_Ea = Sa[tid * 2];
int num_edges = Sa[(tid * 2) + 1];
int v;
float wt = 0;
unsigned int leaveloop;
leaveloop = 0u;
while(leaveloop==0u){
if(atomicExch(lock, 1u) == 0u){
for(v = 0; v < num_edges; v++){
wt = (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) * (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) +
(Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) * (Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) +
(Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) * (Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) ;
wt = sqrt(wt);
if(Ca[Ea[ind_Ea + v]] > (Ca[tid] + wt)){
Ca[Ea[ind_Ea + v]] = Ca[tid] + wt;
Ma[Ea[ind_Ea + v]] = '1';
}
__threadfence();
leaveloop = 1u;
atomicExch(lock, 0u);
}
}
}
}
}
The problem is in the relaxation phase of the Dijkstra's algorithm. I have implemented such a phase as a critical section. If there is a vertex (lets say a) which is a neighbor of more than one vertex (i.e., connecting to other vertices with edges), then all of the threads for those vertices will try to write to the location of vertex a in the Cost Array Ca. Now my goal is to have the smaller value written in that location. To do that, I am trying to serialize the process and applying __threadfence() as well so that value written by one thread is visible to others and then eventually the smaller value is retained in the location of vertex a. But the problem is, that this logic is not working. The location of vertex a does not get the smallest value of all the threads trying to write to that location and I don't understand why. Any help will be highly appreciated.

There is a "classical" (at least, mostly referenced) implementation of Dijkstra's Single-Source Shortest Path (SSSP) algorithm on the GPU contained in the paper
Accelerating large graph algorithms on the GPU using CUDA by Parwan Harish and P.J. Narayanan
However, the implementation in that paper has been recognized to be bugged, see
CUDA Solutions for the SSSP Problem by Pedro J. Martín, Roberto Torres, and Antonio Gavilanes
I'm reporting below the implementation suggested in the first paper fixed according to the remark of the second. The code also contains a C++ version.
#include <sstream>
#include <vector>
#include <iostream>
#include <stdio.h>
#include <float.h>
#include "Utilities.cuh"
#define NUM_ASYNCHRONOUS_ITERATIONS 20 // Number of async loop iterations before attempting to read results back
#define BLOCK_SIZE 16
/***********************/
/* GRAPHDATA STRUCTURE */
/***********************/
// --- The graph data structure is an adjacency list.
typedef struct {
// --- Contains the integer offset to point to the edge list for each vertex
int *vertexArray;
// --- Overall number of vertices
int numVertices;
// --- Contains the "destination" vertices each edge is attached to
int *edgeArray;
// --- Overall number of edges
int numEdges;
// --- Contains the weight of each edge
float *weightArray;
} GraphData;
/**********************************/
/* GENERATE RANDOM GRAPH FUNCTION */
/**********************************/
void generateRandomGraph(GraphData *graph, int numVertices, int neighborsPerVertex) {
graph -> numVertices = numVertices;
graph -> vertexArray = (int *)malloc(graph -> numVertices * sizeof(int));
graph -> numEdges = numVertices * neighborsPerVertex;
graph -> edgeArray = (int *)malloc(graph -> numEdges * sizeof(int));
graph -> weightArray = (float *)malloc(graph -> numEdges * sizeof(float));
for (int i = 0; i < graph -> numVertices; i++) graph -> vertexArray[i] = i * neighborsPerVertex;
int *tempArray = (int *)malloc(neighborsPerVertex * sizeof(int));
for (int k = 0; k < numVertices; k++) {
for (int l = 0; l < neighborsPerVertex; l++) tempArray[l] = INT_MAX;
for (int l = 0; l < neighborsPerVertex; l++) {
bool goOn = false;
int temp;
while (goOn == false) {
goOn = true;
temp = (rand() % graph->numVertices);
for (int t = 0; t < neighborsPerVertex; t++)
if (temp == tempArray[t]) goOn = false;
if (temp == k) goOn = false;
if (goOn == true) tempArray[l] = temp;
}
graph -> edgeArray [k * neighborsPerVertex + l] = temp;
graph -> weightArray[k * neighborsPerVertex + l] = (float)(rand() % 1000) / 1000.0f;
}
}
}
/************************/
/* minDistance FUNCTION */
/************************/
// --- Finds the vertex with minimum distance value, from the set of vertices not yet included in shortest path tree
int minDistance(float *shortestDistances, bool *finalizedVertices, const int sourceVertex, const int N) {
// --- Initialize minimum value
int minIndex = sourceVertex;
float min = FLT_MAX;
for (int v = 0; v < N; v++)
if (finalizedVertices[v] == false && shortestDistances[v] <= min) min = shortestDistances[v], minIndex = v;
return minIndex;
}
/************************/
/* dijkstraCPU FUNCTION */
/************************/
void dijkstraCPU(float *graph, float *h_shortestDistances, int sourceVertex, const int N) {
// --- h_finalizedVertices[i] is true if vertex i is included in the shortest path tree
// or the shortest distance from the source node to i is finalized
bool *h_finalizedVertices = (bool *)malloc(N * sizeof(bool));
// --- Initialize h_shortestDistancesances as infinite and h_shortestDistances as false
for (int i = 0; i < N; i++) h_shortestDistances[i] = FLT_MAX, h_finalizedVertices[i] = false;
// --- h_shortestDistancesance of the source vertex from itself is always 0
h_shortestDistances[sourceVertex] = 0.f;
// --- Dijkstra iterations
for (int iterCount = 0; iterCount < N - 1; iterCount++) {
// --- Selecting the minimum distance vertex from the set of vertices not yet
// processed. currentVertex is always equal to sourceVertex in the first iteration.
int currentVertex = minDistance(h_shortestDistances, h_finalizedVertices, sourceVertex, N);
// --- Mark the current vertex as processed
h_finalizedVertices[currentVertex] = true;
// --- Relaxation loop
for (int v = 0; v < N; v++) {
// --- Update dist[v] only if it is not in h_finalizedVertices, there is an edge
// from u to v, and the cost of the path from the source vertex to v through
// currentVertex is smaller than the current value of h_shortestDistances[v]
if (!h_finalizedVertices[v] &&
graph[currentVertex * N + v] &&
h_shortestDistances[currentVertex] != FLT_MAX &&
h_shortestDistances[currentVertex] + graph[currentVertex * N + v] < h_shortestDistances[v])
h_shortestDistances[v] = h_shortestDistances[currentVertex] + graph[currentVertex * N + v];
}
}
}
/***************************/
/* MASKARRAYEMPTY FUNCTION */
/***************************/
// --- Check whether all the vertices have been finalized. This tells the algorithm whether it needs to continue running or not.
bool allFinalizedVertices(bool *finalizedVertices, int numVertices) {
for (int i = 0; i < numVertices; i++) if (finalizedVertices[i] == true) { return false; }
return true;
}
/*************************/
/* ARRAY INITIALIZATIONS */
/*************************/
__global__ void initializeArrays(bool * __restrict__ d_finalizedVertices, float* __restrict__ d_shortestDistances, float* __restrict__ d_updatingShortestDistances,
const int sourceVertex, const int numVertices) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (sourceVertex == tid) {
d_finalizedVertices[tid] = true;
d_shortestDistances[tid] = 0.f;
d_updatingShortestDistances[tid] = 0.f; }
else {
d_finalizedVertices[tid] = false;
d_shortestDistances[tid] = FLT_MAX;
d_updatingShortestDistances[tid] = FLT_MAX;
}
}
}
/**************************/
/* DIJKSTRA GPU KERNEL #1 */
/**************************/
__global__ void Kernel1(const int * __restrict__ vertexArray, const int* __restrict__ edgeArray,
const float * __restrict__ weightArray, bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances,
float * __restrict__ updatingShortestDistances, const int numVertices, const int numEdges) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (finalizedVertices[tid] == true) {
finalizedVertices[tid] = false;
int edgeStart = vertexArray[tid], edgeEnd;
if (tid + 1 < (numVertices)) edgeEnd = vertexArray[tid + 1];
else edgeEnd = numEdges;
for (int edge = edgeStart; edge < edgeEnd; edge++) {
int nid = edgeArray[edge];
atomicMin(&updatingShortestDistances[nid], shortestDistances[tid] + weightArray[edge]);
}
}
}
}
/**************************/
/* DIJKSTRA GPU KERNEL #1 */
/**************************/
__global__ void Kernel2(const int * __restrict__ vertexArray, const int * __restrict__ edgeArray, const float* __restrict__ weightArray,
bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances, float* __restrict__ updatingShortestDistances,
const int numVertices) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numVertices) {
if (shortestDistances[tid] > updatingShortestDistances[tid]) {
shortestDistances[tid] = updatingShortestDistances[tid];
finalizedVertices[tid] = true; }
updatingShortestDistances[tid] = shortestDistances[tid];
}
}
/************************/
/* dijkstraGPU FUNCTION */
/************************/
void dijkstraGPU(GraphData *graph, const int sourceVertex, float * __restrict__ h_shortestDistances) {
// --- Create device-side adjacency-list, namely, vertex array Va, edge array Ea and weight array Wa from G(V,E,W)
int *d_vertexArray; gpuErrchk(cudaMalloc(&d_vertexArray, sizeof(int) * graph -> numVertices));
int *d_edgeArray; gpuErrchk(cudaMalloc(&d_edgeArray, sizeof(int) * graph -> numEdges));
float *d_weightArray; gpuErrchk(cudaMalloc(&d_weightArray, sizeof(float) * graph -> numEdges));
// --- Copy adjacency-list to the device
gpuErrchk(cudaMemcpy(d_vertexArray, graph -> vertexArray, sizeof(int) * graph -> numVertices, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_edgeArray, graph -> edgeArray, sizeof(int) * graph -> numEdges, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_weightArray, graph -> weightArray, sizeof(float) * graph -> numEdges, cudaMemcpyHostToDevice));
// --- Create mask array Ma, cost array Ca and updating cost array Ua of size V
bool *d_finalizedVertices; gpuErrchk(cudaMalloc(&d_finalizedVertices, sizeof(bool) * graph->numVertices));
float *d_shortestDistances; gpuErrchk(cudaMalloc(&d_shortestDistances, sizeof(float) * graph->numVertices));
float *d_updatingShortestDistances; gpuErrchk(cudaMalloc(&d_updatingShortestDistances, sizeof(float) * graph->numVertices));
bool *h_finalizedVertices = (bool *)malloc(sizeof(bool) * graph->numVertices);
// --- Initialize mask Ma to false, cost array Ca and Updating cost array Ua to \u221e
initializeArrays <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_finalizedVertices, d_shortestDistances,
d_updatingShortestDistances, sourceVertex, graph -> numVertices);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
// --- Read mask array from device -> host
gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost));
while (!allFinalizedVertices(h_finalizedVertices, graph->numVertices)) {
// --- In order to improve performance, we run some number of iterations without reading the results. This might result
// in running more iterations than necessary at times, but it will in most cases be faster because we are doing less
// stalling of the GPU waiting for results.
for (int asyncIter = 0; asyncIter < NUM_ASYNCHRONOUS_ITERATIONS; asyncIter++) {
Kernel1 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances,
d_updatingShortestDistances, graph->numVertices, graph->numEdges);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
Kernel2 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances, d_updatingShortestDistances,
graph->numVertices);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost));
}
// --- Copy the result to host
gpuErrchk(cudaMemcpy(h_shortestDistances, d_shortestDistances, sizeof(float) * graph->numVertices, cudaMemcpyDeviceToHost));
free(h_finalizedVertices);
gpuErrchk(cudaFree(d_vertexArray));
gpuErrchk(cudaFree(d_edgeArray));
gpuErrchk(cudaFree(d_weightArray));
gpuErrchk(cudaFree(d_finalizedVertices));
gpuErrchk(cudaFree(d_shortestDistances));
gpuErrchk(cudaFree(d_updatingShortestDistances));
}
/****************/
/* MAIN PROGRAM */
/****************/
int main() {
// --- Number of graph vertices
int numVertices = 8;
// --- Number of edges per graph vertex
int neighborsPerVertex = 6;
// --- Source vertex
int sourceVertex = 0;
// --- Allocate memory for arrays
GraphData graph;
generateRandomGraph(&graph, numVertices, neighborsPerVertex);
// --- From adjacency list to adjacency matrix.
// Initializing the adjacency matrix
float *weightMatrix = (float *)malloc(numVertices * numVertices * sizeof(float));
for (int k = 0; k < numVertices * numVertices; k++) weightMatrix[k] = FLT_MAX;
// --- Displaying the adjacency list and constructing the adjacency matrix
printf("Adjacency list\n");
for (int k = 0; k < numVertices; k++) weightMatrix[k * numVertices + k] = 0.f;
for (int k = 0; k < numVertices; k++)
for (int l = 0; l < neighborsPerVertex; l++) {
weightMatrix[k * numVertices + graph.edgeArray[graph.vertexArray[k] + l]] = graph.weightArray[graph.vertexArray[k] + l];
printf("Vertex nr. %i; Edge nr. %i; Weight = %f\n", k, graph.edgeArray[graph.vertexArray[k] + l],
graph.weightArray[graph.vertexArray[k] + l]);
}
for (int k = 0; k < numVertices * neighborsPerVertex; k++)
printf("%i %i %f\n", k, graph.edgeArray[k], graph.weightArray[k]);
// --- Displaying the adjacency matrix
printf("\nAdjacency matrix\n");
for (int k = 0; k < numVertices; k++) {
for (int l = 0; l < numVertices; l++)
if (weightMatrix[k * numVertices + l] < FLT_MAX)
printf("%1.3f\t", weightMatrix[k * numVertices + l]);
else
printf("--\t");
printf("\n");
}
// --- Running Dijkstra on the CPU
float *h_shortestDistancesCPU = (float *)malloc(numVertices * sizeof(float));
dijkstraCPU(weightMatrix, h_shortestDistancesCPU, sourceVertex, numVertices);
printf("\nCPU results\n");
for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesCPU[k]);
// --- Allocate space for the h_shortestDistancesGPU
float *h_shortestDistancesGPU = (float*)malloc(sizeof(float) * graph.numVertices);
dijkstraGPU(&graph, sourceVertex, h_shortestDistancesGPU);
printf("\nGPU results\n");
for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesGPU[k]);
free(h_shortestDistancesCPU);
free(h_shortestDistancesGPU);
return 0;
}

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

opencl kernel implementing a simple mathematical formula - opencl

Related

PyOpenCL - not seeing expected speedup

OpenCL Matrix Multiplication Altera Example

Different results GPU & CPU when more than one 8 work items per group

OpenCL Matrix multiplication: inner product versus outer product

Dijkstra's algorithm in CUDA

Categories

Resources