Issue in OpenCL Kernel function

Issue in OpenCL Kernel function - opencl

I am new to Open-cl and I am trying to write kernel code for the following matrix operation:
A is a 2X2 matrix:
A = [1 2] ----> row1
[3 4] ----->row2
I need to compute:
1) s1 = transpose(row1) X row1
2) s1 = transpose(row2) X row2
3) Sum = s1+s2
I wrote kernel code for row level (i.e I can do transpose(row1) X row1 )
-this serves the purpose for first row only
How do I use parallelism to compute this for each row and find the final sum within kernel function ?
private static String programSource1 =
"__kernel"+
" void matrixMul(__global float* A, __global float* C, int rowLength)"+
"{"+
"int row = get_global_id(1);"+
"int col = get_global_id(0);"+
"C[row*rowLength+col] = A[col] * A[row];"+
"}";

#define MAX_ROW_LENGTH 2 // or more
__kernel void matrixMul(__global float* A, __global float* C,
int rowLength)
{
__local float buffer[MAX_ROW_LENGTH * MAX_ROW_LENGTH];
__local float s1[MAX_ROW_LENGTH * MAX_ROW_LENGTH];
int col = get_global_id(0);
int row = get_global_id(1);
int rows = get_global_size(1);
// read the matrix from global to local memory
buffer[row * rowLength + col] = A[row * rowLength + col];
s1[row * rowLength + col] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < rows; ++i)
{
s1[row * rowLength + col] +=
buffer[i * rowLength + col] * buffer[i * rowLength + row];
}
C[row * rowLength + col] = s1[row*rowLength+col];
}
Here is some kernel code that does what you want for small matrices. The kernel uses local memory to reduce global memory access. For such small problems (2x2 matrix) this want achiev anything but if you are computing greater matrices this can speedup the thing a little bit. However, this is a short example and not optimized.Iit comes with some limitations:
this code only supports local workgroup sizes equal to the global
workgroup size (no chunks)
if your matrices get to big the shared memory will limit the utilization of your GPU and
if your matrices get realy big their will not be enough shared memory
If you don't want local memory remove replace the calls for buffer within the for loop by A and write directly to C instead of s1.

Related

OpenCL how to change the memory address of cl_mem?

I want to do a sub-matrix multiplication. Say I have a function:
void MatMul(cl_mem A, cl_mem B, cl_mem C, int M, int K, int N)
where A is M*K, B is K*N, C is M*N, and A, B, C are all row major 1 dimension array passed by host memory float *h_A, *h_B, *hC with the following function:
void ocl_push_array(cl_mem d_x, float *h_x, int n){
size_t data_size = sizeof(float)*n;
err = clEnqueueWriteBuffer(queue, d_x, CL_TRUE, 0, data_size, h_x, 0, NULL, NULL);
}
I want to ask:
if I want to do sub-matrix multiplication, say slicing A by row:
// cl_mem A, B, C;
for(int x=0; x<M; x+=16)
{
cl_mem A_sub = (cl_mem)((float *)A+x*K);
cl_mem C_sub = (cl_mem)((float *)C+x*N);
if((M-x+1)>=16)
MatMul(A_sub, B, C_sub, 16, K, N);
else
MatMul(A_sub, B, C_sub, M-x+1, K, N);
}
Is it the right code to do this operation? I got a run time error says: "CL_INVALID_MEM_OBJECT" (-38) when it assigns arguments to OpenCL kernel (clSetKernelArg).
The reason I want to do this operation is that I found the matrix multiplication got wrong answers when my input matrix A and B become big.
My OpenCL kernel is:
#define BLOCK_SIZE 16
#define AS(i, j) As[j + i * BLOCK_SIZE]
#define BS(i, j) Bs[j + i * BLOCK_SIZE]
__kernel void
matrixMul(__global float* A, __global float* B, __global float* C,
__local float* As, __local float* Bs, int uiWA, int uiWB)
{
int bx = get_group_id(0);
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int aBegin = uiWA * BLOCK_SIZE * by;
int aEnd = aBegin + uiWA - 1;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * uiWB;
float Csub = 0.0f;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
AS(ty, tx) = A[a + uiWA * ty + tx];
BS(ty, tx) = B[b + uiWB * ty + tx];
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += AS(ty, k) * BS(k, tx);
barrier(CLK_LOCAL_MEM_FENCE);
}
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
and the size is:
#define BLOCK_SIZE 16
size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, N), shrRoundUp(BLOCK_SIZE, M)};
size_t shrRoundUp(int group_size, int global_size)
{
int r = global_size % group_size;
if(r == 0)
{
return global_size;
} else
{
return global_size + group_size - r;
}
}
the code is adopted from Nvidia OpenCL matrix multiplication sample. My GPU is: Intel(R) HD Graphics 4600.
Thanks!

I don't think you can do this:
cl_mem A_sub = (cl_mem)((float *)A+x*K);
Because cl_mem is an object in OpenCL, which is actually a complex data structure instead of just a data pointer. It maintains information such as data pointer to the actually memory, reference to the object, memory properties and so on. Different run-times may even have different implementations of cl_mem object. That's why you got the CL_INVALID_MEM_OBJECT error message.
What you can do to get wanted data for the sub matrix is one of the followings:
define two new cl_mem objects, and use a separate kernel to do
the copy work.
use clEnqueueCopyBuffer function to copy the data at the host
code domain.
use CL_MEM_ALLOC_HOST_PTR to memory buffer, and then use
clEnqueueMapBuffer map the GPU memory to host memory pointer, and
then modify the memory content by using the mapped host memory
pointer, when you finish, unmap the pointer to return the GPU memory
to the device memory domain.

OpenCL Matrix Multiplication Altera Example

I am very new to OpenCL and am going through the Altera OpenCL examples.
In their matrix multiplication example, they have used the concept of blocks, where dimensions of the input matrices are multiple of block size. Here's the code:
void matrixMult( // Input and output matrices
__global float *restrict C,
__global float *A,
__global float *B,
// Widths of matrices.
int A_width, int B_width)
{
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);
// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);
// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;
float running_sum = 0.0f;
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
}
}
// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
Assume block size is 2, then: block_x and block_y are both 0; and local_x and local_y are both 0.
Then A_local[0][0] would be A[0] and B_local[0][0] would be B[0].
Sizes of A_local and B_local are 4 elements each.
In that case, how would A_local and B_local access other elements of the block in that iteration?
Also would separate threads/cores be assigned for each local_x and local_y?

There is definitely a barrier missing in your code sample. The outer for loop as you have it will only produce correct results if all work items are executing instructions in lockstep fashion, thus guaranteeing the local memory is populated before the for k loop.
Maybe this is the case for Altera and other FPGAs, but this is not correct for CPUs and GPUs.
You should add barrier(CLK_LOCAL_MEM_FENCE); if you are getting unexpected results, or want to be compatible with other type of hardware.
float running_sum = 0.0f;
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
}
}

A_local and B_local are both shared by all work items of the work group, so all their elements are loaded in parallel (by all work items of the work group) at each step of the encompassing for loop.
Then each work item uses some of the loaded values (not necessarily the values the work item loaded itself) to do its share of the computation.
And finally, the work item stores its individual result into the global output matrix.
It is a classical tiled implementation of a matrix-matrix multiplication. However, I'm really surprised not to see any sort of call to a memory synchronisation function, such as work_group_barrier(CLK_LOCAL_MEM_FENCE) between the load of A_local and B_local and their use in the k loop... But I might very well have overlooked something here.

OpenCL: Optimize matrix multiplication for uchar

I adapted the attached kernel from one of the NVIDIA OpenCL examples and compared performance to clblasSgemm, and found that they perform equally fast (at least on my setup). I am launching it with a {16, 16} local work size.
Now, assume matrices A and B are both uchar, and C accordingly uint. Is there any way to optimize the multiplication? Simply replacing the types degraded performance. I tried hand-vectorizing with uchar4 and uchar16, but that made it slower.
Any suggestions welcome! (I am new to GPU programming and OpenCL)
/*
* This software contains source code provided by NVIDIA Corporation.
*/
#define BLOCK_SIZE 16
__kernel void mat_mul(const __global float* A, const __global float* B,
__global float* C,
const int A_cols, const int B_cols) {
// Block index
const int bx = get_group_id(0);
const int by = get_group_id(1);
// Thread index
const int tx = get_local_id(0);
const int ty = get_local_id(1);
// Index of the first sub-matrix of A processed by the block
const int a0 = A_cols * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
const int a1 = a0 + A_cols - 1;
const int a_step = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
const int b0 = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
const int b_step = BLOCK_SIZE * B_cols;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
__local float As[BLOCK_SIZE][BLOCK_SIZE];
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Loop over all the sub-matrices of A and B required to compute the
// block sub-matrix
for (int a=a0, b=b0; a<=a1; a+=a_step, b+=b_step) {
// Load the matrices from device memory to shared memory;
// each thread loads one element of each matrix
As[ty][tx] = A[a + A_cols * ty + tx];
Bs[ty][tx] = B[b + B_cols * ty + tx];
// Synchronize to make sure the matrices are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrices together;
// each thread computes one element of the block sub-matrix
#pragma unroll
for (int k=0; k<BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding computation is done
// before loading two new sub-matrices of A and B in the next
// iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}

There is very simple way to measure if your kernel is good. Calculate it's OPS & bandwidth (how many data in form of matrix are you processing per second). Then compare it to theoretical limits. You will get factor, limiting performance. Usually, it's load-store operations.

OpenCL / try to understand Kernel Code

I am studying an OpenCL code wich simulates the N-body problem from the following tutorial :
http://www.browndeertechnology.com/docs/BDT_OpenCL_Tutorial_NBody-rev3.html
My main issue relies on the kernel code :
for(int jb=0; jb < nb; jb++) { /* Foreach block ... */
19 pblock[ti] = pos_old[jb*nt+ti]; /* Cache ONE particle position */
20 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in the work-group */
21 for(int j=0; j<nt; j++) { /* For ALL cached particle positions ... */
22 float4 p2 = pblock[j]; /* Read a cached particle position */
23 float4 d = p2 - p;
24 float invr = rsqrt(d.x*d.x + d.y*d.y + d.z*d.z + eps);
25 float f = p2.w*invr*invr*invr;
26 a += f*d; /* Accumulate acceleration */
27 }
28 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in work-group */
29 }
I don't understand what exactly happens at the execution : the kernel code is executed n times where n is the number of work-items (which is also the number of threads) but in the above part of code, we use the local memory for each work-group (there are nb work-groups it seems)
So, at the execution, up to the first "barrier", do I fill locally the pblock array with the global values of pos_old ?
Always up to the first barrier, for another work-group, the pblock array will have contain the same values as the arrays of the others work-groups, since jb=0 before the barrier ?
It seems that's a way to share these arrays by all the work-groups but this is not totally clear for me.
Any help is welcome.

Can you post the entire kernel code please? I have to make assumptions about the params and private variables.
It looks like there are nt number of work items in the group, and ti represents the current work item. When the loop executes, each item in the group will copy only single element. Usually this copy is from a global data source. The first barrier forces the work item to wait until the other items have made their copy. This is necessary because every work item in the group needs to read the data copied from every other work item. The values should not be the same, because ti should be different for each work item. (jb*nt would still equal zero for the first loop though)
Here is the entire kernel code :
__kernel
void
nbody_sim(
__global float4* pos ,
__global float4* vel,
int numBodies,
float deltaTime,
float epsSqr,
__local float4* localPos,
__global float4* newPosition,
__global float4* newVelocity)
{
unsigned int tid = get_local_id(0);
unsigned int gid = get_global_id(0);
unsigned int localSize = get_local_size(0);
// Number of tiles we need to iterate
unsigned int numTiles = numBodies / localSize;
// position of this work-item
float4 myPos = pos[gid];
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
for(int i = 0; i < numTiles; ++i)
{
// load one tile into local memory
int idx = i * localSize + tid;
localPos[tid] = pos[idx];
// Synchronize to make sure data is available for processing
barrier(CLK_LOCAL_MEM_FENCE);
// calculate acceleration effect due to each body
// a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)
for(int j = 0; j < localSize; ++j)
{
// Calculate acceleartion caused by particle j on particle i
float4 r = localPos[j] - myPos;
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
float invDist = 1.0f / sqrt(distSqr + epsSqr);
float invDistCube = invDist * invDist * invDist;
float s = localPos[j].w * invDistCube;
// accumulate effect of all particles
acc += s * r;
}
// Synchronize so that next tile can be loaded
barrier(CLK_LOCAL_MEM_FENCE);
}
float4 oldVel = vel[gid];
// updated position and velocity
float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime;
newPos.w = myPos.w;
float4 newVel = oldVel + acc * deltaTime;
// write to global memory
newPosition[gid] = newPos;
newVelocity[gid] = newVel;
}
There are "numTiles" work-groups with "localSize" work-items for each work-group.
"gid" is the global index and "tid" is the local index.
Let's start at the first iteration of the loop "for(int i = 0; i < numTiles; ++i)" with "i=0":
If I take for example :
numTiles = 4, localSize = 25 and numBodies = 100 = number of work-items.
Then, at the execution, if I have gid = 80, then tid = 5, idx = 5 and the first assignement will be : localPos[5] = pos[5]
Now, I take gid = 5, then tid = 5 and idx = 5, I will have the same assignement with : localPos[5] = pos[5]
So, from what I understand, in the first iteration and after the first "barrier", each work-items contains the same Local array "localPos", i.e the sub-array of the first global block, which is "pos[0:24]".
Is this a good explanation of what happens ?

Can this OpenCL code be optimized?

I am working on a piece of OpencL code for a specialized matrix function: for a Dx1 vector v, two DxD matrices A and B and a constant c, return 1xD vector r where r[i] = c * sum_over_j (v[j] * A[i][j] * B[i][j])
Below is what I have so far, but it runs freakishly slow. A version without summing that returns a DxD matrix is about ten times faster. It's called from PyOpenCL if that makes any difference.
Is anything done wrong? Could it be optimized?
#define D 1000
...
__kernel void element_mult(
__global float *result,
__global const float *vector,
__global const float *matrix,
__global const float *matrix2,
const float factor)
{
int y = get_global_id(1);
float sum = 0;
for(int k = 0; k < D; k++)
{
sum += vector[k] * matrix[(y*D) + k]
* matrix2[(y*D) + k ];
}
result[y] = sum * factor;
}
Cheers!

Optimization #1: make vector __local.
My first pass at this got a decent improvement in performance. I noticed that each vector[k] is read a total of D times, so I copied it to a __local. This is only possible because D is small enough to allow this. The kernel as you have it above suffers from a terrible ALU:fetch ratio of 0.08 on both the 5870 and the 6970 gpus. Even the slower gpus are still waiting on the memory access.
#define D 1000
__kernel void element_mult(
__global float *result,
__global const float *vector,
__global const float *matrix,
__global const float *matrix2,
const float factor)
{
int y = get_global_id(0);
float sum = 0;
__local float vectCopy[D];
int ls = get_local_size(0);
int lid = get_local_id(0);
for(int i=0;i<D;i+=ls){
vectCopy[i+lid] = vector[i+lid];
}
mem_fence(CLK_LOCAL_MEM_FENCE);
for(int k = 0; k < D; k++)
{
sum += vectCopy[k] * matrix[(y*D) + k] * matrix2[(y*D) + k ];
}
result[y] = sum * factor;
}
With this change, APP profiler is showing a new ALU:fetch ratio of 0.20 for the 5870 and 6970 gpus. Average times changed from 1513-->1034, and 1261-->861 on the same cards. The low end gpus are now bound by ALU instead of fetch. (greater than 4:1 ratio)
Opimization #2: calculate each result[y] using an entire work group.
You would have to do this id D were much larger (100k+). The idea is to get the best memory access pattern by using the work group to compute a single element of the result at a time. I defined ls (local size) to be 64 here, because it works on my hardware, as well as most vendors'. The workgroup size you use from the host-side will have to be 64 unless you change that definition. It needs to be defined to create the sum[ls] storage as __local, and I don't like passing variable sized __local vars into my kernels.
results: 5870 ALU:fetch=0.59:1, avg=708. 6970 ALU:fetch=0.72, avg=590. According to APP profiler, this is about twice as fast as your original listing.
#define D 1000
#define ls 64
__kernel void element_mult(
__global float *result,
__global const float *vector,
__global const float *matrix,
__global const float *matrix2,
const float factor)
{
__local float vectCopy[D];
int lid = get_local_id(0);
for(int i=0;i<D;i+=ls){
vectCopy[i+lid] = vector[i+lid];
}
mem_fence(CLK_LOCAL_MEM_FENCE);
int ng = get_num_groups(0);
int gid = get_group_id(0);
int y, k;
__local float sum[ls];
for(y = gid; y < D; y+=ng){
for(k = lid; k < D; k+=ls)
{
sum[lid] += vectCopy[k] * matrix[(y*D) + k] * matrix2[(y*D) + k ];
}
if(lid==0){
result[y] = sum[0];
for(k=1;k<ls;k++){
result[y] += sum[k];
}
result[y] *= factor;
}
mem_fence(CLK_LOCAL_MEM_FENCE);
}
}
EDIT: APP profiler = AMD APP KernelAnalyzer