OpenCL: Expected identifier in kernel - opencl

I am running the following kernel on windows 7, 64 bit, with Intel CPU and HD graphics.
I get very strange error reporting by clGetProgramBuildInfo for the following code:
#define BLOCK_SIZE 256
__kernel void reduce4(__global uint* input, __global uint* output, __local uint* sdata)
{
unsigned int tid = get_local_id(0);
unsigned int bid = get_group_id(0);
unsigned int gid = get_global_id(0);
unsigned int blockSize = get_local_size(0);
unsigned int index = bid*(BLOCK_SIZE*2) + tid;
sdata[tid] = input[index] + input[index+BLOCK_SIZE];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int s = BLOCK_SIZE/2; s > 64 ; s >>= 1) {
// Unrolling the last wavefront and we cut 7 iterations of this
// for-loop while we practice wavefront-programming
if(tid < s)
{
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid < 64) {
if (blockSize >= 128) sdata[tid] += sdata[tid + 64];
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
// write result for this block to global mem
if(tid == 0)
{
output[bid] = sdata[0];
}
}
It always says:
Compilation started
:38:2: error: expected identifier or '('
Compilation failed
this is for the last line, where I have put }. What is wrong here?
Update:
This is how I am reading the kernel file:
int offset = 0;
for(int i = 0; i < numOfDevices; ++i, ++offset ) {
/* Load the two source files into temporary datastores */
const char *file_names[] = {"SimpleOptimizations.cl"};
const int NUMBER_OF_FILES = 1;
char* buffer[NUMBER_OF_FILES];
size_t sizes[NUMBER_OF_FILES];
loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
/* Create the OpenCL program object */
program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);
if(error != CL_SUCCESS) {
perror("Can't create the OpenCL program object");
exit(1);
}
Definition of loadProgramSource
void loadProgramSource(const char** files,
size_t length,
char** buffer,
size_t* sizes) {
/* Read each source file (*.cl) and store the contents into a temporary datastore */
for(size_t i=0; i < length; i++) {
FILE* file = fopen(files[i], "r");
if(file == NULL) {
perror("Couldn't read the program file");
exit(1);
}
fseek(file, 0, SEEK_END);
sizes[i] = ftell(file);
rewind(file); // reset the file pointer so that 'fread' reads from the front
buffer[i] = (char*)malloc(sizes[i]+1);
buffer[i][sizes[i]] = '\0';
fread(buffer[i], sizeof(char), sizes[i], file);
fclose(file);
}
}

I believe this is an issue with the way the Windows deals with text files opened with fopen(). If you take a look at the MSDN page for fopen(), it indicates that if you open a file with just "r" as the mode string, some translations will happen with regards to line-endings. This means that the size of the file you query may not match the amount of data read by fread().
To solve this, simply change the mode string to indicate that you wish to read the file as binary data (i.e. without any pesky translations):
FILE* file = fopen(files[i], "rb");

Related

strlen segfaults on 2d dynamic array of strings

Why do I get a segfault when I try to print the strlen of a string, which is part of an array of strings? I can print each string - the printf works perfectly. But why does the strlen cause a segfault?
The below program first takes an input n, which is the number of strings I want to dynamically allocate. Then I allocate space for 32 bytes for each string
int main() {
int i;
int n;
char **nums;
scanf("%d", &n);
printf("n = %d\n", n);
nums = malloc(sizeof(char *) * n);
printf("allocated nums\n");
for(i=0; i<n; i++) {
nums[i] = malloc(sizeof(char) * 32);
memset(nums[i], '\0', sizeof(char) * 32);
}
for(i=0; i < n; i++) {
scanf("%s", &nums[i]);
}
for(i=0; i<n; i++) {
// THIS PRINTS FINE
printf("string = %s\n", &nums[i]);
// SEGFAULT HERE IMMEDIATELY
printf("length = %d\n", strlen(nums[i]));
}
Here is the console output. As a test, I entered in n=3, followed by the numbers 45, 46, and 47:
3
n = 3
allocated nums
45
46
47
string = 45
Segmentation fault (core dumped)
Additionally, I get a segfault when I try to access an individual character in each string. Again, the first printf in outer for loop prints the string, then I get a segfault accessing nums[i][k]:
int i, k=0;
for(i=0; i<n; i++) {
printf("Printiiing: %s\n", &nums[i]);
// WHY DOES THIS SEGFAULT???
//printf("first char = %c\n", nums[i][0]);
while(k<32 && nums[i][k] != '\0') {
// THIS CAUSES A SEG FAULT
printf("char = %c", (nums[i])[k]);
k++;
}
}
This is fine:
for(i=0; i<n; i++) {
nums[i] = malloc(sizeof(char) * 32);
memset(nums[i], '\0', sizeof(char) * 32);
}
After this loop, each nums[i] is a pointer to 32-byte buffer.
But this corrupts (overwrites) all the pointers, instead of reading strings into the allocated buffers:
for(i=0; i < n; i++) {
scanf("%s", &nums[i]);
}
To fix the bug, use: scanf("%s", nums[i]);.
There is the difference pointer between the &nums[i] and nums[i] address. You can check with the strlen((const char*)(&nums[i])) for get length.
The explanation of #Employee of Russia is exactly for reason.

Update UDP checksum in fragmented packets

I'm building a network appliance. I need to support NAT and IP packet fragmentation. When I change the source or destination address of a UDP packet, I have to correct the UDP checksum (and the IP checksum too, but that's trivial). When the packet is fragmented, I'd have to collect all the fragments to recalculate the checksum. I know the old address and the new address. I'd like to:
Un-negate the checksum
Subtract the old address
Add the new address
Re-reduce the sum and negate
This process doesn't always work. Is there any way to update the checksum versus having to recalculate it from scratch?
I've tried:
long CalcCheckSumAdd(unsigned char *pbHeader, int iSize, long lInitial){
long lSum = lInitial;
while (iSize > 1){
lSum += *((unsigned short*)pbHeader);
pbHeader += 2;
iSize -= 2;
}
if (iSize > 0) lSum += *pbHeader;
return lSum;
}
long CalcCheckSumSubract(unsigned char *pbHeader, int iSize, long lInitial){
long lSum = lInitial;
while (iSize > 1){
lSum -= *((unsigned short*)pbHeader);
pbHeader += 2;
iSize -= 2;
}
if (iSize > 0) lSum -= *pbHeader;
return lSum;
}
unsigned short CalcCheckSumFinish(long lSum){
while (lSum >> 16){
lSum = (lSum & 0xFFFF) + (lSum >> 16);
}
return (unsigned short)(~lSum);
}
long CalcCheckSumUnfinish(unsigned short usSum){
// Can't totally undo lossy finish logic
return ~usSum;
}
unsigned short CalcCheckSumUpdateAddress(unsigned short usOldSum, unsigned long ulOldAddress, unsigned long ulNewAddress){
long lSumFixed = CalcCheckSumUnfinish(usOldSum);
lSumFixed = CalcCheckSumSubract((unsigned char*)&ulOldAddress,sizeof(ulOldAddress),lSumFixed);
lSumFixed = CalcCheckSumAdd((unsigned char*)&ulNewAddress,sizeof(ulNewAddress),lSumFixed);
return CalcCheckSumFinish(lSumFixed);
}
Thanks!
EDIT: Added unit test code below
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
long CalcCheckSumAdd(unsigned char *pbHeader, int iSize, long lInitial){
long lSum = lInitial;
while (iSize > 1){
lSum += *((unsigned short*)pbHeader);
pbHeader += 2;
iSize -= 2;
}
if (iSize > 0) lSum += *pbHeader;
return lSum;
}
unsigned short CalcCheckSumFinish(long lSum){
while (lSum >> 16){
lSum = (lSum & 0xFFFF) + (lSum >> 16);
}
return (unsigned short)(~lSum);
}
void Randomize(unsigned char *pucPacket, unsigned long ulSize){
for (unsigned long ulByte = 0; ulByte < ulSize; ulByte++){
pucPacket[ulByte] = (unsigned char)(255 * rand() / RAND_MAX);
}
}
unsigned short Calc(unsigned char *pucPacket, unsigned long ulSize){
long lSum = CalcCheckSumAdd(pucPacket,ulSize,0);
return CalcCheckSumFinish(lSum);
}
unsigned short Fix(unsigned short usOrig, unsigned int uiOld, unsigned int uiNew){
// TODO: Replace this with something that makes main never fail
usOrig -= uiOld & 0xffff;
usOrig -= uiOld >> 16 & 0xffff;
usOrig += uiNew & 0xffff;
usOrig += uiNew >>16 & 0xffff;
return usOrig;
}
void Break(unsigned char *pucPacket, unsigned int *puiOld, unsigned int *puiNew){
unsigned int *puiChange = (unsigned int*)pucPacket;
*puiOld = *puiChange;
Randomize((unsigned char*)puiNew,sizeof(unsigned int));
*puiChange = *puiNew;
}
void PrintBuffer(const char *szName, unsigned char *pucBuff, unsigned int uiSize){
printf("%s: ",szName);
for (unsigned int uiByte = 0; uiByte < uiSize; uiByte++){
printf("%02X",(unsigned int)pucBuff[uiByte]);
}
printf("\n");
}
void PrintTestCase(unsigned char *pucOrig, unsigned char *pucChanged, unsigned int uiSize, unsigned short usOrig, unsigned short usChanged, unsigned short usFixed){
PrintBuffer("Original Buffer",pucOrig,uiSize);
PrintBuffer("Changed Buffer ",pucChanged,uiSize);
printf("Orig checksum: %04X\n",(unsigned int)usOrig);
printf("Changed checksum: %04X\n",(unsigned int)usChanged);
printf("Fixed checksum: %04X\n",(unsigned int)usFixed);
}
int main(){
srand((unsigned int)time(nullptr));
unsigned char pucDataOrig[100];
unsigned char pucDataChanged[100];
bool bTestFailed = false;
while (!bTestFailed){
Randomize(pucDataOrig,sizeof(pucDataOrig));
memcpy(pucDataChanged,pucDataOrig,sizeof(pucDataOrig));
unsigned short usOrig = Calc(pucDataOrig,sizeof(pucDataOrig));
unsigned int uiOld = 0,
uiNew = 0;
Break(pucDataChanged,&uiOld,&uiNew);
unsigned short usFixed = Fix(usOrig,uiOld,uiNew);
unsigned short usChanged = Calc(pucDataChanged,sizeof(pucDataChanged));
if (usChanged == usFixed){
printf(".");
}else{
printf("\nTest case failed\n");
PrintTestCase(pucDataOrig,pucDataChanged,sizeof(pucDataOrig),usOrig,usChanged,usFixed);
bTestFailed = true;
}
}
return 0;
}
You are right, the solution above works only on some cases, but I have a new implem that works for all kind of packet (fragmented or not, UDP, TCP, IP). Here is the implem:
/* incremental checksum update */
static inline void
cksum_update(uint16_t *csum, uint32_t from, uint32_t to)
{
uint32_t sum, csum_c, from_c, res, res2, ret, ret2;
csum_c = ~((uint32_t)*csum);
from_c = ~from;
res = csum_c + from_c;
ret = res + (res < from_c);
res2 = ret + to;
ret2 = res2 + (res2 < to);
sum = ret2;
sum = (sum & 0xffff) + (sum >> 16);
sum = (sum & 0xffff) + (sum >> 16);
*csum = (uint16_t)~sum;
}
You can now use this function when you translated you packet address and before sending:
/* Update L4 checksums on all packet a part from [2nd, n] fragment */
switch (IS_FRAG(ipv4_hdr) ? 0 : ipv4_hdr->next_proto_id) {
case IPPROTO_TCP:
{
struct tcp_hdr *tcp_hdr = tcp_header(pkt);
/* Compute TCP checksum using incremental update */
cksum_update(&tcp_hdr->cksum, old_ip_addr, *address);
break;
}
case IPPROTO_UDPLITE:
case IPPROTO_UDP:
{
struct udp_hdr *udp_hdr = udp_header(pkt);
/* Compute UDP checksum using incremental update */
cksum_update(&udp_hdr->dgram_cksum, old_ip_addr, *address);
break;
}
default:
break;
}
You have to substract the old ip address and add the new one on the udp checksum, here is the pseudo code:
udp_hdr->dgram_cksum -= old_ipv4_addr & 0xffff;
udp_hdr->dgram_cksum -= old_ipv4_addr >> 16 & 0xffff;
udp_hdr->dgram_cksum += new_ipv4_addr & 0xffff;
udp_hdr->dgram_cksum += new_ipv4_addr >>16 & 0xffff;
That should handle UDP checksum on IP fragments.

Parallel reduction using local memory in OpenCL

I implemented a reduce kernel in OpenCL to sum up all entries in the input vector of size N. For a easier testing I initialize the input vector with 1.0f. So the result should be N. But it is not!
Here is my reduce-kernel:
kernel void reduce(global float* input, global float* output, const unsigned int N, local float* cache)
{
const uint local_id = get_local_id(0);
const uint global_id = get_global_id(0);
const uint local_size = get_local_size(0);
cache[local_id] = (global_id < N) ? input[global_id] : 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int s = local_size >> 1; s > 0; s >>= 1) {
if (local_id < s) {
cache[local_id] += cache[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_id == 0) output[local_size] = cache[0];
}
And here is the setting for OpenCL:
const uint N = 8196;
cl_float a[N];
cl_float b[N];
for (uint i=0; i<N; i++) {
a[i] = 1.0f;
b[i] = 0.0f;
}
cl::Buffer inputBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float)*N);
cl::Buffer resultBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float)*N);
queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, sizeof(cl_float)*N, a);
queue.enqueueWriteBuffer(resultBuffer, CL_TRUE, 0, sizeof(cl_float)*N, b);
cl::Kernel addVectorKernel = cl::Kernel(program, "reduce");
size_t localSize = addVectorKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device); // e.g. => 512
size_t globalSize = roundUp(localSize, N); // rounds up to a multiple of localSize
addVectorKernel.setArg(0, inputBuffer);
addVectorKernel.setArg(1, resultBuffer);
addVectorKernel.setArg(2, N);
addVectorKernel.setArg(3, (sizeof(cl_float) * localSize), NULL);
queue.enqueueNDRangeKernel(
addVectorKernel,
cl::NullRange,
cl::NDRange(globalSize),
cl::NDRange(localSize)
);
queue.finish(); // wait for ending
queue.enqueueReadBuffer(resultBuffer, CL_TRUE, 0, sizeof(cl_float)*N, b); // e.g. => 1024
The result depends on the workgroup size. What am I doing wrong? Is it the kernel itself or is it the settings for OpenCL?
You should be using the group's id when writing the sum back to global memory.
if (local_id == 0) output[local_size] = cache[0];
That line will write to output[512] repeatedly. You need each work group to write to a dedicated location in the output.
kernel void reduce(global float* input, global float* output, const unsigned int N, local float* cache)
{
const uint local_id = get_local_id(0);
const uint global_id = get_global_id(0);
const uint group_id = get_group_id(0);
const uint local_size = get_local_size(0);
cache[local_id] = (global_id < N) ? input[global_id] : 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int s = local_size >> 1; s > 0; s >>= 1) {
if (local_id < s) {
cache[local_id] += cache[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_id == 0) output[group_id] = cache[0];
}
Then you need to sum the values from the output on the host. Note that 'b' in the host code does not need to hold N elements. Only one element for each work group will be used.
//replace (globalSize/localSize) with the pre-calculated/known number of work groups
for (i=1; i<(globalSize/localSize); i++) {
b[0] += b[i];
}
Now b[0] is your grand total.
In the reduction for loop, you need this:
for(unsigned int s = localSize >> 1; s > 0; s >>= 1)
You are shifting one more bit than you should when initializing s.
After that's fixed, let's look at what your kernel is doing. The host code executes it with globalSize of 8192 and localSize of 512, which results in 16 work groups. Inside the kernel you first sum the data from the two consecutive memory locations at index 2*global_id. For work group with id 15, work item 0, that will be at index 15*512*2 = 15,360 and 15,361, which is outside the boundaries of your input array. I am surprised you don't get a crash. At the same time, this explains why you have double the values that you expect.
To fix it, you can do this:
cache[localID] = input[globalID];
Or specify a global size that's half of the number of the current one.

Optimal workgroup size for sum reduction in OpenCL

I am using the following kernel for sum reduciton.
__kernel void reduce(__global float* input, __global float* output, __local float* sdata)
{
// load shared mem
unsigned int tid = get_local_id(0);
unsigned int bid = get_group_id(0);
unsigned int gid = get_global_id(0);
unsigned int localSize = get_local_size(0);
unsigned int stride = gid * 2;
sdata[tid] = input[stride] + input[stride + 1];
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s = localSize >> 2; s > 0; s >>= 1)
{
if(tid < s)
{
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if(tid == 0) output[bid] = sdata[0];
}
It works fine, but I don't know how to choose the optimal workgroup size or number of workgroups if I need more than one workgroup (for example if I want to calculate the sum of 1048576 elements). As far as I understand, the more workgroups I use, the more subresults I will get, which also means that I will need more global reductions at the end.
I've seen the answers to the general workgroup size question here. Are there any recommendations that concern reduction operations specifically?
This question is a possible duplicate of one I answered a while back:
What is the algorithm to determine optimal work group size and number of workgroup.
Experimentation will be the best way to know for sure for any given device.
Update:
I think you can safely stick to 1-dimensional work groups, as you have done in your sample code. On the host, you can try out the best values.
For each device:
1) query for CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE.
2) loop over a few multiples and run the kernel with that group size. save the execution time for each test.
3) when you think you have an optimal value, hard code it into a new kernel for use with that specific device. This will give a further boost to performance. You can also eliminate your sdata parameter in the device-specific kernel.
//define your own context, kernel, queue here
int err;
size_t global_size; //set this somewhere to match your test data size
size_t preferred_size;
size_t max_group_size;
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), preferred_size, NULL);
//check err
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), max_group_size, NULL);
//check err
size_t test_size;
//your vars for hi-res timer go here
for (unsigned int i=preferred_size ; i<=max_group_size ; i+=preferred_size){
//reset timer
test_size = (size_t)i;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &test_size, 0, NULL, NULL);
if(err){
fail("Unable to enqueue kernel"); //implement your own fail function somewhere..
}else{
clfinish(queue);
//stop timer, save value
//output timer value and test_size
}
}
The device-specific kernel can look like this, except the first line should have your optimal value substituted:
#define LOCAL_SIZE 32
__kernel void reduce(__global float* input, __global float* output)
{
unsigned int tid = get_local_id(0);
unsigned int stride = get_global_id(0) * 2;
__local float sdata[LOCAL_SIZE];
sdata[tid] = input[stride] + input[stride + 1];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int s = LOCAL_SIZE >> 2; s > 0; s >>= 1){
if(tid < s){
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(tid == 0) output[get_group_id(0)] = sdata[0];
}

OpenCL clEnqueueNDRangeKernel how to set work group size correctly

In OpenCL, if I want to add two N-dimension vectors, the global work group size (globalSize) should satisfy globalSize = ceil(N/localSize) * localSize, where localSize is the local work group size. Is this correct? If N = 1000, and localSize = 128, globalSize should be 1024? Can we always set globalSize some multiple of localSize and larger than needed?
I tried many times and it worked well for 1-dimension problems.
However, when it comes to 2d problems, for example, multiply two matrices of dimension m*n and n*p, the result matrix is of order m*p, things get more complicated.
The max work group size on my device is 128, so I set localSize [2] = {16,8} and
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}.
It is similar to the 1-dimension case but the result is wrong!
If I set localSize [2] = {1,128} and change the globalSize accordingly, I can get the correct result. So where is the problem? Can anyone tell me why?
In addition, I find out the indices where the matrix element is wrong.
It seems that the result is wrong at (i,j) where i*p + j = n * some constant (n = 1,2,3...)
Why?
Here is my kernel function:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
{
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
else
{
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
}
And then it is the host program:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
{
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
{
A[i] = i;
}
for(int i = 0; i < n*p; i++)
{
B[i] = i;
}
try
{
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("mmul.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
{
float z = 0.0;
for(int k = 0; k < n; k++)
{
z += A[i*n+k] * B[k*p+j];
}
c[i*p+j] = z;
}
for(int i = 0; i < m*p; i++)
{
if(fabs(c[i]-C[i])>0.001)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
}
delete []A;
delete []B;
delete []C;
}
catch(Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
return 0;
}
Your bounds checking code inside your OpenCL kernel is incorrect. Instead of this:
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
You should have this:
if(i < 0 || j < 0 || i >= Mdim || j >= Pdim) return;
Let's assume, that you have float matrix of size 1000x1000:
const int size = 1000;
// Whatever
float* myMatrix = (float*)calloc(size * size, sizeof(*myMatrix));
Determine size of Local Group first:
size_t localSize[] = {16, 8};
Then determine, how many Local Groups do you need:
size_t numLocalGroups[] = {ceil(size/localSize[0]), ceil(size/localSize[1])};
Finally, determine NDRange size:
size_t globalSize[] = {localSize[0] * numLocalGroups[0], localSize[1] * numLocalGroups[1]};
Don't forget to handle out-of-bounds access in right-most Local Groups.

Resources