OpenCL error 48 when launching kernel

OpenCL error 48 when launching kernel - opencl

Continuing with my OpenCL adventure, this is what I have till now from my CUDA implementation. I was trying to check if at least the first kernel call was working but I got error 48 and have no idea what am I missing. I was following the example in this page
KERNEL
__kernel
void clut_distributePixels(__global int *pixelGroup, int c_rows, int c_cols, int c_numColors){
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= c_cols || y >= c_rows) return;
int index = y * c_cols + x;
pixelGroup[index] = index/c_numColors;
}
Read Kernel from file
char *file_contents(const char *filename, int *length){
FILE *f = fopen(filename, "r");
void *buffer;
if (!f) {
fprintf(stderr, "Unable to open %s for reading\n", filename);
return NULL;
}
fseek(f, 0, SEEK_END);
*length = ftell(f);
fseek(f, 0, SEEK_SET);
buffer = malloc(*length+1);
*length = fread(buffer, 1, *length, f);
fclose(f);
((char*)buffer)[*length] = '\0';
return (char*)buffer;
}
CODE
#include <iostream>
#include <OpenCL/OpenCL.h>
#include "Utilities.hpp"
int main(int argc, const char * argv[]){
if (argc < 3) {
std::cout << "Use: {GPU|CPU} nColors" << std::endl;
return 1;
}
/************************************************
HOST SIDE INITIALIZATION
************************************************/
int h_numColors = atoi(argv[2]);
Color *h_image;
int h_rows, h_cols;
if (readText2RGB("LenaOriginal.txt", &h_image, &h_rows, &h_cols) != SUCCESS){
return 1;
}
int *h_pixelGroup = new int[h_rows*h_cols];
Color *h_groupRep = new Color[h_numColors];
Color *h_clutImage = new Color[h_rows*h_cols];
int h_change = 0;
/************************************************
PLATFORM AND DEVICE SETUP
************************************************/
cl_int errorStatus;
//Use the first platform
cl_platform_id platform;
errorStatus = clGetPlatformIDs(1, &platform, NULL);
//Use the first device that matches the type selected
cl_device_id device;
if (strcmp(argv[1], "CPU")){
errorStatus = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
}else if (strcmp(argv[1], "GPU")){
errorStatus = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
}else{
std::cout << "Unknown device type. Choose either CPU or GPU" << std::endl;
return 1;
}
//Define context properties and create context
cl_context_properties contextProps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
cl_context context = clCreateContext(contextProps, 1, &device, NULL, NULL, &errorStatus);
//Create the command queue
cl_command_queue queue = clCreateCommandQueue(context, device, 0, &errorStatus);
/************************************************
DEVICE VARIABLE SETUP
************************************************/
cl_mem d_image;
cl_mem d_pixelGroup;
cl_mem d_groupRep;
cl_mem d_clutImage;
cl_mem d_change;
d_image = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(Color)*h_rows*h_cols, h_image, &errorStatus);
d_pixelGroup = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*h_rows*h_cols, NULL, &errorStatus);
d_groupRep = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Color)*h_numColors, NULL, &errorStatus);
d_clutImage = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Color)*h_rows*h_cols, NULL, &errorStatus);
d_change = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &errorStatus);
/************************************************
CREATE, COMPILE PROGRAM and CREATE KERNEL
************************************************/
int pl;
size_t sourceLength;
char * sourceCode = file_contents("vectorQuantization.cl", &pl);
sourceLength = (size_t)pl;
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&sourceCode, &sourceLength, &errorStatus);
errorStatus = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel k_clut_distributePixels = clCreateKernel(program, "clut_distributePixels", &errorStatus);
errorStatus = clSetKernelArg(k_clut_distributePixels, 0, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_distributePixels, 1, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_distributePixels, 2, sizeof(cl_mem), (void*)&h_cols);
errorStatus = clSetKernelArg(k_clut_distributePixels, 3, sizeof(cl_mem), (void*)&h_numColors);
cl_kernel k_clut_checkDistances = clCreateKernel(program, "clut_checkDistances", &errorStatus);
errorStatus = clSetKernelArg(k_clut_checkDistances, 0, sizeof(cl_mem), (void*)&d_image);
errorStatus = clSetKernelArg(k_clut_checkDistances, 1, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_checkDistances, 2, sizeof(cl_mem), (void*)&d_groupRep);
errorStatus = clSetKernelArg(k_clut_checkDistances, 3, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_checkDistances, 4, sizeof(cl_mem), (void*)&h_cols);
errorStatus = clSetKernelArg(k_clut_checkDistances, 5, sizeof(cl_mem), (void*)&h_numColors);
errorStatus = clSetKernelArg(k_clut_checkDistances, 6, sizeof(cl_mem), (void*)&d_change);
cl_kernel k_clut_createImage = clCreateKernel(program, "clut_createImage", &errorStatus);
errorStatus = clSetKernelArg(k_clut_createImage, 0, sizeof(cl_mem), (void*)&d_clutImage);
errorStatus = clSetKernelArg(k_clut_createImage, 1, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_createImage, 2, sizeof(cl_mem), (void*)&d_groupRep);
errorStatus = clSetKernelArg(k_clut_createImage, 3, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_createImage, 4, sizeof(cl_mem), (void*)&h_cols);
/************************************************
EXECUTE PROGRAM AND GET RESULTS
************************************************/
/*STEP 1: evenly distribute pixels among the colors in the CLUT */
size_t grid[2] = {static_cast<size_t>(h_rows), static_cast<size_t>(h_cols)};
errorStatus = clEnqueueNDRangeKernel(queue, k_clut_distributePixels, 2, NULL, grid, NULL, 0, NULL, NULL);
clFinish(queue);
/*********/
/* ERROR */
/*********/
errorStatus = clEnqueueReadBuffer(queue, d_pixelGroup, CL_TRUE, 0, sizeof(int)*h_rows*h_cols, h_pixelGroup, 0, NULL, NULL);
std::cout << h_pixelGroup[7] << ", " << h_pixelGroup[8] << ", " << h_pixelGroup[9] << ", " << h_pixelGroup[10] << std::endl;
//do {
/*STEP 2: compute reprenstative */
/*STEP 3: compute distances and reassign pixel to group */
//copyFromConstantMemory
//} while (h_change != 0);
std::cout << "Done !!" << std::endl;
return 0;
}

I found my error. First of all Always check return values when you are learning new stuff. I just remember that from when I was learning CUDA, so with this simple macro I started checking everything
#define CL_SUCCESS_OR_RETURN(code) do { \
assert(code == CL_SUCCESS); \
if (code != CL_SUCCESS) { return code; } \
}while (0);
And the error was at the very beginning when I check if it is CPU or GPU. I forgot that strcmp returns 0 when the strings are equal. After fixing this, all worked beautifully !!
Anyways, if you have any other suggestion or advise or you see something ugly or not a best practice in the code please comment.

Related

How to make an OpenCL program run for large data set?

I am new to OpenCL. I am trying to run a simple OpenCL program for Vector Addition on NVIDIA GPU.
Here is the code :
OpenCL file is :
#define MAX_SOURCE_SIZE (0x10000)
#include<stdio.h>
#include<stdlib.h>
#include "CL/cl.h"
int main()
{
cl_uint ret_num_platforms;
cl_uint ret_num_devices;
cl_platform_id platform_id = NULL;
cl_kernel kernel2 = NULL;
cl_program program2 = NULL;
cl_command_queue command_queue = NULL;
cl_context context = NULL;
cl_device_id device_id = NULL;
cl_int ret;
FILE * fp2;
char fileName2[]="./kernel.cl";
int for_var=0;
char * source_str2;
size_t source_size2;
size_t globalWorkSize[1];
size_t localWorkSize[1];
cl_mem cl_buffer3;
cl_mem cl_buffer2;
cl_mem cl_buffer1;
cl_mem cl_buffer0;
int *A;
int *B;
int *C;
int *n;
int i;
n = ((int *)(malloc((sizeof(int )))));
printf("Enter the number of elements of vector : \n");
scanf("%d",n);
A = ((int *)(malloc((( *n) * sizeof(int )))));
B = ((int *)(malloc((( *n) * sizeof(int )))));
C = ((int *)(malloc((( *n) * sizeof(int )))));
printf("\nInput Vector1 :\n");
for (i = 0; i <= *n - 1; i += 1) {
A[i] = (2 * i);
printf("%d ",A[i]);
}
printf("\n\nInput Vector2 :\n");
for (i = 0; i <= *n - 1; i += 1) {
B[i] = (3 * i);
printf("%d ",B[i]);
}
ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
if (ret != CL_SUCCESS) {
printf("Platform error");
}
ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_DEFAULT,1,&device_id,&ret_num_devices);
if (ret != CL_SUCCESS)
printf("device err");
context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
if (!context)
printf("context err");
command_queue = clCreateCommandQueue(context,device_id,0,&ret);
if (!command_queue)
printf("command queue error");
localWorkSize[0] = 16;
globalWorkSize[0] =16400;
cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &ret);
cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, (*n) * sizeof(int), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,(*n) * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,(*n) * sizeof(int), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0, sizeof(int), n, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
fp2 = fopen(fileName2,"r");
if (!fp2) {
fprintf(stderr,"Failed");
exit(1);
}
source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
fclose(fp2);
program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);
if(!program2)
printf("error creating program2");
ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);
if (ret)
printf("error building program2");
kernel2 = clCreateKernel(program2, "ADD" , &ret);
ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);
ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);
ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);
ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);
ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
printf("\n\nAddition of vectors :\n");
for (i = 0; i <= *n - 1; i += 1) {
printf("%d ",C[i]);
}
clReleaseMemObject(cl_buffer0);
clReleaseMemObject(cl_buffer1);
clReleaseMemObject(cl_buffer2);
clReleaseMemObject(cl_buffer3);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
Kernel file is(kernel.cl) :
__kernel void ADD(__constant int *A,__constant int *B,__global int *C,__constant int *n)
{
int i = get_global_id(0);
if (i <= *n - 1) {
C[i] = (A[i] + B[i]);
}
}
The program works fine if I give 16384 as total vector elements but it gives 0 as output for values more than that. I want to run this program with large data set so that I can compare its performance with the one running on CPU.
Please guide me how can I do so?

There's at least one bug in your code - you're copying MEM_SIZE * sizeof(int) bytes from n to buffer 3:
ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0,MEM_SIZE * sizeof(int), n, 0, NULL, NULL);
however, n is only sizeof(int) bytes long:
n = ((int *)(malloc((sizeof(int )))));
I don't know what problems this might be causing, and it's entirely possible there are other, more severe bugs, but this one certainly isn't helping.

For loop in OpenCl kernel rolling through global memory float array

I feel I don't understand a basic parallel programming concept. The kernel below is a simple/contrived example that reproduces the problem I'm having. It attempts to use all the values in "points" to calculate a value and assign it to all of the items in "blocks." I want to push the limits for the size of these arrays. While I can make the "blocks" array quit large (>100 million floats), I get an "invalid command queue" error when "points" is filled with more than ~100 thousand floats (after calling clFinish immediately after clEnqueueNDRangeKernel). Could any of you help me understand why?
__kernel void openClTesting (__global float *blocks, __global float *points, int pointsCount)
{
int globalId = get_global_id(0);
int count = 0;
for (int i = 0; i < pointsCount; i++)
{
count++;
}
blocks[globalId] = count;
};
Some Device Info:
CL_DEVICE_LOCAL_MEM_SIZE = 49,152
CL_DEVICE_GLOBAL_MEM_SIZE = 2,147,483,648
CL_DEVICE_MAX_MEM_ALLOC_SIZE = 536,870,912
Host Code:
#include "stdafx.h"
#include "CL\opencl.h"
#include <iostream>
#include <fstream>
#include <string>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#define NUM_POINTS 100000
#define NUM_BLOCKS 100000000
struct openClData
{
cl_device_id deviceId = NULL;
cl_uint numDevices;
cl_uint numPlatforms;
cl_int ret;
cl_platform_id *platforms = NULL;
cl_context context;
cl_command_queue commandQueue;
cl_program program;
cl_kernel kernel;
char* kernelCode;
cl_uint kernelCodeSize;
size_t globalItemSize;
size_t localItemSize = 1;
};
char* getKernelCode();
void printErrorLog(openClData oclData);
void printRet(openClData oclData, int line);
int countFileChars(const char *fileName);
int _tmain(int argc, _TCHAR* argv[])
{
openClData oclData;
oclData.globalItemSize = NUM_POINTS;
oclData.kernelCode = getKernelCode();
std::cout << oclData.kernelCode << std::endl;
oclData.kernelCodeSize = strlen(oclData.kernelCode);
int numPoints = NUM_POINTS;
int numBlocks = NUM_BLOCKS;
cl_long localMemSize = 0, globalMemSize = 0, maxAllocMemSize = 0;
float *blocks = new float[numBlocks]{0};
float *points = new float[numPoints]{0};
//prepare platform, device, context and command queue
oclData.ret = clGetPlatformIDs(0, NULL, &oclData.numPlatforms);
printRet(oclData, __LINE__);
oclData.platforms = (cl_platform_id *)malloc(oclData.numPlatforms * sizeof(cl_platform_id));
oclData.ret = clGetPlatformIDs(oclData.numPlatforms, oclData.platforms, NULL);
printRet(oclData, __LINE__);
oclData.ret = clGetDeviceIDs(oclData.platforms[0], CL_DEVICE_TYPE_GPU, 1, &oclData.deviceId, &oclData.numDevices);
printRet(oclData, __LINE__);
oclData.context = clCreateContext(NULL, 1, &oclData.deviceId, NULL, NULL, &oclData.ret);
printRet(oclData, __LINE__);
oclData.commandQueue = clCreateCommandQueue(oclData.context, oclData.deviceId, 0, &oclData.ret);
printRet(oclData, __LINE__);
//prepare cl_mem objects
cl_mem memObjBlocks = clCreateBuffer(oclData.context, CL_MEM_READ_WRITE, sizeof(float) * numBlocks, NULL, &oclData.ret);
printRet(oclData, __LINE__);
cl_mem memObjPoints = clCreateBuffer(oclData.context, CL_MEM_READ_WRITE, sizeof(float) * numPoints, NULL, &oclData.ret);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueWriteBuffer(oclData.commandQueue, memObjBlocks, CL_TRUE, 0, sizeof(float) * numBlocks, blocks, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueWriteBuffer(oclData.commandQueue, memObjPoints, CL_TRUE, 0, sizeof(float) * numPoints, points, 0, NULL, NULL);
printRet(oclData, __LINE__);
//prepare program
oclData.program = clCreateProgramWithSource(oclData.context, 1, (const char**)&oclData.kernelCode, (const size_t *)&oclData.kernelCodeSize, &oclData.ret);
printRet(oclData, __LINE__);
oclData.ret = clBuildProgram(oclData.program, 1, &oclData.deviceId, NULL, NULL, NULL);
printRet(oclData, __LINE__);
if (oclData.ret == CL_BUILD_PROGRAM_FAILURE) printErrorLog(oclData);
oclData.kernel = clCreateKernel(oclData.program, "openClTesting", &oclData.ret);
printRet(oclData, __LINE__);
//set arguments
oclData.ret = clSetKernelArg(oclData.kernel, 0, sizeof(cl_mem), &memObjBlocks);
printRet(oclData, __LINE__);
oclData.ret = clSetKernelArg(oclData.kernel, 1, sizeof(cl_mem), &memObjPoints);
printRet(oclData, __LINE__);
oclData.ret = clSetKernelArg(oclData.kernel, 2, sizeof(int), &numPoints);
printRet(oclData, __LINE__);
//run
oclData.ret = clEnqueueNDRangeKernel(oclData.commandQueue, oclData.kernel, 1, NULL, &oclData.globalItemSize, &oclData.localItemSize, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueReadBuffer(oclData.commandQueue, memObjBlocks, CL_TRUE, 0, sizeof(float) * numBlocks, blocks, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
//print some device info
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemSize, 0);
std::cout << "CL_DEVICE_LOCAL_MEM_SIZE = " << localMemSize << '\n';
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_long), &globalMemSize, 0);
std::cout << "CL_DEVICE_GLOBAL_MEM_SIZE = " << globalMemSize << '\n';
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_long), &maxAllocMemSize, 0);
std::cout << "CL_DEVICE_MAX_MEM_ALLOC_SIZE = " << maxAllocMemSize << '\n';
//clean up
oclData.ret = clFlush(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clReleaseKernel(oclData.kernel);
printRet(oclData, __LINE__);
oclData.ret = clReleaseProgram(oclData.program);
printRet(oclData, __LINE__);
oclData.ret = clReleaseMemObject(memObjBlocks);
printRet(oclData, __LINE__);
oclData.ret = clReleaseMemObject(memObjPoints);
printRet(oclData, __LINE__);
oclData.ret = clReleaseCommandQueue(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clReleaseContext(oclData.context);
printRet(oclData, __LINE__);
for (size_t i = 0; i < 10; i++)
{
std::cout << blocks[i] << std::endl;
}
delete blocks;
delete points;
return 0;
}
char* getKernelCode()
{
char* kernelCode =
"__kernel void openClTesting (__global float *blocks, __global float *points, int pointsCount)"
"{"
" int globalId = get_global_id(0);"
" int count = 0;"
" for (int i = 0; i < pointsCount; i++)"
" {"
" count++;"
" }"
"blocks[globalId] = count;"
"}";
return kernelCode;
}
void printErrorLog(openClData oclData)
{
size_t log_size;
clGetProgramBuildInfo(oclData.program, oclData.deviceId, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *)malloc(log_size);
clGetProgramBuildInfo(oclData.program, oclData.deviceId, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
std::cout << log;
free(log);
}
void printRet(openClData oclData, int line)
{
std::cout << line << ", " << oclData.ret << std::endl;
}
int countFileChars(const char *fileName)
{
std::ifstream ifs(fileName);
ifs.seekg(0, std::ios_base::end);
size_t count = ifs.tellg();
ifs.seekg(0, std::ios_base::beg);
return count;
}

A few things I notice:
You're launching NUM_POINTS work-items, but write the result of each to blocks[globalId] - which has NUM_BLOCKS items. So that's undefined behaviour when NUM_POINTS is greater than NUM_BLOCKS. It also explains why varying NUM_BLOCKS does nothing (outside of the above restriction): aside from the memory allocation, the value of NUM_BLOCKS has no effect. (And the memory allocation limit you found roughly matches the CL_DEVICE_MAX_MEM_ALLOC_SIZE value for your implementation.)
You might be running into a kernel timeout condition here. 100000 loop iterations in a single work-item is quite a lot. Depending on the OpenCL implementation, kernels can be killed off if they take too long to run. Consider making better use of the thread-parallelism available, and split the work more horizontally across work-items, rather than looping. Many, shortish-running work-items are typically better than few long-running ones.

On a general note, localItemSize = 1; should be avoided, because it forces every OpenCL work group to consist of a single work item which will reduce you parallelism to the number of work groups that your compute device can run in parallel, which will be much less than the number of work items it can run. You can simply pass NULL for the local item size instead to have the OpenCL implementation figure out a reasonable value in its own:
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, NULL);
This may also be the source of your error because you are creating NUM_POINTS work groups but the size of the queue on the device is memory limited (CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE).

OpenCL uses sub-buffer's parent for constant parameter

I have an OpenCL (1.2) kernel that takes a constant argument, which is a sub-buffer. When I run this kernel, it seems like the parent buffer is used instead. If I use a global const argument, it works as expected.
I would put this down to a driver bug, except I can reproduce it on both Intel (Linux, beignet git) and nVidia (Linux, 367.44-3) implementations on different machines, which makes me think I've made a mistake somewhere.
Below is a working example. The expected output is 1, 1025, 1, 1025,, but instead 1, 1, 1, 1025, is printed.
#include <CL/cl.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#define NELEMS(x) (sizeof(x) / sizeof(*x))
#define PLATFORM 0
#define DEVICE 0
const char src[] =
"kernel void test1(constant int * const a) {\n"
" size_t i = get_global_id(0);\n"
" if (i == 1)\n"
" printf(\"%i, \", a[i]);\n"
"}\n"
"\n"
"kernel void test2(global const int * const a) {\n"
" size_t i = get_global_id(0);\n"
" if (i == 1)\n"
" printf(\"%i, \", a[i]);\n"
"}\n";
const size_t src_len = sizeof(src);
const char * const kernels[] = {"test1", "test2"};
int main(void) {
cl_int err = -1;
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
assert(num_platforms > PLATFORM);
cl_platform_id * platforms = malloc(sizeof(*platforms) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_uint num_devices;
clGetDeviceIDs(platforms[PLATFORM], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
assert(num_devices >= DEVICE);
cl_device_id * devices = malloc(sizeof(*devices) * num_devices);
clGetDeviceIDs(platforms[PLATFORM], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_context_properties context_properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[PLATFORM], 0
};
cl_context context = clCreateContext(context_properties, 1, &devices[DEVICE], NULL, NULL, &err);
assert(err == CL_SUCCESS);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
cl_command_queue queue = clCreateCommandQueue(context, devices[DEVICE], 0, &err);
#pragma GCC diagnostic pop
assert(err == CL_SUCCESS);
cl_program program;
{
// Crashes if directly using src[]
char * source = malloc(src_len);
memcpy(source, src, src_len);
program = clCreateProgramWithSource(context, 1, (const char **) &source, &src_len, &err);
assert(err == CL_SUCCESS);
free(source);
}
err = clBuildProgram(program, 1, &devices[DEVICE], "", NULL, NULL);
assert(err == CL_SUCCESS);
size_t buffer_size = 8192;
size_t subbuffer_size = buffer_size / 2;
{
cl_uint align;
err = clGetDeviceInfo(devices[DEVICE], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(align), &align, NULL);
assert(err == CL_SUCCESS);
assert(subbuffer_size % align == 0);
cl_ulong constbuf_size;
err = clGetDeviceInfo(devices[DEVICE], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(constbuf_size), &constbuf_size, NULL);
assert(err == CL_SUCCESS);
assert(constbuf_size > subbuffer_size);
}
cl_mem buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, &err);
assert(err == CL_SUCCESS);
cl_mem sub_buffers[2];
for (size_t i = 0; i < NELEMS(sub_buffers); i++){
cl_buffer_region region = {
.origin = i * subbuffer_size,
.size = subbuffer_size,
};
sub_buffers[i] = clCreateSubBuffer(buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
assert(err == CL_SUCCESS);
}
{
cl_int * data = clEnqueueMapBuffer(queue, buffer, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, 0, NULL, NULL, &err);
assert(err == CL_SUCCESS);
for (size_t i = 0; i < buffer_size / sizeof(cl_int); i++)
data[i] = i;
cl_event unmap_event;
err = clEnqueueUnmapMemObject(queue, buffer, data, 0, NULL, &unmap_event);
assert(err == CL_SUCCESS);
err = clWaitForEvents(1, &unmap_event);
assert(err == CL_SUCCESS);
}
for (size_t k = 0; k < NELEMS(kernels); k++) {
cl_kernel kernel = clCreateKernel(program, kernels[k], &err);
assert(err == CL_SUCCESS);
cl_event run_event;
for (size_t i = 0; i < NELEMS(sub_buffers); i++){
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &sub_buffers[i]);
assert(err == CL_SUCCESS);
size_t work_size[] = {subbuffer_size / sizeof(cl_int)};
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, &run_event);
assert(err == CL_SUCCESS);
err = clWaitForEvents(1, &run_event);
assert(err == CL_SUCCESS);
err = clFinish(queue);
assert(err == CL_SUCCESS);
}
clReleaseKernel(kernel);
}
puts("");
for (size_t i = 0; i < NELEMS(sub_buffers); i++)
clReleaseMemObject(sub_buffers[i]);
clReleaseMemObject(buffer);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(devices);
free(platforms);
return 0;
}

This is interesting. I try it on difference devices, in a MacBookPro there are 3 devices include Nvidia IRIS and Intel, all get correct output. In the windows 10 in this MBP with Nvidia driver, the output is exactly same wrong.
I think it's a Nvidia bug, but not limit to Nvidia.

Using clEnqueueNDRangeKernel in OpenCL

I need help with one function in OpenCL. When I'm starting using clEnqueueNDRangeKernel instead of clEnqueueTask it takes much more time for program to succeed. Why so? As I understand, the program should use data parallel model and it will work faster, am I wrong? And if I am, how I can change code to see the actual work of data parallel model?
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
for (int i = 0, j = 0; i < InSize; i+=4, j++)
{
unsigned char Value = (pDataIn[i] + pDataIn[i + 1] + pDataIn[i + 2]) / 3;
pDataOut[j] = Value;
}
}
int iWidth, iHeight, iBpp;
vector<unsigned char> pDataIn;
vector<unsigned char> pDataOut;
int err = LoadBmpFile(L"3840x2160.bmp", iWidth, iHeight, iBpp, pDataIn);
if (err != 0 || pDataIn.size() == 0 || iBpp != 32)
{
std::cout << "error load input file!\n";
}
pDataOut.resize(pDataIn.size()/4);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj = NULL;
cl_mem memobj1 = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
unsigned int SizeIn, SizeOut;
SizeIn = pDataIn.size();
SizeOut = pDataOut.size();
FILE *fp;
char fileName[] = "./kernel.cl";
char *source_str;
size_t source_size;
//Loading kernel
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
system("PAUSE");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
//Getting Platform and Device
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
//Create context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
//create kernel program
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
//build it
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
//create queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//create bufer
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, pDataIn.size(), NULL, &ret);
memobj1 = clCreateBuffer(context, CL_MEM_READ_WRITE,pDataOut.size(), NULL, &ret);
//copy buffer to kernel
ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, pDataIn.size(), pDataIn.data(), 0, NULL, NULL);
//create opencl kernel
kernel = clCreateKernel(program, "red_to_green", &ret);
//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
ret = clSetKernelArg(kernel, 2, sizeof(unsigned int), (void *)&SizeIn);
ret = clSetKernelArg(kernel, 3, sizeof(unsigned int), (void *)&SizeOut);
const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){
float start_time = clock();
float search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = 8;
size_t local_item_size = 4;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
//copy from buffer
ret = clEnqueueReadBuffer(command_queue, memobj1, CL_TRUE, 0, pDataOut.size(), pDataOut.data(), 0, NULL, NULL);
ret = clFinish(command_queue);
float end_time = clock();
search_time = end_time - start_time;
//float last_time = last_time + search_time;
cout << search_time << endl;
}
clock_t t1 = clock();
double time_seconds = (t1-t0)*CLOCKS_PER_SEC/cycles_max;
cout << time_seconds/1000 <<endl;
WriteBmpFile(L"3840x2160_wb.bmp", iWidth, iHeight, 8, pDataOut.size(), pDataOut.data(), false);
system("PAUSE");

from the docs page:
The kernel is executed using a single work-item.
clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with
work_dim = 1, global_work_offset = NULL, global_work_size[0] set to 1,
and local_work_size[0] set to 1.
When you use clEnqueueNDRangeKernel, you are using 2 work groups of 4 work items, but they are all doing the same work. They all read from the same global memory, but more importantly, they all try to write to the same locations in global memory.
You need to take into account the worker's global id when doing your computations.
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
int gid = get_global_id(0);
int gsize = get_global_size(0);
for (int j = gid; j < (InSize >> 2); j+= gsize)
{
unsigned char Value = (pDataIn[j*4] + pDataIn[j*4 + 1] + pDataIn[j*4 + 2]) / 3;
pDataOut[j] = Value;
}
}

It looks like you are iterating over all pixels of an input image in your kernel. This will cause all threads to calculate the image intensity for all pixels. Try to launch a single thread for each pixel instead. To do so, change your kernel source code to only calculate the output value for one pixel:
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut) {
int j = get_global_id(0);
int i = j*4;
pDataOut[i] = (pDataIn[j] + pDataIn[j + 1] + pDataIn[j + 2]) / 3;
}
This code will now perform the averaging over the RGB values of your RGBA input image for the single pixel at location i. Now all you need to do is launch as many threads as your image has pixels. Relevant changes:
//create opencl kernel
kernel = clCreateKernel(program, "black_white_img", &ret);
//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){
float start_time = clock();
float search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = iWidth * iHeight;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, NULL, 0, NULL, NULL);
This should give a considerable speedup comparing to your code.

Running an OpenCl program

I'm new to OpenCl, and I'm having trouble running my first ever code. I have an ATI graphics card and I've done the necessary installation from here http://www.thebigblob.com/getting-started-with-opencl-and-gpu-computing/ here. The following is the code I picked off that website to test my setup.
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 1024;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
...as per the instructions there. I keep getting "Failed to Load Kernel", which means it could not open the file.Where do I get the kernel source from?? Could someone please tell me how to get it to run??. Thanks in advance..
gcc -c -I /home/suraj/Desktop/Intern/softwares/AMD-APP-SDK-v2.7-lnx32
/AMD-APP-SDK-v2.7-RC-lnx32/include opencl.c -o oopencl.o
gcc opencl.o -o host -L /home/suraj/Desktop/Intern/softwares/AMD-APP-SDK
-v2.7-lnx32/AMD-APP-SDK-v2.7-RC-lnx32/lib/x86 -l OpenCL

I just went through this exercise with this source code as well, and everything worked fine for me. Did you grab the kernel program from github? It is the third program down in the list called "vector_add_kernel.cl". The C program needs the kernel to actually run.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

OpenCL error 48 when launching kernel - opencl

Related

How to make an OpenCL program run for large data set?

For loop in OpenCl kernel rolling through global memory float array

OpenCL uses sub-buffer's parent for constant parameter

Using clEnqueueNDRangeKernel in OpenCL

Running an OpenCl program

Categories

Resources