OpenCl Error -40 - runtime-error

I've been coding some openCl programs for a while and now i'm trying to make a simple program that read an image in .pam format and then save it as is.
Now the problem is that when i run it, giving a name to open an image it returns an error -40 when creating the first 2dImage.
Here's the code:
#define SEPARATOR "==============================================\n"
/* Copia tutto tranne <imgInfo>.data
* -Questo vuol dire che l'immagine avrà anche stessa altezza e larghezza */
void copy_img_info(imgInfo* src, imgInfo *dst){
dst->channels=src->channels;
dst->data_size=src->data_size;
dst->depth=src->depth;
dst->height=src->height;
dst->maxval=src->maxval;
dst->width=src->width;
}
cl_event launch_op(cl_command_queue que, cl_kernel k_op,
imgInfo info_open, imgInfo info_to_save,
int _lws,
cl_mem src, cl_mem dst,
cl_int num_events, const cl_event* wait_list){
cl_int err;
cl_event evt_kernel;
size_t lws[]={
_lws ? _lws : 16, _lws ? _lws : 16
};
size_t gws[]={
round_mul_up(info_open.height, lws[0]), round_mul_up(info_open.width, lws[1])
};
err = clSetKernelArg(k_op, 0, sizeof(src),&src);
ocl_check(err, "Set op k arg 0");
err = clSetKernelArg(k_op, 1, sizeof(dst),&dst);
ocl_check(err, "Set op k arg 1");
err = clEnqueueNDRangeKernel(que, k_op, 2, NULL, gws, lws, num_events, wait_list, &evt_kernel);
ocl_check(err, "Enqueue kernel op");
return evt_kernel;
}
int main(int argc, char* argv[]) {
if( argc <2 ){
printf("Inserire nome del file\n");
exit(EXIT_FAILURE);
}
int _lws;
if( argc <3)
_lws= 0;
else{
_lws= atoi(argv[2]);
printf("Sarà usato %d come local work size\n",_lws);
}
imgInfo info_open, info_to_save;
cl_event evt_fill[2], evt_upload, evt_op, evt_download;
cl_image_format format = {
.image_channel_data_type = CL_UNSIGNED_INT16,
.image_channel_order = CL_RGBA
};
cl_int err;
cl_platform_id p = select_platform();
cl_device_id d = select_device(p);
cl_context ctx = create_context(p, d);
cl_command_queue que = create_queue(ctx, d);
cl_program prog = create_program("Kernels/image_tr.ocl",NULL , ctx, d);
if(load_pam(argv[1], &info_open)){
printf("Errore durante apertura file\n");
exit(EXIT_FAILURE);
}
copy_img_info(&info_open, &info_to_save);
/* ALLOCATION OF BUFFERS */
cl_mem image_to_open = clCreateImage2D(ctx, CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY , &format, info_open.width, info_open.height,
info_open.width, NULL, &err );
ocl_check(err, "Allocate image to open");
cl_mem image_to_save = clCreateImage2D(ctx, CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY , &format, info_to_save.width, info_to_save.height,
info_to_save.width, NULL, &err );
ocl_check(err, "Allocate image to save");
/* ALLOCATION OF BUFFERS */
/* FILL IMAGES */
const size_t fill_color[]={0,0,0,0};
const size_t origin[3]= {0,0,0};
const size_t region[3]= { info_open.width, info_open.height, 1};
err = clEnqueueFillImage(que, image_to_open, fill_color , origin, region, 0, NULL, evt_fill);
ocl_check(err, "Enqueue Fill Buffer to open");
err = clEnqueueFillImage(que, image_to_save, fill_color , origin, region, 0, NULL, evt_fill + 1);
ocl_check(err, "Enqueue Fill Buffer to save");
/* FILL IMAGES */
/* UPLOAD IMAGE ON GPU */
err = clEnqueueWriteImage(que, image_to_open, CL_TRUE, origin, region,
info_open.width, 0, info_open.data, 1, evt_fill, &evt_upload );
ocl_check(err, "Upload image on GPU");
/* UPLOAD IMAGE ON GPU */
/* CREATION OF KERNELS */
cl_kernel k_op = clCreateKernel(prog, "op", &err );
ocl_check(err, "Creation of kernel op");
/* CREATION OF KERNELS */
/* LAUNCH CUSTOMS KERNELS */
const cl_event evt_wait_list [] = {
evt_fill, evt_upload
};
evt_op = launch_op(que, k_op, info_open, info_to_save, _lws, image_to_open, image_to_save, 2, evt_wait_list);
/* LAUNCH CUSTOMS KERNELS */
/* DOWNLOAD FROM GPU */
err = clEnqueueReadImage(que, image_to_save, CL_TRUE, origin, region, info_to_save.width, 0,
info_to_save.data, 1, &evt_op, &evt_download);
ocl_check(err, "Download from device");
/* DOWNLOAD FROM GPU */
/* SAVE PAM FILE */
if(save_pam("ocl_image_tr.pam",&info_to_save))
printf("Errore salvataggio file Pam\n");
/* SAVE PAM FILE */
/* BENCHMARKING */
cl_ulong runtime_fill[2]={
runtime_ns(*evt_fill), runtime_ns(*(evt_fill+1) )
};
cl_ulong runtime_upload= runtime_ns(evt_upload);
cl_ulong runtime_op = runtime_ns(evt_op);
cl_ulong runtime_download = runtime_ns(evt_download);
printf(SEPARATOR);
printf(" Kernel \t Runtime \t Bandwidth \t GFLOPS\n");
printf(" Fill_1 \t %gms \t %gGB/s \t %g GFLOPS\n", 1.0e-6*runtime_fill[0],
(double)(info_open.height*info_open.width*sizeof(cl_short4))/(runtime_fill[0]),
(double)(info_open.height*info_open.width*sizeof(cl_short4))/(runtime_fill[0]));
printf(" Fill_2 \t %gms \t %gGB/s \t %g GFLOPS\n", 1.0e-6*runtime_fill[1],
(double)(info_to_save.height*info_to_save.width*sizeof(cl_short4))/(runtime_fill[1]),
(double)(info_to_save.height*info_to_save.width*sizeof(cl_short4))/(runtime_fill[1]));
printf(" Upload \t %gms \t %gGB/s \t %g GFLOPS\n", 1.0e-6*runtime_upload,
(double)(info_open.height*info_open.width*sizeof(cl_short4))/(runtime_upload),
(double)(info_open.height*info_open.width*sizeof(cl_short4))/(runtime_upload));
printf(" Op \t %gms \t %gGB/s \t %g GFLOPS\n", 1.0e-6*runtime_op,
(double)(info_to_save.height*info_to_save.width*sizeof(cl_short4))/(runtime_op),
(double)(info_to_save.height*info_to_save.width*sizeof(cl_short4))/(runtime_op));
printf(" Download \t %gms \t %gGB/s \t %g GFLOPS\n", 1.0e-6*runtime_download,
(double)(info_to_save.height*info_to_save.width*sizeof(cl_short4))/(runtime_download),
(double)(info_to_save.height*info_to_save.width*sizeof(cl_short4))/(runtime_download));
printf(SEPARATOR);
/* BENCHMARKING */
/* CLEANING... */
clReleaseEvent(evt_op);
clReleaseEvent(evt_download);
clReleaseEvent(*evt_wait_list);
clReleaseEvent(*(evt_wait_list+1));
clReleaseEvent(*evt_fill);
clReleaseEvent(*(evt_fill+1));
clReleaseMemObject(image_to_open);
clReleaseMemObject(image_to_save);
clReleaseDevice(d);
clReleaseKernel(k_op);
clReleaseProgram(prog);
clReleaseContext(ctx);
clReleaseCommandQueue(que);
return 0;
}
And here is the kernel:
__kernel void op(image2d_t read_only src, image2d_t write_only dst){
// Questa mappatura è più efficiente per la GPU
const sampler_t sampler_ui = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST ;
int row= get_global_id(1);
int col=get_global_id(0);
if( row > get_image_height(src) && col > get_image_width(src))
return;
uint4 pix= read_imageui(src, sampler_ui ,(int2)(row,col));
write_imageui(dst, (int2)(row,col), pix);
}
The headers i used are been created by my teacher of GPGPU and i can't give it.
There's the error given:
Allocate image to open - error -40
The message is that I print in allocating the images.
What is going on?

For anyone that is having the same problem.
I resolved in this way
cl_uint numb;
clGetSupportedImageFormats(ctx, NULL , CL_MEM_OBJECT_IMAGE2D, 0, NULL, &numb);
printf("Numb: %u\n", numb);
This returns me always Numb: 0
But when i changed the CreateImage2d with CreateImage from Opencl 1.2...
It finally works

Related

What is the best practice to do reduce in OpenCL?

Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.

preferred vector width in opencl device

I'm a beginner in OpenCL and am trying to run the sample codes of the "OpenLC in Action" book. I have the following code to get the preferred vector width of my device. The platforms detected on my computer are from Intel Core i7 and HD graphics and another one from NVIDIA GeForce 940M. Whenever I run the code, it gives "1" for vector width of every type unless type double which is zero because it is not supported. Even when I change the platform in my computer to check its devices, results are the same. I ran the code on an AMD computer and it seemed to work properly because it gave me different numbers for different types. But, I am not sure why this code keeps giving me "1" for every type on different platforms of my computer. Any ideas?
Here is the output:
Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>
int main(){
cl_int err, i, j;
cl_platform_id *platforms;
cl_device_id *devices;
cl_uint num_platforms, num_devices, vector_width;
size_t plat_name_size, devi_name_size;
char *plat_name_data, *devi_name_data;
err = clGetPlatformIDs(1, NULL, &num_platforms);
if (err < 0){
perror("No platform is found");
exit(1);
}
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
printf("Number of found platforms is %d\n ", num_platforms);
for (i = 0; i < num_platforms; i++){
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
if (err < 0){
perror("Couldn't read platform name.");
exit(1);
}
plat_name_data = (char*)malloc(plat_name_size);
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL);
printf("Platform No.%d is: %s\n", i, plat_name_data);
err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
if (err < 0){
perror("No device is found in this platform");
exit(1);
}
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
printf("Number of devices found in this platform is: %d\n", num_devices);
for (j = 0; j < num_devices; j++){
err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size);
if (err < 0){
perror("Couldn't read the device name.");
exit(1);
}
devi_name_data = (char*)malloc(devi_name_size);
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL);
printf("Device No.%d name is: %s\n", j + 1, devi_name_data);
if (strstr(devi_name_data, "GeForce 940M")){
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in chars: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in shorts: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in ints: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in longs: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in floats: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in doubles: %u\n", vector_width);
}
}
}
return 0;
}
Short answer: You are querying it correctly, and the platform compiler knows what is the best vector width size. So yes, it is correct the value of 1.
Long answer: For a CPU (any type of CPU) it is likely to prefer non vectored. Specially on Intel CPU + Compiler, since the Intel compiler does the vectorization as part of the optimization process, so it prefers the user NOT to vectorize the code in the first place.
Indeed it looks like nVIDIA also prefers the user to input non vectorized code. It does not mean code will run slower if vectorized already. It just means the compiler (due to the optimization techniques it has) prefers the code to be unvectorized.
Updates to the OpenCL drivers may lead to a change of these values.
Also, you should take them as orientative. Other factors as: local memory usage, coalesced global access, local size, etc... are way more important usually.
Here is one experiment that I've done to see how vectorized operations perform in a device which prefers to do scalar operations. I have implemented the reduction algorithm with two different kernels. The first kernel treats data as scalars while the second treats data as float4 vectors (Codes are given below). Here is the execution results. It is clear that although the NVIDIA device prefers non-vectorized operation, vectorized operation is faster.
Preferred vector width: 1
reduction_scalar: Check passed.
Total time = 4471424
reduction_vector: Check passed.
Total time = 1723776
And here is the code:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"
#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err == CL_DEVICE_NOT_FOUND) {
printf(" GPU is not first! Going on CPU :(");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
}
if (err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main() {
/* OpenCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel[NUM_KERNELS];
cl_command_queue queue;
cl_event prof_event;
cl_int i, j, err, preferred_width;
size_t local_size, global_size;
char kernel_names[NUM_KERNELS][20] =
{ "reduction_scalar", "reduction_vector" };
/* Data and buffers */
float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);
//float data[ARRAY_SIZE];
float sum, actual_sum, *scalar_sum, *vector_sum;
cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
cl_int num_groups;
cl_ulong time_start, time_end, total_time;
/* Initialize data */
for (i = 0; i<ARRAY_SIZE; i++) {
data[i] = 1.0f*i;
}
/* Create device and determine local size */
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(preferred_width), &preferred_width, NULL);
printf("Preferred vector width: %d\n", preferred_width);
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(local_size), &local_size, NULL);
if (err < 0) {
perror("Couldn't obtain device information");
exit(1);
}
/* Allocate and initialize output arrays */
num_groups = ARRAY_SIZE / local_size;
scalar_sum = (float*)malloc(num_groups * sizeof(float));
vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
for (i = 0; i<num_groups; i++) {
scalar_sum[i] = 0.0f;
}
for (i = 0; i<num_groups / 4; i++) {
vector_sum[i] = 0.0f;
}
/* Create a context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build program */
program = build_program(context, device, PROGRAM_FILE);
/* Create data buffer */
data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
if (err < 0) {
perror("Couldn't create a buffer");
exit(1);
};
/* Create a command queue */
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
for (i = 0; i<NUM_KERNELS; i++) {
/* Create a kernel */
kernel[i] = clCreateKernel(program, kernel_names[i], &err);
if (err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
/* Create kernel arguments */
err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
if (i == 0) {
global_size = ARRAY_SIZE;
err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
}
else {
global_size = ARRAY_SIZE / 4;
err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
}
if (err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
/* Finish processing the queue and get profiling information */
clFinish(queue);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
/* Read the result */
if (i == 0) {
err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups; j++) {
sum += scalar_sum[j];
}
}
else {
err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups / 4; j++) {
sum += vector_sum[j];
}
}
/* Check result */
printf("%s: ", kernel_names[i]);
actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
if (fabs(sum - actual_sum) > 0.01*fabs(sum))
printf("Check failed.\n");
else
printf("Check passed.\n");
printf("Total time = %lu\n\n", total_time);
/* Deallocate event */
clReleaseEvent(prof_event);
}
/* Deallocate resources */
free(scalar_sum);
free(vector_sum);
for (i = 0; i<NUM_KERNELS; i++) {
clReleaseKernel(kernel[i]);
}
clReleaseMemObject(scalar_sum_buffer);
clReleaseMemObject(vector_sum_buffer);
clReleaseMemObject(data_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
and the kernels:
__kernel void reduction_scalar(__global float* data,
__local float* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = partial_sums[0];
}
}
__kernel void reduction_vector(__global float4* data,
__local float4* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
}
}

openCL Long Overflowing

Before I start I am a C beginner and I am trying to do some openCL work which might have been a mistake. Below is my kernel code:
__kernel void collatz(__global int* in, __global int* out)
{
uint id = get_global_id(0);
unsigned long n = (unsigned long)id;
uint count = 0;
while (n > 1) {
if (n % 2 == 0) {
n = n / 2;
} else {
if(n == 1572066143) {
unsigned long test = n;
printf("BEFORE - %lu\n", n);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
n = (3 * n) + 1;
} else {
n = (3 * n) + 1;
}
}
count = count + 1;
}
out[id] = count;
}
and the output:
BEFORE - 1572066143
AFTER - 421231134
To me it looks like n is overflowing but I can't figure out why it is happening.
The interesting thing is if I create a new variable to store the same value as n then it seems to work correctly.
unsigned long test = 1572066143;
printf("BEFORE - %lu\n", test);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
Output:
BEFORE - 1572066143
AFTER - 4716198430
As I said I am a C beginner so I could be doing something very stupid! Any help would be appreciated as I have been pulling my hair out for hours now!
Thanks,
Stephen
Update:
Here is my host code in case I am doing something stupid on that end:
int _tmain(int argc, _TCHAR* argv[])
{
/*Step1: Getting platforms and choose an available one.*/
cl_uint numPlatforms; //the NO. of platforms
cl_platform_id platform = NULL; //the chosen platform
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
/*Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *devices;
devices = (cl_device_id*)malloc(1 * sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
/*Step 3: Create context.*/
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
/*Step 4: Creating command queue associate with the context.*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
const char *filename = "HelloWorld_Kernel.cl";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
cl_ulong max = 2000000;
cl_ulong *numbers = NULL;
numbers = new cl_ulong[max];
for (int i = 1; i <= max; i++) {
numbers[i] = i;
}
int *output = (int*)malloc(sizeof(cl_ulong) * max);
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, max * sizeof(cl_ulong), (void *)numbers, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, max * sizeof(cl_ulong), NULL, NULL);
/*Step 8: Create kernel object */
cl_kernel kernel = clCreateKernel(program, "collatz", NULL);
/*Step 9: Sets Kernel arguments.*/
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
/*Step 10: Running the kernel.*/
size_t global_work_size[] = { max };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
/*Step 11: Read the data put back to host memory.*/
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, max * sizeof(cl_ulong), output, 0, NULL, NULL);
return SUCCESS;
}
I finally got to the bottom of the issue.
I was running the code on my Intel HD Graphics 4600 chip and it was producing the strange behaviour shown in the original question. I switched to using my AMD card and then it started working as expected!
Very strange. Thanks to everyone for their help!
Host side and device size values have different sizes. In host, long can vary from 32 to 64bits, depending on the platform. In device, long refers to 64bits only.
printf() function, as defined in C says that %ld is to print long (host side long) numbers. You are using printf in a kernel, so.... It could be that the C-like parser is used, therefore printing the variable as a 32bits long.
Can you try printing it as %lld or as a floating point?

OpenCL clCreateKernel throws CL_INVALID_PROGRAM_EXECUTABLE

im new with OpenCL, I have a problem in clCreateKernel, it throws CL_INVALID_PROGRAM_EXECUTABLE, could anybody help, the code is based on http://www.cs.bris.ac.uk/home/simonm/workshops/OpenCL_lecture3.pdf , the last optimization
Here is the code:
#define ORDER 10 // Order of the square matrices A, B, and C
#define AVAL 3.0 // A elements are constant and equal to AVAL
#define BVAL 5.0 // B elements are constant and equal to BVAL
#define TOL (0.001) // tolerance used in floating point comparisons
#define DIM 2 // Max dim for NDRange
#define COUNT 1 // number of times to do each multiplication
#define SUCCESS 1
#define FAILURE 0
// Funciones Auxiliares
void initmat(int Mdim, int Ndim, int Pdim, float *A, float *B, float *C)
{
int i, j;
/* Initialize matrices */
for (i = 0; i < Ndim; i++)
for (j = 0; j < Pdim; j++)
A[i*Ndim+j] = AVAL;
for (i = 0; i < Pdim; i++)
for (j = 0; j < Mdim; j++)
B[i*Pdim+j] = BVAL;
for (i = 0; i < Ndim; i++)
for (j = 0; j < Mdim; j++)
C[i*Ndim+j] = 0.0f;
}
// Definicion de la funcion:
char * readKernel(void)
{
size_t *source_length;
FILE *fp = fopen("kernel.cl", "r");
if (fp == NULL)
{
printf("Cannot Open Kernel.cl\n");
}
else
{
printf("Kernel.cl Opened\n");
}
fseek(fp, 0, SEEK_END);
source_length[0] = ftell(fp);
if (source_length[0] == 0)
{
printf("Kernel.cl is empty\n");
}
else
{
printf("Kernel.cl length: %zu bytes\n", source_length[0]);
}
char *source = (char*) calloc(source_length[0] + 1, 1);
if (source == 0)
{
printf("Memory allocation failed");
}
fseek(fp, 0, SEEK_SET);
fread(source, 1, source_length[0], fp);
printf("Kernel.cl Read\n");
return source;
}
int main(int argc, char **argv)
{
// Declare and iniciate data
float *A, *B, *C;
int Mdim, Ndim, Pdim;
int err, szA, szB, szC;
size_t global[DIM];
size_t local[DIM];
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_uint nd;
cl_mem a_in, b_in, c_out;
Ndim = ORDER;
Pdim = ORDER;
Mdim = ORDER;
szA = Ndim*Pdim;
szB = Pdim*Mdim;
szC = Ndim*Mdim;
A = (float *)malloc(szA*sizeof(float));
B = (float *)malloc(szB*sizeof(float));
C = (float *)malloc(szC*sizeof(float));
const char* C_elem_KernelSource =
"__kernel \n"
"void mmul( \n"
" const int Mdim, \n"
" const int Ndim, \n"
" const int Pdim, \n"
" __global float* A, \n"
" __global float* B, \n"
" __global float* C, \n"
" __local float* Bwrk) \n"
"{ \n"
" int k,j; \n"
" int i = get_global_id(0); \n"
" int iloc = get_local_id(0); \n"
" int nloc = get_local_size(0); \n"
" float Awrk[10]; \n"
" float tmp; \n"
" for (k=0; k<Pdim; k++) \n"
" Awrk[k] = A[i*Ndim+k]; \n"
" for (j=0; j<Mdim; j++){ \n"
" for (k=iloc; k<Pdim; k=k+nloc) \n"
" Bwrk[k] = B[k*Pdim+j]; \n"
" barrier(CLK_LOCAL_MEM_FENCE); \n"
" tmp = 0.0f; \n"
" for (k=0; k<Pdim; k++) \n"
" tmp += Awrk[k] * Bwrk[k]; \n"
" C[i*Ndim+j] += tmp; \n"
"} \n"
;
initmat(Mdim, Ndim, Pdim, A, B, C);
// Setup the plataform
cl_uint num_platforms;
if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS)
{
printf("Unable to get platform!\n");
}else{
printf("Plataformas Disponibles: %u \n", num_platforms);
}
//Identificador
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, &num_platforms);
printf("Plataformas creada\n");
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err==CL_SUCCESS){
printf("Device creado \n");
}else {
printf("Error %d \n", err);
}
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
if (err==CL_SUCCESS){
printf("Contexto creado \n");
}else {
printf("Error creando contexto \n");
}
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (err==CL_SUCCESS){
printf("cola de comandos creadas \n");
}else {
printf("Error creando cola de comandos \n");
}
// Setup buffers and write A and B matrices to the device memory
a_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szA, NULL, NULL);
b_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szB, NULL, NULL);
c_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, NULL);
err = clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float) * szA, A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, b_in, CL_TRUE, 0, sizeof(float) * szB, B, 0, NULL, NULL);
// Build the program, define the kernel and setup arguments
program = clCreateProgramWithSource(context, 1, (const char **) &C_elem_KernelSource, NULL, &err);
if (err==CL_SUCCESS){
printf("programa creado \n");
}else {
printf("Error generado %d creando programa\n", err);
}
//Compila el programa en el dispositivo elegido
clBuildProgram(program, 1, &device_id, NULL, NULL, NULL );
if (err==CL_SUCCESS){
printf("programa compilado 1\n");
}else {
printf("Error generado %d compilando programa 1\n", err);
}
kernel = clCreateKernel(program, "mmul", &err);
if (err==CL_SUCCESS){
printf("Kernel creado \n");
}else {
printf("Error generado %d creando kernel\n", err);
}
err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim);
err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim);
err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &a_in);
err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_in);
err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &c_out);
err |= clSetKernelArg(kernel, 6, sizeof(float)*Pdim, NULL);
if (err==CL_SUCCESS){
printf("Argumentos del Kernel configurados \n");
}else {
printf("Error configurando argumentos del kernel \n");
}
//Run the kernel and collect results
// 1D ND Range set to dimensions of C matrix
//Local Dim set to 250 so number of work-groups match number of
//compute units (4 in this case) for our order 1000 matrices
//Pass local memory to kernels. This requires a change to the kernel
//argument list … a new call to clSetKernelArg is needed
printf("Encolando Kernel:\n");
global[0] = (size_t) Ndim; global[1] = (size_t) Mdim; local[0] = (size_t) 2;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global, local, 0, NULL, NULL);
if (err==CL_SUCCESS){
printf("Kernel enviado a device \n");
}else {
printf("Error enviando kernel a device \n");
}
clFinish(commands);
err = clEnqueueReadBuffer(commands, c_out, CL_TRUE, 0, sizeof(float) * szC, C, 0, NULL, NULL );
//test_results(A, B, c_out);
}
Thanks
The main problem is that the open brace on line 112 has no matching closing brace:
" for (j=0; j<Mdim; j++){ \n"
Also note that the pointer declared on line 34 is used without initialization:
size_t *source_length;
On line 170, an err= should be added to the clBuildProgram() call so that the error checking works as intended. Then you can add logic to use clGetProgramBuildInfo() to get details in the case of a build fail.

openCL simple add vector returns gargabe values

Here is my attempt to write a opencl code to add 2 vectors
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
//24/12
//data structure platform, device, context,program, kernel, command queue
void main()
{
/////////////////////////////////////////////////////////////////////
//PLATFORM QUERY:
/////////////////////////////////////////////////////////////////////
//clGetPlatformIDs(num_entries, platforms, &num_platforms);
// two part: platform = NULL
// malloc and get platforms*
cl_uint num_platforms; //must be uint
cl_platform_id *platforms;
clGetPlatformIDs(5, NULL, &num_platforms);
printf("There are %d platforms \n", num_platforms);
platforms = (cl_platform_id*) malloc (num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(5, platforms, &num_platforms);
for(int i = 0; i < num_platforms; i++)
{
char name[40],vendor[40],version[40], profile[40],extensions[4096];
clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME, sizeof(name), &name, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_VERSION, sizeof(vendor), &version, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_PROFILE, sizeof(vendor), &profile, NULL);
//clGetPlatformInfo(platforms[i],CL_PLATFORM_EXTENSIONS, sizeof(vendor), &extensions, NULL);
printf("Platform %d \n", i);
printf("Name %s \n", name);
printf("Vendor %s \n", vendor);
printf("Version %s \n", version);
printf("Profile %s \n", profile);
//printf("Extension %s \n", extensions);
printf("----------------------------------\n");
}
////////////////////////////////////////////////////////////////
//DEVICES QUERYING
////////////////////////////////////////////////////////////////
cl_device_id* devices;
cl_uint num_devices;
cl_device_fp_config flag ;
for(int i= 0; i< num_platforms; i++)
{
printf("Platform %d has:\n",i);
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 3, NULL, &num_devices);
devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
char name[40];
for(int j=0; j < num_devices; j++)
{
int err= clGetDeviceInfo(devices[j],CL_DEVICE_NAME,sizeof(name),name,NULL);
if (err<0)
{
//printf("Error querying devices name\n");
}
else
{
printf("Device name %s \n", name);
}
err= clGetDeviceInfo(devices[j],CL_DEVICE_NAME,sizeof(flag),&flag,NULL);
if (flag & CL_FP_DENORM)
{
printf("This device support denormalized number \n");
}
}
printf("-----------------------------------\n");
}
///////////////////////////////////////////////////////
//CONTEXT QUERYING AND CREATING
////////////////////////////////////////////////////////
//NOTE clCreateContext returns cl_context instead of errors
//REF_COUNT if very important in the future
//create context for GPU
cl_context context;
cl_uint ref_count;
cl_int err;
char name[40];
context= clCreateContext(NULL,1,&devices[0], NULL,NULL,&err);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Original reference count is %d \n",ref_count);
/*clRetainContext(context);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Incremented reference count is %d \n",ref_count);
clReleaseContext(context);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Decremented reference count is %d \n",ref_count);*/
////////////////////////////////////////////////////////
//Create programme
///////////////////////////////////////////////////////
size_t program_size;
err=0;
cl_program program;
char* program_buffer;
FILE* program_handle = fopen("kernel.cl","r");
//More recommendable than source code???
program_buffer = (char*)malloc(MAX_SOURCE_SIZE);
program_size = fread( program_buffer, 1, MAX_SOURCE_SIZE, program_handle);
fclose( program_handle );
program = clCreateProgramWithSource(context,1,(const char**) &program_buffer,
(size_t*)&program_size, &err);
////////////////////////////////////////////////////////
//Build Program
///////////////////////////////////////////////////////
//const char options[] = "-cl-finite-math-only -cl-no-signed-zeros";
char* program_log;
size_t log_size;
err= clBuildProgram(program, 1 , devices, NULL, NULL, NULL);
if(err < 0) //debug , printing log
{
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*) malloc(log_size+1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,
program_log,NULL);
printf("%s\n",program_log);
free(program_log);
//exit(1);
}
///////////////////////////////////////////////////////////////////////////////////
//create kernel
///////////////////////////////////////////////////////////////////////////////////
cl_uint num_kernels;
cl_kernel kernel;
char kernel_name[40];
kernel = clCreateKernel(program,"add",&err);
if (err<0)
{
perror("could not found any kernels\n");
}
//kernels = (cl_kernel*)malloc(num_kernels*sizeof(cl_kernel));
//clCreateKernelsInProgram(program, num_kernels, kernels, NULL);
///FOR REFERNECE
//for(int i=0; i<num_kernels; i++)
//{
clGetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,sizeof(kernel_name),kernel_name,NULL);
printf("Kernel function: %s \n",kernel_name);
//}
/////////////////////////////////////////////////////
//Create command queue
/////////////////////////////////////////////////////
cl_command_queue queue = clCreateCommandQueue(context, devices[0],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
if (err < 0)
{
printf("Couldn't create command queue \n");
exit(1);
}
clEnqueueTask(queue, kernel, 0, NULL, NULL);//only enqueue
//////////////////////////////////////////
unsigned int n= 1000;
int* h_a;
int* h_b;
int* h_c;
cl_mem d_a;
cl_mem d_b;
cl_mem d_c;
h_a = (int*) malloc(n*sizeof(int));
h_b = (int*) malloc(n*sizeof(int));
h_c = (int*) malloc(n*sizeof(int));
for(int i=0; i< n; i++)
{
h_a[i]= 1;//sinf(i)*sinf(i);
h_b[i]= 1;//cosf(i)*cosf(i);
}
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
err = clEnqueueWriteBuffer(queue,d_a,CL_TRUE,0,sizeof(h_a),h_a,0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue,d_b,CL_TRUE,0,sizeof(h_b),h_a,0, NULL, NULL);
//////set argument
err= clSetKernelArg(kernel,0,sizeof(cl_mem),&d_a);
err= clSetKernelArg(kernel,1,sizeof(cl_mem),&d_b);
err= clSetKernelArg(kernel,2,sizeof(cl_mem),&d_c);
err= clSetKernelArg(kernel,3,sizeof(unsigned int),&n);
///////////////
size_t globalsize, localsize;
localsize=64;
globalsize=ceil(n/(float)localsize)*localsize;
err= clEnqueueNDRangeKernel(queue,kernel,1, NULL,&globalsize,&localsize,0,NULL,NULL);
////////////////////////
clFinish(queue);
err=clEnqueueReadBuffer(queue, d_c,CL_TRUE, 0, sizeof(h_c), h_c, 0 , NULL, NULL);
for(int i = 0; i< n; i++)
{
printf(" h_c[%d] = %d \n", i, h_c[i]);
}
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseKernel(kernel);
free(h_a);
free(h_b);
free(h_c);
getchar();
}
and here is my kernel.cl
__kernel void add(__global int * a, __global int *b, __global int* c, const unsigned n)
{
int id= get_global_id(0);
if (id<n)
c[id]= a[id] + b[id];
}
With this, I only received garbage values , for example h_c[i]= -842150451 for all i.
Please help me to fix it. Thanks!
This statement is not correct :
sizeof(h_a)
Should be something like :
n * sizeof(int)
Indeed h_a is just a pointer so sizeof(h_a) = sizeof(int) => you have the space for only one item.

Resources