preferred vector width in opencl device - opencl

I'm a beginner in OpenCL and am trying to run the sample codes of the "OpenLC in Action" book. I have the following code to get the preferred vector width of my device. The platforms detected on my computer are from Intel Core i7 and HD graphics and another one from NVIDIA GeForce 940M. Whenever I run the code, it gives "1" for vector width of every type unless type double which is zero because it is not supported. Even when I change the platform in my computer to check its devices, results are the same. I ran the code on an AMD computer and it seemed to work properly because it gave me different numbers for different types. But, I am not sure why this code keeps giving me "1" for every type on different platforms of my computer. Any ideas?
Here is the output:
Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>
int main(){
cl_int err, i, j;
cl_platform_id *platforms;
cl_device_id *devices;
cl_uint num_platforms, num_devices, vector_width;
size_t plat_name_size, devi_name_size;
char *plat_name_data, *devi_name_data;
err = clGetPlatformIDs(1, NULL, &num_platforms);
if (err < 0){
perror("No platform is found");
exit(1);
}
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
printf("Number of found platforms is %d\n ", num_platforms);
for (i = 0; i < num_platforms; i++){
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
if (err < 0){
perror("Couldn't read platform name.");
exit(1);
}
plat_name_data = (char*)malloc(plat_name_size);
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL);
printf("Platform No.%d is: %s\n", i, plat_name_data);
err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
if (err < 0){
perror("No device is found in this platform");
exit(1);
}
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
printf("Number of devices found in this platform is: %d\n", num_devices);
for (j = 0; j < num_devices; j++){
err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size);
if (err < 0){
perror("Couldn't read the device name.");
exit(1);
}
devi_name_data = (char*)malloc(devi_name_size);
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL);
printf("Device No.%d name is: %s\n", j + 1, devi_name_data);
if (strstr(devi_name_data, "GeForce 940M")){
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in chars: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in shorts: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in ints: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in longs: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in floats: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in doubles: %u\n", vector_width);
}
}
}
return 0;
}

Short answer: You are querying it correctly, and the platform compiler knows what is the best vector width size. So yes, it is correct the value of 1.
Long answer: For a CPU (any type of CPU) it is likely to prefer non vectored. Specially on Intel CPU + Compiler, since the Intel compiler does the vectorization as part of the optimization process, so it prefers the user NOT to vectorize the code in the first place.
Indeed it looks like nVIDIA also prefers the user to input non vectorized code. It does not mean code will run slower if vectorized already. It just means the compiler (due to the optimization techniques it has) prefers the code to be unvectorized.
Updates to the OpenCL drivers may lead to a change of these values.
Also, you should take them as orientative. Other factors as: local memory usage, coalesced global access, local size, etc... are way more important usually.

Here is one experiment that I've done to see how vectorized operations perform in a device which prefers to do scalar operations. I have implemented the reduction algorithm with two different kernels. The first kernel treats data as scalars while the second treats data as float4 vectors (Codes are given below). Here is the execution results. It is clear that although the NVIDIA device prefers non-vectorized operation, vectorized operation is faster.
Preferred vector width: 1
reduction_scalar: Check passed.
Total time = 4471424
reduction_vector: Check passed.
Total time = 1723776
And here is the code:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"
#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err == CL_DEVICE_NOT_FOUND) {
printf(" GPU is not first! Going on CPU :(");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
}
if (err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main() {
/* OpenCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel[NUM_KERNELS];
cl_command_queue queue;
cl_event prof_event;
cl_int i, j, err, preferred_width;
size_t local_size, global_size;
char kernel_names[NUM_KERNELS][20] =
{ "reduction_scalar", "reduction_vector" };
/* Data and buffers */
float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);
//float data[ARRAY_SIZE];
float sum, actual_sum, *scalar_sum, *vector_sum;
cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
cl_int num_groups;
cl_ulong time_start, time_end, total_time;
/* Initialize data */
for (i = 0; i<ARRAY_SIZE; i++) {
data[i] = 1.0f*i;
}
/* Create device and determine local size */
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(preferred_width), &preferred_width, NULL);
printf("Preferred vector width: %d\n", preferred_width);
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(local_size), &local_size, NULL);
if (err < 0) {
perror("Couldn't obtain device information");
exit(1);
}
/* Allocate and initialize output arrays */
num_groups = ARRAY_SIZE / local_size;
scalar_sum = (float*)malloc(num_groups * sizeof(float));
vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
for (i = 0; i<num_groups; i++) {
scalar_sum[i] = 0.0f;
}
for (i = 0; i<num_groups / 4; i++) {
vector_sum[i] = 0.0f;
}
/* Create a context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build program */
program = build_program(context, device, PROGRAM_FILE);
/* Create data buffer */
data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
if (err < 0) {
perror("Couldn't create a buffer");
exit(1);
};
/* Create a command queue */
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
for (i = 0; i<NUM_KERNELS; i++) {
/* Create a kernel */
kernel[i] = clCreateKernel(program, kernel_names[i], &err);
if (err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
/* Create kernel arguments */
err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
if (i == 0) {
global_size = ARRAY_SIZE;
err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
}
else {
global_size = ARRAY_SIZE / 4;
err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
}
if (err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
/* Finish processing the queue and get profiling information */
clFinish(queue);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
/* Read the result */
if (i == 0) {
err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups; j++) {
sum += scalar_sum[j];
}
}
else {
err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups / 4; j++) {
sum += vector_sum[j];
}
}
/* Check result */
printf("%s: ", kernel_names[i]);
actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
if (fabs(sum - actual_sum) > 0.01*fabs(sum))
printf("Check failed.\n");
else
printf("Check passed.\n");
printf("Total time = %lu\n\n", total_time);
/* Deallocate event */
clReleaseEvent(prof_event);
}
/* Deallocate resources */
free(scalar_sum);
free(vector_sum);
for (i = 0; i<NUM_KERNELS; i++) {
clReleaseKernel(kernel[i]);
}
clReleaseMemObject(scalar_sum_buffer);
clReleaseMemObject(vector_sum_buffer);
clReleaseMemObject(data_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
and the kernels:
__kernel void reduction_scalar(__global float* data,
__local float* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = partial_sums[0];
}
}
__kernel void reduction_vector(__global float4* data,
__local float4* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
}
}

Related

openCL wrong result

Could somebody explain me why is it happen?
I tought it'd increment the value of my array.
#include <iostream>
#pragma comment(lib, "OpenCL.lib")
#include <CL/cl.h>
const std::string source_str = R"(
__kernel void add(__global int* c) {
int i = get_global_id(0);
c[i]=c[i]+1;
})";
size_t source_size = source_str.length();
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_context context;
cl_command_queue command_queue;
cl_mem a_mem_obj;
cl_program program;
cl_kernel kernel;
int* a;
#define SIZE 100
// ## You may add your own initialization routines here ##
void init() {
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
if (ret != CL_SUCCESS)
std::cout << ret << 1;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
if (ret != CL_SUCCESS)
std::cout << ret << 2;
// Create an OpenCL context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 3;
// Create a command queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 4;
a_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE,
SIZE * sizeof(int), NULL, &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 6;
// Create a program from the kernel source
program = clCreateProgramWithSource(context, 1,
(const char**)&source_str, (const size_t*)&source_size, &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 9;
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 10;
// Create the OpenCL kernel
kernel = clCreateKernel(program, "add", &ret);
if (ret != CL_SUCCESS)
std::cout << ret << 11;
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&a_mem_obj);
if (ret != CL_SUCCESS)
std::cout << ret << 13;
}
void KernelStart() {
// Copy to the memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
SIZE * sizeof(int), a, 0, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 7;
// Execute the OpenCL kernel on the list
size_t static global_item_size = SIZE; // Process the entire lists
size_t static local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 14;
ret = clEnqueueReadBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
SIZE * sizeof(int), a, 0, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 15;
}
int main() {
a = new int[SIZE];
for (size_t i = 0; i < SIZE; i++)
{
a[i] = 1;
}
for (size_t i = 0; i < SIZE; i++)
{
std::cout << a[i];
}
std::cout << std::endl;
init();
KernelStart();
for (size_t i = 0; i < SIZE; i++)
{
std::cout << a[i];
}
}
result:
1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
-4913-54141111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
Your logging already shows you what's going on, albeit not particularly readably.
First Problem
Your program's output:
-4913
The corresponding code:
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&a_mem_obj);
if (ret != CL_SUCCESS)
std::cout << ret << 13;
-49 is CL_INVALID_ARG_INDEX. Kernel arguments are numbered from 0, your kernel has 1 argument, so the only valid index is 0.
Second Problem
Your program's output:
-5414
The code:
#define SIZE 100
…
// Execute the OpenCL kernel on the list
size_t static global_item_size = SIZE; // Process the entire lists
size_t static local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
std::cout << ret << 14;
-54 corresponds to CL_INVALID_WORK_GROUP_SIZE.
There are 3 possible reasons specified:
CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size or does not match the work-group size specified for kernel using the __attribute__((reqd_work_group_size(X, Y, Z))) qualifier in program source.
CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the total number of work-items in the work-group computed as local_work_size[0] *... local_work_size[work_dim - 1] is greater than the value specified by CL_DEVICE_MAX_WORK_GROUP_SIZE in the table of OpenCL Device Queries for clGetDeviceInfo.
CL_INVALID_WORK_GROUP_SIZE if local_work_size is NULL and the __attribute__((reqd_work_group_size(X, Y, Z))) qualifier is used to declare the work-group size for kernel in the program source.
Your local size is 64, your global size is 100. This means you're running into the first condition: you'll need to make sure your global size is an integer multiple of the local size.

OpenCL program works only for the multiple of itemsize

I'm new to openCL program and this is the problem I'm facing while executing a simple vector addition.
I have the following kernel code
#include <CL/cl.hpp>
#include<iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
I have integrated gpu and amd gpus on my system. I'm trying to perform vector addition on my intel gpu and for which I have installed the intel opencl drivers (i7 3rd gen processor with hd graphics).
I have the below openCL code
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::cout << "Total platforms including cpu: " << platforms.size() << std::endl;
if (platforms.size() == 0) {
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
int i;
const int LIST_SIZE = 50;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
//std::cout<<source_str<<std::endl;
// Get platform and device information
cl_platform_id* platforms1 = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, platforms1, &ret_num_platforms);
platforms1= (cl_platform_id*) malloc(sizeof(cl_platform_id) * ret_num_platforms);
clGetPlatformIDs(ret_num_platforms, platforms1, NULL);
/*
* Platform 0: Intel Graphics
* Platform 1 : AMD Graphics
*/
//CHANGE THE PLATFORM ACCORDING TO YOUR SYSTEM!!!!
ret = clGetDeviceIDs( platforms1[0], CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 16; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
//FREE
return 0;
}
If the LISTSIZE is 50, it prints only till 48 that is 16*3. It prints only the multiple of LISTSIZE and I'm not able to figure out why?.
OpenCL kernels execute only for a multiple of the local thread block size (local Range, in your code local_item_size), which should not be smaller than 32 and must be a multiple of 2, (so it can be (32, 64, 128, 256, ...). If you set it to 16, half of the GPU will be idle at any time. global_item_size must be a multiple of local_item_size. You need at least 32 data items for the kernel to function and a lot more for it to yield good performance.
Also the part
#include <CL/cl.hpp>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
is not OpenCL C code and does not belong in the .cl source file. If it is not too lengthy, you can write the OpenCL C code directly in the .cpp file as a raw string:
const string kernel_code = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
)";
char* source_str = kernel_code.c_str();

What is the best practice to do reduce in OpenCL?

Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.

openCL simple add vector returns gargabe values

Here is my attempt to write a opencl code to add 2 vectors
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
//24/12
//data structure platform, device, context,program, kernel, command queue
void main()
{
/////////////////////////////////////////////////////////////////////
//PLATFORM QUERY:
/////////////////////////////////////////////////////////////////////
//clGetPlatformIDs(num_entries, platforms, &num_platforms);
// two part: platform = NULL
// malloc and get platforms*
cl_uint num_platforms; //must be uint
cl_platform_id *platforms;
clGetPlatformIDs(5, NULL, &num_platforms);
printf("There are %d platforms \n", num_platforms);
platforms = (cl_platform_id*) malloc (num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(5, platforms, &num_platforms);
for(int i = 0; i < num_platforms; i++)
{
char name[40],vendor[40],version[40], profile[40],extensions[4096];
clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME, sizeof(name), &name, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_VERSION, sizeof(vendor), &version, NULL);
clGetPlatformInfo(platforms[i],CL_PLATFORM_PROFILE, sizeof(vendor), &profile, NULL);
//clGetPlatformInfo(platforms[i],CL_PLATFORM_EXTENSIONS, sizeof(vendor), &extensions, NULL);
printf("Platform %d \n", i);
printf("Name %s \n", name);
printf("Vendor %s \n", vendor);
printf("Version %s \n", version);
printf("Profile %s \n", profile);
//printf("Extension %s \n", extensions);
printf("----------------------------------\n");
}
////////////////////////////////////////////////////////////////
//DEVICES QUERYING
////////////////////////////////////////////////////////////////
cl_device_id* devices;
cl_uint num_devices;
cl_device_fp_config flag ;
for(int i= 0; i< num_platforms; i++)
{
printf("Platform %d has:\n",i);
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 3, NULL, &num_devices);
devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
char name[40];
for(int j=0; j < num_devices; j++)
{
int err= clGetDeviceInfo(devices[j],CL_DEVICE_NAME,sizeof(name),name,NULL);
if (err<0)
{
//printf("Error querying devices name\n");
}
else
{
printf("Device name %s \n", name);
}
err= clGetDeviceInfo(devices[j],CL_DEVICE_NAME,sizeof(flag),&flag,NULL);
if (flag & CL_FP_DENORM)
{
printf("This device support denormalized number \n");
}
}
printf("-----------------------------------\n");
}
///////////////////////////////////////////////////////
//CONTEXT QUERYING AND CREATING
////////////////////////////////////////////////////////
//NOTE clCreateContext returns cl_context instead of errors
//REF_COUNT if very important in the future
//create context for GPU
cl_context context;
cl_uint ref_count;
cl_int err;
char name[40];
context= clCreateContext(NULL,1,&devices[0], NULL,NULL,&err);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Original reference count is %d \n",ref_count);
/*clRetainContext(context);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Incremented reference count is %d \n",ref_count);
clReleaseContext(context);
clGetContextInfo(context,CL_CONTEXT_REFERENCE_COUNT,sizeof(ref_count), &ref_count, NULL);
printf("Decremented reference count is %d \n",ref_count);*/
////////////////////////////////////////////////////////
//Create programme
///////////////////////////////////////////////////////
size_t program_size;
err=0;
cl_program program;
char* program_buffer;
FILE* program_handle = fopen("kernel.cl","r");
//More recommendable than source code???
program_buffer = (char*)malloc(MAX_SOURCE_SIZE);
program_size = fread( program_buffer, 1, MAX_SOURCE_SIZE, program_handle);
fclose( program_handle );
program = clCreateProgramWithSource(context,1,(const char**) &program_buffer,
(size_t*)&program_size, &err);
////////////////////////////////////////////////////////
//Build Program
///////////////////////////////////////////////////////
//const char options[] = "-cl-finite-math-only -cl-no-signed-zeros";
char* program_log;
size_t log_size;
err= clBuildProgram(program, 1 , devices, NULL, NULL, NULL);
if(err < 0) //debug , printing log
{
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*) malloc(log_size+1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,
program_log,NULL);
printf("%s\n",program_log);
free(program_log);
//exit(1);
}
///////////////////////////////////////////////////////////////////////////////////
//create kernel
///////////////////////////////////////////////////////////////////////////////////
cl_uint num_kernels;
cl_kernel kernel;
char kernel_name[40];
kernel = clCreateKernel(program,"add",&err);
if (err<0)
{
perror("could not found any kernels\n");
}
//kernels = (cl_kernel*)malloc(num_kernels*sizeof(cl_kernel));
//clCreateKernelsInProgram(program, num_kernels, kernels, NULL);
///FOR REFERNECE
//for(int i=0; i<num_kernels; i++)
//{
clGetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,sizeof(kernel_name),kernel_name,NULL);
printf("Kernel function: %s \n",kernel_name);
//}
/////////////////////////////////////////////////////
//Create command queue
/////////////////////////////////////////////////////
cl_command_queue queue = clCreateCommandQueue(context, devices[0],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
if (err < 0)
{
printf("Couldn't create command queue \n");
exit(1);
}
clEnqueueTask(queue, kernel, 0, NULL, NULL);//only enqueue
//////////////////////////////////////////
unsigned int n= 1000;
int* h_a;
int* h_b;
int* h_c;
cl_mem d_a;
cl_mem d_b;
cl_mem d_c;
h_a = (int*) malloc(n*sizeof(int));
h_b = (int*) malloc(n*sizeof(int));
h_c = (int*) malloc(n*sizeof(int));
for(int i=0; i< n; i++)
{
h_a[i]= 1;//sinf(i)*sinf(i);
h_b[i]= 1;//cosf(i)*cosf(i);
}
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(h_a),NULL,NULL);
err = clEnqueueWriteBuffer(queue,d_a,CL_TRUE,0,sizeof(h_a),h_a,0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue,d_b,CL_TRUE,0,sizeof(h_b),h_a,0, NULL, NULL);
//////set argument
err= clSetKernelArg(kernel,0,sizeof(cl_mem),&d_a);
err= clSetKernelArg(kernel,1,sizeof(cl_mem),&d_b);
err= clSetKernelArg(kernel,2,sizeof(cl_mem),&d_c);
err= clSetKernelArg(kernel,3,sizeof(unsigned int),&n);
///////////////
size_t globalsize, localsize;
localsize=64;
globalsize=ceil(n/(float)localsize)*localsize;
err= clEnqueueNDRangeKernel(queue,kernel,1, NULL,&globalsize,&localsize,0,NULL,NULL);
////////////////////////
clFinish(queue);
err=clEnqueueReadBuffer(queue, d_c,CL_TRUE, 0, sizeof(h_c), h_c, 0 , NULL, NULL);
for(int i = 0; i< n; i++)
{
printf(" h_c[%d] = %d \n", i, h_c[i]);
}
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseKernel(kernel);
free(h_a);
free(h_b);
free(h_c);
getchar();
}
and here is my kernel.cl
__kernel void add(__global int * a, __global int *b, __global int* c, const unsigned n)
{
int id= get_global_id(0);
if (id<n)
c[id]= a[id] + b[id];
}
With this, I only received garbage values , for example h_c[i]= -842150451 for all i.
Please help me to fix it. Thanks!
This statement is not correct :
sizeof(h_a)
Should be something like :
n * sizeof(int)
Indeed h_a is just a pointer so sizeof(h_a) = sizeof(int) => you have the space for only one item.

Efficient Repeated calling of NDRangeKernel in OpenCL

I've written the following code. I have a loop which iterates between two red and black kernels. In each iteration I call clEnqueueReadBuffer which I think is not efficient. Is there any other way to repeat calling kernels efficiently?
Thanks
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <cmath>
#include <ctime>
#include <ocl
Utils.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define DATA_SIZE (1048576)
#define NANO_TO_MILI 1e6
#define MAX_ITER 1
#define LIMIT 100
#define BIG_RANGE LIMIT*4*100
#define EPS 1e-2
#define SQ 1024
#define A(i,j) A[i*SQ+j]
using namespace std;
cl_platform_id platforms;
cl_device_id device;
cl_context context;
cl_program program1, program2;
cl_command_queue command;
cl_int err;
cl_kernel kernel_red, kernel_black;
cl_int i;
cl_mem input_A,input_b,in_out_X;
cl_event timing_event;
cl_ulong time_start, time_end,total_time = 0;
const char options[] = "-cl-mad-enable -cl-finite-math-only -Werror -DWIDTH=1024 -DHEIGHT=1024";
char *kernel_names[] = {"Red","Black"};
float norm (float*,float*,int);
void swap(float **in, float **out);
void CreateQueue(void);
void CreateKernel(void);
void CreateBuffer(unsigned int);
void Enqueue_Write_Buffer(unsigned int);
void Kernel_Arg_Set(cl_kernel, unsigned int);
void Enqueue_Read_Buffer(unsigned int);
void Create_Work_Group(cl_kernel, unsigned int);
void Shutdown();
float *A,*oldX,*newX,*b;
int main(int argc, char** argv) {
unsigned int count = DATA_SIZE;
int i,j;
clock_t start,end;
float *XX,*XXnew;
A = (float*)malloc(sizeof(float)*count);
newX = (float*)malloc(sizeof(float)*SQ);
oldX = (float*)malloc(sizeof(float)*SQ);
b = (float*)malloc(sizeof(float)*SQ);
XX = (float*)malloc(sizeof(float)*SQ);
float h=1.0f/SQ;
float xx[SQ];
for (i=0;i<SQ;i++){
XX[i] = 0.0f;
oldX[i]=0.0f;
xx[i] = 0.0f + (i+1)*h;
if (i != 0) b[i] = -2.0f*xx[i]; else b[i] = -2.0f*xx[i]-1.0f/(h*h)+1.0f/(2.0f*h);
for(j=0;j<SQ;j++) A(i,j) =0.0f;
A(i,i) = -2.0f/(h*h);
if (i!=SQ-1) A(i,i+1) = 1.0f/(h*h) + 1.0f/(2.0f*h); else A(i,i+1) = 0.0f;
if (i != 0) A(i,i-1) = 1.0f/(h*h) - 1.0f/(2.0f*h); else A(i,i-1) = 0.0f;
}
newX[0] = BIG_RANGE;
int cnt = 0;
CreateQueue();
CreateKernel();
CreateBuffer(count);
Kernel_Arg_Set(kernel_red ,count);
Kernel_Arg_Set(kernel_black,count);
end=0.0f;start =clock();cnt =0;
Enqueue_Write_Buffer(count);
while(norm(oldX,newX,SQ) > EPS && cnt<LIMIT){
Create_Work_Group(kernel_red, count);
Enqueue_Read_Buffer(count);
Create_Work_Group(kernel_black, count);
cnt++;
Enqueue_Read_Buffer(count);
}
clFinish(command);
Shutdown();
free(oldX);
free(newX);
free(XX);
free(XXnew);
return 0;
}
void CreateQueue(){
err = clGetPlatformIDs(1, &platforms, NULL);
if(err<0){
perror("no platform");getchar();exit(1);}
err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &device,NULL);
if(err<0){
perror("no device");getchar();exit(1);}
context = clCreateContext(NULL, 1, &device,NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");exit(1);}
command = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
if (!command)
{
printf("Error: Failed to create a command commands!\n");
exit(1);
}
clEnqueueBarrier(command);
}
void CreateBuffer(unsigned int count){
input_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * count, A, NULL);
in_out_X = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, oldX, NULL);
input_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, b, NULL);
if (!input_A || !input_b || !in_out_X)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
}
void CreateKernel(){
FILE *fp;
size_t program_size;
string kernel_src;
fp = fopen("Red.cl", "r");
fseek(fp, 0, SEEK_END);
program_size = ftell(fp);
kernel_src.resize(program_size + 1);
fseek(fp, 0, SEEK_SET);
fread(&kernel_src[0], program_size, 1, fp);
fclose(fp);
kernel_src[program_size] = '\0';
const char *src = &kernel_src[0];
program1 = clCreateProgramWithSource(context, 1,&src, NULL, &err);
if (!program1)
{
printf("clCreateProgramWithSource failed\n");
exit(1);
}
err =clBuildProgram(program1, 1, &device, options, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2*2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program1, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
kernel_red = clCreateKernel(program1, kernel_names[0], &err);
if (!kernel_red || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
kernel_black = clCreateKernel(program1, kernel_names[1], &err);
if (!kernel_black || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
}
void Create_Work_Group(cl_kernel kernel, unsigned int count){
size_t global[] = {SQ,SQ,0};
size_t local[] = {32,32,0};
err = clEnqueueNDRangeKernel(command, kernel, 2, NULL, global, local, 0, NULL,NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
exit(1);
}
}
void Kernel_Arg_Set(cl_kernel kernel,unsigned int count){
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_A);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_out_X);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_b);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
}
void Enqueue_Read_Buffer(unsigned int count){
err = clEnqueueReadBuffer( command, in_out_X, CL_TRUE, 0, sizeof(float) * SQ, oldX, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
}
void Enqueue_Write_Buffer(unsigned int count){
err = clEnqueueWriteBuffer(command, input_A , CL_FALSE, 0, sizeof(float) * count, A, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, input_b , CL_FALSE, 0, sizeof(float) * SQ , b, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, in_out_X, CL_FALSE, 0, sizeof(float) * SQ ,oldX, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
}
What you do is quite inefficient. You can write the buffer only once, then enqueue as many kernels as you want, with the same buffer as their argument. Of course if you need to compute the norm, you need to read data back. I would suggest something like this:
Create an additional buffer for the norm; check at the beginning of every kernel what the norm is (just by reading its value); if it is smaller than threshold value, return immediately.
Create a new kernel which will compute the norm for you.
Enque tasks like:
write buffers,
kernels: { {red,black}*10, updateNorm}*10
read buffers.
The computation will run 10x, then norm will be updated. In case it is already ok, already enqueued computation kernels will be will retrun immediately. After the queue is finished, read buffers back and check norm on the CPU. If the norm is still not OK, enqueue the same batch of kernels again.
In the worst case, you will waste 9 real and 90 immediately returning kernel runs.

Resources