openCL - difference in handling array and scalar - opencl

I'm trying to dig into openCL. At the moment I'm asking myself why a scalar differs from array when it comes to transferring to GPU.
Below there are two inputs. A scalar and a array. Why is there such a big difference in transferring them to the GPU?
Thanks in advance!
// input size
int input_size = 4;
err = clSetKernelArg(kernel, 0, sizeof(unsigned int), &_input_size);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to set kernel arguments!");
}
// input
int input[input_size];
input[0] = 1;
input[1] = 2;
input[2] = 3;
input[3] = 4;
cl_mem cl_input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * _input_size, NULL, NULL);
if (!cl_input) {
throw std::runtime_error("Failed to allocate device memory!");
}
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to set kernel arguments!");
}
err = clEnqueueWriteBuffer(commands, cl_input, CL_TRUE, 0, sizeof(int) * _input_size, _input, 0, NULL, NULL);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to write to source array!");
}
// output
int output_size = 4;
int output[output_size];
cl_mem cl_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * _output_size, NULL, NULL);
if (!cl_output) {
throw std::runtime_error("Failed to allocate device memory!");
}
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &cl_output);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to set kernel arguments!" );
}

In OpenCL scalars are passed by value and are constants to each thread. Arrays are passed by reference (address) and can be used a input (read only) , output ( write only) or input/output ( read/write). Arrays require the cl_mem object be passed as the arguments. cl_mem is a meta object that describes the array. Scalars are so simple they just require their size as meta data.
So in you above example the kernel should look like ....
kernel void mykernel( const int arg0, global int * arg1, global int * arg2 ... ) {
int i = get_global_id(0);
// In every thread arg0 is constant value and cannot be changed
if (i < arg0)
arg2[i] = arg[1[i];
}

Related

What is the best practice to do reduce in OpenCL?

Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.

OpenCL GPU calculation wrong

I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.
Here is my code on the Host
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
double *arange(double start, double end, double step)
{
// 'arange' routine.
int i;
int arr_size = ((end - start) / step) + 1;
double *output = malloc(arr_size * sizeof(double));
for(i=0;i<arr_size;i++)
{
output[i] = start + (step * i);
}
return output;
}
int main()
{
// This code executes on the OpenCL Host
// Host data
double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
double *delnu = arange(nu_ini, nu_end, nu_step);
double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
double *gamma, *f;
double prs = 950.0;
int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
FILE *fp = fopen("h2o_HITRAN.par","r");
char string[320];
while(!feof(fp))
{
dum = fgetc(fp);
if(dum == '\n')
{
lines++;
}
}
rewind(fp);
nu = malloc(lines * sizeof(double));
inten = malloc(lines * sizeof(double));
gam_air = malloc(lines * sizeof(double));
n_air = malloc(lines * sizeof(double));
del_air = malloc(lines * sizeof(double));
gamma = malloc(lines * sizeof(double));
f = malloc(delnu_size * sizeof(double));
i=0;
while(fgets(string, 320, fp))
{
sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
i++;
}
size_t line_siz = sizeof(double) * lines;
size_t delnu_siz = sizeof(double) * delnu_size;
// gamma calculation
for(i=0;i<lines;i++)
{
gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
}
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = NULL;
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
// Create a context and associate it with the devices
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = NULL;
cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
// Create a buffer objects that will contain the data from the host array 'buf_xxxx'
cl_mem buf_inten = NULL;
cl_mem buf_gamma = NULL;
cl_mem buf_delnu = NULL;
cl_mem buf_nu = NULL;
cl_mem buf_del_air = NULL;
cl_mem buf_f = NULL;
buf_inten = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_gamma = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_delnu = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
buf_nu = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_f = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
// Write input array A to the Device buffer 'buf_xxx'
status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_Source;
FILE *program_handle = fopen("abs_calc.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_Source = (char*)malloc(program_size+1);
program_Source[program_size] = '\0';
fread(program_Source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// Create the vector addition kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "abs_cross", &status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);
// Define index space (global work size) of work items for execution.
// A workgroup size (local work size) is not required, but can be used.
size_t globalWorkSize[2] = {lines, delnu_size};
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
// Read the Device output buffer to the host output array
clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);
// Verify the output
FILE *file = fopen("opencl_output","w");
for(i=0;i<delnu_size;i++)
{
fprintf(file, "%le %le\n", delnu[i], f[i]);
}
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(buf_nu);
clReleaseMemObject(buf_inten);
clReleaseMemObject(buf_del_air);
clReleaseMemObject(buf_gamma);
clReleaseMemObject(buf_f);
clReleaseMemObject(buf_delnu);
clReleaseContext(context);
// Free host resources
free(nu);
free(inten);
free(gam_air);
free(n_air);
free(del_air);
free(delnu);
free(gamma);
free(f);
free(platforms);
free(devices);
fclose(fp);
fclose(file);
return 0;
}
and this is my kernel code
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
global double *gamma,
global double *delnu,
global double *nu,
global double *del_air,
global double *f)
{
double pie = 4.0*atan(1.0);
int i = get_global_id(0);
int j = get_global_id(1);
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
Am I doing something wrong?
Thank you.
You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.
I am using AMD W2100, and yes, I have printed out all the supported extension and it included cl_khr_fp64 extension.
Sorry, I forgot to include the original calculation. The actual calculation goes like the following..
for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
I would write OpenCL kernel as below,
Without using atomics and only single work dimension.
global_work_size = delnu_size
There could be a better way but its the simplest one.
__kernel void test(__global double *gamma,
__global double *inten,
__global double *delnu,
__global double *delair,
__global double *f,
const int lines)
{
double pie = 4.0*atan(1.0);
int j = get_global_id(0);
f[j] = 0;
for(i=0,i<lines;i++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
You need to understand how OpenCL kernel is executed.
You can think of it as large number of threads executing concurrently
and each thread could be identified with get_global_id

preferred vector width in opencl device

I'm a beginner in OpenCL and am trying to run the sample codes of the "OpenLC in Action" book. I have the following code to get the preferred vector width of my device. The platforms detected on my computer are from Intel Core i7 and HD graphics and another one from NVIDIA GeForce 940M. Whenever I run the code, it gives "1" for vector width of every type unless type double which is zero because it is not supported. Even when I change the platform in my computer to check its devices, results are the same. I ran the code on an AMD computer and it seemed to work properly because it gave me different numbers for different types. But, I am not sure why this code keeps giving me "1" for every type on different platforms of my computer. Any ideas?
Here is the output:
Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>
int main(){
cl_int err, i, j;
cl_platform_id *platforms;
cl_device_id *devices;
cl_uint num_platforms, num_devices, vector_width;
size_t plat_name_size, devi_name_size;
char *plat_name_data, *devi_name_data;
err = clGetPlatformIDs(1, NULL, &num_platforms);
if (err < 0){
perror("No platform is found");
exit(1);
}
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
printf("Number of found platforms is %d\n ", num_platforms);
for (i = 0; i < num_platforms; i++){
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
if (err < 0){
perror("Couldn't read platform name.");
exit(1);
}
plat_name_data = (char*)malloc(plat_name_size);
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL);
printf("Platform No.%d is: %s\n", i, plat_name_data);
err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
if (err < 0){
perror("No device is found in this platform");
exit(1);
}
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
printf("Number of devices found in this platform is: %d\n", num_devices);
for (j = 0; j < num_devices; j++){
err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size);
if (err < 0){
perror("Couldn't read the device name.");
exit(1);
}
devi_name_data = (char*)malloc(devi_name_size);
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL);
printf("Device No.%d name is: %s\n", j + 1, devi_name_data);
if (strstr(devi_name_data, "GeForce 940M")){
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in chars: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in shorts: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in ints: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in longs: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in floats: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in doubles: %u\n", vector_width);
}
}
}
return 0;
}
Short answer: You are querying it correctly, and the platform compiler knows what is the best vector width size. So yes, it is correct the value of 1.
Long answer: For a CPU (any type of CPU) it is likely to prefer non vectored. Specially on Intel CPU + Compiler, since the Intel compiler does the vectorization as part of the optimization process, so it prefers the user NOT to vectorize the code in the first place.
Indeed it looks like nVIDIA also prefers the user to input non vectorized code. It does not mean code will run slower if vectorized already. It just means the compiler (due to the optimization techniques it has) prefers the code to be unvectorized.
Updates to the OpenCL drivers may lead to a change of these values.
Also, you should take them as orientative. Other factors as: local memory usage, coalesced global access, local size, etc... are way more important usually.
Here is one experiment that I've done to see how vectorized operations perform in a device which prefers to do scalar operations. I have implemented the reduction algorithm with two different kernels. The first kernel treats data as scalars while the second treats data as float4 vectors (Codes are given below). Here is the execution results. It is clear that although the NVIDIA device prefers non-vectorized operation, vectorized operation is faster.
Preferred vector width: 1
reduction_scalar: Check passed.
Total time = 4471424
reduction_vector: Check passed.
Total time = 1723776
And here is the code:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"
#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err == CL_DEVICE_NOT_FOUND) {
printf(" GPU is not first! Going on CPU :(");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
}
if (err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main() {
/* OpenCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel[NUM_KERNELS];
cl_command_queue queue;
cl_event prof_event;
cl_int i, j, err, preferred_width;
size_t local_size, global_size;
char kernel_names[NUM_KERNELS][20] =
{ "reduction_scalar", "reduction_vector" };
/* Data and buffers */
float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);
//float data[ARRAY_SIZE];
float sum, actual_sum, *scalar_sum, *vector_sum;
cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
cl_int num_groups;
cl_ulong time_start, time_end, total_time;
/* Initialize data */
for (i = 0; i<ARRAY_SIZE; i++) {
data[i] = 1.0f*i;
}
/* Create device and determine local size */
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(preferred_width), &preferred_width, NULL);
printf("Preferred vector width: %d\n", preferred_width);
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(local_size), &local_size, NULL);
if (err < 0) {
perror("Couldn't obtain device information");
exit(1);
}
/* Allocate and initialize output arrays */
num_groups = ARRAY_SIZE / local_size;
scalar_sum = (float*)malloc(num_groups * sizeof(float));
vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
for (i = 0; i<num_groups; i++) {
scalar_sum[i] = 0.0f;
}
for (i = 0; i<num_groups / 4; i++) {
vector_sum[i] = 0.0f;
}
/* Create a context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build program */
program = build_program(context, device, PROGRAM_FILE);
/* Create data buffer */
data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
if (err < 0) {
perror("Couldn't create a buffer");
exit(1);
};
/* Create a command queue */
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
for (i = 0; i<NUM_KERNELS; i++) {
/* Create a kernel */
kernel[i] = clCreateKernel(program, kernel_names[i], &err);
if (err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
/* Create kernel arguments */
err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
if (i == 0) {
global_size = ARRAY_SIZE;
err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
}
else {
global_size = ARRAY_SIZE / 4;
err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
}
if (err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
/* Finish processing the queue and get profiling information */
clFinish(queue);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
/* Read the result */
if (i == 0) {
err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups; j++) {
sum += scalar_sum[j];
}
}
else {
err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups / 4; j++) {
sum += vector_sum[j];
}
}
/* Check result */
printf("%s: ", kernel_names[i]);
actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
if (fabs(sum - actual_sum) > 0.01*fabs(sum))
printf("Check failed.\n");
else
printf("Check passed.\n");
printf("Total time = %lu\n\n", total_time);
/* Deallocate event */
clReleaseEvent(prof_event);
}
/* Deallocate resources */
free(scalar_sum);
free(vector_sum);
for (i = 0; i<NUM_KERNELS; i++) {
clReleaseKernel(kernel[i]);
}
clReleaseMemObject(scalar_sum_buffer);
clReleaseMemObject(vector_sum_buffer);
clReleaseMemObject(data_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
and the kernels:
__kernel void reduction_scalar(__global float* data,
__local float* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = partial_sums[0];
}
}
__kernel void reduction_vector(__global float4* data,
__local float4* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
}
}

openCL Long Overflowing

Before I start I am a C beginner and I am trying to do some openCL work which might have been a mistake. Below is my kernel code:
__kernel void collatz(__global int* in, __global int* out)
{
uint id = get_global_id(0);
unsigned long n = (unsigned long)id;
uint count = 0;
while (n > 1) {
if (n % 2 == 0) {
n = n / 2;
} else {
if(n == 1572066143) {
unsigned long test = n;
printf("BEFORE - %lu\n", n);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
n = (3 * n) + 1;
} else {
n = (3 * n) + 1;
}
}
count = count + 1;
}
out[id] = count;
}
and the output:
BEFORE - 1572066143
AFTER - 421231134
To me it looks like n is overflowing but I can't figure out why it is happening.
The interesting thing is if I create a new variable to store the same value as n then it seems to work correctly.
unsigned long test = 1572066143;
printf("BEFORE - %lu\n", test);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
Output:
BEFORE - 1572066143
AFTER - 4716198430
As I said I am a C beginner so I could be doing something very stupid! Any help would be appreciated as I have been pulling my hair out for hours now!
Thanks,
Stephen
Update:
Here is my host code in case I am doing something stupid on that end:
int _tmain(int argc, _TCHAR* argv[])
{
/*Step1: Getting platforms and choose an available one.*/
cl_uint numPlatforms; //the NO. of platforms
cl_platform_id platform = NULL; //the chosen platform
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
/*Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *devices;
devices = (cl_device_id*)malloc(1 * sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
/*Step 3: Create context.*/
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
/*Step 4: Creating command queue associate with the context.*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
const char *filename = "HelloWorld_Kernel.cl";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
cl_ulong max = 2000000;
cl_ulong *numbers = NULL;
numbers = new cl_ulong[max];
for (int i = 1; i <= max; i++) {
numbers[i] = i;
}
int *output = (int*)malloc(sizeof(cl_ulong) * max);
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, max * sizeof(cl_ulong), (void *)numbers, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, max * sizeof(cl_ulong), NULL, NULL);
/*Step 8: Create kernel object */
cl_kernel kernel = clCreateKernel(program, "collatz", NULL);
/*Step 9: Sets Kernel arguments.*/
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
/*Step 10: Running the kernel.*/
size_t global_work_size[] = { max };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
/*Step 11: Read the data put back to host memory.*/
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, max * sizeof(cl_ulong), output, 0, NULL, NULL);
return SUCCESS;
}
I finally got to the bottom of the issue.
I was running the code on my Intel HD Graphics 4600 chip and it was producing the strange behaviour shown in the original question. I switched to using my AMD card and then it started working as expected!
Very strange. Thanks to everyone for their help!
Host side and device size values have different sizes. In host, long can vary from 32 to 64bits, depending on the platform. In device, long refers to 64bits only.
printf() function, as defined in C says that %ld is to print long (host side long) numbers. You are using printf in a kernel, so.... It could be that the C-like parser is used, therefore printing the variable as a 32bits long.
Can you try printing it as %lld or as a floating point?

Efficient Repeated calling of NDRangeKernel in OpenCL

I've written the following code. I have a loop which iterates between two red and black kernels. In each iteration I call clEnqueueReadBuffer which I think is not efficient. Is there any other way to repeat calling kernels efficiently?
Thanks
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <cmath>
#include <ctime>
#include <ocl
Utils.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define DATA_SIZE (1048576)
#define NANO_TO_MILI 1e6
#define MAX_ITER 1
#define LIMIT 100
#define BIG_RANGE LIMIT*4*100
#define EPS 1e-2
#define SQ 1024
#define A(i,j) A[i*SQ+j]
using namespace std;
cl_platform_id platforms;
cl_device_id device;
cl_context context;
cl_program program1, program2;
cl_command_queue command;
cl_int err;
cl_kernel kernel_red, kernel_black;
cl_int i;
cl_mem input_A,input_b,in_out_X;
cl_event timing_event;
cl_ulong time_start, time_end,total_time = 0;
const char options[] = "-cl-mad-enable -cl-finite-math-only -Werror -DWIDTH=1024 -DHEIGHT=1024";
char *kernel_names[] = {"Red","Black"};
float norm (float*,float*,int);
void swap(float **in, float **out);
void CreateQueue(void);
void CreateKernel(void);
void CreateBuffer(unsigned int);
void Enqueue_Write_Buffer(unsigned int);
void Kernel_Arg_Set(cl_kernel, unsigned int);
void Enqueue_Read_Buffer(unsigned int);
void Create_Work_Group(cl_kernel, unsigned int);
void Shutdown();
float *A,*oldX,*newX,*b;
int main(int argc, char** argv) {
unsigned int count = DATA_SIZE;
int i,j;
clock_t start,end;
float *XX,*XXnew;
A = (float*)malloc(sizeof(float)*count);
newX = (float*)malloc(sizeof(float)*SQ);
oldX = (float*)malloc(sizeof(float)*SQ);
b = (float*)malloc(sizeof(float)*SQ);
XX = (float*)malloc(sizeof(float)*SQ);
float h=1.0f/SQ;
float xx[SQ];
for (i=0;i<SQ;i++){
XX[i] = 0.0f;
oldX[i]=0.0f;
xx[i] = 0.0f + (i+1)*h;
if (i != 0) b[i] = -2.0f*xx[i]; else b[i] = -2.0f*xx[i]-1.0f/(h*h)+1.0f/(2.0f*h);
for(j=0;j<SQ;j++) A(i,j) =0.0f;
A(i,i) = -2.0f/(h*h);
if (i!=SQ-1) A(i,i+1) = 1.0f/(h*h) + 1.0f/(2.0f*h); else A(i,i+1) = 0.0f;
if (i != 0) A(i,i-1) = 1.0f/(h*h) - 1.0f/(2.0f*h); else A(i,i-1) = 0.0f;
}
newX[0] = BIG_RANGE;
int cnt = 0;
CreateQueue();
CreateKernel();
CreateBuffer(count);
Kernel_Arg_Set(kernel_red ,count);
Kernel_Arg_Set(kernel_black,count);
end=0.0f;start =clock();cnt =0;
Enqueue_Write_Buffer(count);
while(norm(oldX,newX,SQ) > EPS && cnt<LIMIT){
Create_Work_Group(kernel_red, count);
Enqueue_Read_Buffer(count);
Create_Work_Group(kernel_black, count);
cnt++;
Enqueue_Read_Buffer(count);
}
clFinish(command);
Shutdown();
free(oldX);
free(newX);
free(XX);
free(XXnew);
return 0;
}
void CreateQueue(){
err = clGetPlatformIDs(1, &platforms, NULL);
if(err<0){
perror("no platform");getchar();exit(1);}
err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &device,NULL);
if(err<0){
perror("no device");getchar();exit(1);}
context = clCreateContext(NULL, 1, &device,NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");exit(1);}
command = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
if (!command)
{
printf("Error: Failed to create a command commands!\n");
exit(1);
}
clEnqueueBarrier(command);
}
void CreateBuffer(unsigned int count){
input_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * count, A, NULL);
in_out_X = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, oldX, NULL);
input_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, b, NULL);
if (!input_A || !input_b || !in_out_X)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
}
void CreateKernel(){
FILE *fp;
size_t program_size;
string kernel_src;
fp = fopen("Red.cl", "r");
fseek(fp, 0, SEEK_END);
program_size = ftell(fp);
kernel_src.resize(program_size + 1);
fseek(fp, 0, SEEK_SET);
fread(&kernel_src[0], program_size, 1, fp);
fclose(fp);
kernel_src[program_size] = '\0';
const char *src = &kernel_src[0];
program1 = clCreateProgramWithSource(context, 1,&src, NULL, &err);
if (!program1)
{
printf("clCreateProgramWithSource failed\n");
exit(1);
}
err =clBuildProgram(program1, 1, &device, options, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2*2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program1, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
kernel_red = clCreateKernel(program1, kernel_names[0], &err);
if (!kernel_red || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
kernel_black = clCreateKernel(program1, kernel_names[1], &err);
if (!kernel_black || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
}
void Create_Work_Group(cl_kernel kernel, unsigned int count){
size_t global[] = {SQ,SQ,0};
size_t local[] = {32,32,0};
err = clEnqueueNDRangeKernel(command, kernel, 2, NULL, global, local, 0, NULL,NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
exit(1);
}
}
void Kernel_Arg_Set(cl_kernel kernel,unsigned int count){
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_A);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_out_X);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_b);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
}
void Enqueue_Read_Buffer(unsigned int count){
err = clEnqueueReadBuffer( command, in_out_X, CL_TRUE, 0, sizeof(float) * SQ, oldX, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
}
void Enqueue_Write_Buffer(unsigned int count){
err = clEnqueueWriteBuffer(command, input_A , CL_FALSE, 0, sizeof(float) * count, A, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, input_b , CL_FALSE, 0, sizeof(float) * SQ , b, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(command, in_out_X, CL_FALSE, 0, sizeof(float) * SQ ,oldX, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
}
What you do is quite inefficient. You can write the buffer only once, then enqueue as many kernels as you want, with the same buffer as their argument. Of course if you need to compute the norm, you need to read data back. I would suggest something like this:
Create an additional buffer for the norm; check at the beginning of every kernel what the norm is (just by reading its value); if it is smaller than threshold value, return immediately.
Create a new kernel which will compute the norm for you.
Enque tasks like:
write buffers,
kernels: { {red,black}*10, updateNorm}*10
read buffers.
The computation will run 10x, then norm will be updated. In case it is already ok, already enqueued computation kernels will be will retrun immediately. After the queue is finished, read buffers back and check norm on the CPU. If the norm is still not OK, enqueue the same batch of kernels again.
In the worst case, you will waste 9 real and 90 immediately returning kernel runs.

Resources