Swap memory buffers effectively in OpenCL: implementation - opencl

I have faced the same problem as here: How to effectively swap OpenCL memory buffers?. My first implementation was the same as has been described in the question, at each cycle it writes/reads memory buffers to/from the device. As pointed out this introduces useless read/write buffer overhead. The code (with memory overhead) below works fine:
//THIS WORKS!!!
f0_mem = clCreateBuffer(
context,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof (int)*(capacity + 1),
NULL,
&err);
f1_mem = (..."the same as above"...);
m_d_mem = clCreateBuffer(..., CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof (int)*capacity,...);
for (int k = 0; k < numelem; k++) {
sumK = sumK - weight[k];
cmax = 0;
cmax = max(capacity - sumK, weight[k]);
total_elements = (size_t) (capacity - cmax + 1);
if (k % 2 == 0) {
//clEnqueueWriteBuffer of cl_mem buffers
writeBufferToDevice(f0_mem, f1_mem, f0, f1);
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
} else {
//clEnqueueWriteBuffer of cl_mem buffers
writeBufferToDevice(f1_mem, f0_mem, f1, f0);
setKernelArgs(f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
}
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
//clEnqueueReadBuffer of cl_mem buffers
readBufferFromDevice(f0_mem, f1_mem, m_d_mem, f0, f1, m_d);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
EDIT: My kernel:
void kernel knapsack(global int *input_f, global int *output_f, global int *m_d, int cmax, int weightk, int pk, int maxelem){
int c = get_global_id(0)+cmax;
if(get_global_id(0) < maxelem){
if(input_f[c] < input_f[c - weightk] + pk){
output_f[c] = input_f[c - weightk] + pk;
m_d[c-1] = 1;
}
else{
output_f[c] = input_f[c];
}
}
}
After I have tried to implement the two suggested solutions:
simply swapping setKernelArgs(...)
create two kernels
For the first one this my code:
//ARGUMENTS SWAP
f0_mem = ...
f1_mem = ...
m_d_mem = ...
//clEnqueueWriteBuffer occurs hear
writeBufferToDevice( (cl_mem&) f0_mem, (cl_mem&) f1_mem, (cl_mem&) m_d_mem, (int*) f0, (int*) f1, (int*) m_d);
for (int k = 0; k < numelem; k++) {
/*
The same code block
*/
if (k % 2 == 0) {
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
} else {
setKernelArgs(f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
}
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity, m_d, 0, NULL, NULL);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
The second solution is implemented in this way:
//TWO KERNELS
f0_mem = ...
f1_mem = ...
m_d_mem = ...
//clEnqueueWriteBuffer occurs hear
writeBufferToDevice( (cl_mem&) f0_mem, (cl_mem&) f1_mem, (cl_mem&) m_d_mem, (int*) f0, (int*) f1, (int*) m_d);
for (int k = 0; k < numelem; k++) {
/*
The same code block
*/
if (k % 2 == 0) {
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
} else {
setKernelArgs(kernel1, f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
}
clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity, m_d, 0, NULL, NULL);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
Neither of the two solutions work for me (it seems to me, no swapping occur at all!), what am I doing wrong?
Sub-question: in the last two solutions, is it possible to have memory buffers filled with zeroes without using writeBufferToDevice( f0_mem, f1_mem, m_d_mem...) before the for cycle?
This work is based on this article:
Solving knapsack problems on GPU by V. Boyera, D. El Baza, M. Elkihel
related work: Accelerating the knapsack problem on GPUs by Bharath Suri

Both attempted solutions looks correct to me but there may be some dependencies between each iteration - you would have to post your kernel to check.
It works fine in your solution probably because you are writing and reading each iteration which works slower so it's enough time to synchronize itself.
You can try to add clFinish(command); after each OpenCL API call to see if that makes a difference.
Apart from that there is 3rd solution you could try: swapping pointers in the kernel. You will need to move your loop from CPU to GPU.
inline void swap_pointers(__global double **A, __global double **B)
{
__global double *tmp = *A;
*A = *B;
*B = tmp;
}
__kernel void my_kernel(
__global double *pA,
__global double *pB,
...
)
{
for (int k = 0; k < numelem; k++)
{
// some stuff here
swap_pointers(&pA, &pB);
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
}
}
Then read everything in one go on the host (m_d_mem must be big enough to store data from all iterations):
clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity*numelem, m_d, 0, NULL, NULL);

Solution:
At each cycle after copying m_d to M, the m_d should be reseted and written back to m_d_mem buffer object with Knapsack::writeBuffer_m_d_ToDevice()
ksack.readBuffer_m_d_FromDevice();
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
ksack.writeBuffer_m_d_ToDevice();//resets m_d_mem

Related

OpenCL brute force TEA block 32bit, key 64bit

I decided to study OpenCL myself and write a brute-force password for the TEA algorithm, did I understand OpenCL correctly? can you improve something in the direction of speed? what mistakes have I made?
I prepare the first 5 bytes in cycles, the remaining 3 bytes are sorted out by the kernel, 255 threads at 65535 each
in the main program:
for (int x5 = KEY[0]; x5 >= 0; x5--) {
KEY[0]=x5;
for (int x4 = KEY[1]; x4 >= 0; x4--) {
KEY[1]=x4;
for (int x3 = KEY[2]; x3 >= 0; x3--) {
KEY[2]=x3;
for (int x2 = KEY[3]; x2 >= 0; x2--) {
KEY[3]=x2;
for (int x = KEY[4]; x >= 0; x--) {
KEY[4]=x;
ret = clEnqueueWriteBuffer(command_queue, key_mem_obj, CL_TRUE, 0,
8 * sizeof(int), KEY, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cadr_mem_obj, CL_TRUE, 0,
1 * sizeof(int), CADR, 0, NULL, NULL);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&key_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&cadr_mem_obj);
NDRange = 0x0100;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&NDRange, NULL, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
break;
}
ret = clEnqueueReadBuffer(command_queue, cadr_mem_obj, CL_TRUE, 0,
1 * sizeof(int), CADR, 0, NULL, NULL);
if (CADR[0]>0) {
uint16_t k=CADR[0];
ret = clEnqueueReadBuffer(command_queue, retc_mem_obj, CL_TRUE, 0,
524280 * sizeof(int), RETC, 0, NULL, NULL);
for ((i = 0); i < k; i++) {
Form1->Memo1->Lines->BeginUpdate();
Form1->Memo1->Lines->Add(IntToHex(RETC[i*8],2)+IntToHex(RETC[i*8+1],2)+
IntToHex(RETC[i*8+2],2)+IntToHex(RETC[i*8+3],2)+IntToHex(RETC[i*8+4],2)+
IntToHex(RETC[i*8+5],2)+IntToHex(RETC[i*8+6],2)+IntToHex(RETC[i*8+7],2));
Form1->Memo1->Lines->EndUpdate();
Form1->Label6->Caption=IntToStr(Form1->Memo1->Lines->Count-1);
}
CADR[0]=0;
}
KEY2[0]=KEY[0];
KEY2[1]=KEY[1];
KEY2[2]=KEY[2];
KEY2[3]=KEY[3];
KEY2[4]=KEY[4];
KEY2[5]=KEY[5];
KEY2[6]=KEY[6];
KEY2[7]=KEY[7];
if(Terminated){
break;
}
}
KEY[4]=0xFF;
}
KEY[3]=0xFF;
}
KEY[2]=0xFF;
}
KEY[1]=0xFF;
}
KEY[0]=0xFF;`
Kernel:
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
__kernel void brute(__global const int *KEY, __global const int *DAT, __global int
*CADR,__global int *RETC)
{
int i = get_global_id(0);
ushort Data[2];
ushort Key[4];
Key[0]=(KEY[0]<<8)+KEY[1];
Key[1]=(KEY[2]<<8)+KEY[3];
// Key[2]=(KEY[4]<<8)+KEY[5];
Key[3]=(KEY[6]<<8)+KEY[7];
Key[2] = (KEY[4]<<8) + i;
for (int j=0xFFFF; j>=0; j--){
Key[3]=j;
Data[0]=(DAT[0]<<8)+DAT[1];
Data[1]=(DAT[2]<<8)+DAT[3];
ushort delta = 0x9e37;
ushort sum = (delta<<5);
for (uint n = 0;n < 32; ++n){
Data[1]-=(((Data[0])+Key[2])^(Data[0]+sum)^((Data[0]>>5)+Key[3]));
Data[0]-=(((Data[1]<<4)+Key[0])^(Data[1]+sum)^(Data[1]+Key[1]));
sum -= delta;
}
if ((Data[0]==0x0000) && (Data[1]==0x0000)){
int a=CADR[0];
atomic_inc(CADR);
RETC[a*8]=(Key[0] >> 8);
RETC[a*8+1]=(Key[0] & 0xFF);
RETC[a*8+2]=(Key[1] >> 8);
RETC[a*8+3]=(Key[1] & 0xFF);
RETC[a*8+4]=(Key[2] >> 8);
RETC[a*8+5]=(Key[2] & 0xFF);
RETC[a*8+6]=(Key[3] >> 8);
RETC[a*8+7]=(Key[3] & 0xFF);
}
}
}
If you only launch 256 threads that each do 65536 iterations of the same thing, your GPU will not be saturated and performance will be very poor. GPUs have thousands of "cores", and if you launch 256 threads you will only use 256 of them while the rest remains idle.
The idea of GPU parallelization is to split the work up into as many imdependent problems as there are. In your case this means: Lauch 256*65536 threads that do one Iteration each. Then performance will be much better.

What is the best practice to do reduce in OpenCL?

Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.

OpenCL GPU calculation wrong

I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.
Here is my code on the Host
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
double *arange(double start, double end, double step)
{
// 'arange' routine.
int i;
int arr_size = ((end - start) / step) + 1;
double *output = malloc(arr_size * sizeof(double));
for(i=0;i<arr_size;i++)
{
output[i] = start + (step * i);
}
return output;
}
int main()
{
// This code executes on the OpenCL Host
// Host data
double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
double *delnu = arange(nu_ini, nu_end, nu_step);
double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
double *gamma, *f;
double prs = 950.0;
int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
FILE *fp = fopen("h2o_HITRAN.par","r");
char string[320];
while(!feof(fp))
{
dum = fgetc(fp);
if(dum == '\n')
{
lines++;
}
}
rewind(fp);
nu = malloc(lines * sizeof(double));
inten = malloc(lines * sizeof(double));
gam_air = malloc(lines * sizeof(double));
n_air = malloc(lines * sizeof(double));
del_air = malloc(lines * sizeof(double));
gamma = malloc(lines * sizeof(double));
f = malloc(delnu_size * sizeof(double));
i=0;
while(fgets(string, 320, fp))
{
sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
i++;
}
size_t line_siz = sizeof(double) * lines;
size_t delnu_siz = sizeof(double) * delnu_size;
// gamma calculation
for(i=0;i<lines;i++)
{
gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
}
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = NULL;
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
// Create a context and associate it with the devices
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = NULL;
cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
// Create a buffer objects that will contain the data from the host array 'buf_xxxx'
cl_mem buf_inten = NULL;
cl_mem buf_gamma = NULL;
cl_mem buf_delnu = NULL;
cl_mem buf_nu = NULL;
cl_mem buf_del_air = NULL;
cl_mem buf_f = NULL;
buf_inten = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_gamma = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_delnu = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
buf_nu = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_f = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
// Write input array A to the Device buffer 'buf_xxx'
status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_Source;
FILE *program_handle = fopen("abs_calc.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_Source = (char*)malloc(program_size+1);
program_Source[program_size] = '\0';
fread(program_Source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// Create the vector addition kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "abs_cross", &status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);
// Define index space (global work size) of work items for execution.
// A workgroup size (local work size) is not required, but can be used.
size_t globalWorkSize[2] = {lines, delnu_size};
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
// Read the Device output buffer to the host output array
clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);
// Verify the output
FILE *file = fopen("opencl_output","w");
for(i=0;i<delnu_size;i++)
{
fprintf(file, "%le %le\n", delnu[i], f[i]);
}
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(buf_nu);
clReleaseMemObject(buf_inten);
clReleaseMemObject(buf_del_air);
clReleaseMemObject(buf_gamma);
clReleaseMemObject(buf_f);
clReleaseMemObject(buf_delnu);
clReleaseContext(context);
// Free host resources
free(nu);
free(inten);
free(gam_air);
free(n_air);
free(del_air);
free(delnu);
free(gamma);
free(f);
free(platforms);
free(devices);
fclose(fp);
fclose(file);
return 0;
}
and this is my kernel code
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
global double *gamma,
global double *delnu,
global double *nu,
global double *del_air,
global double *f)
{
double pie = 4.0*atan(1.0);
int i = get_global_id(0);
int j = get_global_id(1);
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
Am I doing something wrong?
Thank you.
You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.
I am using AMD W2100, and yes, I have printed out all the supported extension and it included cl_khr_fp64 extension.
Sorry, I forgot to include the original calculation. The actual calculation goes like the following..
for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
I would write OpenCL kernel as below,
Without using atomics and only single work dimension.
global_work_size = delnu_size
There could be a better way but its the simplest one.
__kernel void test(__global double *gamma,
__global double *inten,
__global double *delnu,
__global double *delair,
__global double *f,
const int lines)
{
double pie = 4.0*atan(1.0);
int j = get_global_id(0);
f[j] = 0;
for(i=0,i<lines;i++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
You need to understand how OpenCL kernel is executed.
You can think of it as large number of threads executing concurrently
and each thread could be identified with get_global_id

openCL Long Overflowing

Before I start I am a C beginner and I am trying to do some openCL work which might have been a mistake. Below is my kernel code:
__kernel void collatz(__global int* in, __global int* out)
{
uint id = get_global_id(0);
unsigned long n = (unsigned long)id;
uint count = 0;
while (n > 1) {
if (n % 2 == 0) {
n = n / 2;
} else {
if(n == 1572066143) {
unsigned long test = n;
printf("BEFORE - %lu\n", n);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
n = (3 * n) + 1;
} else {
n = (3 * n) + 1;
}
}
count = count + 1;
}
out[id] = count;
}
and the output:
BEFORE - 1572066143
AFTER - 421231134
To me it looks like n is overflowing but I can't figure out why it is happening.
The interesting thing is if I create a new variable to store the same value as n then it seems to work correctly.
unsigned long test = 1572066143;
printf("BEFORE - %lu\n", test);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
Output:
BEFORE - 1572066143
AFTER - 4716198430
As I said I am a C beginner so I could be doing something very stupid! Any help would be appreciated as I have been pulling my hair out for hours now!
Thanks,
Stephen
Update:
Here is my host code in case I am doing something stupid on that end:
int _tmain(int argc, _TCHAR* argv[])
{
/*Step1: Getting platforms and choose an available one.*/
cl_uint numPlatforms; //the NO. of platforms
cl_platform_id platform = NULL; //the chosen platform
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
/*Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *devices;
devices = (cl_device_id*)malloc(1 * sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
/*Step 3: Create context.*/
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
/*Step 4: Creating command queue associate with the context.*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
const char *filename = "HelloWorld_Kernel.cl";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
cl_ulong max = 2000000;
cl_ulong *numbers = NULL;
numbers = new cl_ulong[max];
for (int i = 1; i <= max; i++) {
numbers[i] = i;
}
int *output = (int*)malloc(sizeof(cl_ulong) * max);
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, max * sizeof(cl_ulong), (void *)numbers, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, max * sizeof(cl_ulong), NULL, NULL);
/*Step 8: Create kernel object */
cl_kernel kernel = clCreateKernel(program, "collatz", NULL);
/*Step 9: Sets Kernel arguments.*/
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
/*Step 10: Running the kernel.*/
size_t global_work_size[] = { max };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
/*Step 11: Read the data put back to host memory.*/
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, max * sizeof(cl_ulong), output, 0, NULL, NULL);
return SUCCESS;
}
I finally got to the bottom of the issue.
I was running the code on my Intel HD Graphics 4600 chip and it was producing the strange behaviour shown in the original question. I switched to using my AMD card and then it started working as expected!
Very strange. Thanks to everyone for their help!
Host side and device size values have different sizes. In host, long can vary from 32 to 64bits, depending on the platform. In device, long refers to 64bits only.
printf() function, as defined in C says that %ld is to print long (host side long) numbers. You are using printf in a kernel, so.... It could be that the C-like parser is used, therefore printing the variable as a 32bits long.
Can you try printing it as %lld or as a floating point?

Open global_work_size misunderstanding

I'm trying to understand a simple OpenCL example, which is vector addition. The kernel is the following:
__kernel void addVec(__global double* a, __global double* b, __global double* c)
{
size_t id = get_global_id(0);
c[id] = a[id] + b[id];
}
For example, my input arrays have a size of 1 million elements each.
In my host program, I set global_work_size to be exactly the size of the vectors input arrays (1 million).
But when i set it to a smaller value, for example 1000, it also works with this kernel!
I don't understand why the global_work_size can be lesser than the problem dimension, and still, the OpenCL program compute every elements of the input arrays.
Could someone clarify on this?
EDIT: here is the code where I copy the data:
size_t arraySize = 1000000;
const size_t global_work_size[1] = {512};
double *host_a = malloc(arraySize*sizeof(double));
double *host_b = malloc(arraySize*sizeof(double));
double *host_c = calloc(arraySize, sizeof(double));
...
// Create the input and output arrays in device memory for our calculation
device_a = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize*sizeof(double), NULL, NULL);
device_b = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize*sizeof(double), NULL, NULL);
device_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, arraySize*sizeof(double), NULL, NULL);
...
// Copy data set into the input array in device memory. [host --> device]
status = clEnqueueWriteBuffer(command_queue, device_a, CL_TRUE, 0, arraySize*sizeof(double), host_a, 0, NULL, NULL);
status |= clEnqueueWriteBuffer(command_queue, device_b, CL_TRUE, 0, arraySize*sizeof(double), host_b, 0, NULL, NULL);
...
// Copy-back the results from the device [host <-- device]
clEnqueueReadBuffer(command_queue, device_c, CL_TRUE, 0, arraySize*sizeof(double), host_c, 0, NULL, NULL );
...
printf("checking result validity ...\n");
for (size_t i=0; i<arraySize; ++i)
if(host_c[i] - 1 > 1e-6) // the array is supposed to be 1 everywhere
{
printf("*** ERROR! Invalid results ! host_c[%zi]=%.9lf\n", i, host_c[i]);
break;
}
Thanks
Your test function doesn't look good, it will be met for any value < 1, it should be like this:
for (size_t i=0; i<arraySize; ++i){
cl_double val = host_c[i] - 1; // the array is supposed to be 1 everywhere
if((val > 1e-6) || (val < -1e-6))
{
printf("*** ERROR! Invalid results ! host_c[%zi]=%.9lf\n", i, host_c[i]);
break;
}
}
Non initialized values in the GPU are likely to be 0, therefore meeting your condition.
Additionally, remember that if you run the program once with the full size, consecutive reads will still hold the proper processed data (even if you close and open the app again). Since the GPU memory is not cleaned after the buffer is created/destroyed.

Resources