OpenCL local_work_size issues - opencl

I have a vector of size n (unknown before the main program is launched), and I want to experiment with the work-group sizes. However, unless I set the local_work_size to a number that exactly divides n, I only get 0 values in fprop below.
Kernel:
__kernel void palt(__global double *fprop, __global const double *fcoll,
__global const int *nn, const uint max_size)
{
size_t l = get_global_id(0);
if( l > max_size ) return;
fprop[l] = fcoll[nn[l]];
}
Host code:
int block_sz_p = 128;
const int max_size = ns*imax;
// set the parameters for the propagation operator
errNum = clSetKernelArg(propagation_kernel, 0, sizeof(cl_mem), &fpd);
errNum |= clSetKernelArg(propagation_kernel, 1, sizeof(cl_mem), &fcd);
errNum |= clSetKernelArg(propagation_kernel, 2, sizeof(cl_mem), &nnd);
errNum |= clSetKernelArg(propagation_kernel, 3, sizeof(int), (void *) &max_sz);
checkErr(errNum, "clSetKernelArg(propagation)");
// specify the work group size/dim
const size_t work_dim = 3;
const size_t global_work_size_propagation[] = {imax*ns, 1, 1};
const size_t local_work_size_propagation[] = {block_sz_p, 1, 1};
// propagation
clEnqueueNDRangeKernel(queue, propagation_kernel, work_dim, NULL,
global_work_size_propagation, local_work_size_propagation,
0, NULL, &event);
clWaitForEvents(1, &event);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end, NULL);
tker2 = (end-start);
What's going on here?

You should check for CL errors, clEnqueueNDRangeKernel and other calls return a error code (others return the error code by reference).
I assume the problem is when the global workgroup size isn't divisible by the local workgroup size, which is not supported and generates a CL error:
CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size or does not match the work-group size specified for kernel using the attribute((reqd_work_group_size(X, Y, Z))) qualifier in program source.
From the clEnqueueNDRangeKernel man page.

Related

OpenCL: Call parameter type does not match function signature

I'm using #pragma OPENCL EXTENSION cl_khr_fp16 : enable supported GPU with OpenCL 1.2. I wanted to check the performance improvement by changin float precision from 32 to 16. In my device kernel, I converted all float to half like shown below:
__kernel void copy_kernel(int N, __global half *X, __global half *Y)
{
int i = get_global_id(0);
if(i < N) Y[i] = X[i];
}
In my host side, I made cl_mem point to array of cl_half. Host program looks as shown below:
void copy(int N, cl_mem X, cl_mem Y)
{
cl_kernel kernel = get_copy_kernel();
cl_command_queue queue = cl.queue;
cl_uint i = 0;
cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N);
cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X);
cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y);
check_error_cl(cl);
size_t gsize = N;
cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, NULL);
check_error_cl(cl);
}
But while compiling the kernel, I get the below error:
Call parameter type does not match function signature!
%32 = load half addrspace(1)* %31, align 2
float %33 = call float #llvm.nvvm.mul.rn.f(half %32, half %19)
Broken module found, compilation terminated!
You are passing a half variable to the kernel but the kernel expects a pointer to an array of halfs.
If you want to pass an array of halfs to the GPU you still have to use cl_mem objects which then contains the array of halfs.

OpenCL GPU calculation wrong

I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.
Here is my code on the Host
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
double *arange(double start, double end, double step)
{
// 'arange' routine.
int i;
int arr_size = ((end - start) / step) + 1;
double *output = malloc(arr_size * sizeof(double));
for(i=0;i<arr_size;i++)
{
output[i] = start + (step * i);
}
return output;
}
int main()
{
// This code executes on the OpenCL Host
// Host data
double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
double *delnu = arange(nu_ini, nu_end, nu_step);
double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
double *gamma, *f;
double prs = 950.0;
int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
FILE *fp = fopen("h2o_HITRAN.par","r");
char string[320];
while(!feof(fp))
{
dum = fgetc(fp);
if(dum == '\n')
{
lines++;
}
}
rewind(fp);
nu = malloc(lines * sizeof(double));
inten = malloc(lines * sizeof(double));
gam_air = malloc(lines * sizeof(double));
n_air = malloc(lines * sizeof(double));
del_air = malloc(lines * sizeof(double));
gamma = malloc(lines * sizeof(double));
f = malloc(delnu_size * sizeof(double));
i=0;
while(fgets(string, 320, fp))
{
sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
i++;
}
size_t line_siz = sizeof(double) * lines;
size_t delnu_siz = sizeof(double) * delnu_size;
// gamma calculation
for(i=0;i<lines;i++)
{
gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
}
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = NULL;
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
// Create a context and associate it with the devices
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = NULL;
cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
// Create a buffer objects that will contain the data from the host array 'buf_xxxx'
cl_mem buf_inten = NULL;
cl_mem buf_gamma = NULL;
cl_mem buf_delnu = NULL;
cl_mem buf_nu = NULL;
cl_mem buf_del_air = NULL;
cl_mem buf_f = NULL;
buf_inten = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_gamma = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_delnu = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
buf_nu = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_f = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
// Write input array A to the Device buffer 'buf_xxx'
status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_Source;
FILE *program_handle = fopen("abs_calc.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_Source = (char*)malloc(program_size+1);
program_Source[program_size] = '\0';
fread(program_Source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// Create the vector addition kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "abs_cross", &status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);
// Define index space (global work size) of work items for execution.
// A workgroup size (local work size) is not required, but can be used.
size_t globalWorkSize[2] = {lines, delnu_size};
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
// Read the Device output buffer to the host output array
clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);
// Verify the output
FILE *file = fopen("opencl_output","w");
for(i=0;i<delnu_size;i++)
{
fprintf(file, "%le %le\n", delnu[i], f[i]);
}
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(buf_nu);
clReleaseMemObject(buf_inten);
clReleaseMemObject(buf_del_air);
clReleaseMemObject(buf_gamma);
clReleaseMemObject(buf_f);
clReleaseMemObject(buf_delnu);
clReleaseContext(context);
// Free host resources
free(nu);
free(inten);
free(gam_air);
free(n_air);
free(del_air);
free(delnu);
free(gamma);
free(f);
free(platforms);
free(devices);
fclose(fp);
fclose(file);
return 0;
}
and this is my kernel code
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
global double *gamma,
global double *delnu,
global double *nu,
global double *del_air,
global double *f)
{
double pie = 4.0*atan(1.0);
int i = get_global_id(0);
int j = get_global_id(1);
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
Am I doing something wrong?
Thank you.
You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.
I am using AMD W2100, and yes, I have printed out all the supported extension and it included cl_khr_fp64 extension.
Sorry, I forgot to include the original calculation. The actual calculation goes like the following..
for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
I would write OpenCL kernel as below,
Without using atomics and only single work dimension.
global_work_size = delnu_size
There could be a better way but its the simplest one.
__kernel void test(__global double *gamma,
__global double *inten,
__global double *delnu,
__global double *delair,
__global double *f,
const int lines)
{
double pie = 4.0*atan(1.0);
int j = get_global_id(0);
f[j] = 0;
for(i=0,i<lines;i++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
You need to understand how OpenCL kernel is executed.
You can think of it as large number of threads executing concurrently
and each thread could be identified with get_global_id

Open global_work_size misunderstanding

I'm trying to understand a simple OpenCL example, which is vector addition. The kernel is the following:
__kernel void addVec(__global double* a, __global double* b, __global double* c)
{
size_t id = get_global_id(0);
c[id] = a[id] + b[id];
}
For example, my input arrays have a size of 1 million elements each.
In my host program, I set global_work_size to be exactly the size of the vectors input arrays (1 million).
But when i set it to a smaller value, for example 1000, it also works with this kernel!
I don't understand why the global_work_size can be lesser than the problem dimension, and still, the OpenCL program compute every elements of the input arrays.
Could someone clarify on this?
EDIT: here is the code where I copy the data:
size_t arraySize = 1000000;
const size_t global_work_size[1] = {512};
double *host_a = malloc(arraySize*sizeof(double));
double *host_b = malloc(arraySize*sizeof(double));
double *host_c = calloc(arraySize, sizeof(double));
...
// Create the input and output arrays in device memory for our calculation
device_a = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize*sizeof(double), NULL, NULL);
device_b = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize*sizeof(double), NULL, NULL);
device_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, arraySize*sizeof(double), NULL, NULL);
...
// Copy data set into the input array in device memory. [host --> device]
status = clEnqueueWriteBuffer(command_queue, device_a, CL_TRUE, 0, arraySize*sizeof(double), host_a, 0, NULL, NULL);
status |= clEnqueueWriteBuffer(command_queue, device_b, CL_TRUE, 0, arraySize*sizeof(double), host_b, 0, NULL, NULL);
...
// Copy-back the results from the device [host <-- device]
clEnqueueReadBuffer(command_queue, device_c, CL_TRUE, 0, arraySize*sizeof(double), host_c, 0, NULL, NULL );
...
printf("checking result validity ...\n");
for (size_t i=0; i<arraySize; ++i)
if(host_c[i] - 1 > 1e-6) // the array is supposed to be 1 everywhere
{
printf("*** ERROR! Invalid results ! host_c[%zi]=%.9lf\n", i, host_c[i]);
break;
}
Thanks
Your test function doesn't look good, it will be met for any value < 1, it should be like this:
for (size_t i=0; i<arraySize; ++i){
cl_double val = host_c[i] - 1; // the array is supposed to be 1 everywhere
if((val > 1e-6) || (val < -1e-6))
{
printf("*** ERROR! Invalid results ! host_c[%zi]=%.9lf\n", i, host_c[i]);
break;
}
}
Non initialized values in the GPU are likely to be 0, therefore meeting your condition.
Additionally, remember that if you run the program once with the full size, consecutive reads will still hold the proper processed data (even if you close and open the app again). Since the GPU memory is not cleaned after the buffer is created/destroyed.

OpenCL: Can one kernel call the other kernel

Hi ,
I am trying to run the available convolution code in OpenCL.
I am having heterogeneous system with -
1) CPU
2) GPU
PFB my code base which is running in my system :
convolution.cl
// TODO: Add OpenCL kernel code here.
__kernel
void convolve(
const __global uint * const input,
__constant uint * const mask,
__global uint * const output,
const int inputWidth,
const int maskWidth){
const int x = get_global_id(0);
const int y = get_global_id(1);
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
const int idxIntmp = (y + r) * inputWidth + x;
for (int c = 0; c < maskWidth; c++)
{
sum += mask[(r * maskWidth) + c] * input[idxIntmp + c];
}
}
output[y * get_global_size(0) + x] = sum;
}
and convolution.cpp -
//Convolution-Process of applying a 3×3 mask to an 8×8 input signal,resulting in a 6×6 output signal
#include "CL/cl.h"
#include "vector"
#include "iostream"
#include "time.h"
#include <fstream>
#include <sstream>
#include <string>
using namespace std;
// Constants
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;
cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
{3, 1, 1, 4, 8, 2, 1, 3},
{4, 2, 1, 1, 2, 1, 2, 3},
{4, 4, 4, 4, 3, 2, 2, 2},
{9, 8, 3, 8, 9, 0, 0, 0},
{9, 3, 3, 9, 0, 0, 0, 0},
{0, 9, 0, 8, 0, 0, 0, 0},
{3, 0, 8, 8, 9, 4, 4, 4},
{5, 9, 8, 1, 8, 1, 1, 1}
};
const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;
cl_uint outputSignal[outputSignalWidth][outputSignalHeight];
const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;
cl_uint mask[maskWidth][maskHeight] =
{
{1, 1, 1},
{1, 0, 1},
{1, 1, 1},
};
inline void checkErr(cl_int err, const char * name)
{
if (err != CL_SUCCESS)
{
std::cerr << "ERROR: " << name
<< " (" << err << ")" << std::endl;
exit(EXIT_FAILURE);
}
}
void CL_CALLBACK contextCallback(
const char * errInfo,
const void * private_info,
size_t cb,
void * user_data)
{
std::cout << "Error occurred during context use: "<< errInfo << std::endl;
exit(EXIT_FAILURE);
}
int main(int argc,char argv[]){
cl_int errNum;
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id * platformIDs;
cl_device_id * deviceIDs;
cl_context context = NULL;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem inputSignalBuffer;
cl_mem outputSignalBuffer;
cl_mem maskBuffer;
double start,end,Totaltime;//Timer variables
errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr(
(errNum != CL_SUCCESS) ? errNum :
(numPlatforms <= 0 ? -1 : CL_SUCCESS),
"clGetPlatformIDs");
platformIDs = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);
errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
checkErr(
(errNum != CL_SUCCESS) ? errNum :
(numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
deviceIDs = NULL;
cl_uint i;
for (i = 0; i < numPlatforms; i++)
{
errNum = clGetDeviceIDs(
platformIDs[i],
CL_DEVICE_TYPE_GPU,
0,
NULL,
&numDevices);
if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
{
checkErr(errNum, "clGetDeviceIDs");
}
else if (numDevices > 0)
{
deviceIDs = (cl_device_id *)malloc(
sizeof(cl_device_id) * numDevices);
errNum = clGetDeviceIDs(
platformIDs[i],
CL_DEVICE_TYPE_GPU,
numDevices,
&deviceIDs[0],
NULL);
checkErr(errNum, "clGetDeviceIDs");
break;
}
}
if (deviceIDs == NULL) {
std::cout << "No CPU device found" << std::endl;
exit(-1);
}
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,(cl_context_properties)platformIDs[i], 0
};
context = clCreateContext(
contextProperties, numDevices, deviceIDs,
&contextCallback, NULL, &errNum);
checkErr(errNum, "clCreateContext");
std::ifstream srcFile("convolution.cl");
checkErr(srcFile.is_open() ? CL_SUCCESS : -1,
"reading convolution.cl");
std::string srcProg(
std::istreambuf_iterator<char>(srcFile),
(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
checkErr(errNum, "clCreateProgramWithSource");
errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
checkErr(errNum, "clBuildProgram");
kernel = clCreateKernel(program, "convolve", &errNum);
checkErr(errNum, "clCreateKernel");
inputSignalBuffer = clCreateBuffer(
context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
static_cast<void *>(inputSignal), &errNum);
checkErr(errNum, "clCreateBuffer(inputSignal)");
maskBuffer = clCreateBuffer(
context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_uint) * maskHeight * maskWidth,
static_cast<void *>(mask), &errNum);
checkErr(errNum, "clCreateBuffer(mask)");
outputSignalBuffer = clCreateBuffer(
context, CL_MEM_WRITE_ONLY,
sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
NULL, &errNum);
checkErr(errNum, "clCreateBuffer(outputSignal)");
queue = clCreateCommandQueue(
context, deviceIDs[0], 0, &errNum);
checkErr(errNum, "clCreateCommandQueue");
errNum = clSetKernelArg(
kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
errNum |= clSetKernelArg(
kernel, 1, sizeof(cl_mem), &maskBuffer);
errNum |= clSetKernelArg(
kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
errNum |= clSetKernelArg(
kernel, 3, sizeof(cl_uint), &inputSignalWidth);
errNum |= clSetKernelArg(
kernel, 4, sizeof(cl_uint), &maskWidth);
checkErr(errNum, "clSetKernelArg");
const size_t globalWorkSize[1] ={ outputSignalWidth * outputSignalHeight };
const size_t localWorkSize[1] = { 1 };
start = clock();
errNum = clEnqueueNDRangeKernel(
queue,
kernel,
1,
NULL,
globalWorkSize,
localWorkSize,
0,
NULL,
NULL
);
checkErr(errNum, "clEnqueueNDRangeKernel");
errNum = clEnqueueReadBuffer(
queue, outputSignalBuffer, CL_TRUE, 0,
sizeof(cl_uint) * outputSignalHeight * outputSignalHeight,
outputSignal, 0, NULL, NULL);
checkErr(errNum, "clEnqueueReadBuffer");
end= clock(); - start;
cout<<"Time in ms = "<<((end/CLOCKS_PER_SEC) * 1000) << endl;
for (int y = 0; y < outputSignalHeight; y++)
{
for (int x = 0; x < outputSignalWidth; x++)
{
std::cout << outputSignal[x][y] << " ";
}
std::cout << std::endl;
}
return 0;
}
Questions :
I am having below doubts-
1) When I am using device type as CL_DEVICE_TYPE_GPU,
am getting 267 ms performance .When I am using CL_DEVICE_TYPE_CPU,execution time changed to 467 ms.
I want to know that what is the difference between running a convolution code on a CPU without GPU and CPU with GPU (by selecting device type as CL_DEVICE_TYPE_CPU) .
2) As I can see the convolution.cl file where there is a for loop which is executing 3 times.Can I call other Kernel for doing this operation from available kernel file ??
I am asking this question as I am new to the OpenCL coding and want to know that thing.
Both CPU & GPU are OpenCL Devices. So, by choosing CL_DEVICE_TYPE_CPU, you are telling OpenCL runtime to compile kernel code to CPU assembler & run it on CPU. When you are choosing CL_DEVICE_TYPE_GPU, kernel code is compiled to GPU assembler & executed on your video card. Ability to change device type without re-writing source code is of the main OpenCL features. It doesn't matter, does your CPU have integrated GPU, and / or discrete GPU is installed, you just pick available Device & run kernel on it.
For OpenCL 1.2 & older you can't call kernel from kernel. Dynamic parallelism is implemented in OpenCL 2.0.
For the first question: you should vectorize the kernel so opencl can easily use SIMD feature of your CPU hence unlock 4x(or 8x) more compute units per core.
__kernel
void convolve(
const __global uint8 * const input, // uint8 fits AVX(AVX2?) and uint4 fits SSE(SSE3?)
__constant uint8 * const mask,
__global uint8 * const output,
const int inputWidth,
const int maskWidth){
const int x = get_global_id(0); // this is 1/8 size now
const int y = get_global_id(1); // this is 1/8 size now
uint8 sum = 0; // a vector of 8 unsigneds
for (int r = 0; r < maskWidth; r++)
{
const int idxIntmp = (y + r) * inputWidth + x;
for (int c = 0; c < maskWidth; c++)
{
sum += mask[(r * maskWidth) + c] * input[idxIntmp + c]; //8 issued per clock
// scalars get promoted when used in direct multiplication of addition.
}
}
output[y * get_global_size(0) + x] = sum;
}
dont forget to decrease total work threads by 7/8 ratio (example: from 8k threads to 1k threads).
Please increase work per thread such as 50 convolutions per thread to increase occupation ratio of work units, then work on some local memory optimizations(for GPU) to get even better results such as 5ms per kernel..
On my AVX capable CPU, a simple matrix multiplication got speed up ratio of 2.4X going for 8-element vectorizations like this.
Running a kernel 3 times is not an issue if you offload enough work on it. If not, you should concatenate multiple kernels into a single one using some tricky algorithm.
If a profiler is not available at the moment, you can check GPU/CPU temperatures to get some idea of how close you are to the limits of hardware.
Play with number of local threads per work group. This can change performance as it lets more or less registers to be used per thread.

OpenCL Transfer rate exceed PCI-e Bandwidth

I made an OpenCL program and use pinned memory (CL_MEM_ALLOC_HOST_PTR) to get a higher transfer rate from device to host.
The transfer rate is increased as I expected (get transfer rate using AMD APP Profiler 2.4).
The problem is the transfer rate is higher than PCIe bandwidth (93703 GB /s) for matrix 4096 x 4096 (64 MB).
It happened too when I use zero copy buffer ( CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer).
I search some information that it is true if pinned memory and zero copy buffer have high transfer rate but it still limited with PCIe bandwidth for discrete GPU.
So, is it normal if the transfer rate exceed PCIe bandwidth (using PCIe bandwidth 2.0 x 16)?
My OS is Windows 7 64 bit.
I use AMD APP SDK 2.6 and discrete GPU AMD HD 6630M.
Edit:
Here is the code:
#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
cl_context context = NULL;
cl_command_queue queue = NULL;
cl_program program = NULL;
void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;
// Create Kernel Object Bound To Kernel Function
naive = clCreateKernel(program, "naiveAlgorithm", &err);
//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;
if(globalsize >= 16)
{
localsize =16;
}else
{
localsize = globalsize;
}
size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items [2] = {localsize, localsize};
// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);
// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);
//Release Kernel
err = clReleaseKernel(naive);
}
void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL);
MatrixMul(d_A, d_B, d_C, size);
err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);
err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}
//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096;
//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;
//Allocate and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));
//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);
//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);
cl_int err; // error code
cl_platform_id* platforms;
cl_uint platformCount;
cl_device_id device;
int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk
clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, platforms, NULL); //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices
const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
reinterpret_cast<cl_context_properties> (platforms [platformtype]),
0, 0
};
context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
//Load Kernel Source
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;
fp = fopen(fileName, "r");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Create Program Object
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
&source_size, &err);
// Build Program
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
Naive(matrixA, matrixB, matrixC, size);
//Cleanup all memory
err = clFlush(queue);
err = clFinish(queue);
err = clReleaseProgram(program);
err = clReleaseCommandQueue(queue);
err = clReleaseContext(context);
// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
printMatrix(matrixC, size*size, size);
cout<<endl;
free(matrixA);
free(matrixB);
free(matrixC);
free(source_str);
return 0;
}
And here is the kernel code:
__kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {
int tx = get_global_id(0); //2D Thread IDx
int ty = get_global_id(1); //2D Thread IDy
float sum = 0;
//Calculate result of one element of Matrix C
for (int k = 0; k < size; k++) {
sum += A[ty*size+k] * B[k*size+tx];
}
C[ty*size+tx] = sum;
}
And here is the image:
I see that your output array is actually located in host memory because of the CL_MEM_ALLOC_HOST_PTR flag in the following line:
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
This means that you should be using clEnqueueMapBuffer, followed by using the matrix in whatever way you see fit, followed by clEnqueueUnmapMemObject. There is no need for the array matrixC since d_C is already in host memory.
The data transfer from GPU to host actually happens while your kernel is running. The map call makes sure that all data has finished moving from the GPU to the CPU. That is why the transfer times are actually so small.
I can't find any documentation on whether clEnqueueReadBuffer works for pinned memory or not. I also see that you are retrieving the error codes of each operation but do not check these error codes, hence your code may be silently failing.
Regarding the large difference between the time taken by clEnqueueReadBuffer and the time spent transferring data, note that all queued operations don't immediately get dispatched to the GPU. One source of delay is the Windows display driver model (WDDM) for graphics cards. The +-20 micro-seconds used for the clEnqueueReadBuffer sounds right for this delay (I've actually seen longer delays).

Resources