Compile and build .cl file using NVIDIA's nvcc Compiler? - opencl

Is it possible to compile .cl file using NVIDIA's nvcc compiler?? I am trying to set up visual studio 2010 to code Opencl under CUDA platform. But when I select CUDA C/C++ Compiler to compile and build .cl file, it gives me errors like nvcc does not exist. What is the issue?

You should be able to use nvcc to compile OpenCL codes. Normally, I would suggest using a filename extension of .c for a C-compliant code, and .cpp for a C++ compliant code(*), however nvcc has filename extension override options (-x ...) so that we can modify the behavior. Here is a worked example using CUDA 8.0.61, RHEL 7, Tesla K20x:
$ cat t4.cpp
#include <CL/opencl.h>
#include <stdint.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdlib.h>
const char source[] =
"__kernel void test_rotate(__global ulong *d_count, ulong loops, ulong patt)"
"{"
" ulong n = patt;"
" for (ulong i = 0; i<loops; i++)"
" n &= (107 << (patt+(i%7)));"
" d_count[0] = n + loops;"
"}"
;
int main(int argc, char *argv[])
{
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue1, queue2;
cl_program program;
cl_mem mem1, mem2;
cl_kernel kernel;
bool two_kernels = false;
unsigned long long loops = 1000;
if (argc > 1) loops *= atoi(argv[1]);
if (argc > 2) two_kernels = true;
if (two_kernels) printf("running two kernels\n");
else printf("running one kernel\n");
printf("running %lu loops\n", loops);
unsigned long long pattern = 1;
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
queue2 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
const char *sources[1] = {source};
program = clCreateProgramWithSource(context, 1, sources, NULL, NULL);
clBuildProgram(program, 1, &device, NULL, NULL, NULL);
mem1 = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(cl_ulong), NULL, NULL);
mem2 = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(cl_ulong), NULL, NULL);
kernel = clCreateKernel(program, "test_rotate", NULL);
const size_t work_size[1] = {1};
clSetKernelArg(kernel, 0, sizeof(mem1), &mem1);
clSetKernelArg(kernel, 1, sizeof(loops), &loops);
clSetKernelArg(kernel, 2, sizeof(pattern), &pattern);
clEnqueueNDRangeKernel(queue1, kernel, 1, NULL, work_size, work_size, 0, NULL, NULL);
if (two_kernels){
clSetKernelArg(kernel, 0, sizeof(mem2), &mem2);
clSetKernelArg(kernel, 1, sizeof(loops), &loops);
clSetKernelArg(kernel, 2, sizeof(pattern), &pattern);
clEnqueueNDRangeKernel(queue2, kernel, 1, NULL, work_size, work_size, 0, NULL, NULL);
}
cl_ulong *buf1 = (cl_ulong *)clEnqueueMapBuffer(queue1, mem1, true, CL_MAP_READ, 0, 1*sizeof(cl_ulong), 0, NULL, NULL, NULL);
cl_ulong *buf2 = (cl_ulong *)clEnqueueMapBuffer(queue2, mem2, true, CL_MAP_READ, 0, 1*sizeof(cl_ulong), 0, NULL, NULL, NULL);
printf("result1: %lu\n", buf1[0]);
printf("result2: %lu\n", buf2[0]);
clEnqueueUnmapMemObject(queue1, mem1, buf1, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue2, mem2, buf2, 0, NULL, NULL);
return 0;
}
$ nvcc -arch=sm_35 -o t4 t4.cpp -lOpenCL
$ ./t4
running one kernel
running 1000 loops
result1: 1000
result2: 0
$ cp t4.cpp t4.cl
$ nvcc -arch=sm_35 -x cu -o t4 t4.cl -lOpenCL
$ ./t4
running one kernel
running 1000 loops
result1: 1000
result2: 0
$
Note that the code here doesn't do anything sensible or significant, so I'd prefer to avoid questions. It's just for demonstration of compilation of a C++ compliant OpenCL code.
(*)(Because such files could also be readily processed by an ordinary host compiler, e.g. gnu compilers, with appropriate switches for include and link options.)

Related

OpenCL program works only for the multiple of itemsize

I'm new to openCL program and this is the problem I'm facing while executing a simple vector addition.
I have the following kernel code
#include <CL/cl.hpp>
#include<iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
I have integrated gpu and amd gpus on my system. I'm trying to perform vector addition on my intel gpu and for which I have installed the intel opencl drivers (i7 3rd gen processor with hd graphics).
I have the below openCL code
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::cout << "Total platforms including cpu: " << platforms.size() << std::endl;
if (platforms.size() == 0) {
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
int i;
const int LIST_SIZE = 50;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
//std::cout<<source_str<<std::endl;
// Get platform and device information
cl_platform_id* platforms1 = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, platforms1, &ret_num_platforms);
platforms1= (cl_platform_id*) malloc(sizeof(cl_platform_id) * ret_num_platforms);
clGetPlatformIDs(ret_num_platforms, platforms1, NULL);
/*
* Platform 0: Intel Graphics
* Platform 1 : AMD Graphics
*/
//CHANGE THE PLATFORM ACCORDING TO YOUR SYSTEM!!!!
ret = clGetDeviceIDs( platforms1[0], CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 16; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
//FREE
return 0;
}
If the LISTSIZE is 50, it prints only till 48 that is 16*3. It prints only the multiple of LISTSIZE and I'm not able to figure out why?.
OpenCL kernels execute only for a multiple of the local thread block size (local Range, in your code local_item_size), which should not be smaller than 32 and must be a multiple of 2, (so it can be (32, 64, 128, 256, ...). If you set it to 16, half of the GPU will be idle at any time. global_item_size must be a multiple of local_item_size. You need at least 32 data items for the kernel to function and a lot more for it to yield good performance.
Also the part
#include <CL/cl.hpp>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
is not OpenCL C code and does not belong in the .cl source file. If it is not too lengthy, you can write the OpenCL C code directly in the .cpp file as a raw string:
const string kernel_code = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
)";
char* source_str = kernel_code.c_str();

What is the best practice to do reduce in OpenCL?

Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.

OpenCL Transfer rate exceed PCI-e Bandwidth

I made an OpenCL program and use pinned memory (CL_MEM_ALLOC_HOST_PTR) to get a higher transfer rate from device to host.
The transfer rate is increased as I expected (get transfer rate using AMD APP Profiler 2.4).
The problem is the transfer rate is higher than PCIe bandwidth (93703 GB /s) for matrix 4096 x 4096 (64 MB).
It happened too when I use zero copy buffer ( CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer).
I search some information that it is true if pinned memory and zero copy buffer have high transfer rate but it still limited with PCIe bandwidth for discrete GPU.
So, is it normal if the transfer rate exceed PCIe bandwidth (using PCIe bandwidth 2.0 x 16)?
My OS is Windows 7 64 bit.
I use AMD APP SDK 2.6 and discrete GPU AMD HD 6630M.
Edit:
Here is the code:
#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
cl_context context = NULL;
cl_command_queue queue = NULL;
cl_program program = NULL;
void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;
// Create Kernel Object Bound To Kernel Function
naive = clCreateKernel(program, "naiveAlgorithm", &err);
//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;
if(globalsize >= 16)
{
localsize =16;
}else
{
localsize = globalsize;
}
size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items [2] = {localsize, localsize};
// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);
// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);
//Release Kernel
err = clReleaseKernel(naive);
}
void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL);
MatrixMul(d_A, d_B, d_C, size);
err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);
err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}
//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096;
//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;
//Allocate and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));
//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);
//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);
cl_int err; // error code
cl_platform_id* platforms;
cl_uint platformCount;
cl_device_id device;
int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk
clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, platforms, NULL); //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices
const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
reinterpret_cast<cl_context_properties> (platforms [platformtype]),
0, 0
};
context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
//Load Kernel Source
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;
fp = fopen(fileName, "r");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Create Program Object
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
&source_size, &err);
// Build Program
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
Naive(matrixA, matrixB, matrixC, size);
//Cleanup all memory
err = clFlush(queue);
err = clFinish(queue);
err = clReleaseProgram(program);
err = clReleaseCommandQueue(queue);
err = clReleaseContext(context);
// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
printMatrix(matrixC, size*size, size);
cout<<endl;
free(matrixA);
free(matrixB);
free(matrixC);
free(source_str);
return 0;
}
And here is the kernel code:
__kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {
int tx = get_global_id(0); //2D Thread IDx
int ty = get_global_id(1); //2D Thread IDy
float sum = 0;
//Calculate result of one element of Matrix C
for (int k = 0; k < size; k++) {
sum += A[ty*size+k] * B[k*size+tx];
}
C[ty*size+tx] = sum;
}
And here is the image:
I see that your output array is actually located in host memory because of the CL_MEM_ALLOC_HOST_PTR flag in the following line:
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
This means that you should be using clEnqueueMapBuffer, followed by using the matrix in whatever way you see fit, followed by clEnqueueUnmapMemObject. There is no need for the array matrixC since d_C is already in host memory.
The data transfer from GPU to host actually happens while your kernel is running. The map call makes sure that all data has finished moving from the GPU to the CPU. That is why the transfer times are actually so small.
I can't find any documentation on whether clEnqueueReadBuffer works for pinned memory or not. I also see that you are retrieving the error codes of each operation but do not check these error codes, hence your code may be silently failing.
Regarding the large difference between the time taken by clEnqueueReadBuffer and the time spent transferring data, note that all queued operations don't immediately get dispatched to the GPU. One source of delay is the Windows display driver model (WDDM) for graphics cards. The +-20 micro-seconds used for the clEnqueueReadBuffer sounds right for this delay (I've actually seen longer delays).

How to remove CL_INVALID_PLATFORM error in opencl code?

Doing simple matrix multiplication using OpenCL:
// Multiply two matrices A * B = C
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <oclUtils.h>
#define WA 3
#define HA 3
#define WB 3
#define HB 3
#define WC 3
#define HC 3
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{
// set seed for rand()
srand(2006);
// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
// 2. initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// 3. print out A and B
printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++)
{
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n");
}
printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++)
{
printf("%f ", h_B[i]);
if(((i + 1) % WB) == 0)
printf("\n");
}
// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
// 5. Initialize OpenCL
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
cl_kernel clKernel;
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
/*****************************************/
/* Initialize OpenCL */
/*****************************************/
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
//shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
//shrCheckError(errcode, CL_SUCCESS);
// Setup device memory
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);
// 6. Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("kernel.cl",
"// My comment\n",
&kernelLength);
//shrCheckError(clMatrixMul != NULL, shrTRUE);
clProgram = clCreateProgramWithSource(clGPUContext,
1, (const char **)&clMatrixMul,
&kernelLength, &errcode);
//shrCheckError(errcode, CL_SUCCESS);
errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
//shrCheckError(errcode, CL_SUCCESS);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
//shrCheckError(errcode, CL_SUCCESS);
// 7. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];
int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
//shrCheckError(errcode, CL_SUCCESS);
localWorkSize[0] = 3;
localWorkSize[1] = 3;
globalWorkSize[0] = 3;
globalWorkSize[1] = 3;
errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
//shrCheckError(errcode, CL_SUCCESS);
// 8. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
//shrCheckError(errcode, CL_SUCCESS);
// 9. print out the results
printf("\n\nMatrix C (Results)\n");
for(int i = 0; i < size_C; i++)
{
printf("%f ", h_C[i]);
if(((i + 1) % WC) == 0)
printf("\n");
}
printf("\n");
// 10. clean up memory
free(h_A);
free(h_B);
free(h_C);
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
free(clDevices);
free(clMatrixMul);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);
}
In the above code I keep getting error at the place :
/**********************/ / Initialize OpenCL
/ /**********************/
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS);
The error code being returned is -32 that means: CL_INVALID_PLATFORM"
How do I remove this error?
OS: Windows 7, 32 bit, NVIDIA GPU GeForce 610
The Nvidia drivers expect you to provide a non-NULL properties pointer as first argument to the clCreateContextFromType call.
The Khronos specification for clCreateContextFromType states that if NULL is passed for the properties parameter, the platform that is selected is implementation dependent. In case of Nvidia the choice seems to be that no platform at all is selected if a NULL pointer is passed. See clCreateContextFromType for more information.
On the other hand, this behavior is consistent with Issue #3 in the cl_khr_icd extension, which would apply if you are using OpenCL through the ICD, and which states:
3: How will the ICD handle a NULL cl_platform_id?
RESOLVED: The NULL platform is not supported by the ICD.
To pass the properties to clCreateContextFromType, first query the platforms with clGetPlatformIDs. Then construct a properties array with the desired platform ID and pass it to clCreateContextFromType. Something along the following lines should work with a C99 compliant compiler:
// query the number of platforms
cl_uint numPlatforms;
errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
shrCheckError(errcode, CL_SUCCESS);
// now get all the platform IDs
cl_platform_id platforms[numPlatforms];
errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
shrCheckError(errcode, CL_SUCCESS);
// set platform property - we just pick the first one
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0};
clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);

How to use clCreateProgramWithBinary in OpenCL?

I'm trying to just get a basic program to work using clCreateProgramWithBinary. This is so I know how to use it rather than a "true" application.
I see that one of the parameters is a list of binaries. How exactly would I go about creating a binary to test with? I have some test code which creates a program from source, builds and enqueues it. Is there a binary created at some point during this process which I can feed into clCreateProgramWithBinary?
Here is some of my code, just to give an idea of my overall flow. I've omitted comments and error checks for simplicity.
program = clCreateProgramWithSource(clctx, 1, &dumbkernelsource, NULL, &errcode);
errcode = clBuildProgram(program, env->num_devices, env->device, NULL, NULL, NULL);
mykernel = clCreateKernel(program, "flops", &errcode);
errcode = clGetKernelWorkGroupInfo(mykernel, *(env->device), CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
global = num_workgroups * local;
errcode = clEnqueueNDRangeKernel(commands, mykernel, 1, NULL, &global, &local, 0, NULL, NULL);
After you compile your program, you can get its binary code with clGetProgramInfo, and then save it to a file.
Example code (not tried to compile, but should be something along these lines):
program = clCreateProgramWithSource(clctx, 1, &dumbkernelsource, NULL, &errcode);
errcode = clBuildProgram(program, env->num_devices, env->device, NULL, NULL, NULL);
int number_of_binaries;
char **binary;
int *binary_sizes;
errcode = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, NULL, 0, &number_of_binaries);
binary_sizes = new int[number_of_binaries];
binary = new char*[number_of_binaries];
errcode = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, binary_sizes, number_of_binaries*sizeof(int), &number_of_binaries);
for (int i = 0; i < number_of_binaries; ++i) binary[i] = new char[binary_sizes[i]];
errcode = clGetProgramInfo(program, CL_PROGRAM_BINARIES, binary, number_of_binaries*sizeof(char*), &number_of_binaries);
Minimal runnable example
Compile the embedded vector increment shader from CL C source, save the binary to a.bin, load the binary shader, and run it:
./a.out
Assertions are done at the end of the program.
Ignore the CL C shader, load binary from a.bin, and run it:
./a.out 0
Compile and run with:
gcc -ggdb3 -std=c99 -Wall -Wextra a.c -lOpenCL && ./a.out
Tested in Ubuntu 16.10, NVIDIA NVS5400, driver 375.39.
GitHub upstream: https://github.com/cirosantilli/cpp-cheat/blob/b1e9696cb18a12c4a41e0287695a2a6591b04597/opencl/binary_shader.c
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <CL/cl.h>
const char *source =
"__kernel void kmain(__global int *out) {\n"
" out[get_global_id(0)]++;\n"
"}\n"
;
#define BIN_PATH "a.bin"
char* common_read_file(const char *path, long *length_out) {
char *buffer;
FILE *f;
long length;
f = fopen(path, "r");
assert(NULL != f);
fseek(f, 0, SEEK_END);
length = ftell(f);
fseek(f, 0, SEEK_SET);
buffer = malloc(length);
if (fread(buffer, 1, length, f) < (size_t)length) {
return NULL;
}
fclose(f);
if (NULL != length_out) {
*length_out = length;
}
return buffer;
}
int main(int argc, char **argv) {
FILE *f;
char *binary;
cl_command_queue command_queue;
cl_context context;
cl_device_id device;
cl_int input[] = {1, 2}, errcode_ret, binary_status;
cl_kernel kernel, binary_kernel;
cl_mem buffer;
cl_platform_id platform;
cl_program program, binary_program;
const size_t global_work_size = sizeof(input) / sizeof(input[0]);
int use_cache;
long lenght;
size_t binary_size;
if (argc > 1) {
use_cache = !strcmp(argv[1], "0");
} else {
use_cache = 0;
}
/* Get the binary, and create a kernel with it. */
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
command_queue = clCreateCommandQueue(context, device, 0, NULL);
if (use_cache) {
binary = common_read_file(BIN_PATH, &lenght);
binary_size = lenght;
} else {
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
clBuildProgram(program, 1, &device, "", NULL, NULL);
kernel = clCreateKernel(program, "kmain", NULL);
clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
binary = malloc(binary_size);
clGetProgramInfo(program, CL_PROGRAM_BINARIES, binary_size, &binary, NULL);
f = fopen(BIN_PATH, "w");
fwrite(binary, binary_size, 1, f);
fclose(f);
}
binary_program = clCreateProgramWithBinary(
context, 1, &device, &binary_size,
(const unsigned char **)&binary, &binary_status, &errcode_ret
);
free(binary);
clBuildProgram(binary_program, 1, &device, NULL, NULL, NULL);
binary_kernel = clCreateKernel(binary_program, "kmain", &errcode_ret);
/* Run the kernel created from the binary. */
buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(input), input, NULL);
clSetKernelArg(binary_kernel, 0, sizeof(buffer), &buffer);
clEnqueueNDRangeKernel(command_queue, binary_kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
clFlush(command_queue);
clFinish(command_queue);
clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), input, 0, NULL, NULL);
/* Assertions. */
assert(input[0] == 2);
assert(input[1] == 3);
/* Cleanup. */
clReleaseMemObject(buffer);
clReleaseKernel(kernel);
clReleaseKernel(binary_kernel);
clReleaseProgram(program);
clReleaseProgram(binary_program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return EXIT_SUCCESS;
}
I highly recommend cat a.bin, which contains human readable (and editable) PTX assembly for this implementation.
The official OpenCL Programming Guide book has a nice example of this. There's also a Google code project, opencl-book-samples, which includes the code from the book. The example you're looking for is here.

Resources