Getting nan values from OpenCL FFT kernel on FPGA - opencl

I was trying to use the Intel's FFT1D kernel by writing the Host program by my own for Intel FPGA. Link to Intel's FFT1d can be found here
I have also given my host program below, wherein, I have a file saved (which contains some data), my task is to read that data, calculate its FFT and print some of it. It is a 4K point FFT
#include <stdio.h>
#include <stdlib.h>
#include "CL/opencl.h"
#include "AOCLUtils/aocl_utils.h"
#include <string.h>
#include "fft_config.h"
#define N (1<<LOGN) //Please check the FFT Sample Code for Ref (2 to the power 12 gives 4K points)
#define DATA_FILE "complex_input.data"
using namespace aocl_utils;
cl_platform_id platform = NULL;
cl_device_id device = NULL;
cl_command_queue queue0 = NULL;
cl_command_queue queue1 = NULL;
cl_context context = NULL;
cl_program program = NULL;
cl_kernel kernel0, kernel1;
cl_mem d_inData, d_outData;
cl_int err = 0;
typedef struct {
float x;
float y;
} float2;
//float2 h_outData[N], h_inData[N];
float2 *h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
float2 *h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);
void init(); //Function that does the job of Querying Platform and Device, creating Context, Command Queues, Program and required Kernels to do the job.
void cleanup(); //Function that releases all the Created Contexts, Buffers etc, in order to finish the execution.
void read_data(); //Reads data from the complex numbers from .data file and fills in the float2 struct h_inData[].
int temp_value = 1;
int main()
{
// h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
//h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);
int inverse = false;
int temp =1;
init();
read_data();
d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float2)*N, NULL, &err);
checkError(err,"Failed to allocate Buffer for input array\n");
d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(float2)*N, NULL, &err);
checkError(err, "Failed to allocate the Buffer for output\n");
//WE FINISH THE FETCH KERNEL
err = clEnqueueWriteBuffer(queue1,d_inData, CL_TRUE, 0, sizeof(float2)*N, h_inData, 0, NULL, NULL);
checkError(err,"Failed to Write the input Buffer\n");
err = clSetKernelArg(kernel1, 0, sizeof(cl_mem), (void *)&d_inData);
checkError(err, "Failed to set KerArg for Kernel1 - 0\n");
err = clSetKernelArg(kernel0, 0, sizeof(cl_mem), (void *)&d_outData);
checkError(err, "Failed to set KerArg for Kernel0 - 0\n");
err = clSetKernelArg(kernel0, 1, sizeof(cl_int), (void *)&temp_value);
checkError(err, "Failed to set KerArg for Kernel0 - 1\n");
err = clSetKernelArg(kernel0, 2, sizeof(cl_int), (void *)&inverse);
checkError(err, "Failed to set KerArg for Kernel0 - 2\n");
printf("FFT Initialization Complete!\n\n");
err = clEnqueueTask(queue0, kernel0, 0, NULL, NULL);
checkError(err, "Failed to Launch the Kernel for FFT\n");
size_t local_work_size = N/8;
size_t global_work_size = local_work_size * 1; //Coz the number of Iterations is just 1
err = clEnqueueNDRangeKernel(queue1, kernel1, 1, NULL, &local_work_size, &global_work_size, 0, NULL, NULL);
checkError(err, "Failed to launch the Fetch Kernel\n");
err = clFinish(queue0);
checkError(err, "Failed to finish FFT\n");
err = clFinish(queue1);
checkError(err, "Failed to finish Fetch kernel\n");
err = clEnqueueReadBuffer(queue0, d_outData, CL_TRUE, 0, sizeof(float2)*N, h_outData, 0, NULL, NULL);
checkError(err, "Failed to Read back the Buffer output\n");
printf("FFT is Complete!\n\n");
printf("Printing some of the values, just to make sure they are non-zero\n\n");
for(int ii=100;ii<125;ii++)
{
printf("%f + %f j -> %f + %f j\n",h_inData[ii].x,h_inData[ii].y,h_outData[ii].x,h_outData[ii].y);
}
printf("\n\n");
cleanup();
return 0;
}
void read_data()
{
size_t sourceSize;
float* temp;
FILE *fp = fopen(DATA_FILE,"r");
if(fp==NULL)
{
printf("Could not find the Random Data File! Exiting!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
sourceSize=ftell(fp);
rewind(fp);
temp = (float *)alignedMalloc(sourceSize);
fread(temp, sizeof(float),sourceSize,fp);
fclose(fp);
for(int i=0;i<N;i++)
{
h_inData[i].x = temp[2*i];
h_inData[i].y = temp[(2*i)+1];
}
}
void init()
{
platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
if(platform == NULL)
{
printf("Could not find the Platform\n");
exit(1);
}
scoped_array<cl_device_id> devices;
cl_uint num_devices;
devices.reset(getDevices(platform, CL_DEVICE_TYPE_ACCELERATOR, &num_devices));
device = devices[0];
context = clCreateContext(NULL, 1, &device, &oclContextCallback, NULL, &err);
checkError(err, "Failed to create Context\n");
queue0 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
checkError(err, "Failed to create Command Queue0\n");
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
checkError(err, "Failed to create Command Queue1\n");
program = createProgramFromBinary(context, "bin/fft1d.aocx", &device, 1);
err = clBuildProgram(program, 1, &device, "", NULL, NULL);
checkError(err, "Failed to Build Program\n");
kernel0 = clCreateKernel(program, "fft1d", &err);
checkError(err,"Could not Create Kernel0\n");
kernel1 = clCreateKernel(program, "fetch", &err);
checkError(err, "Could not Create Kernel1\n");
printf("Finished with the Initial Setup!\n");
}
void cleanup()
{
if(kernel0)
clReleaseKernel(kernel0);
if(kernel1)
clReleaseKernel(kernel1);
if(program)
clReleaseProgram(program);
if(queue0)
clReleaseCommandQueue(queue0);
if(queue1)
clReleaseCommandQueue(queue1);
if(d_inData)
clReleaseMemObject(d_inData);
if(d_outData)
clReleaseMemObject(d_outData);
if(context)
clReleaseContext(context);
}
I checked if the data from file is being read fine, and It is correct and as expected.
Please let me know where could this go wrong!

Update!
I found out the solution. Reading from the itself was not a good idea, here. I tried generating the random there during the execution and it worked just fine!

Related

Build opencl kernel failure in Visual Studio

I am using opencl in Visual Studio and I get stuck with an error saying
Failed to build program executable
I can't figure out what is wrong with my code... mind you anyone can help me!
Here is my code:
cl_int err;
// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
if (err != CL_SUCCESS) {
printf("Error: Failed to find a platform\n");
return EXIT_FAILURE;
}
// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err != CL_SUCCESS) {
printf("Error: Failed to create a device group\n");
return EXIT_FAILURE;
}
// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context) {
printf("Error: Failed to create a compute context\n");
return EXIT_FAILURE;
}
// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);
if (!queue) {
printf("Error: Failed to create a command commands\n");
return EXIT_FAILURE;
}
// Create the compute program from the kernel source file
char *fileName = "GOL-kernels.cl";
FILE *fh = fopen(fileName, "r");
if (!fh) {
printf("Error: Failed to open file\n");
return 0;
}
struct stat statbuf;
stat(fileName, &statbuf);
char *kernelSource = (char *)malloc(statbuf.st_size + 1);
fread(kernelSource, statbuf.st_size, 1, fh);
kernelSource[statbuf.st_size] = '\0';
program = clCreateProgramWithSource(context, 1,
(const char **)& kernelSource, NULL, &err);
if (!program) {
printf("Error: Failed to create compute program\n");
return EXIT_FAILURE;
}
// Build the program executable
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS) {
printf("Error: Failed to build program executable %d\n", err);
system("pause");
return EXIT_FAILURE;
}
Most likely the kernel build is failing. Check what's in the log:
// Build the program
ret = clBuildProgram(program, 1, &device_id, "-I. -Werror", NULL, NULL);
size_t len = 0;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, NULL, NULL, &len);
char *log = new char[len];
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, len, log, NULL);
printf("\n\nBuildlog: %s\n\n", log);
Also it is a good idea to turn all warnings into errors: -Werror. It saves lot of time later when you may be wondering why the kernel is not return the correct results.

Getting low Host-Device transfer rate with NVIDIA Quadro M4000

I am doing OpenCL on an NVIDIA Quadro M4000 installed on PCIe 3x16. On the card documentation, it is stated that the transfer rate CPU->GPU can go up to 15.7Gb/s while on my benchmark it is yielding only ~2.4Gb/s. I know that effective transfer rate can significantly differ from theoretical one but I wasn't expecting the difference to be that much.
Anyone has any experience with the quadro CPU->GPU data transfer.
Thanks
#include<iostream>
#include<cstdlib>
#include<cstdio>
#include<string>
#include<cmath>
#include<CL/cl.h>
#include <Windows.h>
using namespace std;
SYSTEMTIME last_call;
cl_platform_id platform_id = NULL;
cl_uint ret_num_platform;
cl_device_id device_id = NULL;
cl_uint ret_num_device;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_int err;
void _profile(char* msg){
SYSTEMTIME tmp;
clFinish(command_queue);
GetSystemTime(&tmp);
printf("__Profile --- %s --- : %d : %d : %d\n", msg, (tmp.wMinute - last_call.wMinute),
(tmp.wSecond - last_call.wSecond),
(tmp.wMilliseconds - last_call.wMilliseconds));
last_call = tmp;
}
int main()
{
// Reading Kernel Program
char *kernel_src_std = "__kernel void copy(__global const uchar *x, __global uchar *z){\
const int id = get_global_id(0);\
z[id] = x[id]; \
}";
size_t kernel_src_size = strlen(kernel_src_std);
// Create Input data
int w = 1920;
int h = 1080;
int c = 3;
float* input = (float*)malloc(w * h * c * sizeof(float));
for(int i=0;i<w*h*c;i++)
input[i] = (float)rand()/RAND_MAX;
// getting platform ID
err = clGetPlatformIDs(1, &platform_id, &ret_num_platform);
// Get Device ID
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_device );
// Create Context
context = clCreateContext(NULL,1,&device_id,NULL,NULL,&err);
// Create Command Queue
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
// Create buffer Object
cl_mem buf_in = clCreateBuffer(context,CL_MEM_READ_ONLY, sizeof(float) * w*h*c,
0, &err);
cl_mem buf_out = clCreateBuffer(context,CL_MEM_WRITE_ONLY, sizeof(float) * w*h*c,
0, &err);
_profile("Start transfer input...");
// Copy Data from Host to Device
cl_event event[5];
err = clEnqueueWriteBuffer(command_queue,buf_in,CL_TRUE, 0, sizeof(float)*w*h*c,input,0,NULL, NULL);
_profile("End transfer input...");
// Create and Build Program
program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_std, 0, &err);
// Create Kernel
kernel = clCreateKernel(program,"copy",&err );
// Set Kernel Arguments
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&buf_in);
err = clSetKernelArg(kernel, 1,sizeof(cl_mem), (void *)&buf_out);
// Execute Kernel
size_t ws[]={h*w*c};
size_t lws[]={100};
err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, ws, lws, 0, NULL, NULL);
// Create output buf
float* output = (float*)malloc(sizeof(float)*w*h*c);
// Read output Data, from Device to Host
err = clEnqueueReadBuffer(command_queue, buf_out, CL_TRUE, 0, sizeof(float)*w*h*c, output,NULL,NULL,NULL);
//Release Objects
clReleaseMemObject(buf_in);
clReleaseMemObject(buf_out);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
free(input);
free(output);
while(1);
return(0);
}
As your question is vague it is hard to pinpoint the exact reason for your poor performance. Some concrete code might help.
However, in your comments you say that you transfer an array of 6220800 floats. That is about 200 megabits to transfer. At maximum transfer rate (15.7Gb/s) that should give about 12ms.
However, with every new transfer request there is also a latency that is added, which --- for small transfers --- can effectively degrade your transfer rate.
Have you tried benchmarking on significantly bigger arrays (say, 100x the size)?
You're using blocking transfers which means you're incurring a stall on the read/write requests (additionally you're not using pinned memory, but you addressed that). At the moment, your code goes
Begin timing -> Write -> stall -> kernel -> read -> stall -> end timing. This will drastically affect the timings for your memory bandwidth transfer if your transfer scale is on the order of 2ms, as the stalls are comparable in size to this. You'll need to eliminate these stalls if you want to measure the bandwidth accurately

Unable to find device in opencl using beignet

I am trying to run opencl using beignet
https://askubuntu.com/questions/412009/open-cl-in-intel
My system configuration is
Intel HD Graphics 5500
NVIDIA GeForce 830M (2 GB DDR3 dedicated)
When I run the following code:
// HelloWorld.cpp
//
// This is a simple example that demonstrates basic OpenCL setup and
// use.
#include <iostream>
#include <fstream>
#include <sstream>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
///
// Constants
//
const int ARRAY_SIZE = 1000;
///
// Create an OpenCL context on the first available platform using
// either a GPU or CPU depending on what is available.
//
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
// First, select an OpenCL platform to run on. For this example, we
// simply choose the first available platform. Normally, you would
// query for all available platforms and select the most appropriate one.
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
std::cerr << "Failed to find any OpenCL platforms." << std::endl;
return NULL;
}
// Next, create an OpenCL context on the platform. Attempt to
// create a GPU-based context, and if that fails, try to create
// a CPU-based context.
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cout << "Could not create GPU context, trying CPU..." << std::endl;
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
return NULL;
}
}
return context;
}
///
// Create a command queue on the first device available on the
// context
//
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
// First get the size of the devices buffer
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
return NULL;
}
if (deviceBufferSize <= 0)
{
std::cerr << "No devices available.";
return NULL;
}
// Allocate memory for the devices buffer
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
if (errNum != CL_SUCCESS)
{
delete [] devices;
std::cerr << "Failed to get device IDs";
return NULL;
}
// In this example, we just choose the first available device. In a
// real program, you would likely use all available devices or choose
// the highest performance device based on OpenCL device queries
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
if (commandQueue == NULL)
{
delete [] devices;
std::cerr << "Failed to create commandQueue for device 0";
return NULL;
}
*device = devices[0];
delete [] devices;
return commandQueue;
}
///
// Create an OpenCL program from the kernel source file
//
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(fileName, std::ios::in);
if (!kernelFile.is_open())
{
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
return NULL;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context, 1,
(const char**)&srcStr,
NULL, NULL);
if (program == NULL)
{
std::cerr << "Failed to create CL program from source." << std::endl;
return NULL;
}
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
// Determine the reason for the error
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(buildLog), buildLog, NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
clReleaseProgram(program);
return NULL;
}
return program;
}
///
// Create memory objects used as the arguments to the kernel
// The kernel takes three arguments: result (output), a (input),
// and b (input)
//
bool CreateMemObjects(cl_context context, cl_mem memObjects[3],
float *a, float *b)
{
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, a, NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, b, NULL);
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * ARRAY_SIZE, NULL, NULL);
if (memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL)
{
std::cerr << "Error creating memory objects." << std::endl;
return false;
}
return true;
}
///
// Cleanup any created OpenCL resources
//
void Cleanup(cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObjects[3])
{
for (int i = 0; i < 3; i++)
{
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (context != 0)
clReleaseContext(context);
}
///
// main() for HelloWorld example
//
int main(int argc, char** argv)
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[3] = { 0, 0, 0 };
cl_int errNum;
// Create an OpenCL context on first available platform
context = CreateContext();
if (context == NULL)
{
std::cerr << "Failed to create OpenCL context." << std::endl;
return 1;
}
// Create a command-queue on the first device available
// on the created context
commandQueue = CreateCommandQueue(context, &device);
if (commandQueue == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create OpenCL program from HelloWorld.cl kernel source
program = CreateProgram(context, device, "HelloWorld.cl");
if (program == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create OpenCL kernel
kernel = clCreateKernel(program, "hello_kernel", NULL);
if (kernel == NULL)
{
std::cerr << "Failed to create kernel" << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create memory objects that will be used as arguments to
// kernel. First create host memory arrays that will be
// used to store the arguments to the kernel
float result[ARRAY_SIZE];
float a[ARRAY_SIZE];
float b[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++)
{
a[i] = (float)i;
b[i] = (float)(i * 2);
}
if (!CreateMemObjects(context, memObjects, a, b))
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Set the kernel arguments (result, a, b)
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
size_t globalWorkSize[1] = { ARRAY_SIZE };
size_t localWorkSize[1] = { 1 };
// Queue the kernel up for execution across the array
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Read the output buffer back to the Host
errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
0, ARRAY_SIZE * sizeof(float), result,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Output the result buffer
for (int i = 0; i < ARRAY_SIZE; i++)
{
std::cout << result[i] << " ";
}
std::cout << std::endl;
std::cout << "Executed program succesfully." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 0;
}
I always get the output:
Number of available platforms: 1
Platform names:
[0] Experiment Intel Gen OCL Driver [Selected]
Number of devices available for each type:
CL_DEVICE_TYPE_CPU: 0
CL_DEVICE_TYPE_GPU: 0
CL_DEVICE_TYPE_ACCELERATOR: 0
*** Detailed information for each device ***
I tried various opencl codes and none of them works properly.Why are the devices not being found and what is the solution?
What happens when you run clinfo utility?
You can get clinfo for your linux and then run it. It provides a list of every platform and devices found. If you can't get your device listed by clinfo, you won't have it listed by your program.
It looks like you have a Nvidia Optimus computer, and it is very bad because Nvidia does not provide official support for Optimus on Linux. However, at least your Intel CPU shoule be recognized.
If you can't get it listed, you might lack the dri driver for your vendor (nvidia).

OpenCL create subdevices CL_DEVICE_PARTITION_FAILED

I'm stuck at getting clCreateSubDevices working, where CL_DEVICE_PARTITION_FAILED is always returned and I have no clue to solve this problem. I'm trying to create a subdevice with one core only. Here is the code, do you see anything wrong with it? Thanks!
Here are the function signatures:
clCreateSubDevices, clGetPlatformIDs, clGetDeviceIDs
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
if (device_id == NULL) {
fprintf(stderr, "failed to get device id!");
return 1;
}
const cl_device_partition_property properties[3] = {
CL_DEVICE_PARTITION_BY_COUNTS,
1, // Use only one compute unit
CL_DEVICE_PARTITION_BY_COUNTS_LIST_END
};
cl_device_id subdevice_id;
cl_int error = clCreateSubDevices(device_id, properties, 1, &subdevice_id, NULL);
if (error != CL_SUCCESS) {
fprintf(stderr, "failed to create sub device %d!\n", error);
return 1;
}

How to display an image inside kernel using opencl?

I am new to opencl. The task is:
Load an pre-existing image
Write Host code using opencl to send the image ptr to kernel
Calculate hsl threshold of the loaded image inside kernel
Display the threshold or binary image
I ve used opencv to load a pre-existing 2D image in my program. And I used open cl buffer objects to allocate memory and have send image pointer to the kernel. After kernel execution in order to display the calculated image from the kernel I need clEnqueueReadBuffer. Then I use opencv to display the image from the host. I ve attached code below
As this takes more time on GPU and CPU I thought to switch over to image memory.
But I like to know whether usage of images also need clenqueueReadImage to copy image from kernel to host or do we any way to display the threshold image in kernel itself?
//My code using opencl buffers
IplImage *src = cvLoadImage("../Input/im2.png",CV_LOAD_IMAGE_COLOR );
int a=src->height;
int b=src->width;
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
std::cerr << "Failed to find any OpenCL platforms." << std::endl;
return NULL;
}
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cout << "Could not create GPU context, trying CPU..." << std::endl;
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU, NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
return NULL;
}
}
return context;
}
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
return NULL;
}
if (deviceBufferSize <= 0)
{
std::cerr << "No devices available.";
return NULL;
}
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
if (errNum != CL_SUCCESS)
{
delete [] devices;
std::cerr << "Failed to get device IDs";
return NULL;
}
commandQueue = clCreateCommandQueue(context, devices[0],CL_QUEUE_PROFILING_ENABLE, &errNum );
if (commandQueue == NULL)
{
delete [] devices;
std::cerr << "Failed to create commandQueue for device 0";
return NULL;
}
*device = devices[0];
delete [] devices;
return commandQueue;
}
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(fileName, std::ios::in);
if (!kernelFile.is_open())
{
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
return NULL;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context, 1,
(const char**)&srcStr,
NULL, NULL);
if (program == NULL)
{
std::cerr << "Failed to create CL program from source." << std::endl;
return NULL;
}
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(buildLog), buildLog, NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
clReleaseProgram(program);
return NULL;
}
return program;
}
bool CreateMemObjects(cl_context context, cl_mem memObjects[2], unsigned char *src_ptr)
{
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(unsigned char) *(a*b*3) , src_ptr , NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(unsigned char) *(a*b) , NULL, NULL);
if (memObjects[0] == NULL || memObjects[1] == NULL)
{
std::cerr << "Error creating memory objects" << std::endl;
return false;
}
return true;
}
void Cleanup(cl_context context, cl_command_queue commandQueue, cl_program program, cl_kernel kernel, cl_mem memObjects[2])
{
for (int i = 0; i < 2; i++)
{
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (context != 0)
clReleaseContext(context);
}
int main()
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[2] = { 0,0 };
cl_int errNum;
cl_event myEvent;
cl_ulong start_time,end_time;
double kernelExecTimeNs;
IplImage *thres_img1 = cvCreateImage(cvGetSize(src), IPL_DEPTH_8U, 1);
unsigned char *tur_image1,*src_ptr;
tur_image1 = (unsigned char*) malloc((a*b) * sizeof(unsigned char));
src_ptr = (unsigned char*) malloc ((a*b*3) * sizeof(unsigned char));
context = CreateContext();
if (context == NULL)
{
std::cerr << "Failed to create OpenCL context." <<std::endl;
return 1;
}
commandQueue = CreateCommandQueue(context, &device);
if (commandQueue == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
program = CreateProgram(context, device, "hsl_threshold.cl");
if (program == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
kernel = clCreateKernel(program, "HSL_threshold", NULL);
if (kernel == NULL)
{
std::cerr << "Failed to create kernel" << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
printf("height:%d\n",a);//image height
printf("width:%d\n",b);//image width
cvShowImage("color image",src);
cvWaitKey(0);
memcpy(src_ptr,src->imageData,(a*b*3));
if (!CreateMemObjects(context, memObjects, src_ptr))
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments" << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
cout<<"Kernel arguments set successfully";
size_t globalWorkSize[1]={a*b};
size_t localWorkSize[1]={512};
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &myEvent);
clWaitForEvents(1,&myEvent);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
clFinish(commandQueue);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, sizeof(start_time), &start_time, NULL);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(end_time), &end_time, NULL);
kernelExecTimeNs = end_time-start_time;
printf("\nExecution time in milliseconds = %0.3f ms\n",( kernelExecTimeNs / 1000000.0) );
cout<<"\n Kernel timings \n"<<kernelExecTimeNs<<"seconds";
errNum = clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE,
0, (a*b) * sizeof(unsigned char), tur_image1,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
memcpy(thres_img1->imageData,tur_image1,sizeof(unsigned char)*(a*b));
cvShowImage( "hsl_thresh",thres_img1);
cvSaveImage( "../Output/hsl_threshold.png",thres_img1);
cvWaitKey(0);
std::cout<<std::endl;
std::cout<<"Image displayed Successfully"<<std::endl;
Cleanup(context,commandQueue,program,kernel,memObjects);
printf("\n Free opencl resources");
std::cin.get();
return 0;
}
There are ways to directly process data calculated by OpenCL via OpenGL. Your OCL implementation must support the extension cl_khr_gl_sharing.
This mode is called CL/GL-Interop Mode.
If you create an OpenGL-instance first and initialise OpenCL with the pointers to your GL-instance, it is possible for each implementation to access each others data.
(All snippets are taken from code using CL-C++-Bindings, I guess it is okay for the general understanding)
cl_context_properties properties[] =
// Take this line to create an OCL context in GL-CL-interop-mode.
// OpenGL must already be initialised.
// For interop init see: http://www.khronos.org/registry/cl/extensions/khr/cl_khr_gl_sharing.txt
// USING: CL_GL_CONTEXT_KHR: Rendering Context [Use your OGL-HGLRC variable or do wglGetCurrentContext(); ]
// AND: CL_WGL_HDC_KHR: Device Context [Use your OGL-HDC variable or do wglGetCurrentDC(); ]
{
CL_CONTEXT_PLATFORM, (cl_context_properties)(_platforms->at(0))(),
CL_GL_CONTEXT_KHR, (cl_context_properties)myGL->hRC,
CL_WGL_HDC_KHR, (cl_context_properties)myGL->hDC, 0
};
Now you can create OCL-images based on OGL textures
//The following data can be accessed both from OCL and OGL
cl::Image2D imageFromGL = new cl::Image2DGL(*_context, CL_MEM_READ_WRITE, GL_TEXTURE_2D, 0, myGL->textures[0]);
Before using the memory in OCL, you have to ask OGL to release it
//Ask OGL to release memory. All OGL actions must be finished before doing so!
_queue->enqueueAcquireGLObjects(&imageFromGL, NULL, &evt);
Now, do what you want, then give it back to OGL:
//Hand memory back to OGL. All OCL actions must be finished before doing so!
_queue->enqueueReleaseGLObjects(&imageFromGL, NULL, &evt);
And finally you can use OpenGL code to display the data on the screen.

Resources