Accessing platforms and devices in AMD OpenCL platform - opencl

I'm trying to run my first codes in OpenCL. I wrote the following program to detect the available platforms and devices in my computer. The code works fine in my computer which has Intel CPU and NVIDIA GPU. It detects every platform and device correctly. But, when I run it on a computer with AMD-FX 770K and Radeon R7 240, the output is as the figure shown below. At least, it must show the Radeon R7 240 GPU as a device in this platform, but it doesn't. Any idea why the code is not working properly on AMD platform?
Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>
int main(){
cl_int err, i, j;
cl_platform_id *platforms;
cl_device_id *devices;
cl_uint num_platforms, num_devices;
size_t plat_name_size, devi_name_size;
char *plat_name_data, *devi_name_data;
err = clGetPlatformIDs(1, NULL, &num_platforms);
if (err < 0){
perror("No platform is found");
exit(1);
}
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
printf("Number of found platforms is %d\n ", num_platforms);
for (i = 0; i < num_platforms; i++){
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
if (err < 0){
perror("Couldn't read platform name.");
exit(1);
}
plat_name_data = (char*)malloc(plat_name_size);
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL);
printf("Platform No.%d is: %s\n", i, plat_name_data);
free(plat_name_data);
err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
if (err < 0){
perror("No device is found in this platform");
exit(1);
}
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
printf("Number of devices found in this platform is: %d\n", num_devices);
for (j = 0; j < num_devices; j++){
err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size);
if (err < 0){
perror("Couldn't read the device name.");
exit(1);
}
devi_name_data = (char*)malloc(devi_name_size);
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL);
printf("Device No.%d name is: %s\n", j, devi_name_data);
free(devi_name_data);
}
}
return 0;
}

Please set num_entries to "0" in both of below calls:
clGetPlatformIDs(0 /* num_entries */_, NULL, &num_platforms);
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0 /* num_entries */, NULL, &num_devices);

Related

Getting nan values from OpenCL FFT kernel on FPGA

I was trying to use the Intel's FFT1D kernel by writing the Host program by my own for Intel FPGA. Link to Intel's FFT1d can be found here
I have also given my host program below, wherein, I have a file saved (which contains some data), my task is to read that data, calculate its FFT and print some of it. It is a 4K point FFT
#include <stdio.h>
#include <stdlib.h>
#include "CL/opencl.h"
#include "AOCLUtils/aocl_utils.h"
#include <string.h>
#include "fft_config.h"
#define N (1<<LOGN) //Please check the FFT Sample Code for Ref (2 to the power 12 gives 4K points)
#define DATA_FILE "complex_input.data"
using namespace aocl_utils;
cl_platform_id platform = NULL;
cl_device_id device = NULL;
cl_command_queue queue0 = NULL;
cl_command_queue queue1 = NULL;
cl_context context = NULL;
cl_program program = NULL;
cl_kernel kernel0, kernel1;
cl_mem d_inData, d_outData;
cl_int err = 0;
typedef struct {
float x;
float y;
} float2;
//float2 h_outData[N], h_inData[N];
float2 *h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
float2 *h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);
void init(); //Function that does the job of Querying Platform and Device, creating Context, Command Queues, Program and required Kernels to do the job.
void cleanup(); //Function that releases all the Created Contexts, Buffers etc, in order to finish the execution.
void read_data(); //Reads data from the complex numbers from .data file and fills in the float2 struct h_inData[].
int temp_value = 1;
int main()
{
// h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
//h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);
int inverse = false;
int temp =1;
init();
read_data();
d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float2)*N, NULL, &err);
checkError(err,"Failed to allocate Buffer for input array\n");
d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(float2)*N, NULL, &err);
checkError(err, "Failed to allocate the Buffer for output\n");
//WE FINISH THE FETCH KERNEL
err = clEnqueueWriteBuffer(queue1,d_inData, CL_TRUE, 0, sizeof(float2)*N, h_inData, 0, NULL, NULL);
checkError(err,"Failed to Write the input Buffer\n");
err = clSetKernelArg(kernel1, 0, sizeof(cl_mem), (void *)&d_inData);
checkError(err, "Failed to set KerArg for Kernel1 - 0\n");
err = clSetKernelArg(kernel0, 0, sizeof(cl_mem), (void *)&d_outData);
checkError(err, "Failed to set KerArg for Kernel0 - 0\n");
err = clSetKernelArg(kernel0, 1, sizeof(cl_int), (void *)&temp_value);
checkError(err, "Failed to set KerArg for Kernel0 - 1\n");
err = clSetKernelArg(kernel0, 2, sizeof(cl_int), (void *)&inverse);
checkError(err, "Failed to set KerArg for Kernel0 - 2\n");
printf("FFT Initialization Complete!\n\n");
err = clEnqueueTask(queue0, kernel0, 0, NULL, NULL);
checkError(err, "Failed to Launch the Kernel for FFT\n");
size_t local_work_size = N/8;
size_t global_work_size = local_work_size * 1; //Coz the number of Iterations is just 1
err = clEnqueueNDRangeKernel(queue1, kernel1, 1, NULL, &local_work_size, &global_work_size, 0, NULL, NULL);
checkError(err, "Failed to launch the Fetch Kernel\n");
err = clFinish(queue0);
checkError(err, "Failed to finish FFT\n");
err = clFinish(queue1);
checkError(err, "Failed to finish Fetch kernel\n");
err = clEnqueueReadBuffer(queue0, d_outData, CL_TRUE, 0, sizeof(float2)*N, h_outData, 0, NULL, NULL);
checkError(err, "Failed to Read back the Buffer output\n");
printf("FFT is Complete!\n\n");
printf("Printing some of the values, just to make sure they are non-zero\n\n");
for(int ii=100;ii<125;ii++)
{
printf("%f + %f j -> %f + %f j\n",h_inData[ii].x,h_inData[ii].y,h_outData[ii].x,h_outData[ii].y);
}
printf("\n\n");
cleanup();
return 0;
}
void read_data()
{
size_t sourceSize;
float* temp;
FILE *fp = fopen(DATA_FILE,"r");
if(fp==NULL)
{
printf("Could not find the Random Data File! Exiting!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
sourceSize=ftell(fp);
rewind(fp);
temp = (float *)alignedMalloc(sourceSize);
fread(temp, sizeof(float),sourceSize,fp);
fclose(fp);
for(int i=0;i<N;i++)
{
h_inData[i].x = temp[2*i];
h_inData[i].y = temp[(2*i)+1];
}
}
void init()
{
platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
if(platform == NULL)
{
printf("Could not find the Platform\n");
exit(1);
}
scoped_array<cl_device_id> devices;
cl_uint num_devices;
devices.reset(getDevices(platform, CL_DEVICE_TYPE_ACCELERATOR, &num_devices));
device = devices[0];
context = clCreateContext(NULL, 1, &device, &oclContextCallback, NULL, &err);
checkError(err, "Failed to create Context\n");
queue0 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
checkError(err, "Failed to create Command Queue0\n");
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
checkError(err, "Failed to create Command Queue1\n");
program = createProgramFromBinary(context, "bin/fft1d.aocx", &device, 1);
err = clBuildProgram(program, 1, &device, "", NULL, NULL);
checkError(err, "Failed to Build Program\n");
kernel0 = clCreateKernel(program, "fft1d", &err);
checkError(err,"Could not Create Kernel0\n");
kernel1 = clCreateKernel(program, "fetch", &err);
checkError(err, "Could not Create Kernel1\n");
printf("Finished with the Initial Setup!\n");
}
void cleanup()
{
if(kernel0)
clReleaseKernel(kernel0);
if(kernel1)
clReleaseKernel(kernel1);
if(program)
clReleaseProgram(program);
if(queue0)
clReleaseCommandQueue(queue0);
if(queue1)
clReleaseCommandQueue(queue1);
if(d_inData)
clReleaseMemObject(d_inData);
if(d_outData)
clReleaseMemObject(d_outData);
if(context)
clReleaseContext(context);
}
I checked if the data from file is being read fine, and It is correct and as expected.
Please let me know where could this go wrong!
Update!
I found out the solution. Reading from the itself was not a good idea, here. I tried generating the random there during the execution and it worked just fine!

Unable to find device in opencl using beignet

I am trying to run opencl using beignet
https://askubuntu.com/questions/412009/open-cl-in-intel
My system configuration is
Intel HD Graphics 5500
NVIDIA GeForce 830M (2 GB DDR3 dedicated)
When I run the following code:
// HelloWorld.cpp
//
// This is a simple example that demonstrates basic OpenCL setup and
// use.
#include <iostream>
#include <fstream>
#include <sstream>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
///
// Constants
//
const int ARRAY_SIZE = 1000;
///
// Create an OpenCL context on the first available platform using
// either a GPU or CPU depending on what is available.
//
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
// First, select an OpenCL platform to run on. For this example, we
// simply choose the first available platform. Normally, you would
// query for all available platforms and select the most appropriate one.
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
std::cerr << "Failed to find any OpenCL platforms." << std::endl;
return NULL;
}
// Next, create an OpenCL context on the platform. Attempt to
// create a GPU-based context, and if that fails, try to create
// a CPU-based context.
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cout << "Could not create GPU context, trying CPU..." << std::endl;
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
return NULL;
}
}
return context;
}
///
// Create a command queue on the first device available on the
// context
//
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
// First get the size of the devices buffer
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
return NULL;
}
if (deviceBufferSize <= 0)
{
std::cerr << "No devices available.";
return NULL;
}
// Allocate memory for the devices buffer
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
if (errNum != CL_SUCCESS)
{
delete [] devices;
std::cerr << "Failed to get device IDs";
return NULL;
}
// In this example, we just choose the first available device. In a
// real program, you would likely use all available devices or choose
// the highest performance device based on OpenCL device queries
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
if (commandQueue == NULL)
{
delete [] devices;
std::cerr << "Failed to create commandQueue for device 0";
return NULL;
}
*device = devices[0];
delete [] devices;
return commandQueue;
}
///
// Create an OpenCL program from the kernel source file
//
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(fileName, std::ios::in);
if (!kernelFile.is_open())
{
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
return NULL;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context, 1,
(const char**)&srcStr,
NULL, NULL);
if (program == NULL)
{
std::cerr << "Failed to create CL program from source." << std::endl;
return NULL;
}
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
// Determine the reason for the error
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(buildLog), buildLog, NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
clReleaseProgram(program);
return NULL;
}
return program;
}
///
// Create memory objects used as the arguments to the kernel
// The kernel takes three arguments: result (output), a (input),
// and b (input)
//
bool CreateMemObjects(cl_context context, cl_mem memObjects[3],
float *a, float *b)
{
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, a, NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, b, NULL);
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * ARRAY_SIZE, NULL, NULL);
if (memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL)
{
std::cerr << "Error creating memory objects." << std::endl;
return false;
}
return true;
}
///
// Cleanup any created OpenCL resources
//
void Cleanup(cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObjects[3])
{
for (int i = 0; i < 3; i++)
{
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (context != 0)
clReleaseContext(context);
}
///
// main() for HelloWorld example
//
int main(int argc, char** argv)
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[3] = { 0, 0, 0 };
cl_int errNum;
// Create an OpenCL context on first available platform
context = CreateContext();
if (context == NULL)
{
std::cerr << "Failed to create OpenCL context." << std::endl;
return 1;
}
// Create a command-queue on the first device available
// on the created context
commandQueue = CreateCommandQueue(context, &device);
if (commandQueue == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create OpenCL program from HelloWorld.cl kernel source
program = CreateProgram(context, device, "HelloWorld.cl");
if (program == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create OpenCL kernel
kernel = clCreateKernel(program, "hello_kernel", NULL);
if (kernel == NULL)
{
std::cerr << "Failed to create kernel" << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create memory objects that will be used as arguments to
// kernel. First create host memory arrays that will be
// used to store the arguments to the kernel
float result[ARRAY_SIZE];
float a[ARRAY_SIZE];
float b[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++)
{
a[i] = (float)i;
b[i] = (float)(i * 2);
}
if (!CreateMemObjects(context, memObjects, a, b))
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Set the kernel arguments (result, a, b)
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
size_t globalWorkSize[1] = { ARRAY_SIZE };
size_t localWorkSize[1] = { 1 };
// Queue the kernel up for execution across the array
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Read the output buffer back to the Host
errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
0, ARRAY_SIZE * sizeof(float), result,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Output the result buffer
for (int i = 0; i < ARRAY_SIZE; i++)
{
std::cout << result[i] << " ";
}
std::cout << std::endl;
std::cout << "Executed program succesfully." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 0;
}
I always get the output:
Number of available platforms: 1
Platform names:
[0] Experiment Intel Gen OCL Driver [Selected]
Number of devices available for each type:
CL_DEVICE_TYPE_CPU: 0
CL_DEVICE_TYPE_GPU: 0
CL_DEVICE_TYPE_ACCELERATOR: 0
*** Detailed information for each device ***
I tried various opencl codes and none of them works properly.Why are the devices not being found and what is the solution?
What happens when you run clinfo utility?
You can get clinfo for your linux and then run it. It provides a list of every platform and devices found. If you can't get your device listed by clinfo, you won't have it listed by your program.
It looks like you have a Nvidia Optimus computer, and it is very bad because Nvidia does not provide official support for Optimus on Linux. However, at least your Intel CPU shoule be recognized.
If you can't get it listed, you might lack the dri driver for your vendor (nvidia).

Variadic Macros in OpenCL: Where are they supported?

I have been trying to find information on the (non-standard, cf. 1.0, 1.1, 1.2, 2.0) support for variadic macros in OpenCL implementations.
I have access to the following platforms, all of which support variadic macros:
Mac OS X, Intel CPU, OpenCL 1.2, driver: 1.1
Mac OS X, Intel GPU, OpenCL 1.2, driver: 1.2(Dec 23 2014 00:18:31)
Mac OS X, ATI GPU, OpenCL 1.2, driver: 1.2 (Aug 17 2014 20:27:52)
Mac OS X, Nvidia GPU, OpenCL 1.2, driver: 10.2.7 310.41.25f01
Could other please check their available implementations so that we can have a map of implementations that supported variadic macros?
Edit: Here is a self-contained test program that makes uses of a variadic macro.
#include <stdlib.h>
#include <stdio.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
const char* SOURCE =
"#define KERNEL(name, ...) kernel void name(__VA_ARGS__) \n"
" \n"
"KERNEL(test, global float* input, global float* output) \n"
"{ \n"
" int i = get_global_id(0); \n"
" output[i] = input[i]; \n"
"} \n"
" \n"
;
static const int GPU = 1;
int main(int argc, char** argv)
{
int err;
cl_float input[16];
cl_float output[16];
size_t global = 16;
size_t local = 16;
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
cl_command_queue command_queue;
cl_program program;
cl_kernel kernel;
cl_mem input_buf;
cl_mem output_buf;
err = clGetPlatformIDs(1, &platform_id, NULL);
if(err != CL_SUCCESS)
{
printf("error: clGetPlatformIDs\n");
return EXIT_FAILURE;
}
err = clGetDeviceIDs(platform_id, GPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if(err != CL_SUCCESS)
{
printf("error: clGetDeviceIDs\n");
return EXIT_FAILURE;
}
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if(err != CL_SUCCESS)
{
printf("error: clCreateContext\n");
return EXIT_FAILURE;
}
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
if(err != CL_SUCCESS)
{
printf("error: clCreateCommandQueue\n");
return EXIT_FAILURE;
}
program = clCreateProgramWithSource(context, 1, &SOURCE, NULL, &err);
if(err != CL_SUCCESS)
{
printf("error: clCreateProgramWithSource\n");
return EXIT_FAILURE;
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("error: clBuildProgram\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
return EXIT_FAILURE;
}
kernel = clCreateKernel(program, "test", &err);
if(err != CL_SUCCESS)
{
printf("error: clCreateKernel\n");
return EXIT_FAILURE;
}
input_buf = clCreateBuffer(context, CL_MEM_READ_ONLY, 16*sizeof(cl_float), NULL, NULL);
output_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 16*sizeof(cl_float), NULL, NULL);
if(!input_buf || !output_buf)
{
printf("error: clCreateBuffer\n");
return EXIT_FAILURE;
}
err = clEnqueueWriteBuffer(command_queue, input_buf, CL_TRUE, 0, 16*sizeof(cl_float), input, 0, NULL, NULL);
if(err != CL_SUCCESS)
{
printf("error: clEnqueueWriteBuffer\n");
return EXIT_FAILURE;
}
err = 0;
err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buf);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_buf);
if(err != CL_SUCCESS)
{
printf("error: clSetKernelArg\n");
return EXIT_FAILURE;
}
err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if(err != CL_SUCCESS)
{
printf("error: clEnqueueNDRangeKernel\n");
return EXIT_FAILURE;
}
clFinish(command_queue);
err = clEnqueueReadBuffer(command_queue, output_buf, CL_TRUE, 0, 16*sizeof(cl_float), output, 0, NULL, NULL );
if(err != CL_SUCCESS)
{
printf("error: clEnqueueReadBuffer\n");
return EXIT_FAILURE;
}
clReleaseMemObject(input_buf);
clReleaseMemObject(output_buf);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
printf("success\n");
return EXIT_SUCCESS;
}
For posterity, I will answer this myself with a list of platforms that were tested for support of variadic macros. Future visitor, please feel free to add any platform you can test to this list.
Supported:
Mac OS X, Intel CPU, OpenCL 1.2, driver: 1.1
Mac OS X, Intel GPU, OpenCL 1.2, driver: 1.2(Dec 23 2014 00:18:31)
Mac OS X, ATI GPU, OpenCL 1.2, driver: 1.2 (Aug 17 2014 20:27:52)
Debian unstable, AMD GPU, OpenCL 2.0, driver: amdgpu-pro 16.15.2-277429
Mac OS X, Nvidia GPU, OpenCL 1.2, driver: 10.2.7 310.41.25f01
Redhat Enterprise Linux 5, Nvidia CUDA SDK 4.0, OpenCL 1.0 CUDA, driver: 260.19.26
Unsupported:
Debian unstable, Intel GPU, OpenCL 1.2, driver: Beignet 1.1.1
Unsupported:
Win10 x64 2004, Intel CPU & GPU, OpenCL 2.1
Win10 x64 2004, NVIDIA GPU, OpenCL 1.2 CUDA 11.0.140

OpenCL Memory Transfer Issue (Err. code -6)

I'm dying a little inside. I've been working on this all day to no avail. I was having issues running some code that previously ran fine, so I wrote a short "toy" OpenCL program to try and figure out what was going on, but my toy program has me baffled and incredibly frustrated.
I'm working with an Nvidia 780i with 3Gb of global memory. It has a maximum allocation of ~780 Mb. At first, it wouldn't error out when I was intentionally over-allocating. Solved that (it was typographical, but the compiler/analyzer wasn't catching it). Now, even when trying to allocate WAY below what the device should be able to handle, I get an error code of -6 (CL_OUT_OF_HOST_MEMORY) on the second big buffer allocation.
I've been researching this error and I just can't track how it applies in this case. I have 32 gb of ram in the machine I'm using so there's certainly not a shortage there. I figure there's something that I just don't understand going on here.
It will allocate the first buffer alright, but then choke on the second. I basically just cannot allocate nearly the amount of global memory I both want and need to.
Any help is much appreciated. If you can help me with this and you're near LA I'll take you out for drinks. That's how frustrated I am.
Code and output is below for my machine.
Thanks,
John
Main program:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "kernels.cl"
#define KERNEL_NAME "test"
#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#define N_PROJ 4000
#define N_CHANNELS 736
#define N_ROWS 32
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
perror("Just a heads up: I'm not going to run on the GPU");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
}
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
cl_ulong16 alloc_size,mem_size;
char name[40];
clGetDeviceInfo(dev,CL_DEVICE_MAX_MEM_ALLOC_SIZE,sizeof(cl_ulong16),&alloc_size,NULL);
clGetDeviceInfo(dev,CL_DEVICE_NAME,sizeof(name),name,NULL);
clGetDeviceInfo(dev,CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong16),&mem_size,NULL);
printf("Using device: %s\n",name);
printf("Global memory size: %lu\n",mem_size);
printf("Max. allocation: %lu\n",alloc_size);
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle)-13;
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if(err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main(int argc, const char * argv[])
{
/* This file serves as a backbone for OpenCL programs. */
/* All the user needs to do is enter their OpenCL data */
/* structures, set kernel args, and kernel dispatches. */
/* Standard OCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
device=create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
program = build_program(context, device, PROGRAM_FILE);
queue = clCreateCommandQueue(context, device,0, NULL);
kernel = clCreateKernel(program, KERNEL_NAME, NULL);
/* User code goes here */
cl_int err;
/* Declare and set data */
int a[]={1,2,3};
float *rebin;
rebin =(float*) calloc(N_PROJ*N_CHANNELS*N_ROWS,sizeof(float));
float *mat;
mat =(float*) calloc(N_PROJ*N_CHANNELS*N_ROWS,sizeof(float));
printf("\nAllocation size: %lu\n",N_PROJ*N_CHANNELS*N_ROWS*sizeof(float));
/* Declare and set buffer objects */
cl_mem a_buff,rebin_buff,mat_buff;
printf("Total memory to be allocated: %lu\n",2*N_PROJ*N_CHANNELS*N_ROWS*sizeof(float)+sizeof(a) );
a_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,sizeof(a),a,NULL);
rebin_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),rebin,&err);
if (err<0){
printf("Error: %i\n",err);
perror("Couldn't create buffer 1");
exit(1);
}
mat_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),mat ,&err);
if (err<0){
printf("Error: %i\n",err);
perror("Couldn't create buffer 2");
exit(1);
}
/* Copy data over to the device */
err=clSetKernelArg(kernel,0,sizeof(cl_mem),&mat_buff);
if (err<0){
perror("Couldn't set kernel argument");
exit(1);
}
err=clSetKernelArg(kernel,1,sizeof(cl_mem),&rebin_buff);
err=clSetKernelArg(kernel,2,sizeof(cl_mem),&a_buff);
clEnqueueTask(queue,kernel,0,NULL,NULL);
clEnqueueReadBuffer(queue,mat_buff ,CL_TRUE,0,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),mat ,0,NULL,NULL);
clEnqueueReadBuffer(queue,rebin_buff,CL_TRUE,0,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),rebin,0,NULL,NULL);
clEnqueueReadBuffer(queue,a_buff, CL_TRUE,0,sizeof(a),a,0,NULL,NULL);
printf("%f %f %f\n",mat[1],mat[2],mat[3]);
printf("%f %f %f\n",rebin[1],rebin[2],rebin[3]);
printf("%i %i %i",a[0],a[1],a[2]);
/***********************/
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
//clReleaseDevice(device);
printf("\n\nProgram apparently executed fully. \n");
return 0;
}
Kernel:
__kernel void test(__global float *mat,__global float *rebin,__global int *a){
a[0]=3;
a[1]=2;
a[2]=1;
rebin[1]=1.0f;
rebin[2]=2.0f;
rebin[3]=3.0f;
mat[1]=3.0f;
mat[2]=2.0f;
mat[3]=1.0f;
}
Console output for my machine:
Using device: GeForce GTX 780
Global memory size: 3221225472
Max. allocation: 805306368
Allocation size: 376832000
Total memory to be allocated: 753664012
Error: -6
Couldn't create buffer 2: No error
Process returned 1 (0x1) execution time : 0.435 s
Press any key to continue.

clGetDeviceIDs fails in OpenCL with error code -30

The output of the following program on my machine with ATI Firepro V8750 is as follows:
"Couldn't find any devices:No error"
(this happens at the call of first clGetDeviceIDs). the error code returned is -30. What does that mean?
I am not able to understand why it is unable to find the device. I have checked that CLinfo.exe lists my GPU along with the Intel CPU I am having. Can some one give my some pointers as to what is wrong here?
Additional info:
AMD APP SK 2.4
Firepro Driver: 8.911.3.3_VistaWin7_X32X64_135673
12-4_vista_win7_32_dd_ccc
Windows 7
Also I must mention that the firePro Driver's some componenets failed to get install.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
int main() {
/* Host/device data structures */
cl_platform_id platform;
cl_device_id *devices;
cl_uint num_devices, addr_data;
cl_int i, err;
/* Extension data */
char name_data[48], ext_data[4096];
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't find any platforms");
exit(1);
}
/* Determine number of connected devices */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, NULL, &num_devices);
if(err < 0) {
perror("Couldn't find any devices");
exit(1);
}
/* Access connected devices */
devices = (cl_device_id*)
malloc(sizeof(cl_device_id) * num_devices);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU,
num_devices, devices, NULL);
/* Obtain data for each connected device */
for(i=0; i<num_devices; i++) {
err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME,
sizeof(name_data), name_data, NULL);
if(err < 0) {
perror("Couldn't read extension data");
exit(1);
}
clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS,
sizeof(ext_data), &addr_data, NULL);
clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS,
sizeof(ext_data), ext_data, NULL);
printf("NAME: %s\nADDRESS_WIDTH: %u\nEXTENSIONS: %s",
name_data, addr_data, ext_data);
}
free(devices);
return 0;
}
Here is CLINFO output:
GPU:
CPU:
Why are the two highlighted versions different?
Could it be that you have multiple OpenCL platforms installed on your system? So, perhaps your first platform is a CPU-only playform, so the query for a GPU device fails.
EDIT:
Here's the problem: The first call to clGetDeviceIDs passes 1 for num_entries, but NULL for the devices pointer. I think you want to pass in 0 for num_entries.

Resources