OpenCL pinned memory vs Heap Memory - opencl

I have written a sample program to understand the effects of GPU/CPU pinned memory and Heap memory. The following code illustrates this. I have allocated three buffers of dimensions say 1280x720. I have filled buffers 1 & 2 with some data and in turn used these buffers to fill buffer 3. The mathematical operatio involved in filling buffer 3 is insignificant. In case 1, the memory allocated from these buffers are from heap (malloc call). In case 2, the memory for these buffers are allocated from OpenCL API calls (clCreateBuffer()). There is a performance difference between these 2 cases. I tested it on Intel integrated GPU's. I am unable to explain this difference in performance. Does it have some thing to do with cacheable properties of CPU/GPU pinned memory vs Heap memory.
Have you encountered such behavior before or am i doing something wrong?
#include <stdio.h>
#include <malloc.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#define OPENCL
#if defined(_WIN32)
/*
* Win32 specific includes
*/
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#else
#include <sys/time.h>
/* timersub is not provided by msys at this time. */
#ifndef timersub
#define timersub(a, b, result) \
do { \
(result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
(result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
if ((result)->tv_usec < 0) { \
--(result)->tv_sec; \
(result)->tv_usec += 1000000; \
} \
} while (0)
#endif
#endif
struct usec_timer {
#if defined(_WIN32)
LARGE_INTEGER begin, end;
#else
struct timeval begin, end;
#endif
};
static void usec_timer_start(struct usec_timer *t) {
#if defined(_WIN32)
QueryPerformanceCounter(&t->begin);
#else
gettimeofday(&t->begin, NULL);
#endif
}
static void usec_timer_mark(struct usec_timer *t) {
#if defined(_WIN32)
QueryPerformanceCounter(&t->end);
#else
gettimeofday(&t->end, NULL);
#endif
}
static int64_t usec_timer_elapsed(struct usec_timer *t) {
#if defined(_WIN32)
LARGE_INTEGER freq, diff;
diff.QuadPart = t->end.QuadPart - t->begin.QuadPart;
QueryPerformanceFrequency(&freq);
return diff.QuadPart * 1000000 / freq.QuadPart;
#else
struct timeval diff;
timersub(&t->end, &t->begin, &diff);
return diff.tv_sec * 1000000 + diff.tv_usec;
#endif
}
#ifdef OPENCL
#include ".\CL\cl.h"
int opencl_init(cl_context *context, cl_command_queue *cmd_queue) {
cl_int status;
cl_uint num_platforms = 0;
cl_platform_id platform;
cl_uint num_devices = 0;
cl_device_id device;
cl_command_queue_properties command_queue_properties = 0;
// Get the number of platforms in the system.
status = clGetPlatformIDs(0, NULL, &num_platforms);
if (status != CL_SUCCESS || num_platforms == 0)
goto fail;
// Get the platform ID for one platform
status = clGetPlatformIDs(1, &platform, NULL);
if (status != CL_SUCCESS)
goto fail;
// Get the number of devices available on the platform
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
if (status != CL_SUCCESS || num_devices == 0)
goto fail;
// Get the device ID for one device
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (status != CL_SUCCESS)
goto fail;
// Create OpenCL context for one device
*context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
if (status != CL_SUCCESS || *context == NULL)
goto fail;
// Create command queues for the device
*cmd_queue = clCreateCommandQueue(*context, device, command_queue_properties, &status);
if (status != CL_SUCCESS || *cmd_queue == NULL)
goto fail;
return 0;
fail:
return 1;
}
#endif
int main(int argc, char **argv) {
int x, y, z;
int width = 1280, height = 720;
unsigned char *buffer[3];
int use_gpu;
cl_mem opencl_mem[3];
cl_context context;
cl_command_queue cmd_queue;
cl_int status;
if (argc != 2)
return 0;
use_gpu = atoi(argv[1]);
if (use_gpu) {
if (opencl_init(&context, &cmd_queue))
printf("OpenCL init failure");
}
if (use_gpu) {
for (x = 0; x < 3; x++) {
opencl_mem[x] = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
width * height * sizeof(*buffer[x]), NULL,
&status);
if (status != CL_SUCCESS)
return 0;
buffer[x] = clEnqueueMapBuffer(cmd_queue, opencl_mem[x], CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE, 0,
width * height * sizeof(*buffer[x]), 0,
NULL, NULL, &status);
if (status != CL_SUCCESS) {
clReleaseMemObject(opencl_mem[x]);
opencl_mem[x] = NULL;
return 0;
}
}
} else {
for (x = 0; x < 3; x++) {
buffer[x] = malloc(width * height * sizeof(*buffer[x]));
if (buffer[x] == NULL) {
printf("Unable to alloc memory");
}
}
}
memset(buffer[0], 1, width * height * sizeof(*buffer[0]));
memset(buffer[1], 2, width * height * sizeof(*buffer[1]));
memset(buffer[2], 0, width * height * sizeof(*buffer[2]));
{
struct usec_timer emr_timer;
usec_timer_start(&emr_timer);
for (z = 0; z < 600; z++) {
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) {
// don't worry about overflows
buffer[2][y * width + x] += buffer[0][y * width + x]
+ buffer[1][y * width + x];
}
}
}
usec_timer_mark(&emr_timer);
printf("Elapsed time %"PRIu64"\n", usec_timer_elapsed(&emr_timer));
}
if (use_gpu) {
for (x = 0; x < 3; x++) {
if (buffer[x] != NULL) {
status = clEnqueueUnmapMemObject(cmd_queue, opencl_mem[0], buffer[0], 0,
NULL, NULL);
status |= clFinish(cmd_queue);
if (status != CL_SUCCESS)
return 0;
buffer[0] = NULL;
}
if (opencl_mem[0] != NULL) {
status = clReleaseMemObject(opencl_mem[0]);
if (status != CL_SUCCESS)
return 0;
opencl_mem[0] = NULL;
}
}
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
} else {
for (x = 0; x < 3; x++) {
free(buffer[x]);
buffer[x] = NULL;
}
}
return 0;
}

If you use malloc + operation + free you are using only CPU resources.
If you use OpenCL you are using CPU + GPU, and you involve in syncronization and data copy penalties.
Alloc in GPU
Map to CPU space (allocs another buffer in CPU)
Operate CPU buffer
Unmap (pinned copy to the GPU buffer + deallocate the CPU one).
Destroy GPU buffer
What makes you think it should have the same speed? Of course is more costly, and will always be. You are doing the same CPU operation + some extra OpenCL operations.
Pinned memory is faster than non-pinned memory in transfers, but it is never faster than non copy, because you simply are not copying anything!
Also for a memory benchmark, doing operation with 3*1280*720 = 2.6MB, is completely silly. It would take just microseconds in common systems. And anyway, that part should be the same for both cases.
The overhead will dominate your results, rather than the throughput.

Related

Arduino global array not modified when accessed over serial?

I'm working on a home automation system, one part of which includes an Arduino Mega2560 connected via USB to a RaspberryPi 3 Model B. The Arduino receives two short, simple commands over serial from the Pi; the first command sets 'regions' of led lights, and the second sets colors on the regions previously described. Syntax is as follows for both commands, where bracketed items are single bytes, len+off are 3 ascii bytes interpreted as base-10, and r+g+b are 2 ascii bytes each interpreted as hex. The color set command supports a variable number of regions as shown:
Region Set Command
!c[regionId][stripId][len2][len1][len0][off2][off1][off0]\r
Color Set Command
!b[r1][r0][g1][g0][b1][b0][numRegions][regionId0]([regionId1]...)\r
Each command sends positive/negative acknowledgements so you'd think I'd be able to tell what's going wrong, but I get unexpected behavior when I try to set light colors with the second command, however, despite getting acknowledgements as expected. So far, the only reasonable conclusion I can come to is that for the global array regions discards changes and stays zero always. For example, this exchange results in no lights lit (comments are not in the actual exchange)
Pi sends: !c00144000\r // create regionId 0 on stripId 0; length 144 offset 0
Pi recieves: O // positive confirmation?
Pi sends: !b00009910\r // set regionId 0 to color 0x000099 (blue)
Pi recieves: O // positive confirmation, but no lights?
Pi sends: !x\r // invalid command; triggers default in switch
Pi recieves: X // as expected, all lights turn red
Here's the stripped down source for the Arduino:
#include "Arduino.h"
#include "FastLED.h"
#define WAIT while(!Serial.available()) ;;
// Forward Declarations
const int max_strips = 2;
const int max_strip_length = 300;
const int max_leds = max_strips * max_strip_length;
CRGB leds[max_strips][max_strip_length];
const int max_regions = 20;
int regions[max_regions][3];
const int baud_rate = 9600;
unsigned char buffer[64];
unsigned char currentByte = 0;
unsigned char incommingByte;
unsigned char startChar = '!';
unsigned char endChar = '\r';
bool store = false;
// Helper Functions
void set(CRGB c) {
for(int i = 0; i < max_strips; i++) {
for(int j = 0; j < max_strip_length; j++) {
leds[i][j] = c;
}
}
}
void set(int stripId, int length, int offset, CRGB c) {
for(int i = offset; i < offset+length; i++) {
leds[stripId][i] = c;
}
}
void setup() {
Serial.begin(baud_rate);
for(int i = 0; i < max_strips; i++) {
switch(i) {
case 0: {
FastLED.addLeds<WS2812B, 2, GRB>(leds[0], max_strip_length);
} break;
case 1: {
FastLED.addLeds<WS2812B, 3, GRB>(leds[1], max_strip_length);
} break;
}
}
set(CRGB::Black);
}
void loop() {
if(Serial.available() > 0) {
unsigned char incomingByte = Serial.read();
if(incomingByte == startChar) {
currentByte = 0;
store = true;
}
if(store) {
if(incomingByte == endChar) {
buffer[currentByte++] = incomingByte;
store = false;
switch(buffer[1]) {
case 'b': {
int red = (buffer[2] - '0')*16 + (buffer[3] - '0');
int green = (buffer[4] - '0')*16 + (buffer[5] - '0');
int blue = (buffer[6] - '0')*16 + (buffer[7] - '0');
int numRegions = buffer[8] - '0';
for(int i = 0; i < numRegions; i++) {
int regionId = buffer[9+i] - '0';
if(regionId >= max_regions) {
Serial.write('S');
return;
}
// Never sets anything
set(regions[regionId][0], regions[regionId][1], regions[regionId][2], CRGB(red, green, blue));
}
Serial.write('O');
} break;
case 'c': {
int regionId = buffer[2] - '0';
int stripId = buffer[3] - '0';
int length = (buffer[4] - '0')*100 + (buffer[5] - '0')*10 + (buffer[6] -'0');
int offset = (buffer[7] - '0')*100 + (buffer[8] - '0')*10 + (buffer[9] -'0');
if(regionId >= max_regions) {
Serial.write('R');
return;
}
if(stripId >= max_strips) {
Serial.write('S');
return;
}
regions[regionId][0] = stripId; // WE LOSE THESE VALUES??
regions[regionId][1] = length; // ??
regions[regionId][2] = offset; // ??
Serial.write('O');
} break;
default: {
set(CRGB::Red);
Serial.write('X');
} break;
}
currentByte = 0;
}else if(currentByte > 64){
store = false;
currentByte = 0;
} else {
buffer[currentByte++] = incomingByte;
}
}
}
FastLED.show();
}
I'd appreciate any advice as to what I'm missing here, and regardless, thanks for reading this far!
This was occurring because I have a lot of garbage showing up on the serial interface for the Arduino I'm testing on. Slowing the baud rate down to 2400bps resolves the issue on this device. Other Arduinos in the system do fine with much higher serial rates -- I suspect there's a hardware problem with the crystal on this particular Arduino so clock speeds aren't matching up correctly between the two sides of the serial connection.

Unable to find device in opencl using beignet

I am trying to run opencl using beignet
https://askubuntu.com/questions/412009/open-cl-in-intel
My system configuration is
Intel HD Graphics 5500
NVIDIA GeForce 830M (2 GB DDR3 dedicated)
When I run the following code:
// HelloWorld.cpp
//
// This is a simple example that demonstrates basic OpenCL setup and
// use.
#include <iostream>
#include <fstream>
#include <sstream>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
///
// Constants
//
const int ARRAY_SIZE = 1000;
///
// Create an OpenCL context on the first available platform using
// either a GPU or CPU depending on what is available.
//
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
// First, select an OpenCL platform to run on. For this example, we
// simply choose the first available platform. Normally, you would
// query for all available platforms and select the most appropriate one.
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
std::cerr << "Failed to find any OpenCL platforms." << std::endl;
return NULL;
}
// Next, create an OpenCL context on the platform. Attempt to
// create a GPU-based context, and if that fails, try to create
// a CPU-based context.
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cout << "Could not create GPU context, trying CPU..." << std::endl;
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
return NULL;
}
}
return context;
}
///
// Create a command queue on the first device available on the
// context
//
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
// First get the size of the devices buffer
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
return NULL;
}
if (deviceBufferSize <= 0)
{
std::cerr << "No devices available.";
return NULL;
}
// Allocate memory for the devices buffer
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
if (errNum != CL_SUCCESS)
{
delete [] devices;
std::cerr << "Failed to get device IDs";
return NULL;
}
// In this example, we just choose the first available device. In a
// real program, you would likely use all available devices or choose
// the highest performance device based on OpenCL device queries
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
if (commandQueue == NULL)
{
delete [] devices;
std::cerr << "Failed to create commandQueue for device 0";
return NULL;
}
*device = devices[0];
delete [] devices;
return commandQueue;
}
///
// Create an OpenCL program from the kernel source file
//
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(fileName, std::ios::in);
if (!kernelFile.is_open())
{
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
return NULL;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context, 1,
(const char**)&srcStr,
NULL, NULL);
if (program == NULL)
{
std::cerr << "Failed to create CL program from source." << std::endl;
return NULL;
}
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
// Determine the reason for the error
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(buildLog), buildLog, NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
clReleaseProgram(program);
return NULL;
}
return program;
}
///
// Create memory objects used as the arguments to the kernel
// The kernel takes three arguments: result (output), a (input),
// and b (input)
//
bool CreateMemObjects(cl_context context, cl_mem memObjects[3],
float *a, float *b)
{
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, a, NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, b, NULL);
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * ARRAY_SIZE, NULL, NULL);
if (memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL)
{
std::cerr << "Error creating memory objects." << std::endl;
return false;
}
return true;
}
///
// Cleanup any created OpenCL resources
//
void Cleanup(cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObjects[3])
{
for (int i = 0; i < 3; i++)
{
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (context != 0)
clReleaseContext(context);
}
///
// main() for HelloWorld example
//
int main(int argc, char** argv)
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[3] = { 0, 0, 0 };
cl_int errNum;
// Create an OpenCL context on first available platform
context = CreateContext();
if (context == NULL)
{
std::cerr << "Failed to create OpenCL context." << std::endl;
return 1;
}
// Create a command-queue on the first device available
// on the created context
commandQueue = CreateCommandQueue(context, &device);
if (commandQueue == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create OpenCL program from HelloWorld.cl kernel source
program = CreateProgram(context, device, "HelloWorld.cl");
if (program == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create OpenCL kernel
kernel = clCreateKernel(program, "hello_kernel", NULL);
if (kernel == NULL)
{
std::cerr << "Failed to create kernel" << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Create memory objects that will be used as arguments to
// kernel. First create host memory arrays that will be
// used to store the arguments to the kernel
float result[ARRAY_SIZE];
float a[ARRAY_SIZE];
float b[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++)
{
a[i] = (float)i;
b[i] = (float)(i * 2);
}
if (!CreateMemObjects(context, memObjects, a, b))
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Set the kernel arguments (result, a, b)
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
size_t globalWorkSize[1] = { ARRAY_SIZE };
size_t localWorkSize[1] = { 1 };
// Queue the kernel up for execution across the array
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Read the output buffer back to the Host
errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
0, ARRAY_SIZE * sizeof(float), result,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
// Output the result buffer
for (int i = 0; i < ARRAY_SIZE; i++)
{
std::cout << result[i] << " ";
}
std::cout << std::endl;
std::cout << "Executed program succesfully." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 0;
}
I always get the output:
Number of available platforms: 1
Platform names:
[0] Experiment Intel Gen OCL Driver [Selected]
Number of devices available for each type:
CL_DEVICE_TYPE_CPU: 0
CL_DEVICE_TYPE_GPU: 0
CL_DEVICE_TYPE_ACCELERATOR: 0
*** Detailed information for each device ***
I tried various opencl codes and none of them works properly.Why are the devices not being found and what is the solution?
What happens when you run clinfo utility?
You can get clinfo for your linux and then run it. It provides a list of every platform and devices found. If you can't get your device listed by clinfo, you won't have it listed by your program.
It looks like you have a Nvidia Optimus computer, and it is very bad because Nvidia does not provide official support for Optimus on Linux. However, at least your Intel CPU shoule be recognized.
If you can't get it listed, you might lack the dri driver for your vendor (nvidia).

clEnqueueReadImage throws CL_OUT_OF_RESOURCES error

I am new to OpenCL and I tried to run example code in OpenCL Programming Guide.
I got an error code -5(which corresponds to CL_OUT_OF_RESOURCES) when trying to read filtered image back to host using clEnqueueReadImage.
Here is the code in the book:
//
// Book: OpenCL(R) Programming Guide
// Authors: Aaftab Munshi, Benedict Gaster, Timothy Mattson, James Fung, Dan Ginsburg
// ISBN-10: 0-321-74964-2
// ISBN-13: 978-0-321-74964-2
// Publisher: Addison-Wesley Professional
// URLs: http://safari.informit.com/9780132488006/
// http://www.openclprogrammingguide.com
//
// ImageFilter2D.cpp
//
// This example demonstrates performing gaussian filtering on a 2D image using
// OpenCL
//
// Requires FreeImage library for image I/O:
// http://freeimage.sourceforge.net/
#include <iostream>
#include <fstream>
#include <sstream>
#include <string.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include "FreeImage.h"
///
// Create an OpenCL context on the first available platform using
// either a GPU or CPU depending on what is available.
//
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
// First, select an OpenCL platform to run on. For this example, we
// simply choose the first available platform. Normally, you would
// query for all available platforms and select the most appropriate one.
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
std::cerr << "Failed to find any OpenCL platforms." << std::endl;
return NULL;
}
// Next, create an OpenCL context on the platform. Attempt to
// create a GPU-based context, and if that fails, try to create
// a CPU-based context.
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cout << "Could not create GPU context, trying CPU..." << std::endl;
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
return NULL;
}
}
return context;
}
///
// Create a command queue on the first device available on the
// context
//
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
// First get the size of the devices buffer
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
return NULL;
}
if (deviceBufferSize <= 0)
{
std::cerr << "No devices available.";
return NULL;
}
// Allocate memory for the devices buffer
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to get device IDs";
return NULL;
}
// In this example, we just choose the first available device. In a
// real program, you would likely use all available devices or choose
// the highest performance device based on OpenCL device queries
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
if (commandQueue == NULL)
{
std::cerr << "Failed to create commandQueue for device 0";
return NULL;
}
*device = devices[0];
delete [] devices;
return commandQueue;
}
///
// Create an OpenCL program from the kernel source file
//
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(fileName, std::ios::in);
if (!kernelFile.is_open())
{
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
return NULL;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context, 1,
(const char**)&srcStr,
NULL, NULL);
if (program == NULL)
{
std::cerr << "Failed to create CL program from source." << std::endl;
return NULL;
}
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
// Determine the reason for the error
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(buildLog), buildLog, NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
clReleaseProgram(program);
return NULL;
}
return program;
}
///
// Cleanup any created OpenCL resources
//
void Cleanup(cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem imageObjects[2],
cl_sampler sampler)
{
for (int i = 0; i < 2; i++)
{
if (imageObjects[i] != 0)
clReleaseMemObject(imageObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (sampler != 0)
clReleaseSampler(sampler);
if (context != 0)
clReleaseContext(context);
}
///
// Load an image using the FreeImage library and create an OpenCL
// image out of it
//
cl_mem LoadImage(cl_context context, char *fileName, int &width, int &height)
{
FREE_IMAGE_FORMAT format = FreeImage_GetFileType(fileName, 0);
FIBITMAP* image = FreeImage_Load(format, fileName);
// Convert to 32-bit image
FIBITMAP* temp = image;
image = FreeImage_ConvertTo32Bits(image);
FreeImage_Unload(temp);
width = FreeImage_GetWidth(image);
height = FreeImage_GetHeight(image);
char *buffer = new char[width * height * 4];
memcpy(buffer, FreeImage_GetBits(image), width * height * 4);
FreeImage_Unload(image);
// Create OpenCL image
cl_image_format clImageFormat;
clImageFormat.image_channel_order = CL_RGBA;
clImageFormat.image_channel_data_type = CL_UNORM_INT8;
cl_int errNum;
cl_mem clImage;
clImage = clCreateImage2D(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
&clImageFormat,
width,
height,
0,
buffer,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL image object" << std::endl;
return 0;
}
return clImage;
}
///
// Save an image using the FreeImage library
//
bool SaveImage(char *fileName, char *buffer, int width, int height)
{
FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(fileName);
FIBITMAP *image = FreeImage_ConvertFromRawBits((BYTE*)buffer, width,
height, width * 4, 32,
0xFF000000, 0x00FF0000, 0x0000FF00);
return (FreeImage_Save(format, image, fileName) == TRUE) ? true : false;
}
///
// Round up to the nearest multiple of the group size
//
size_t RoundUp(int groupSize, int globalSize)
{
int r = globalSize % groupSize;
if(r == 0)
{
return globalSize;
}
else
{
return globalSize + groupSize - r;
}
}
///
// main() for HelloBinaryWorld example
//
int main(int argc, char** argv)
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem imageObjects[2] = { 0, 0 };
cl_sampler sampler = 0;
cl_int errNum;
if (argc != 3)
{
std::cerr << "USAGE: " << argv[0] << " <inputImageFile> <outputImageFiles>" << std::endl;
return 1;
}
// Create an OpenCL context on first available platform
context = CreateContext();
if (context == NULL)
{
std::cerr << "Failed to create OpenCL context." << std::endl;
return 1;
}
// Create a command-queue on the first device available
// on the created context
commandQueue = CreateCommandQueue(context, &device);
if (commandQueue == NULL)
{
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
// Make sure the device supports images, otherwise exit
cl_bool imageSupport = CL_FALSE;
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
&imageSupport, NULL);
if (imageSupport != CL_TRUE)
{
std::cerr << "OpenCL device does not support images." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
// Load input image from file and load it into
// an OpenCL image object
int width, height;
imageObjects[0] = LoadImage(context, argv[1], width, height);
if (imageObjects[0] == 0)
{
std::cerr << "Error loading: " << std::string(argv[1]) << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
// Create ouput image object
cl_image_format clImageFormat;
clImageFormat.image_channel_order = CL_RGBA;
clImageFormat.image_channel_data_type = CL_UNORM_INT8;
imageObjects[1] = clCreateImage2D(context,
CL_MEM_WRITE_ONLY,
&clImageFormat,
width,
height,
0,
NULL,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL output image object." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
// Create sampler for sampling image object
sampler = clCreateSampler(context,
CL_FALSE, // Non-normalized coordinates
CL_ADDRESS_CLAMP_TO_EDGE,
CL_FILTER_NEAREST,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL sampler object." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
// Create OpenCL program
program = CreateProgram(context, device, "ImageFilter2D.cl");
if (program == NULL)
{
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
// Create OpenCL kernel
kernel = clCreateKernel(program, "gaussian_filter", NULL);
if (kernel == NULL)
{
std::cerr << "Failed to create kernel" << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
// Set the kernel arguments
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &imageObjects[1]);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
size_t localWorkSize[2] = { 16, 16};
size_t globalWorkSize[2] = { RoundUp(localWorkSize[0], width),
RoundUp(localWorkSize[1], height) };
// Queue the kernel up for execution
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
// Read the output buffer back to the Host
char *buffer = new char [width * height * 4];
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { width, height, 1};
errNum = clEnqueueReadImage(commandQueue, imageObjects[1], CL_TRUE,
origin, region, 0, 0, buffer,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cout<<errNum<<std::endl;
std::cerr << "Error reading result buffer." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 1;
}
std::cout << std::endl;
std::cout << "Executed program succesfully." << std::endl;
//memset(buffer, 0xff, width * height * 4);
// Save the image out to disk
if (!SaveImage(argv[2], buffer, width, height))
{
std::cerr << "Error writing output image: " << argv[2] << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
delete [] buffer;
return 1;
}
delete [] buffer;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
return 0;
}
The image I use was a random image downloaded from internet.
I also use clGetDeviceInfo to return the CL_LOCAL_MEM_SIZE which is 4294967295

OpenCL Memory Transfer Issue (Err. code -6)

I'm dying a little inside. I've been working on this all day to no avail. I was having issues running some code that previously ran fine, so I wrote a short "toy" OpenCL program to try and figure out what was going on, but my toy program has me baffled and incredibly frustrated.
I'm working with an Nvidia 780i with 3Gb of global memory. It has a maximum allocation of ~780 Mb. At first, it wouldn't error out when I was intentionally over-allocating. Solved that (it was typographical, but the compiler/analyzer wasn't catching it). Now, even when trying to allocate WAY below what the device should be able to handle, I get an error code of -6 (CL_OUT_OF_HOST_MEMORY) on the second big buffer allocation.
I've been researching this error and I just can't track how it applies in this case. I have 32 gb of ram in the machine I'm using so there's certainly not a shortage there. I figure there's something that I just don't understand going on here.
It will allocate the first buffer alright, but then choke on the second. I basically just cannot allocate nearly the amount of global memory I both want and need to.
Any help is much appreciated. If you can help me with this and you're near LA I'll take you out for drinks. That's how frustrated I am.
Code and output is below for my machine.
Thanks,
John
Main program:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "kernels.cl"
#define KERNEL_NAME "test"
#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#define N_PROJ 4000
#define N_CHANNELS 736
#define N_ROWS 32
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
perror("Just a heads up: I'm not going to run on the GPU");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
}
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
cl_ulong16 alloc_size,mem_size;
char name[40];
clGetDeviceInfo(dev,CL_DEVICE_MAX_MEM_ALLOC_SIZE,sizeof(cl_ulong16),&alloc_size,NULL);
clGetDeviceInfo(dev,CL_DEVICE_NAME,sizeof(name),name,NULL);
clGetDeviceInfo(dev,CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong16),&mem_size,NULL);
printf("Using device: %s\n",name);
printf("Global memory size: %lu\n",mem_size);
printf("Max. allocation: %lu\n",alloc_size);
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle)-13;
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if(err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main(int argc, const char * argv[])
{
/* This file serves as a backbone for OpenCL programs. */
/* All the user needs to do is enter their OpenCL data */
/* structures, set kernel args, and kernel dispatches. */
/* Standard OCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
device=create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
program = build_program(context, device, PROGRAM_FILE);
queue = clCreateCommandQueue(context, device,0, NULL);
kernel = clCreateKernel(program, KERNEL_NAME, NULL);
/* User code goes here */
cl_int err;
/* Declare and set data */
int a[]={1,2,3};
float *rebin;
rebin =(float*) calloc(N_PROJ*N_CHANNELS*N_ROWS,sizeof(float));
float *mat;
mat =(float*) calloc(N_PROJ*N_CHANNELS*N_ROWS,sizeof(float));
printf("\nAllocation size: %lu\n",N_PROJ*N_CHANNELS*N_ROWS*sizeof(float));
/* Declare and set buffer objects */
cl_mem a_buff,rebin_buff,mat_buff;
printf("Total memory to be allocated: %lu\n",2*N_PROJ*N_CHANNELS*N_ROWS*sizeof(float)+sizeof(a) );
a_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,sizeof(a),a,NULL);
rebin_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),rebin,&err);
if (err<0){
printf("Error: %i\n",err);
perror("Couldn't create buffer 1");
exit(1);
}
mat_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),mat ,&err);
if (err<0){
printf("Error: %i\n",err);
perror("Couldn't create buffer 2");
exit(1);
}
/* Copy data over to the device */
err=clSetKernelArg(kernel,0,sizeof(cl_mem),&mat_buff);
if (err<0){
perror("Couldn't set kernel argument");
exit(1);
}
err=clSetKernelArg(kernel,1,sizeof(cl_mem),&rebin_buff);
err=clSetKernelArg(kernel,2,sizeof(cl_mem),&a_buff);
clEnqueueTask(queue,kernel,0,NULL,NULL);
clEnqueueReadBuffer(queue,mat_buff ,CL_TRUE,0,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),mat ,0,NULL,NULL);
clEnqueueReadBuffer(queue,rebin_buff,CL_TRUE,0,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),rebin,0,NULL,NULL);
clEnqueueReadBuffer(queue,a_buff, CL_TRUE,0,sizeof(a),a,0,NULL,NULL);
printf("%f %f %f\n",mat[1],mat[2],mat[3]);
printf("%f %f %f\n",rebin[1],rebin[2],rebin[3]);
printf("%i %i %i",a[0],a[1],a[2]);
/***********************/
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
//clReleaseDevice(device);
printf("\n\nProgram apparently executed fully. \n");
return 0;
}
Kernel:
__kernel void test(__global float *mat,__global float *rebin,__global int *a){
a[0]=3;
a[1]=2;
a[2]=1;
rebin[1]=1.0f;
rebin[2]=2.0f;
rebin[3]=3.0f;
mat[1]=3.0f;
mat[2]=2.0f;
mat[3]=1.0f;
}
Console output for my machine:
Using device: GeForce GTX 780
Global memory size: 3221225472
Max. allocation: 805306368
Allocation size: 376832000
Total memory to be allocated: 753664012
Error: -6
Couldn't create buffer 2: No error
Process returned 1 (0x1) execution time : 0.435 s
Press any key to continue.

How to display an image inside kernel using opencl?

I am new to opencl. The task is:
Load an pre-existing image
Write Host code using opencl to send the image ptr to kernel
Calculate hsl threshold of the loaded image inside kernel
Display the threshold or binary image
I ve used opencv to load a pre-existing 2D image in my program. And I used open cl buffer objects to allocate memory and have send image pointer to the kernel. After kernel execution in order to display the calculated image from the kernel I need clEnqueueReadBuffer. Then I use opencv to display the image from the host. I ve attached code below
As this takes more time on GPU and CPU I thought to switch over to image memory.
But I like to know whether usage of images also need clenqueueReadImage to copy image from kernel to host or do we any way to display the threshold image in kernel itself?
//My code using opencl buffers
IplImage *src = cvLoadImage("../Input/im2.png",CV_LOAD_IMAGE_COLOR );
int a=src->height;
int b=src->width;
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
std::cerr << "Failed to find any OpenCL platforms." << std::endl;
return NULL;
}
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cout << "Could not create GPU context, trying CPU..." << std::endl;
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU, NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
return NULL;
}
}
return context;
}
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
return NULL;
}
if (deviceBufferSize <= 0)
{
std::cerr << "No devices available.";
return NULL;
}
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
if (errNum != CL_SUCCESS)
{
delete [] devices;
std::cerr << "Failed to get device IDs";
return NULL;
}
commandQueue = clCreateCommandQueue(context, devices[0],CL_QUEUE_PROFILING_ENABLE, &errNum );
if (commandQueue == NULL)
{
delete [] devices;
std::cerr << "Failed to create commandQueue for device 0";
return NULL;
}
*device = devices[0];
delete [] devices;
return commandQueue;
}
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(fileName, std::ios::in);
if (!kernelFile.is_open())
{
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
return NULL;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context, 1,
(const char**)&srcStr,
NULL, NULL);
if (program == NULL)
{
std::cerr << "Failed to create CL program from source." << std::endl;
return NULL;
}
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(buildLog), buildLog, NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
clReleaseProgram(program);
return NULL;
}
return program;
}
bool CreateMemObjects(cl_context context, cl_mem memObjects[2], unsigned char *src_ptr)
{
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(unsigned char) *(a*b*3) , src_ptr , NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(unsigned char) *(a*b) , NULL, NULL);
if (memObjects[0] == NULL || memObjects[1] == NULL)
{
std::cerr << "Error creating memory objects" << std::endl;
return false;
}
return true;
}
void Cleanup(cl_context context, cl_command_queue commandQueue, cl_program program, cl_kernel kernel, cl_mem memObjects[2])
{
for (int i = 0; i < 2; i++)
{
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (context != 0)
clReleaseContext(context);
}
int main()
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[2] = { 0,0 };
cl_int errNum;
cl_event myEvent;
cl_ulong start_time,end_time;
double kernelExecTimeNs;
IplImage *thres_img1 = cvCreateImage(cvGetSize(src), IPL_DEPTH_8U, 1);
unsigned char *tur_image1,*src_ptr;
tur_image1 = (unsigned char*) malloc((a*b) * sizeof(unsigned char));
src_ptr = (unsigned char*) malloc ((a*b*3) * sizeof(unsigned char));
context = CreateContext();
if (context == NULL)
{
std::cerr << "Failed to create OpenCL context." <<std::endl;
return 1;
}
commandQueue = CreateCommandQueue(context, &device);
if (commandQueue == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
program = CreateProgram(context, device, "hsl_threshold.cl");
if (program == NULL)
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
kernel = clCreateKernel(program, "HSL_threshold", NULL);
if (kernel == NULL)
{
std::cerr << "Failed to create kernel" << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
printf("height:%d\n",a);//image height
printf("width:%d\n",b);//image width
cvShowImage("color image",src);
cvWaitKey(0);
memcpy(src_ptr,src->imageData,(a*b*3));
if (!CreateMemObjects(context, memObjects, src_ptr))
{
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments" << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
cout<<"Kernel arguments set successfully";
size_t globalWorkSize[1]={a*b};
size_t localWorkSize[1]={512};
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &myEvent);
clWaitForEvents(1,&myEvent);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
clFinish(commandQueue);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, sizeof(start_time), &start_time, NULL);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(end_time), &end_time, NULL);
kernelExecTimeNs = end_time-start_time;
printf("\nExecution time in milliseconds = %0.3f ms\n",( kernelExecTimeNs / 1000000.0) );
cout<<"\n Kernel timings \n"<<kernelExecTimeNs<<"seconds";
errNum = clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE,
0, (a*b) * sizeof(unsigned char), tur_image1,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
Cleanup(context, commandQueue, program, kernel, memObjects);
return 1;
}
memcpy(thres_img1->imageData,tur_image1,sizeof(unsigned char)*(a*b));
cvShowImage( "hsl_thresh",thres_img1);
cvSaveImage( "../Output/hsl_threshold.png",thres_img1);
cvWaitKey(0);
std::cout<<std::endl;
std::cout<<"Image displayed Successfully"<<std::endl;
Cleanup(context,commandQueue,program,kernel,memObjects);
printf("\n Free opencl resources");
std::cin.get();
return 0;
}
There are ways to directly process data calculated by OpenCL via OpenGL. Your OCL implementation must support the extension cl_khr_gl_sharing.
This mode is called CL/GL-Interop Mode.
If you create an OpenGL-instance first and initialise OpenCL with the pointers to your GL-instance, it is possible for each implementation to access each others data.
(All snippets are taken from code using CL-C++-Bindings, I guess it is okay for the general understanding)
cl_context_properties properties[] =
// Take this line to create an OCL context in GL-CL-interop-mode.
// OpenGL must already be initialised.
// For interop init see: http://www.khronos.org/registry/cl/extensions/khr/cl_khr_gl_sharing.txt
// USING: CL_GL_CONTEXT_KHR: Rendering Context [Use your OGL-HGLRC variable or do wglGetCurrentContext(); ]
// AND: CL_WGL_HDC_KHR: Device Context [Use your OGL-HDC variable or do wglGetCurrentDC(); ]
{
CL_CONTEXT_PLATFORM, (cl_context_properties)(_platforms->at(0))(),
CL_GL_CONTEXT_KHR, (cl_context_properties)myGL->hRC,
CL_WGL_HDC_KHR, (cl_context_properties)myGL->hDC, 0
};
Now you can create OCL-images based on OGL textures
//The following data can be accessed both from OCL and OGL
cl::Image2D imageFromGL = new cl::Image2DGL(*_context, CL_MEM_READ_WRITE, GL_TEXTURE_2D, 0, myGL->textures[0]);
Before using the memory in OCL, you have to ask OGL to release it
//Ask OGL to release memory. All OGL actions must be finished before doing so!
_queue->enqueueAcquireGLObjects(&imageFromGL, NULL, &evt);
Now, do what you want, then give it back to OGL:
//Hand memory back to OGL. All OCL actions must be finished before doing so!
_queue->enqueueReleaseGLObjects(&imageFromGL, NULL, &evt);
And finally you can use OpenGL code to display the data on the screen.

Resources