OpenCL - save region of an image

OpenCL - save region of an image - opencl

I am using OpenCL C 1.2 and in order to read back an image on the host, I use this code:
// Read the output buffer back to the Host
char *buffer = malloc((width * height)*sizeof(char*)*4); //new char
[width * height * 4];
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { width, height, 1};
errNum = clEnqueueReadImage(commandQueue, imageObjects[1], CL_TRUE,
origin, region, 0, 0, buffer,
0, NULL, NULL);
How can I read only a specific region of the image?
Thank you!

You would set origin and region that described the sub-image that you want to read. Only that sub-region of the image will be coped out into host memory.

Related

How to properly use QOpenGLBuffer.PixelPackBuffer with PyQt5

I am trying to read the color buffer content of the default framebuffer in PyQt5 using pixel buffer object given by the Qt OpenGL framework.
It looks like the reading is unsuccessful because the end image always contains all zeros. There's very little examples with pixel buffers and PyQt5, so I was mostly relying on this c++ tutorial explaining pixel buffers, specifically section Example: Asynchronous Read-back.
My code goes something like this:
class GLCanvas(QtWidgets.QOpenGLWidget):
# ...
def screenDump(self):
"""
Takes a screenshot and returns a pixmap.
:returns: A pixmap with the rendered content.
:rtype: QPixmap
"""
self.makeCurrent()
w = self.size().width()
h = self.size().height()
ppo = QtGui.QOpenGLBuffer(QtGui.QOpenGLBuffer.PixelPackBuffer)
ppo.setUsagePattern(QOpenGLBuffer.StaticRead)
ppo.create()
success = ppo.bind()
if success:
ppo.allocate(w * h * 4)
# Render the stuff
# ...
# Read the color buffer.
glReadBuffer(GL_FRONT)
glReadPixels(0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, 0)
# TRY1: Create an image with pixel buffer data - Doesn't work, image contains all zeros.
pixel_buffer_mapped = ppo.map(QOpenGLBuffer.ReadOnly)
image = QtGui.QImage(sip.voidptr(pixel_buffer_mapped), w, h, QtGui.QImage.Format_ARGB32)
ppo.unmap()
# TRY2: Create an image with pixel buffer data - Doesn't work, image contains all zeros.
# image = QtGui.QImage(w, h, QtGui.QImage.Format_ARGB32)
# bits = image.constBits()
# ppo.read(0, bits, w * h * 4)
ppo.release()
pixmap = QtGui.QPixmap.fromImage(image)
return pixmap
Any help would be greatly appreciated.

I didn't have any success after a couple of days, so I decided to implement color buffer fetching with pixel buffer object in C++, and then use SWIG to pass the data to Python.
I'm posting relevant code, maybe it will help somebody.
CPP side
// renderer.cpp
class Renderer{
// ...
void resize(int width, int height) {
// Set the viewport
glViewport(0, 0, width, height);
// Store width and height
width_ = width;
height_ = height;
// ...
}
// -------------------------------------------------------------------------- //
// Returns the color buffer data in GL_RGBA format.
GLubyte* screenDumpCpp(){
// Check if pixel buffer objects are available.
if (!GLInfo::pixelBufferSupported()){
return 0;
}
// Get the color buffer size in bytes.
int channels = 4;
int data_size = width_ * height_ * channels;
GLuint pbo_id;
// Generate pixel buffer for reading.
glGenBuffers(1, &pbo_id);
glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id);
glBufferData(GL_PIXEL_PACK_BUFFER, data_size, 0, GL_STREAM_READ);
// Set the framebuffer to read from.
glReadBuffer(GL_FRONT);
// Read the framebuffer and store data in the pixel buffer.
glReadPixels(0, 0, width_, height_, GL_RGBA, GL_UNSIGNED_BYTE, 0);
// Map the pixel buffer.
GLubyte* pixel_buffer = (GLubyte*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
// Cleanup.
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
glDeleteBuffers(1, &pbo_id);
return pixel_buffer;
}
// Returns the color buffer data in RGBA format as a numpy array.
PyObject* screenDump(){
// Get screen dump.
GLubyte* cpp_image = screenDumpCpp();
int channels = 4;
int image_size = width_* height_ * channels;
// Setup dimensions for numpy vector.
PyObject * python_image = NULL;
int ndim = 1;
npy_intp dims[1] = {image_size};
// Set up numpy vector.
python_image = PyArray_SimpleNew(ndim, dims, NPY_UINT8);
GLubyte * data = static_cast<GLubyte *>(PyArray_DATA(toPyArrayObject(python_image)));
// Copy screen dump to python space.
memcpy(data, cpp_image, image_size);
// return screen dump to python.
return python_image;
}
};
// glinfo.cpp
const GLInt GLInfo::glVersionInt(){ ... }
GLV GLInt::GLV(int major, int minor){ ... }
bool GLInfo::pixelBufferSupported(){
const GLint version = GLInfo::glVersionInt();
bool supported = false;
if (version >= GLInfo::GLV(1, 5) && version < GLInfo::GLV(3, 0)){
supported = true;
}
else if (version >= GLInfo::GLV(3, 0)){
GLint extensions_number;
glGetIntegerv(GL_NUM_EXTENSIONS, &extensions_number);
std::string pixel_buffer_extension("GL_ARB_pixel_buffer_object");
while (extensions_number--) {
const auto extension_name = reinterpret_cast<const char *>(glGetStringi(GL_EXTENSIONS, extensions_number));
std::string extension_name_str(extension_name);
if (pixel_buffer_extension == extension_name) {
supported = true;
break;
}
}
}
return supported;
}
Python side
# ...
class MyCanvas(QOpenGLWidget):
def __init__(self):
# Get renderer from c++
self._renderer = Renderer()
def resizeGL(self, width, height):
self._renderer.resize(width, height)
# ...
if __name__ == '__main__':
# ...
canvas = MyCanvas()
canvas.show()
width = canvas.width()
height = canvas.height()
data = canvas._renderer().screenDump()
image = QtGui.QImage(data.data, width, height, QtGui.QImage.Format_RGBA8888)
new_image = image.mirrored()
pixmap = QtGui.QPixmap.fromImage(new_image)
pixmap.save(path)
sys.exit(app.exec_())

OpenCL, Understanding VectorAdd program

I'm new to OpenCL, with very limited background in C/C++.
I've been given this OpenCL program that adds two vectors, and supposed to figure out how it works. It comes from Intel:
https://www.intel.com/content/www/us/en/programmable/support/support-resources/design-examples/design-software/opencl/vector-addition.html
Would it be correct to say: each kernel uses 1 element from A and 1 element from B to calculate 1 element of Z?
To me, it looks like it determines the number of devices (num_devices), and essentially divides the problem size (N) by num_devices, to determine the number of elements per device (n_per_device[]). Then it creates arrays of random numbers for each device (input_a[] and input_b[]) with n_per_device number of elements.
Then these arrays are used by the kernel, where addition of the whole array is performed and stored as Z.
For example, say if the number of devices available is 1000, and problem size (N) is 1,000,000; the n_per_device is 1000 (and since there is no remainder it is the same for all), and it would generate 1000 arrays of input_a and input_b, with 1000 elements in each. Then a respective pair of arrays of 1000 elements are taken by the kernel and added together - in other words each execution of the kernel adds 1000 elements?
Am I following anything, or totally wrong here?
The kernel is:
// ACL kernel for adding two input vectors
__kernel void vectorAdd(__global const float *x,
__global const float *y,
__global float *restrict z)
{
// get index of the work item
int index = get_global_id(0);
// add the vector elements
z[index] = x[index] + y[index];
}
The host (main) code is (sorry it is long, not sure what's not important):
///////////////////////////////////////////////////////////////////////////////////
// This host program executes a vector addition kernel to perform:
// C = A + B
// where A, B and C are vectors with N elements.
//
// This host program supports partitioning the problem across multiple OpenCL
// devices if available. If there are M available devices, the problem is
// divided so that each device operates on N/M points. The host program
// assumes that all devices are of the same type (that is, the same binary can
// be used), but the code can be generalized to support different device types
// easily.
//
// Verification is performed against the same computation on the host CPU.
///////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "CL/opencl.h"
#include "AOCL_Utils.h"
using namespace aocl_utils;
// OpenCL runtime configuration
cl_platform_id platform = NULL;
unsigned num_devices = 0;
scoped_array<cl_device_id> device; // num_devices elements
cl_context context = NULL;
scoped_array<cl_command_queue> queue; // num_devices elements
cl_program program = NULL;
scoped_array<cl_kernel> kernel; // num_devices elements
scoped_array<cl_mem> input_a_buf; // num_devices elements
scoped_array<cl_mem> input_b_buf; // num_devices elements
scoped_array<cl_mem> output_buf; // num_devices elements
// Problem data.
const unsigned N = 1000000; // problem size
scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
scoped_array<scoped_array<float> > ref_output; // num_devices elements
scoped_array<unsigned> n_per_device; // num_devices elements
// Function prototypes
float rand_float();
bool init_opencl();
void init_problem();
void run();
void cleanup();
// Entry point.
int main() {
// Initialize OpenCL.
if(!init_opencl()) {
return -1;
}
// Initialize the problem data.
// Requires the number of devices to be known.
init_problem();
// Run the kernel.
run();
// Free the resources allocated
cleanup();
return 0;
}
/////// HELPER FUNCTIONS ///////
// Randomly generate a floating-point number between -10 and 10.
float rand_float() {
return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
}
// Initializes the OpenCL objects.
bool init_opencl() {
cl_int status;
printf("Initializing OpenCL\n");
if(!setCwdToExeDir()) {
return false;
}
// Get the OpenCL platform.
platform = findPlatform("Altera");
if(platform == NULL) {
printf("ERROR: Unable to find Altera OpenCL platform.\n");
return false;
}
// Query the available OpenCL device.
device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
printf("Platform: %s\n", getPlatformName(platform).c_str());
printf("Using %d device(s)\n", num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
printf(" %s\n", getDeviceName(device[i]).c_str());
}
// Create the context.
context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create per-device objects.
queue.reset(num_devices);
kernel.reset(num_devices);
n_per_device.reset(num_devices);
input_a_buf.reset(num_devices);
input_b_buf.reset(num_devices);
output_buf.reset(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Command queue.
queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Kernel.
const char *kernel_name = "vectorAdd";
kernel[i] = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
// Determine the number of elements processed by this device.
n_per_device[i] = N / num_devices; // number of elements handled by this device
// Spread out the remainder of the elements over the first
// N % num_devices.
if(i < (N % num_devices)) {
n_per_device[i]++;
}
// Input buffers.
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input A");
input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input B");
// Output buffer.
output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for output");
}
return true;
}
// Initialize the data for the problem. Requires num_devices to be known.
void init_problem() {
if(num_devices == 0) {
checkError(-1, "No devices");
}
input_a.reset(num_devices);
input_b.reset(num_devices);
output.reset(num_devices);
ref_output.reset(num_devices);
// Generate input vectors A and B and the reference output consisting
// of a total of N elements.
// We create separate arrays for each device so that each device has an
// aligned buffer.
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]);
input_b[i].reset(n_per_device[i]);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
for(unsigned j = 0; j < n_per_device[i]; ++j) {
input_a[i][j] = rand_float();
input_b[i][j] = rand_float();
ref_output[i][j] = input_a[i][j] + input_b[i][j];
}
}
}
void run() {
cl_int status;
const double start_time = getCurrentTimestamp();
// Launch the problem for each device.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Transfer inputs to each device. Each of the host buffers supplied to
// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
// for the host-to-device transfer.
cl_event write_event[2];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, "Failed to transfer input A");
status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
checkError(status, "Failed to transfer input B");
// Set kernel arguments.
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
// Enqueue kernel.
// Use a global work size corresponding to the number of elements to add
// for this device.
//
// We don't specify a local work size and let the runtime choose
// (it'll choose to use one work-group with the same size as the global
// work-size).
//
// Events are used to ensure that the kernel is not launched until
// the writes to the input buffers have completed.
const size_t global_work_size = n_per_device[i];
printf("Launching for device %d (%d elements)\n", i, global_work_size);
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, NULL, 2, write_event, &kernel_event[i]);
checkError(status, "Failed to launch kernel");
// Read the result. This the final operation.
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
// Release local events.
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
}
// Wait for all devices to finish.
clWaitForEvents(num_devices, finish_event);
const double end_time = getCurrentTimestamp();
// Wall-clock time taken.
printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3);
// Get kernel times using the OpenCL event profiling API.
for(unsigned i = 0; i < num_devices; ++i) {
cl_ulong time_ns = getStartEndTime(kernel_event[i]);
printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
}
// Release all events.
for(unsigned i = 0; i < num_devices; ++i) {
clReleaseEvent(kernel_event[i]);
clReleaseEvent(finish_event[i]);
}
// Verify results.
bool pass = true;
for(unsigned i = 0; i < num_devices && pass; ++i) {
for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
printf("Failed verification # device %d, index %d\nOutput: %f\nReference: %f\n",
i, j, output[i][j], ref_output[i][j]);
pass = false;
}
}
}
printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
}
// Free the resources allocated during initialization
void cleanup() {
for(unsigned i = 0; i < num_devices; ++i) {
if(kernel && kernel[i]) {
clReleaseKernel(kernel[i]);
}
if(queue && queue[i]) {
clReleaseCommandQueue(queue[i]);
}
if(input_a_buf && input_a_buf[i]) {
clReleaseMemObject(input_a_buf[i]);
}
if(input_b_buf && input_b_buf[i]) {
clReleaseMemObject(input_b_buf[i]);
}
if(output_buf && output_buf[i]) {
clReleaseMemObject(output_buf[i]);
}
}
if(program) {
clReleaseProgram(program);
}
if(context) {
clReleaseContext(context);
}
}

There are a few sub-questions here, so let me try and address them individually. I'm going to be slightly pedantic on terminology; I'm not doing that to be snarky but hopefully this will help you make more sense of documentation, examples, etc.:
Would it be correct to say: each kernel uses 1 element from A and 1 element from B to calculate 1 element of Z?
The kernel is just the code that will run on the OpenCL device. Typically, a kernel is scheduled to run (using clEnqueueNDRangeKernel()) with multiple work-items. With just one work item, there is not much point in bothering with OpenCL at all; the performance benefit comes from massive parallelism. In any case, your quoted statement is correct for each individual work-item processing this kernel. If you run this kernel with 1000 work items, 1000 elements from A will be processed with 1000 elements from B to calculate 1000 elements of Z. The order this happens in is deliberately undefined, and at least groups of elements will be operated on concurrently.
To me, it looks like it determines the number of devices (num_devices), and essentially divides the problem size (N) by num_devices, to determine the number of elements per device (n_per_device[]). Then it creates arrays of random numbers for each device (input_a[] and input_b[]) with n_per_device number of elements.
Yes, it looks like that to me too.
For example, say if the number of devices available is 1000,
I would just like to point out that you will pretty much never have this many OpenCL devices in a system. The granularity of a single OpenCL device is typically "one GPU," or "all the CPU cores in the system," or "one FPGA accelerator card."
So a "normal" amount of devices on a desktop system is 1, 2, or maybe up to about 4 (e.g. CPU + iGPU + dual discrete GPUs). Big irons with many accelerator cards might have ~16 or so. If you're attempting to accelerate some code in a desktop (or small server) application, you'll usually just pick one device that's likely to be the most appropriate for your problem and run with that. Distributing workload evenly across heterogenous devices is a hard problem for anything but the most basic algorithms.
and problem size (N) is 1,000,000; the n_per_device is 1000 (and since there is no remainder it is the same for all), and it would generate 1000 arrays of input_a and input_b, with 1000 elements in each. Then a respective pair of arrays of 1000 elements are taken by the kernel and added together -
Yes.
in other words each execution of the kernel adds 1000 elements?
Again, this is where using the term "kernel" isn't precise enough. In your example, you would enqueue 1000 work items to execute the kernel on each of the 1000 devices.

FFMpeg with X265

I am currently trying to encode raw RGB24 images via x265. I already successfully did this with the x264 library, but a few things have changed as compared to the x265 library.
Here the problem in short: I want to convert the image I have from RGB24 to YUV 4:2:0 via the sws_scale function of FFMPEG. The prototype of the function is:
int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[])
Assuming image contains my raw image, srcstride and `m_height' the corresponding RGB stride and height of my image, I made the following call with x264
sws_scale(convertCtx, &image, &srcstride, 0, m_height, pic_in.img.plane, pic_in.img.i_stride);
pic_in is of type x264_picture_t which looks (brief) as follows
typedef struct
{
...
x264_image_t img;
} x264_picture_t;
with x264_image_t
typedef struct
{
...
int i_stride[4];
uint8_t *plane[4];
} x264_image_t;
Now, in x265 the structures have slightly changed to
typedef struct x265_picture
{
...
void* planes[3];
int stride[3];
} x265_picture;
And I am now not quite sure how to call the same function
sws_scale(convertCtx, &image, &srcstride, 0, m_height, ????, pic_in.stride);
I tried creating a temporary array, and then copying back and recasting the array items, but it doesnt seem to work
pic.planes[i] = reinterpret_cast<void*>(tmp[i]) ;
Can someone help me out?
Thanks a lot :)
Edit
I figured it out now
outputSlice = sws_scale(convertCtx, &image, &srcstride, 0, m_height, reinterpret_cast<uint8_t**>(pic_in.planes), pic_in.stride);
This seems to do the trick :)
And btw, for other people who are experiment with x265:in x264 there was a x264_picture_alloc function which I didn't manage to find in x265. So here is a function which I used in my application and which does the trick.
void x265_picture_alloc_custom( x265_picture *pic, int csp, int width, int height, uint32_t depth) {
x265_picture_init(&mParam, pic);
pic->colorSpace = csp;
pic->bitDepth = depth;
pic->sliceType = X265_TYPE_AUTO;
uint32_t pixelbytes = depth > 8 ? 2 : 1;
uint32_t framesize = 0;
for (int i = 0; i < x265_cli_csps[csp].planes; i++)
{
uint32_t w = width >> x265_cli_csps[csp].width[i];
uint32_t h = height >> x265_cli_csps[csp].height[i];
framesize += w * h * pixelbytes;
}
pic->planes[0] = new char[framesize];
pic->planes[1] = (char*)(pic->planes[0]) + width * height * pixelbytes;
pic->planes[2] = (char*)(pic->planes[1]) + ((width * height * pixelbytes) >> 2);
pic->stride[0] = width;
pic->stride[1] = pic->stride[2] = pic->stride[0] >> 1;
}

And I am now not quite sure how to call the same function
sws_scale(convertCtx, &image, &srcstride, 0, m_height, ????,
pic_in.stride);
tried with?:
sws_scale(convertCtx, &image, &srcstride, 0, m_height, pic_in.planes,pic_in.stride);
what error do you have? have you initialized memory of x265_picture?

Opencl: GPU Execution Time is always Zero

I am trying to print the execution time for some functions on GPU. But timing on GPU is always comming out to be 0. Also when I choose CL_DEVICE_TYPE_CPU in the following it works fine.
errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
This works fine and shows non-zero value of execution time but if I choose CL_DEVICE_TYPE_GPU, then it always shows 0, irrespective of total no. of data points and threads. please note that in both cases (CL_DEVICE_TYPE_CPU and CL_DEVICE_TYPE_GPU), I am printing the execution time in same way. That is my host code and my kernel code is same in both cases(thats what openCL is!). Following are some of the code section:
// openCL code to get platform and device ids
errcode = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
// to create context
clGPUContext = clCreateContext( NULL, 1, &device_id, NULL, NULL, &errcode);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
device_id, CL_QUEUE_PROFILING_ENABLE, &errcode);
// Setup device memory
d_instances= clCreateBuffer(clGPUContext,CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR,mem_size_i,instances->data, &errcode);
d_centroids = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_c, NULL, &errcode);
d_distance = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_d,NULL, &errcode);
// d_dist_X = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//d_dist_Y = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//to build program
clProgram = clCreateProgramWithSource(clGPUContext,1, (const char **)&source_str,(const
size_t*)&source_size, &errcode);
errcode = clBuildProgram(clProgram, 0,NULL, NULL, NULL, NULL);
if (errcode == CL_BUILD_PROGRAM_FAILURE)
{
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
&log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, log_size, log,
NULL);
// Print the log
printf("%s\n", log);
}
clKernel = clCreateKernel(clProgram,"distance_finding", &errcode);
// Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
if(num_instances >= 500)
{
localWorkSize[0] = 500;
float block1=num_instances/localWorkSize[0];
int block= (int)(ceil(block1));
globalWorkSize[0] = block*localWorkSize[0];
}
else
{
localWorkSize[0]=num_instances;
globalWorkSize[0]=num_instances;
}
int iteration=1;
while(iteration < MAX_ITERATIONS)
{
errcode = clEnqueueWriteBuffer(clCommandQue,d_centroids , CL_TRUE, 0,
mem_size_c, (void*)centroids->data, 0, NULL, NULL);
errcode = clEnqueueWriteBuffer(clCommandQue,d_distance , CL_TRUE, 0, mem_size_d,
(void*)distance->data, 0, NULL, NULL);
//set kernel arguments
errcode = clSetKernelArg(clKernel, 0,sizeof(cl_mem), (void *)&d_instances);
errcode = clSetKernelArg(clKernel, 1,sizeof(cl_mem), (void *)&d_centroids);
errcode = clSetKernelArg(clKernel, 2,sizeof(cl_mem), (void *)&d_distance);
errcode = clSetKernelArg(clKernel, 3,sizeof(unsigned int), (void *)
&num_instances);
errcode = clSetKernelArg(clKernel,4,sizeof(unsigned int),(void *)&clusters);
errcode = clSetKernelArg(clKernel,5,sizeof(unsigned int),(void *)&dimensions);
errcode = clEnqueueNDRangeKernel(clCommandQue,clKernel, 1, NULL,
globalWorkSize,localWorkSize, 0, NULL, &myEvent);
clFinish(clCommandQue); // wait for all events to finish
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START,sizeof(cl_ulong),
&startTime, NULL);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,sizeof(cl_ulong),
&endTime, NULL);
kernelExecTimeNs = endTime-startTime;
gpu_time+= kernelExecTimeNs;
// Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,d_distance, CL_TRUE, 0,
mem_size_d,distance->data, 0, NULL, NULL);
Printing the time in ms
printf("\n\n Time taken by GPU is %llu ms",gpu_time/1000000);
If the way I am calculating the GPU timing is wrong, why would it work on a CPU (by changing to CL_DEVICE_TYPE_CPU)? What is wrong here?
Edited:
System Information
AMD APP SDK 2.4
AMD ATI FirePro GL 3D, having 800 cores
Kerenel
#pragma OPENCL EXTENSION cl_khr_fp64:enable
double distance_cal(__local float* cent,float* data,int dimensions)
{
float dist1=0.00;
for(int i=0;i<dimensions;i++)
dist1 += ((data[i]-cent[i]) * (data[i]-cent[i]));
double sq_dist=sqrt(dist1);
return sq_dist;
}
void fetch_col(float* data,__constant float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
col=col+len;
}
}
void fetch_col_cen(__local float* data,__global float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
col=col+len;
}
}
__kernel void distance_finding(__constant float* data,__global float* cen,__global float*
dist,int inst,int clus,const int dimensions)
{
int idx=get_global_id(0);
float data_col[4];
fetch_col( data_col,data,idx,dimensions,inst);
for(int i=0;i<clus;i++)
{
int k=i*inst; // take each dimension value for each cluster data
__local float cent[4];
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
fetch_col_cen(cent,cen,i,dimensions,clus);
dist[idx+k]=distance_cal(cent,data_col,dimensions);// calculate distance wrt
each data n each centroid
}
}

clEnqueueNDRangeKernel() is asynchronous if it is using GPU and therefore you only see the time it took to enqueue the request but not to execution it.
That said, I could be wrong, but I usually write c++ code to do the timing and put the start_time before the instruction and end_time after the
clFinish(cmd_queue);
just like you did with C++ timing code, that would be a good test, if you're sure your GPU shouldn't be finishing by 0 seconds.

An easy way to check would be to introduce an abnormally long operation inside the kernel. If THAT shows up as zero when there a perceptible lag in actual execution - then you have your answer.
That said, I believe (even though the indicated thread is for Linux, it probably holds water on Windows too) you might need to install the instrumented drivers to even have the system write to the performance counters. You can also use the CUDA profiler on nVidia's OpenCL implementation because it sits on top of CUDA.

change to
clFinish(clCommandQue); // wait for all events to finish
// add this after clFinish()
// Ensure kernel execution is finished
clWaitForEvents(1 , &myEvent);
..
double gpu_time = endTime-startTime;
..
printf("\n\n Time taken by GPU is %0.3f ms", gpu_time/1000000.0);

OpenCL enqueueNDRangeKernel causes Access Violation error

I am continuously getting an Access Violation Error with a all my kernels which I am trying to build. Other kernels which I take from books seem to work fine.
https://github.com/ssarangi/VideoCL - This is where the code is.
Something seems to be missing in this. Could someone help me with this.
Thanks so much.
[James] - Thanks for the suggestion and you are right. I am doing it on Win 7 with a AMD Redwood card. I have the Catalyst 11.7 drivers with AMD APP SDK 2.5. I am posting the code below.
#include <iostream>
#include "bmpfuncs.h"
#include "CLManager.h"
void main()
{
float theta = 3.14159f/6.0f;
int W ;
int H ;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
float* ip = readImage(inputFile, &W, &H);
float *op = new float[W*H];
//We assume that the input image is the array “ip”
//and the angle of rotation is theta
float cos_theta = cos(theta);
float sin_theta = sin(theta);
try
{
CLManager* clMgr = new CLManager();
// Build the Source
unsigned int pgmID = clMgr->buildSource("rotation.cl");
// Create the kernel
cl::Kernel* kernel = clMgr->makeKernel(pgmID, "img_rotate");
// Create the memory Buffers
cl::Buffer* clIp = clMgr->createBuffer(CL_MEM_READ_ONLY, W*H*sizeof(float));
cl::Buffer* clOp = clMgr->createBuffer(CL_MEM_READ_WRITE, W*H*sizeof(float));
// Get the command Queue
cl::CommandQueue* queue = clMgr->getCmdQueue();
queue->enqueueWriteBuffer(*clIp, CL_TRUE, 0, W*H*sizeof(float), ip);
// Set the arguments to the kernel
kernel->setArg(0, clOp);
kernel->setArg(1, clIp);
kernel->setArg(2, W);
kernel->setArg(3, H);
kernel->setArg(4, sin_theta);
kernel->setArg(5, cos_theta);
// Run the kernel on specific NDRange
cl::NDRange globalws(W, H);
queue->enqueueNDRangeKernel(*kernel, cl::NullRange, globalws, cl::NullRange);
queue->enqueueReadBuffer(*clOp, CL_TRUE, 0, W*H*sizeof(float), op);
storeImage(op, outputFile, H, W, inputFile);
}
catch(cl::Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
}
I am getting the error at the queue->enqueueNDRangeKernel line.
I have the queue and the kernel stored in a class.
CLManager::CLManager()
: m_programIDs(-1)
{
// Initialize the Platform
cl::Platform::get(&m_platforms);
// Create a Context
cl_context_properties cps[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(m_platforms[0])(),
0
};
m_context = cl::Context(CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
m_devices = m_context.getInfo<CL_CONTEXT_DEVICES>();
cl_int err;
m_queue = new cl::CommandQueue(m_context, m_devices[0], 0, &err);
}
cl::Kernel* CLManager::makeKernel(unsigned int programID, std::string kernelName)
{
cl::CommandQueue queue = cl::CommandQueue(m_context, m_devices[0]);
cl::Kernel* kernel = new cl::Kernel(*(m_programs[programID]), kernelName.c_str());
m_kernels.push_back(kernel);
return kernel;
}

I checked your code. I'm on Linux though. At runtime I'm getting Error -38, which means CL_INVALID_MEM_OBJECT. So I went and checked your buffers.
cl::Buffer* clIp = clMgr->createBuffer(CL_MEM_READ_ONLY, W*H*sizeof(float));
cl::Buffer* clOp = clMgr->createBuffer(CL_MEM_READ_WRITE, W*H*sizeof(float));
Then you pass the buffers as a Pointer:
kernel->setArg(0, clOp);
kernel->setArg(1, clIp);
But setArg is expecting a value, so the buffer pointers should be dereferenced:
kernel->setArg(0, *clOp);
kernel->setArg(1, *clIp);
After those changes the cat rotates ;)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex