Use HOST as a DEVICE - opencl

I work on OpenCL, and I have got only a CPU i3 core Duo => I possess only 1 device at all (my CPU). So basically, I guess my HOST (cpu) will also be the DEVICE. I tried to launch a kernel but the task assigned to the DEVICE (which is also the HOST) never terminate. After thinking about this issue, it seems obvious that the HOST waiting for the DEVICE (itself) to finish, is impossible. But does anyone know a means to overcome this issue? Maybe using clCreateSubDevice, to subdivide my only device into an host and a real device?

You will find my java-like code below, in order you to let me know my mistake. Actually when i run the following code without the clFinish(commandQueue); (on bottom of code), I have the following output:
I use the platform Intel(R) OpenCL
Enqueuing kernels...
Pause for 15000 ms.
Task INCOMPLETE
If I add clFinish(commandQueue) I have the output and my task is completed:
I use the platform Intel(R) OpenCL
Enqueuing kernels...
Event kernel status: CL_COMPLETE event ID: 10 runtime: 2.631ms
Pause for 15000 ms.
Task COMPLETE
So why the single clFinish() instruction allow me the task to complete ? thanks you in advance for explaination.
public class Test_CPU
{
private static String programSource0 =
"__kernel void vectorAdd(" +
" __global const float *a,"+
" __global const float *b, " +
" __global float *c)"+
"{"+
" int gid = get_global_id(0);"+
" c[gid] = a[gid]+b[gid];"+
"}";
/**
* The entry point of this sample
*
* #param args Not used
*/
public static void main(String args[])
{
/**
* Callback function that is called when the event ev has the event_status status and will display the runtime of execution kernel in seconds
* #param event: the event
* #param event_status: status of the event
* #param user_data: data given by the user is an integer tag that can be used to match profiling output to the associated kernel
* #return: none
*/
EventCallbackFunction kernelCommandEvent = new EventCallbackFunction()
{
#Override
public void function(cl_event event, int event_status, Object user_data)
{
int evID = (int)user_data;
long[] ev_start_time = new long[1];
Arrays.fill(ev_start_time, 0);
long[] ev_end_time = new long[1];
Arrays.fill(ev_end_time, 0);
long[] return_bytes = new long[1];
double run_time = 0.0;
clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_QUEUED, Sizeof.cl_long, Pointer.to(ev_start_time), return_bytes);
clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END , Sizeof.cl_long, Pointer.to(ev_end_time), return_bytes);
run_time = (double)(ev_end_time[0] - ev_start_time[0]);
System.out.println("Event kernel status: " + CL.stringFor_command_execution_status(event_status) + " event ID: " + evID + " runtime: " + String.format("%8.3f", (run_time*1.0e-6)) + " ms.");
}
};
// Initialize the input data
int n = 1000000;
float srcArrayA[] = new float[n];
float srcArrayB[] = new float[n];
float dstArray0[] = new float[n];
for (int i=0; i<srcArrayA.length; i++)
{
srcArrayA[i] = i;
srcArrayB[i] = i;
}
Pointer srcA = Pointer.to(srcArrayA);
Pointer srcB = Pointer.to(srcArrayB);
Pointer dst0 = Pointer.to(dstArray0);
// The platform, device type and device number that will be used
final int platformIndex = 1;
final long deviceType = CL_DEVICE_TYPE_CPU;
final int deviceIndex = 0;
// Enable exceptions and subsequently omit error checks in this sample
CL.setExceptionsEnabled(true);
// Obtain the number of platforms
int numPlatformsArray[] = new int[1];
clGetPlatformIDs(0, null, numPlatformsArray);
int numPlatforms = numPlatformsArray[0];
// Obtain a platform ID
cl_platform_id platforms[] = new cl_platform_id[numPlatforms];
clGetPlatformIDs(platforms.length, platforms, null);
cl_platform_id platform = platforms[platformIndex];
long size[] = new long[1];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, null, size);
// Create a buffer of the appropriate size and fill it with the info
byte buffer[] = new byte[(int)size[0]];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, buffer.length, Pointer.to(buffer), null);
// Create a string from the buffer (excluding the trailing \0 byte)
System.out.println("I use the platform " + new String(buffer, 0, buffer.length-1));
// Initialize the context properties
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform);
// Obtain the number of devices for the platform
int numDevicesArray[] = new int[1];
clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
int numDevices = numDevicesArray[0];
// Obtain a device ID
cl_device_id devices[] = new cl_device_id[numDevices];
clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
cl_device_id device = devices[deviceIndex];
// Create a context for the selected device
cl_context context = clCreateContext(contextProperties, 1, new cl_device_id[]{device}, null, null, null);
// Create a command-queue, with profiling info enabled
long properties = 0;
properties |= CL.CL_QUEUE_PROFILING_ENABLE;
cl_command_queue commandQueue = CL.clCreateCommandQueue(context, devices[0], properties, null);
// Allocate the buffer memory objects
cl_mem srcMemA = CL.clCreateBuffer(context, CL.CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * n, srcA, null);
cl_mem srcMemB = CL.clCreateBuffer(context, CL.CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * n, srcB, null);
cl_mem dstMem0 = CL.clCreateBuffer(context, CL.CL_MEM_READ_WRITE, Sizeof.cl_float * n, null, null);
// Create and build the the programs and the kernels
cl_program program0 = CL.clCreateProgramWithSource(context, 1, new String[]{ programSource0 }, null, null);
// Build the programs
CL.clBuildProgram(program0, 0, null, null, null, null);
// Create the kernels
cl_kernel kernel0 = CL.clCreateKernel(program0, "vectorAdd", null);
// Set the arguments
CL.clSetKernelArg(kernel0, 0, Sizeof.cl_mem, Pointer.to(srcMemA));
CL.clSetKernelArg(kernel0, 1, Sizeof.cl_mem, Pointer.to(srcMemB));
CL.clSetKernelArg(kernel0, 2, Sizeof.cl_mem, Pointer.to(dstMem0));
// Set work-item dimensions and execute the kernels
long globalWorkSize[] = new long[]{n};
System.out.println("Enqueueing kernels...");
cl_event[] myEventID = new cl_event[1];
myEventID[0] = new cl_event();
clEnqueueNDRangeKernel(commandQueue, kernel0, 1, null, globalWorkSize, null, 0, null, myEventID[0]);
int ID[] = new int[1];
ID[0] = 10;
clSetEventCallback(myEventID[0], CL_COMPLETE, kernelCommandEvent, ID[0]);
clFinish(commandQueue);
System.out.println("Pause for 15000 ms.");
try
{
Thread.sleep(15000);
}
catch(InterruptedException iEx)
{
iEx.printStackTrace();
}
// See if task completed
int[] ok = new int[1];
Arrays.fill(ok, 0);
clGetEventInfo(myEventID[0], CL_EVENT_COMMAND_EXECUTION_STATUS, Sizeof.cl_int, Pointer.to(ok), null);
if (ok[0] == CL_COMPLETE) System.out.println("Task COMPLETE");else System.out.println("Task INCOMPLETE");
}
}

I think my thoughts were not so bad, because indeed, you need to programmatically force the HOST to switch to DEVICE work, in such a case both HOST and DEVICE are the same hardware.
In fact, it is possible to have the HOST as a DEVICE, but in order to let the DEVICE work, you need to invoke at least one blocking function (clFinish(), or clEnqueueRead (... CL_TRUE, ...)). Otherwise, the HOST will always work and will never switch to DEVICE work. I tried to add a sleep() function, but it did not work, you really need to add a blocking opencl function instead.
Thanks at any rate!

Related

OpenCL, Understanding VectorAdd program

I'm new to OpenCL, with very limited background in C/C++.
I've been given this OpenCL program that adds two vectors, and supposed to figure out how it works. It comes from Intel:
https://www.intel.com/content/www/us/en/programmable/support/support-resources/design-examples/design-software/opencl/vector-addition.html
Would it be correct to say: each kernel uses 1 element from A and 1 element from B to calculate 1 element of Z?
To me, it looks like it determines the number of devices (num_devices), and essentially divides the problem size (N) by num_devices, to determine the number of elements per device (n_per_device[]). Then it creates arrays of random numbers for each device (input_a[] and input_b[]) with n_per_device number of elements.
Then these arrays are used by the kernel, where addition of the whole array is performed and stored as Z.
For example, say if the number of devices available is 1000, and problem size (N) is 1,000,000; the n_per_device is 1000 (and since there is no remainder it is the same for all), and it would generate 1000 arrays of input_a and input_b, with 1000 elements in each. Then a respective pair of arrays of 1000 elements are taken by the kernel and added together - in other words each execution of the kernel adds 1000 elements?
Am I following anything, or totally wrong here?
The kernel is:
// ACL kernel for adding two input vectors
__kernel void vectorAdd(__global const float *x,
__global const float *y,
__global float *restrict z)
{
// get index of the work item
int index = get_global_id(0);
// add the vector elements
z[index] = x[index] + y[index];
}
The host (main) code is (sorry it is long, not sure what's not important):
///////////////////////////////////////////////////////////////////////////////////
// This host program executes a vector addition kernel to perform:
// C = A + B
// where A, B and C are vectors with N elements.
//
// This host program supports partitioning the problem across multiple OpenCL
// devices if available. If there are M available devices, the problem is
// divided so that each device operates on N/M points. The host program
// assumes that all devices are of the same type (that is, the same binary can
// be used), but the code can be generalized to support different device types
// easily.
//
// Verification is performed against the same computation on the host CPU.
///////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "CL/opencl.h"
#include "AOCL_Utils.h"
using namespace aocl_utils;
// OpenCL runtime configuration
cl_platform_id platform = NULL;
unsigned num_devices = 0;
scoped_array<cl_device_id> device; // num_devices elements
cl_context context = NULL;
scoped_array<cl_command_queue> queue; // num_devices elements
cl_program program = NULL;
scoped_array<cl_kernel> kernel; // num_devices elements
scoped_array<cl_mem> input_a_buf; // num_devices elements
scoped_array<cl_mem> input_b_buf; // num_devices elements
scoped_array<cl_mem> output_buf; // num_devices elements
// Problem data.
const unsigned N = 1000000; // problem size
scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
scoped_array<scoped_array<float> > ref_output; // num_devices elements
scoped_array<unsigned> n_per_device; // num_devices elements
// Function prototypes
float rand_float();
bool init_opencl();
void init_problem();
void run();
void cleanup();
// Entry point.
int main() {
// Initialize OpenCL.
if(!init_opencl()) {
return -1;
}
// Initialize the problem data.
// Requires the number of devices to be known.
init_problem();
// Run the kernel.
run();
// Free the resources allocated
cleanup();
return 0;
}
/////// HELPER FUNCTIONS ///////
// Randomly generate a floating-point number between -10 and 10.
float rand_float() {
return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
}
// Initializes the OpenCL objects.
bool init_opencl() {
cl_int status;
printf("Initializing OpenCL\n");
if(!setCwdToExeDir()) {
return false;
}
// Get the OpenCL platform.
platform = findPlatform("Altera");
if(platform == NULL) {
printf("ERROR: Unable to find Altera OpenCL platform.\n");
return false;
}
// Query the available OpenCL device.
device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
printf("Platform: %s\n", getPlatformName(platform).c_str());
printf("Using %d device(s)\n", num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
printf(" %s\n", getDeviceName(device[i]).c_str());
}
// Create the context.
context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create per-device objects.
queue.reset(num_devices);
kernel.reset(num_devices);
n_per_device.reset(num_devices);
input_a_buf.reset(num_devices);
input_b_buf.reset(num_devices);
output_buf.reset(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Command queue.
queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Kernel.
const char *kernel_name = "vectorAdd";
kernel[i] = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
// Determine the number of elements processed by this device.
n_per_device[i] = N / num_devices; // number of elements handled by this device
// Spread out the remainder of the elements over the first
// N % num_devices.
if(i < (N % num_devices)) {
n_per_device[i]++;
}
// Input buffers.
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input A");
input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input B");
// Output buffer.
output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for output");
}
return true;
}
// Initialize the data for the problem. Requires num_devices to be known.
void init_problem() {
if(num_devices == 0) {
checkError(-1, "No devices");
}
input_a.reset(num_devices);
input_b.reset(num_devices);
output.reset(num_devices);
ref_output.reset(num_devices);
// Generate input vectors A and B and the reference output consisting
// of a total of N elements.
// We create separate arrays for each device so that each device has an
// aligned buffer.
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]);
input_b[i].reset(n_per_device[i]);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
for(unsigned j = 0; j < n_per_device[i]; ++j) {
input_a[i][j] = rand_float();
input_b[i][j] = rand_float();
ref_output[i][j] = input_a[i][j] + input_b[i][j];
}
}
}
void run() {
cl_int status;
const double start_time = getCurrentTimestamp();
// Launch the problem for each device.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Transfer inputs to each device. Each of the host buffers supplied to
// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
// for the host-to-device transfer.
cl_event write_event[2];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, "Failed to transfer input A");
status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
checkError(status, "Failed to transfer input B");
// Set kernel arguments.
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
// Enqueue kernel.
// Use a global work size corresponding to the number of elements to add
// for this device.
//
// We don't specify a local work size and let the runtime choose
// (it'll choose to use one work-group with the same size as the global
// work-size).
//
// Events are used to ensure that the kernel is not launched until
// the writes to the input buffers have completed.
const size_t global_work_size = n_per_device[i];
printf("Launching for device %d (%d elements)\n", i, global_work_size);
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, NULL, 2, write_event, &kernel_event[i]);
checkError(status, "Failed to launch kernel");
// Read the result. This the final operation.
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
// Release local events.
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
}
// Wait for all devices to finish.
clWaitForEvents(num_devices, finish_event);
const double end_time = getCurrentTimestamp();
// Wall-clock time taken.
printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3);
// Get kernel times using the OpenCL event profiling API.
for(unsigned i = 0; i < num_devices; ++i) {
cl_ulong time_ns = getStartEndTime(kernel_event[i]);
printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
}
// Release all events.
for(unsigned i = 0; i < num_devices; ++i) {
clReleaseEvent(kernel_event[i]);
clReleaseEvent(finish_event[i]);
}
// Verify results.
bool pass = true;
for(unsigned i = 0; i < num_devices && pass; ++i) {
for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
printf("Failed verification # device %d, index %d\nOutput: %f\nReference: %f\n",
i, j, output[i][j], ref_output[i][j]);
pass = false;
}
}
}
printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
}
// Free the resources allocated during initialization
void cleanup() {
for(unsigned i = 0; i < num_devices; ++i) {
if(kernel && kernel[i]) {
clReleaseKernel(kernel[i]);
}
if(queue && queue[i]) {
clReleaseCommandQueue(queue[i]);
}
if(input_a_buf && input_a_buf[i]) {
clReleaseMemObject(input_a_buf[i]);
}
if(input_b_buf && input_b_buf[i]) {
clReleaseMemObject(input_b_buf[i]);
}
if(output_buf && output_buf[i]) {
clReleaseMemObject(output_buf[i]);
}
}
if(program) {
clReleaseProgram(program);
}
if(context) {
clReleaseContext(context);
}
}
There are a few sub-questions here, so let me try and address them individually. I'm going to be slightly pedantic on terminology; I'm not doing that to be snarky but hopefully this will help you make more sense of documentation, examples, etc.:
Would it be correct to say: each kernel uses 1 element from A and 1 element from B to calculate 1 element of Z?
The kernel is just the code that will run on the OpenCL device. Typically, a kernel is scheduled to run (using clEnqueueNDRangeKernel()) with multiple work-items. With just one work item, there is not much point in bothering with OpenCL at all; the performance benefit comes from massive parallelism. In any case, your quoted statement is correct for each individual work-item processing this kernel. If you run this kernel with 1000 work items, 1000 elements from A will be processed with 1000 elements from B to calculate 1000 elements of Z. The order this happens in is deliberately undefined, and at least groups of elements will be operated on concurrently.
To me, it looks like it determines the number of devices (num_devices), and essentially divides the problem size (N) by num_devices, to determine the number of elements per device (n_per_device[]). Then it creates arrays of random numbers for each device (input_a[] and input_b[]) with n_per_device number of elements.
Yes, it looks like that to me too.
For example, say if the number of devices available is 1000,
I would just like to point out that you will pretty much never have this many OpenCL devices in a system. The granularity of a single OpenCL device is typically "one GPU," or "all the CPU cores in the system," or "one FPGA accelerator card."
So a "normal" amount of devices on a desktop system is 1, 2, or maybe up to about 4 (e.g. CPU + iGPU + dual discrete GPUs). Big irons with many accelerator cards might have ~16 or so. If you're attempting to accelerate some code in a desktop (or small server) application, you'll usually just pick one device that's likely to be the most appropriate for your problem and run with that. Distributing workload evenly across heterogenous devices is a hard problem for anything but the most basic algorithms.
and problem size (N) is 1,000,000; the n_per_device is 1000 (and since there is no remainder it is the same for all), and it would generate 1000 arrays of input_a and input_b, with 1000 elements in each. Then a respective pair of arrays of 1000 elements are taken by the kernel and added together -
Yes.
in other words each execution of the kernel adds 1000 elements?
Again, this is where using the term "kernel" isn't precise enough. In your example, you would enqueue 1000 work items to execute the kernel on each of the 1000 devices.

How to dynamically fill the structure which is a pointer to pointer of arrays in C++ implementing xfs

Structure 1:
typedef struct _wfs_cdm_cu_info
{
USHORT usTellerID;
USHORT usCount;
LPWFSCDMCASHUNIT * lppList;
} WFSCDMCUINFO, * LPWFSCDMCUINFO;
Structure 2:
typedef struct _wfs_cdm_cashunit
{
USHORT usNumber;
USHORT usType;
LPSTR lpszCashUnitName;
CHAR cUnitID[5];
CHAR cCurrencyID[3];
ULONG ulValues;
ULONG ulInitialCount;
ULONG ulCount;
ULONG ulRejectCount;
ULONG ulMinimum;
ULONG ulMaximum;
BOOL bAppLock;
USHORT usStatus;
USHORT usNumPhysicalCUs;
LPWFSCDMPHCU * lppPhysical;
} WFSCDMCASHUNIT, * LPWFSCDMCASHUNIT;
Structure 3:
typedef struct _wfs_cdm_physicalcu
{
LPSTR lpPhysicalPositionName;
CHAR cUnitID[5];
ULONG ulInitialCount;
ULONG ulCount;
ULONG ulRejectCount;
ULONG ulMaximum;
USHORT usPStatus;
BOOL bHardwareSensor;
} WFSCDMPHCU, * LPWFSCDMPHCU;
The code:
LPWFSCDMCUINFO lpWFSCDMCuinf = NULL;
LPWFSCDMCASHUNIT lpWFSCDMCashUnit = NULL;
LPWFSCDMPHCU lpWFSCDMPhcu = NULL;
int i=0;
try
{
hResult = WFMAllocateBuffer(sizeof(WFSCDMCUINFO),WFS_MEM_ZEROINIT|WFS_MEM_SHARE,(void**)&lpWFSCDMCuinf);
lpWFSCDMCuinf->usCount =7;
lpWFSCDMCuinf->usTellerID = 0;
hResult = WFMAllocateMore(7*sizeof(LPWFSCDMCASHUNIT),lpWFSCDMCuinf,(void**)&lpWFSCDMCuinf->lppList);
for(i=0;i<7;i++)
{
LPWFSCDMCASHUNIT lpWFSCDMCashUnit = NULL;
hResult = WFMAllocateMore(sizeof(WFSCDMCASHUNIT), lpWFSCDMCuinf, (void**)&lpWFSCDMCashUnit);
lpWFSCDMCuinf->lppList[i] = lpWFSCDMCashUnit;//store the pointer
//FILLING CASH UNIT
-----------------------------
lpWFSCDMCashUnit->ulValues =50;
-----------------------------
WFMAllocateMore(1* sizeof(LPWFSCDMPHCU), lpWFSCDMCuinf, (void**)&lpWFSCDMCashUnit->lppPhysical);// Allocate Physical Unit structure
for(int j=0;j<1;j++)
{
LPWFSCDMPHCU lpWFSCDMPhcu = NULL;
hResult = WFMAllocateMore(sizeof(WFSCDMPHCU), lpWFSCDMCuinf, (void**)&lpWFSCDMPhcu);
lpWFSCDMCashUnit->lppPhysical[j] = lpWFSCDMPhcu;
//FILLING Phy CASHUNIT
-------------------------------------------------------
lpWFSCDMPhcu->ulMaximum = 2000;
-----------------------------
}
}
//lpWFSCDMCuinf->lppList=&lpWFSCDMCashUnit;
hResult =WFSExecute (hService,WFS_CMD_CDM_END_EXCHANGE,(LPVOID)&lpWFSCDMCuinf,60000,&lppResult);
return (int)hResult;
I'm getting stuck while I retrieve all the values in structure 1.
I need to dynamically add the values into these structure and display Structure1 as output.An allocation of memory needs to be done for this.I have tried using the above code for allocating the memory but in spite of allocating the values are not properly stored in structure.
The value of usCount changes as per the denomination set. Based on this usNumPhysicalCUs is set.
Also when I send &lpWFSCDMCuinf within the WFSExecutemethod the lppPhysical seems to be empty.
I cant exactly figure out where I'm getting wrong.
First of all your must allocate memory for each block.
For pointers array you will allocate memory to store count of pointers, than for each pointer in allocated memory you must allocate memory for structure itself.
I rewrite your code in more short form. There is no error checking and this code is sample only.
LPWFSCDMCUINFO lpWFSCDMCuinf = NULL;
HRESULT hr = WFMAllocateBuffer(sizeof(WFSCDMCUINFO), WFS_MEM_ZEROINIT|WFS_MEM_SHARE, (void**)&lpWFSCDMCuinf);
// Allocate 7 times of WFSCDMCASHUNIT
const int cuCount = 7;
lpWFSCDMCuinf->usCount = cuCount;
hr = WFMAllocateMore(cuCount * sizeof(LPWFSCDMCASHUNIT), lpWFSCDMCuinf, (void**)&lpWFSCDMCuinf->lppList);
for (int i=0; i < cuCount; i++)
{
// for one entry
LPWFSCDMCASHUNIT currentCU = NULL;
hr = WFMAllocateMore(sizeof(WFSCDMCASHUNIT), lpWFSCDMCuinf, (void**)&currentCU);
// Store pinter
lpWFSCDMCuinf->lppList[i] = currentCU;
// Fill current CU data here
// ....
// Allocate Phisical Unit Pointers
const int phuCount = 1;
currentCU->usNumPhysicalCUs = phuCount;
WFMAllocateMore(phuCount * sizeof(LPWFSCDMPHCU), lpWFSCDMCuinf, (void**)&currentCU->lppPhysical);
// Allocate Phisical Unit structure
for (int j=0; j < phuCount; j++)
{
LPWFSCDMPHCU phuCurrent = NULL;
// Allocate Phisical Unit structure
WFMAllocateMore(sizeof(WFSCDMPHCU), lpWFSCDMCuinf, (void**)&phuCurrent);
currentCU->lppPhysical[j] = phuCurrent;
// Fill Phisical Unit here
// ..
// ..
}
}
In additional to this sample I recommend you to write some helper function to allocate XFS structures like WFSCDMCUINFO. In my own project I've used some macro to serialize XFS structure in memory with WFMAllocate and WFMAllocateMore functions.
XFS structures is so complex and different from class to class. I wrote some macros to serialize and deserialize structures in memory stream and XFS memory buffers. In application I use heap alloc to store XFS structures in memory, but when I need to return structures in another XFS message I need to transfer memory buffers to XFS memory with WFMAllocate and WFMAllocateMore.

Opencl: GPU Execution Time is always Zero

I am trying to print the execution time for some functions on GPU. But timing on GPU is always comming out to be 0. Also when I choose CL_DEVICE_TYPE_CPU in the following it works fine.
errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
This works fine and shows non-zero value of execution time but if I choose CL_DEVICE_TYPE_GPU, then it always shows 0, irrespective of total no. of data points and threads. please note that in both cases (CL_DEVICE_TYPE_CPU and CL_DEVICE_TYPE_GPU), I am printing the execution time in same way. That is my host code and my kernel code is same in both cases(thats what openCL is!). Following are some of the code section:
// openCL code to get platform and device ids
errcode = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
// to create context
clGPUContext = clCreateContext( NULL, 1, &device_id, NULL, NULL, &errcode);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
device_id, CL_QUEUE_PROFILING_ENABLE, &errcode);
// Setup device memory
d_instances= clCreateBuffer(clGPUContext,CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR,mem_size_i,instances->data, &errcode);
d_centroids = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_c, NULL, &errcode);
d_distance = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_d,NULL, &errcode);
// d_dist_X = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//d_dist_Y = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//to build program
clProgram = clCreateProgramWithSource(clGPUContext,1, (const char **)&source_str,(const
size_t*)&source_size, &errcode);
errcode = clBuildProgram(clProgram, 0,NULL, NULL, NULL, NULL);
if (errcode == CL_BUILD_PROGRAM_FAILURE)
{
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
&log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, log_size, log,
NULL);
// Print the log
printf("%s\n", log);
}
clKernel = clCreateKernel(clProgram,"distance_finding", &errcode);
// Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
if(num_instances >= 500)
{
localWorkSize[0] = 500;
float block1=num_instances/localWorkSize[0];
int block= (int)(ceil(block1));
globalWorkSize[0] = block*localWorkSize[0];
}
else
{
localWorkSize[0]=num_instances;
globalWorkSize[0]=num_instances;
}
int iteration=1;
while(iteration < MAX_ITERATIONS)
{
errcode = clEnqueueWriteBuffer(clCommandQue,d_centroids , CL_TRUE, 0,
mem_size_c, (void*)centroids->data, 0, NULL, NULL);
errcode = clEnqueueWriteBuffer(clCommandQue,d_distance , CL_TRUE, 0, mem_size_d,
(void*)distance->data, 0, NULL, NULL);
//set kernel arguments
errcode = clSetKernelArg(clKernel, 0,sizeof(cl_mem), (void *)&d_instances);
errcode = clSetKernelArg(clKernel, 1,sizeof(cl_mem), (void *)&d_centroids);
errcode = clSetKernelArg(clKernel, 2,sizeof(cl_mem), (void *)&d_distance);
errcode = clSetKernelArg(clKernel, 3,sizeof(unsigned int), (void *)
&num_instances);
errcode = clSetKernelArg(clKernel,4,sizeof(unsigned int),(void *)&clusters);
errcode = clSetKernelArg(clKernel,5,sizeof(unsigned int),(void *)&dimensions);
errcode = clEnqueueNDRangeKernel(clCommandQue,clKernel, 1, NULL,
globalWorkSize,localWorkSize, 0, NULL, &myEvent);
clFinish(clCommandQue); // wait for all events to finish
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START,sizeof(cl_ulong),
&startTime, NULL);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,sizeof(cl_ulong),
&endTime, NULL);
kernelExecTimeNs = endTime-startTime;
gpu_time+= kernelExecTimeNs;
// Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,d_distance, CL_TRUE, 0,
mem_size_d,distance->data, 0, NULL, NULL);
Printing the time in ms
printf("\n\n Time taken by GPU is %llu ms",gpu_time/1000000);
If the way I am calculating the GPU timing is wrong, why would it work on a CPU (by changing to CL_DEVICE_TYPE_CPU)? What is wrong here?
Edited:
System Information
AMD APP SDK 2.4
AMD ATI FirePro GL 3D, having 800 cores
Kerenel
#pragma OPENCL EXTENSION cl_khr_fp64:enable
double distance_cal(__local float* cent,float* data,int dimensions)
{
float dist1=0.00;
for(int i=0;i<dimensions;i++)
dist1 += ((data[i]-cent[i]) * (data[i]-cent[i]));
double sq_dist=sqrt(dist1);
return sq_dist;
}
void fetch_col(float* data,__constant float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
col=col+len;
}
}
void fetch_col_cen(__local float* data,__global float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
col=col+len;
}
}
__kernel void distance_finding(__constant float* data,__global float* cen,__global float*
dist,int inst,int clus,const int dimensions)
{
int idx=get_global_id(0);
float data_col[4];
fetch_col( data_col,data,idx,dimensions,inst);
for(int i=0;i<clus;i++)
{
int k=i*inst; // take each dimension value for each cluster data
__local float cent[4];
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
fetch_col_cen(cent,cen,i,dimensions,clus);
dist[idx+k]=distance_cal(cent,data_col,dimensions);// calculate distance wrt
each data n each centroid
}
}
clEnqueueNDRangeKernel() is asynchronous if it is using GPU and therefore you only see the time it took to enqueue the request but not to execution it.
That said, I could be wrong, but I usually write c++ code to do the timing and put the start_time before the instruction and end_time after the
clFinish(cmd_queue);
just like you did with C++ timing code, that would be a good test, if you're sure your GPU shouldn't be finishing by 0 seconds.
An easy way to check would be to introduce an abnormally long operation inside the kernel. If THAT shows up as zero when there a perceptible lag in actual execution - then you have your answer.
That said, I believe (even though the indicated thread is for Linux, it probably holds water on Windows too) you might need to install the instrumented drivers to even have the system write to the performance counters. You can also use the CUDA profiler on nVidia's OpenCL implementation because it sits on top of CUDA.
change to
clFinish(clCommandQue); // wait for all events to finish
// add this after clFinish()
// Ensure kernel execution is finished
clWaitForEvents(1 , &myEvent);
..
double gpu_time = endTime-startTime;
..
printf("\n\n Time taken by GPU is %0.3f ms", gpu_time/1000000.0);

GMainContext have ref_count > 0 after unref

I am not getting ref_count to decrease properly for my GMainContext. The example program here is a small version of a large program (which uses threads, hence the need to create a context and push it on the thread).
GMainLoop *loop;
GMainContext *ctx;
struct conn
{
GSocketClient *client;
GSocketConnection *conn;
GInputStream *in;
GOutputStream *out;
gchar data[8192];
unsigned int count;
};
static void
read_done_cb(GObject *source_object, GAsyncResult *res, gpointer user_data)
{
struct conn *c = (struct conn *)user_data;
gssize len = g_input_stream_read_finish(c->in, res, NULL);
g_input_stream_read_async(c->in, c->data, sizeof c->data / sizeof *c->data, G_PRIORITY_DEFAULT, NULL, read_done_cb, c);
if (c->count++ == 1) {
printf("End of life as I know it...\n");
g_main_loop_quit(loop);
}
}
static void
write_done_cb(GObject *source_object, GAsyncResult *res, gpointer user_data)
{
}
static void
connect_done_cb(GObject *source_object, GAsyncResult *res, gpointer user_data)
{
printf("## %s\n", __FUNCTION__);
struct conn *c = (struct conn *)user_data;
c->conn = g_socket_client_connect_to_host_finish(c->client, res, NULL);
c->in = g_io_stream_get_input_stream(G_IO_STREAM(c->conn));
c->out = g_io_stream_get_output_stream(G_IO_STREAM(c->conn));
char *data = "GET /axis-cgi/mjpg/video.cgi HTTP/1.0\r\n\r\n";
g_output_stream_write_async(c->out, data, strlen(data), G_PRIORITY_DEFAULT, NULL, write_done_cb, c);
g_input_stream_read_async(c->in, c->data, sizeof c->data / sizeof *c->data, G_PRIORITY_DEFAULT, NULL, read_done_cb, c);
}
int
main(int argc, char **argv)
{
g_type_init();
struct conn *c = g_malloc0(sizeof *c);
ctx = g_main_context_new();
loop = g_main_loop_new(ctx, FALSE);
g_main_context_push_thread_default(ctx);
c->client = g_socket_client_new();
g_socket_client_connect_to_host_async(c->client, "10.85.25.20", 80, NULL, connect_done_cb, c);
g_main_loop_run(loop);
g_io_stream_close(G_IO_STREAM(c->conn), NULL, NULL);
g_object_unref(c->client);
g_object_unref(c->conn);
g_main_context_pop_thread_default(ctx);
g_main_loop_unref(loop);
g_main_context_unref(ctx);
return 0;
}
Using gdb, inserting breakpoint just before return I can see that ctx still have one ref count:
(gdb) p ctx->ref_count
$2 = 1
If I do another g_main_context_unref(ctx); everything shuts down as expected. I do not understand where I get this ownership though.
Thanks in advance for your help
I found the error. I read_done_cb I issued another g_input_stream_read_async and immediately after quitting the main loop. g_input_stream_read_async upped the ref_count but GMainLoop never got a chance to return to my callback (and decreasing the ref_count on my GMainContext).
Moving the call to g_input_stream_read_async in my callback to below the if statement
static void
read_done_cb(GObject *source_object, GAsyncResult *res, gpointer user_data)
{
struct conn *c = (struct conn *)user_data;
gssize len = g_input_stream_read_finish(c->in, res, NULL);
if (c->count++ == 1) {
printf("End of life as I know it...\n");
g_main_loop_quit(loop);
}
g_input_stream_read_async(c->in, c->data, sizeof c->data / sizeof *c->data, G_PRIORITY_DEFAULT, NULL, read_done_cb, c);
}
correctly resolved the number of ref counts on my main context.
Silly mistake. Hopefully someone will find some use of my post at least.
g_main_context_new(), g_main_loop_new(), and g_main_context_push_thread_default() all ref the context. g_main_context_pop_thread_default(), g_main_loop_unref(), and g_main_context_unref() all unref it. So your intuition is sound.
I would use a watchpoint in gdb: watch ctx->ref_count to find out where the extra reference is being added.

OpenCL enqueueNDRangeKernel causes Access Violation error

I am continuously getting an Access Violation Error with a all my kernels which I am trying to build. Other kernels which I take from books seem to work fine.
https://github.com/ssarangi/VideoCL - This is where the code is.
Something seems to be missing in this. Could someone help me with this.
Thanks so much.
[James] - Thanks for the suggestion and you are right. I am doing it on Win 7 with a AMD Redwood card. I have the Catalyst 11.7 drivers with AMD APP SDK 2.5. I am posting the code below.
#include <iostream>
#include "bmpfuncs.h"
#include "CLManager.h"
void main()
{
float theta = 3.14159f/6.0f;
int W ;
int H ;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
float* ip = readImage(inputFile, &W, &H);
float *op = new float[W*H];
//We assume that the input image is the array “ip”
//and the angle of rotation is theta
float cos_theta = cos(theta);
float sin_theta = sin(theta);
try
{
CLManager* clMgr = new CLManager();
// Build the Source
unsigned int pgmID = clMgr->buildSource("rotation.cl");
// Create the kernel
cl::Kernel* kernel = clMgr->makeKernel(pgmID, "img_rotate");
// Create the memory Buffers
cl::Buffer* clIp = clMgr->createBuffer(CL_MEM_READ_ONLY, W*H*sizeof(float));
cl::Buffer* clOp = clMgr->createBuffer(CL_MEM_READ_WRITE, W*H*sizeof(float));
// Get the command Queue
cl::CommandQueue* queue = clMgr->getCmdQueue();
queue->enqueueWriteBuffer(*clIp, CL_TRUE, 0, W*H*sizeof(float), ip);
// Set the arguments to the kernel
kernel->setArg(0, clOp);
kernel->setArg(1, clIp);
kernel->setArg(2, W);
kernel->setArg(3, H);
kernel->setArg(4, sin_theta);
kernel->setArg(5, cos_theta);
// Run the kernel on specific NDRange
cl::NDRange globalws(W, H);
queue->enqueueNDRangeKernel(*kernel, cl::NullRange, globalws, cl::NullRange);
queue->enqueueReadBuffer(*clOp, CL_TRUE, 0, W*H*sizeof(float), op);
storeImage(op, outputFile, H, W, inputFile);
}
catch(cl::Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
}
I am getting the error at the queue->enqueueNDRangeKernel line.
I have the queue and the kernel stored in a class.
CLManager::CLManager()
: m_programIDs(-1)
{
// Initialize the Platform
cl::Platform::get(&m_platforms);
// Create a Context
cl_context_properties cps[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(m_platforms[0])(),
0
};
m_context = cl::Context(CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
m_devices = m_context.getInfo<CL_CONTEXT_DEVICES>();
cl_int err;
m_queue = new cl::CommandQueue(m_context, m_devices[0], 0, &err);
}
cl::Kernel* CLManager::makeKernel(unsigned int programID, std::string kernelName)
{
cl::CommandQueue queue = cl::CommandQueue(m_context, m_devices[0]);
cl::Kernel* kernel = new cl::Kernel(*(m_programs[programID]), kernelName.c_str());
m_kernels.push_back(kernel);
return kernel;
}
I checked your code. I'm on Linux though. At runtime I'm getting Error -38, which means CL_INVALID_MEM_OBJECT. So I went and checked your buffers.
cl::Buffer* clIp = clMgr->createBuffer(CL_MEM_READ_ONLY, W*H*sizeof(float));
cl::Buffer* clOp = clMgr->createBuffer(CL_MEM_READ_WRITE, W*H*sizeof(float));
Then you pass the buffers as a Pointer:
kernel->setArg(0, clOp);
kernel->setArg(1, clIp);
But setArg is expecting a value, so the buffer pointers should be dereferenced:
kernel->setArg(0, *clOp);
kernel->setArg(1, *clIp);
After those changes the cat rotates ;)

Resources