OpenCL, double buffering using two command-queues for a single device

OpenCL, double buffering using two command-queues for a single device - opencl

I'm creating an aplication with openCL 1.2 which is a test for a bigger application. This test sums 1 to each value of a 4x4 matrix with each kernel execution. The idea is to get double buffering to work. I created two kernels that do actually the same thing, they share the same READ_WRITE buffer so each kernel execution can continue where the last one left it, but they differ because they have a different output buffer, allowing to use one of the output buffer with a kernel while reading the data of the other, just like this:
Diagram
The pieces of code I think are relevant or could be problematic are the following, I include queues, buffers and events just in case but I tried changing everything regarding this:
Queues
compute_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &err);
data_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &err);
Buffer
input_Parametros = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double) * 5, Parametros, NULL);
input_matA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double) * 4, matA_1, NULL); // The 4x4 matrix
output_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , sizeof(double) * 4 * iteraciones_por_kernel, NULL, NULL);
output_buffer_2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY , sizeof(double) * 4 * iteraciones_por_kernel, NULL, NULL);
Argument set for each kernel
clSetKernelArg(kernel_1, 0, sizeof(cl_mem), &input_matA);
clSetKernelArg(kernel_1, 1, sizeof(cl_mem), &input_Parametros);
clSetKernelArg(kernel_1, 3, sizeof(cl_mem), &output_buffer);
clSetKernelArg(kernel_2, 0, sizeof(cl_mem), &input_matA);
clSetKernelArg(kernel_2, 1, sizeof(cl_mem), &input_Parametros);
clSetKernelArg(kernel_2, 3, sizeof(cl_mem), &output_buffer_2);
Events
cl_event event_1, event_2, event_3, event_4;
Kernel and Read enqueue
////////////////////////////////////////////////////////////////
// START
////////////////////////////////////////////////////////////////
clEnqueueNDRangeKernel(compute_queue, kernel_1, 1, NULL, global, local, 0, 0, &event_1);
clEnqueueNDRangeKernel(compute_queue, kernel_2, 1, NULL, global, local, 0, 0, &event_2);
clEnqueueReadBuffer(data_queue, output_buffer, CL_FALSE, 0, sizeof(double)*4*iteraciones_por_kernel, datos_salida, 1 , &event_1, &event_3);
////////////////////////////////////////////////////////////////
// ENQUEUE LOOP
////////////////////////////////////////////////////////////////
for (int i = 1; i <= (n_iteraciones_int - 2); i++){
////////////////////////////////////////////////////////////////
// LOOP PART 1
////////////////////////////////////////////////////////////////
if (i % 2 != 0){
clEnqueueNDRangeKernel(compute_queue, kernel_1, 1, NULL, global, local, 1, &event_3, &event_1);
clEnqueueReadBuffer(data_queue, output_buffer_2, CL_FALSE, 0, sizeof(double) * 4 * iteraciones_por_kernel, &datos_salida[i*iteraciones_por_kernel_int*4], 1, &event_2, &event_4);
}
////////////////////////////////////////////////////////////////
// LOOP PART 2
////////////////////////////////////////////////////////////////
if (i % 2 == 0){
clEnqueueNDRangeKernel(compute_queue, kernel_2, 1, NULL, global, local, 1, &event_4, &event_2);
clEnqueueReadBuffer(data_queue, output_buffer, CL_FALSE, 0, sizeof(double) * 4 * iteraciones_por_kernel, &datos_salida[i*iteraciones_por_kernel_int * 4], 1, &event_1, &event_3);
}
}
////////////////////////////////////////////////////////////////
// END
////////////////////////////////////////////////////////////////
clEnqueueReadBuffer(data_queue, output_buffer_2, CL_TRUE, 0, sizeof(double) * 4 * iteraciones_por_kernel, &datos_salida[(n_iteraciones_int - 1) * 4], 1, &event_2, 0);
I just can't get this to work even with everything seeming perfectly fine. The first read gives the expected values, but from then on it's like the kernels don't execute anymore, since i get 0's from the output_buffer_2 and the same values as in the first read form the first output_buffer.
This works perfectly fine with the same kernels and just one queue that does it all with a single data transfer at the end, but I don't want that.
I revised everything and investigated as much as I could, tried every variation I could imagine. This should be easy and possible I think... where is the problem?
I'm using AMD HD7970 as device, Windows 10 and visual studio community 2013 if it is any help.

Thanks to huseyin tugrul buyukisik help, the program worked with the following variations:
Events
cl_event event[20]; //adjust this to your needs
Kernel and read enqueue
////////////////////////////////////////////////////////////////
// START
////////////////////////////////////////////////////////////////
clEnqueueNDRangeKernel(compute_queue, kernel_1, 1, NULL, global, local, 0, 0, &event[0]);
clEnqueueNDRangeKernel(compute_queue, kernel_2, 1, NULL, global, local, 0, 0, &event[1]);
clEnqueueReadBuffer(data_queue, output_buffer, CL_FALSE, 0, sizeof(double)*4*iteraciones_por_kernel, datos_salida, 1 , &event[0], &event[2]);
////////////////////////////////////////////////////////////////
// LOOP
////////////////////////////////////////////////////////////////
for (int i = 1; i <= (n_iteraciones_int - 2); i++){
////////////////////////////////////////////////////////////////
// LOOP PART 1
////////////////////////////////////////////////////////////////
if (i % 2 == 1){
clEnqueueNDRangeKernel(compute_queue, kernel_1, 1, NULL, global, local, 1, &event[2+2*(i - 1)], &event[4 + 2 * (i - 1)]);
clEnqueueReadBuffer(data_queue, output_buffer_2, CL_FALSE, 0, sizeof(double) * 4 * iteraciones_por_kernel, &datos_salida[i*(iteraciones_por_kernel_int) * 4], 1, &event[1+2*(i - 1)], &event[3 + 2 * (i - 1)]);
}
////////////////////////////////////////////////////////////////
// LOOP PART 2
////////////////////////////////////////////////////////////////
if (i % 2 == 0){
clEnqueueNDRangeKernel(compute_queue, kernel_2, 1, NULL, global, local, 1, &event[3 + 2 * (i - 2)], &event[5 + 2 * (i - 2)]);
clEnqueueReadBuffer(data_queue, output_buffer, CL_FALSE, 0, sizeof(double) * 4 * iteraciones_por_kernel, &datos_salida[i*(iteraciones_por_kernel_int) * 4], 1, &event[4 + 2 * (i - 2)], &event[6 + 2 * (i - 2)]);
}
}
////////////////////////////////////////////////////////////////
// END
////////////////////////////////////////////////////////////////
clFlush(compute_queue);
clFlush(data_queue);
clEnqueueReadBuffer(data_queue, output_buffer_2, CL_TRUE, 0, sizeof(double) * 4 * iteraciones_por_kernel, &datos_salida[(n_iteraciones_int-1)*(iteraciones_por_kernel_int) * 4], 1, &event[5+2*(n_iteraciones_int-4)], 0);

Related

when using MPI_Isend() MPI_and Irecv(), should the pair use the same request? in case the message is already an array, how it should be passed?

In this code I am trying to broadcast using non blocking send and receive as a practice. I have multiple questions and issues.
1.Should I pair Isend() and Irecv() to use the same request?
2.When the message is an array, how should it be passed? in this case, message or &message?
3.Why I cannot run this code on less or more than 8 processors? if the rank doesn't exit, shouldn't it just go on without executing that piece of code?
4.The snippet on the at the bottom is there in order to print the total time once, but the waitall() does not work, and I do not understand why.
5. When passing arrays longer than 2^12, I get segmentation error, while I have checked the limits of Isend() and Irecv() and they supposed to handle even bigger length messages.
6.I used long double for record the time, is this a common or good practice? when I used smaller variables like float or double I would get nan.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<mpi.h>
int main(int argc, char *argv[]){
MPI_Init(&argc, &argv);
int i, rank, size, ready;
long int N = pow(2, 10);
float* message = (float *)malloc(sizeof(float *) * N + 1);
long double start, end;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
//MPI_Request* request = (MPI_Request *)malloc(sizeof(MPI_Request *) * size);
MPI_Request request[size-1];
/*Stage I: -np 8*/
if(rank == 0){
for(i = 0; i < N; i++){
message[i] = N*rand();
message[i] /= rand();
}
start = MPI_Wtime();
MPI_Isend(&message, N, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request[0]);
MPI_Isend(&message, N, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &request[1]);
MPI_Isend(&message, N, MPI_FLOAT, 4, 0, MPI_COMM_WORLD, &request[3]);
printf("Processor root-rank %d- sent the message...\n", rank);
}
if (rank == 1){
MPI_Irecv(&message, N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request[0]);
MPI_Wait(&request[0], MPI_STATUS_IGNORE);
printf("Processor rank 1 received the message.\n");
MPI_Isend(&message, N, MPI_FLOAT, 3, 0, MPI_COMM_WORLD, &request[2]);
MPI_Isend(&message, N, MPI_FLOAT, 5, 0, MPI_COMM_WORLD, &request[4]);
}
if(rank == 2){
MPI_Irecv(&message, N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request[1]);
MPI_Wait(&request[1], MPI_STATUS_IGNORE);
printf("Processor rank 2 received the message.\n");
MPI_Isend(&message, N, MPI_FLOAT, 6, 0, MPI_COMM_WORLD, &request[5]);
}
if(rank == 3){
MPI_Irecv(&message, N, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request[2]);
MPI_Wait(&request[2], MPI_STATUS_IGNORE);
printf("Processor rank 3 received the message.\n");
MPI_Isend(&message, N, MPI_FLOAT, 7, 0, MPI_COMM_WORLD, &request[6]);
}
if(rank == 4){
MPI_Irecv(&message, N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request[3]);
MPI_Wait(&request[3], MPI_STATUS_IGNORE);
printf("Processor rank 4 received the message.\n");
}
if(rank == 5){
MPI_Irecv(&message, N, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request[4]);
MPI_Wait(&request[4], MPI_STATUS_IGNORE);
printf("Processor rank 5 received the message.\n");
}
if(rank == 6){
MPI_Irecv(&message, N, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &request[5]);
MPI_Wait(&request[5], MPI_STATUS_IGNORE);
printf("Processor rank 6 received the message.\n");
}
if(rank == 7){
MPI_Irecv(&message, N, MPI_FLOAT, 3, 0, MPI_COMM_WORLD, &request[6]);
MPI_Wait(&request[6], MPI_STATUS_IGNORE);
printf("Processor rank 7 received the message.\n");
}
/*MPI_Testall(size-1,request,&ready, MPI_STATUS_IGNORE);*/
/* if (ready){*/
end = MPI_Wtime();
printf("Total Time: %Lf\n", end - start);
/*}*/
MPI_Finalize();
}

Each MPI task runs in its own address space, so there is no correlation between request[1] on rank 0 and request[1] on rank 2. That means you do not have to "pair" the requests. That being said, if you think "pairing" the requests improves the readability of your code, you might want to do so even if this is not required.
the buffer parameter of MPI_Isend() and MPI_Irecv() is a pointer to the start of the data, this is message (and not &message) here.
if you run with let's say 2 MPI tasks, MPI_Send(..., dest=2, ...) on rank 0 will fail because there 2 is an invalid rank in the MPI_COMM_WORLD communicator.
many requests are uninitialized when MPI_Waitall() (well, MPI_Testall() here) is invoked. One option is to first initialize all of them to MPI_REQUEST_NULL.
using &message results in memory corruption and that likely explains the crash.
From the MPI standard, the prototype is double MPI_Wtime(), so you'd rather use double here (the NaN likely come from the memory corruption described above)

OpenCL GPU calculation wrong

I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.
Here is my code on the Host
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
double *arange(double start, double end, double step)
{
// 'arange' routine.
int i;
int arr_size = ((end - start) / step) + 1;
double *output = malloc(arr_size * sizeof(double));
for(i=0;i<arr_size;i++)
{
output[i] = start + (step * i);
}
return output;
}
int main()
{
// This code executes on the OpenCL Host
// Host data
double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
double *delnu = arange(nu_ini, nu_end, nu_step);
double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
double *gamma, *f;
double prs = 950.0;
int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
FILE *fp = fopen("h2o_HITRAN.par","r");
char string[320];
while(!feof(fp))
{
dum = fgetc(fp);
if(dum == '\n')
{
lines++;
}
}
rewind(fp);
nu = malloc(lines * sizeof(double));
inten = malloc(lines * sizeof(double));
gam_air = malloc(lines * sizeof(double));
n_air = malloc(lines * sizeof(double));
del_air = malloc(lines * sizeof(double));
gamma = malloc(lines * sizeof(double));
f = malloc(delnu_size * sizeof(double));
i=0;
while(fgets(string, 320, fp))
{
sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
i++;
}
size_t line_siz = sizeof(double) * lines;
size_t delnu_siz = sizeof(double) * delnu_size;
// gamma calculation
for(i=0;i<lines;i++)
{
gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
}
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = NULL;
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
// Create a context and associate it with the devices
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = NULL;
cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
// Create a buffer objects that will contain the data from the host array 'buf_xxxx'
cl_mem buf_inten = NULL;
cl_mem buf_gamma = NULL;
cl_mem buf_delnu = NULL;
cl_mem buf_nu = NULL;
cl_mem buf_del_air = NULL;
cl_mem buf_f = NULL;
buf_inten = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_gamma = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_delnu = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
buf_nu = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_f = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
// Write input array A to the Device buffer 'buf_xxx'
status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_Source;
FILE *program_handle = fopen("abs_calc.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_Source = (char*)malloc(program_size+1);
program_Source[program_size] = '\0';
fread(program_Source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// Create the vector addition kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "abs_cross", &status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);
// Define index space (global work size) of work items for execution.
// A workgroup size (local work size) is not required, but can be used.
size_t globalWorkSize[2] = {lines, delnu_size};
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
// Read the Device output buffer to the host output array
clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);
// Verify the output
FILE *file = fopen("opencl_output","w");
for(i=0;i<delnu_size;i++)
{
fprintf(file, "%le %le\n", delnu[i], f[i]);
}
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(buf_nu);
clReleaseMemObject(buf_inten);
clReleaseMemObject(buf_del_air);
clReleaseMemObject(buf_gamma);
clReleaseMemObject(buf_f);
clReleaseMemObject(buf_delnu);
clReleaseContext(context);
// Free host resources
free(nu);
free(inten);
free(gam_air);
free(n_air);
free(del_air);
free(delnu);
free(gamma);
free(f);
free(platforms);
free(devices);
fclose(fp);
fclose(file);
return 0;
}
and this is my kernel code
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
global double *gamma,
global double *delnu,
global double *nu,
global double *del_air,
global double *f)
{
double pie = 4.0*atan(1.0);
int i = get_global_id(0);
int j = get_global_id(1);
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
Am I doing something wrong?
Thank you.

You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.

I am using AMD W2100, and yes, I have printed out all the supported extension and it included cl_khr_fp64 extension.
Sorry, I forgot to include the original calculation. The actual calculation goes like the following..
for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}

I would write OpenCL kernel as below,
Without using atomics and only single work dimension.
global_work_size = delnu_size
There could be a better way but its the simplest one.
__kernel void test(__global double *gamma,
__global double *inten,
__global double *delnu,
__global double *delair,
__global double *f,
const int lines)
{
double pie = 4.0*atan(1.0);
int j = get_global_id(0);
f[j] = 0;
for(i=0,i<lines;i++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
You need to understand how OpenCL kernel is executed.
You can think of it as large number of threads executing concurrently
and each thread could be identified with get_global_id

Profiling clEnqueueWriteBuffer using clGetEventProfilingInfo

Is it possible to use the clGetEventProfilingInfo function to get the time taken by clEnqueueWriteBuffer function. I tried using it and i got the same values for CL_PROFILING_COMMAND_START and CL_PROFILING_COMMAND_END. I want to know whether this is because of some error in my code or because profiling wont work for clEnqueueWriteBuffer.
My code snippet is:
//Writing to input buffers
clEnqueueWriteBuffer(queue[0], buf_A, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_A + (PARTITION_SIZE * j), 0, NULL, &eventList[0]);
checkErr(cl_err, "clEnqueueWriteBuffer : buf_A");
clWaitForEvents(1, eventList);
clGetEventProfilingInfo(eventList[0], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time_data += total_time;
final_time_queued_data += total_time_queued;
cout<<"\nTime: "<<final_time_data<<" "<<final_time_queued_data;
And the output i get is:
Time : 0 0.288444
Since the value returned is 0 I am assuming its not able to find the time taken for data transfer.

Optimising Host to GPU transfer

I am offloading work to GPU using OpenCL (a variant of matrix multiplication). The matrix code itself works fantastically well, but the cost of moving data to GPU is prohibitive.
I've moved from using clEnqueueRead/clEnqueueWrite to memory mapped buffers as follows:
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
sizeof(char) * queryVector_size,
NULL, NULL);
checkErr(err,"Buf A");
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
sizeof(char) * segment_size,
NULL, NULL);
checkErr(err,"Buf B");
err = clSetKernelArg(ko_smat, 0, sizeof(cl_mem), &d_c);
checkErr(err,"Compute Kernel");
err = clSetKernelArg(ko_smat, 1, sizeof(cl_mem), &d_a);
checkErr(err,"Compute Kernel");
err = clSetKernelArg(ko_smat, 2, sizeof(cl_mem), &d_b);
checkErr(err,"Compute Kernel");
query_vector = (char*) clEnqueueMapBuffer(commands, d_a, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * queryVector_size, 0, NULL, NULL, &err);
checkErr(err,"Write A");
segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_size, 0, NULL, NULL, &err);
checkErr(err,"Write B");
// code which initialises buffers using ptrs (segment_data and queryV)
err = clEnqueueUnmapMemObject(commands,
d_a,
query_vector, 0, NULL, NULL);
checkErr(err,"Unmap Buffer");
err = clEnqueueUnmapMemObject(commands,
d_b,
segment_data, 0, NULL, NULL);
checkErr(err,"Unmap Buff");
err = clEnqueueNDRangeKernel(commands, ko_smat, 2, NULL, globalWorkItems, localWorkItems, 0, NULL, NULL);
err = clFinish(commands);
checkErr(err, "Execute Kernel");
result = (char*) clEnqueueMapBuffer(commands, d_c, CL_TRUE,CL_MAP_WRITE, 0, sizeof(char) * result_size, 0, NULL, NULL, &err);
checkErr(err,"Write C");
printMatrix(result, result_row, result_col);
This code works fine when I use the ReadEnqueue/WriteEnqueue methods and intialise d_a, d_b, d_c through that, but when I use the MappedBuffers, result is 0 due to d_a and d_b being null
when running the kernel.
What is the appropriate way to map/unmap buffers?
EDIT:
the core problem seems to be from here
segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_width * segment_length, 0, NULL, NULL, &err);
// INITIALISE
printMatrix(segment_data, segment_length, segment_width);
// ALL GOOD
err = clEnqueueUnmapMemObject(commands,
d_b,
segment_data, 0, NULL, NULL);
checkErr(err,"Unmap Buff");
segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_width * segment_length, 0\
, NULL, NULL, &err);
printMatrix(segment_data, segment_length, segment_width);
// ALL ZEROs again
The first printMatrix() returns the correct output, once I unmap it and remap it, segment_data becomes all 0s (it's initial value). I suspect I'm using an incorrect flag somewhere? I cant' figure out where though.

query_vector = (char*) clEnqueueMapBuffer(commands, d_a, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * queryVector_size, 0, NULL, NULL, &err);
checkErr(err,"Write A");
segment_data = (char*) clEnqueueMapBuffer(commands, d_b, CL_TRUE,CL_MAP_READ, 0, sizeof(char) * segment_size, 0, NULL, NULL, &err);
checkErr(err,"Write B");
The buffers are mapped as CL_MAP_READ but writing to them. Unlike buffer creation, these flags do not take a device view of the memory, but a host view, so they should be mapped using the CL_MAP_WRITE flag otherwise any changes will just be discarded when its unmapped

From the OpenCL 1.2 spec:
5.4.3 Accessing mapped regions of a memory object
...
If a memory object is currently mapped for reading, the application must ensure that the memory object is unmapped before any enqueued kernels or commands that write to this memory object or any of its associated memory objects (sub-buffer or 1D image buffer objects) or its parent object (if the memory object is a sub-buffer or 1D image buffer object) begin execution; otherwise the behavior is undefined.
So, you need to map the results buffer after you've enqueued the kernel. Similarly, you need to unmap the input buffers before you enqueue the kernel. The timeline for mapping/unmapping buffers should be roughly as follows:
Create input buffers
Create output buffers
Map input buffers
Write input data
Unmap input buffers
Enqueue kernel
Map output buffers
Read output data
Unmap output buffers

Clearly the best way for speeding up your code is using mapped buffers. You can create the buffers using CL_MEM_ALLOC_HOST_PTR and this basically takes some transfer burden off the CPU by initiating DMA transfers.
Here is an example of using the mapped buffers:
// pointer to hold the result
int * host_ptr = malloc(size * sizeof(int));
d_mem = clCreateBuffer(context,CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR,
size*sizeof(cl_int), NULL, &ret);
int * map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_WRITE,
0,size*sizeof(int),0,NULL,NULL,&ret);
// initialize data
for (i=0; i<size;i++) {
map_ptr[i] = i;
}
ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);
//Set OpenCL Kernel Parameters
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_mem);
size_t global_work[1] = { size };
//Execute OpenCL Kernel
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
global_work, NULL, 0, 0, NULL);
map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_READ,
0,size*sizeof(int),0,NULL,NULL,&ret);
// copy the data to result array
for (i=0; i<size;i++){
host_ptr[i] = map_ptr[i];
}
ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);
// cl finish etc
It is taken from this post.

clEnqueueReadBuffer return a void buffer

I am trying to implement an OpenCL project that should take in input two "grey" channels images and return a "grey" channel buffer
My OpenCL code:
__kernel void motion_detection(__global uchar* bgImage, __global uchar* motionImage,__global uchar* outputImage)
{
int i = get_global_id(0);
int tsh = 7;
uchar bgEl = bgImage[i];
uchar motionEl = motionImage[i];
float bg = convert_float(bgEl);
float mo = convert_float(motionEl);
float temp = fabs(bg-mo);
if(temp>tsh){temp=255.0;
}else
{temp=0.0;
}
outputImage[i] = convert_uchar(bg);
}
My Host:
// get width and height of input image
height = inputImage->rows;
width = inputImage->cols;
// get the pointer to pixel data
cl_uchar* inputImageData = inputImage->data;
cl_uchar* motionImageData = motionImage->data;
cl_uchar* outputImageData = outputImage->data;
After set up buffers:
// Create memory object for input Image
inputImageBuffer = clCreateBuffer(
context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
width * height * greyPixelSize,
inputImageData,
&status);
CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputImageBuffer)");
motionImageBuffer = clCreateBuffer(
context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
width * height * greyPixelSize,
motionImageData,
&status);
CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputImageBuffer)");
// Create memory objects for output Image
outputImageBuffer = clCreateBuffer(
context,
CL_MEM_READ_WRITE,
width * height * greyPixelSize,
0,
&status);
Settting up arguments and executing OpenCL kernel:
// input buffer image
status = clSetKernelArg(
kernl,
0,
sizeof(cl_mem),
&inputImageBuffer);
CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)");
status = clSetKernelArg(
kernl,
1,
sizeof(cl_mem),
&motionImageBuffer);
CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputImageBuffer)");
// outBuffer imager
status = clSetKernelArg(
kernl,
2,
sizeof(cl_mem),
&outputImageBuffer);
CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (outputImageBuffer)");
status = clEnqueueNDRangeKernel(
commandQueue,
kernl,
1,
NULL,
globalThreads,
localThreads,
0,
NULL,
&ndrEvt);
clFinish(commandQueue);
Here is my problem, when this intruction is executed it returns outputImageData as "\0" and nothing else:
status = clEnqueueReadBuffer(
commandQueue,
outputImageBuffer,
CL_TRUE,
0,
width * height * greyPixelSize,
outputImageData,
0,
NULL,
NULL);
CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

OpenCL, double buffering using two command-queues for a single device - opencl

Related

when using MPI_Isend() MPI_and Irecv(), should the pair use the same request? in case the message is already an array, how it should be passed?

OpenCL GPU calculation wrong

Profiling clEnqueueWriteBuffer using clGetEventProfilingInfo

Optimising Host to GPU transfer

clEnqueueReadBuffer return a void buffer

Categories

Resources