I have an AMD A10 APU with Radeon R7 GPU. I believe this device supportes concurrent kernel execution. But when i wrote the following code and obtained profiling information it doesnt seem like the kernels are executing concurrently. My openCL code is given below (The kernels within each iteration is added to the same queue and kernels in different iteration are added to different queues and hence should be running in parallel).
for(j = 0; j < 8; j++){
cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER1_KERNEL],1,NULL,&globalSize,&localSize,4,eventList,&eventList[4 + j * 4]); //Invoking the first filter kernel
cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER2_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 4 + 4 * j,&eventList[5 + j * 4]); //Invoking the second filter kernel
cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER3_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 5 + 4 * j,&eventList[6 + j * 4]); //Invoking the third filter kernel
cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][AGGREGATE_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 6 + 4 * j,&eventList[7 + j * 4]); //Invoking the aggregate kernel
}
The code i used for profiling is :
for(j = 0; j < 8; j++){
//Code for obtaining the profiling data
clWaitForEvents(4 + 4*j, eventList+4);
clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time += total_time;
final_time_queued += total_time_queued;
cout<<"\n1 : "<<time_start<<" "<<time_end;
clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time += total_time;
final_time_queued += total_time_queued;
cout<<"\n2 : "<<time_start<<" "<<time_end;
clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time += total_time;
final_time_queued += total_time_queued;
cout<<"\n3 : "<<time_start<<" "<<time_end;
clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time += total_time;
final_time_queued += total_time_queued;
cout<<"\n4 : "<<time_start<<" "<<time_end;
}
The output of my profiling code is :
1 : 3989633359630 3989657015190
2 : 3989657016860 3989683273450
3 : 3989683275090 3989708840030
4 : 3989708841760 3989734915610
1 : 3989800219990 3989824648510
2 : 3989824650240 3989850888860
3 : 3989850890610 3989876392210
4 : 3989876393890 3989902432920
1 : 3989954275546 3989978865766
2 : 3989978867476 3990005037296
3 : 3990005038976 3990030592876
4 : 3990030594566 3990056566896
1 : 3990113144067 3990137315217
2 : 3990137316937 3990163458337
3 : 3990163460057 3990189007267
4 : 3990189008967 3990215129227
1 : 3990274589700 3990299102730
2 : 3990299104430 3990325570980
3 : 3990325572730 3990351050810
4 : 3990351052550 3990377255070
1 : 3990424871514 3990448828874
2 : 3990448830524 3990475309034
3 : 3990475310744 3990500849914
4 : 3990500851664 3990526839444
1 : 3990584574567 3990608802017
2 : 3990608803727 3990635102497
3 : 3990635104427 3990660647987
4 : 3990660649697 3990686716887
1 : 3990733269328 3990757174868
2 : 3990757176588 3990783429448
3 : 3990783431118 3990809003598
4 : 3990809005298 3990835207128
I figured out why the kernels are not executing concurrently. For concurrent execution the GPU should have enough free resources available to support concurrent execution. In my code the number of work groups of each kernel was quite high and so the GPU only had enough resources to support 1 concurrent kernel. So keep the number of workgroups to a minimum to allow concurrent execution
You need to use an out-of-order command queue (if supported by your driver) or multiple in-order command queues in order to get concurrent kernel execution.
Related
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 1 year ago.
Improve this question
I am working to implement CUDA for the following code. The first version has been written serially and the second version is written with CUDA. I am sure about its results in serial version. I expect that the second version that I have added CUDA functionality also give me the same result, but it seems that kernel function does not do any thing and it gives me the initial value of u and v. I know due to lack of my experience, the bug may be obvious, but I cannot figure it out. Also, please do not recommend using flatten array, because it is harder for me to understand the indexing in code.
First version:
#include <fstream>
#include <iostream>
#include <math.h>
#include <vector>
#include <chrono>
#include <omp.h>
using namespace std;
const int M = 1024;
const int N = 1024;
const double A = 1;
const double B = 3;
const double Du = 5 * pow(10, -5);
const double Dv = 5 * pow(10, -6);
const int Max_Itr = 1000;
const double h = 1.0 / static_cast<double>(M - 1);
const double delta_t = 0.0025;
const double s1 = (Du * delta_t) / pow(h, 2);
const double s2 = (Dv * delta_t) / pow(h, 2);
int main() {
double** u=new double* [M];
double** v=new double* [M];
for (int i=0; i<M; i++){
u[i]=new double [N];
v[i]=new double [N];
}
for (int j = 0; j < M; j++) {
for (int i = 0; i < N;i++) {
u[i][j]=0.02;
v[i][j]=0.02;
}
}
for (int k = 1; k < Max_Itr; k++) {
for (int i = 1; i < N - 1; i++) {
for (int j = 1; j < M - 1; j++) {
u[i][j] = ((1 - (4 * s1)) * u[i][j]) + (s1 * (u[i + 1][j] + u[i - 1][j] + u[i][j + 1] + u[i][j - 1])) +
(A * delta_t) + (delta_t * pow(u[i][j], 2) * v[i][j]) - (delta_t * (B + 1) * u[i][j]);
v[i][j] = ((1 - (4 * s2)) * v[i][j]) + (s2 * (v[i + 1][j] + v[i - 1][j] + v[i][j + 1] + v[i][j - 1])) + (B * delta_t * u[i][j])
- (delta_t * pow(u[i][j], 2) * v[i][j]);
}
}
}
cout<<"u: "<<u[512][512]<<" v: "<<v[512][512]<<endl;
return 0;
}
Second version:
#include <fstream>
#include <iostream>
#include <math.h>
#include <vector>
using namespace std;
#define M 1024
#define N 1024
__global__ void my_kernel(double** v, double** u){
int i= blockIdx.y * blockDim.y + threadIdx.y;
int j= blockIdx.x * blockDim.x + threadIdx.x;
double A = 1;
double B = 3;
int Max_Itr = 1000;
double delta_t = 0.0025;
double Du = 5 * powf(10, -5);
double Dv = 5 * powf(10, -6);
double h = 1.0 / (M - 1);
double s1 = (Du * delta_t) / powf(h, 2);
double s2 = (Dv * delta_t) / powf(h, 2);
for (int k = 1; k < Max_Itr; k++) {
u[i][j] = ((1 - (4 * s1))
* u[i][j]) + (s1 * (u[i + 1][j] + u[i - 1][j] + u[i][j + 1] + u[i][j - 1])) +
(A * delta_t) + (delta_t * pow(u[i][j], 2) * v[i][j]) - (delta_t * (B + 1) * u[i][j]);
v[i][j] = ((1 - (4 * s2))
* v[i][j]) + (s2 * (v[i + 1][j] + v[i - 1][j] + v[i][j + 1] + v[i][j - 1])) + (B * delta_t * u[i][j])
- (delta_t * pow(u[i][j], 2) * v[i][j]);
__syncthreads();
}
}
int main() {
double** u=new double* [M];
double** v=new double* [M];
for (int i=0; i<M; i++){
u[i]=new double [N];
v[i]=new double [N];
}
dim3 blocks(32,32);
dim3 grids(M/32 +1, N/32 + 1);
for (int j = 0; j < M; j++) {
for (int i = 0; i < N;i++) {
u[i][j]=0.02;
v[i][j]=0.02;
}
}
double **u_d, **v_d;
int d_size = N * M * sizeof(double);
cudaMalloc(&u_d, d_size);
cudaMalloc(&v_d, d_size);
cudaMemcpy(u_d, u, d_size, cudaMemcpyHostToDevice);
cudaMemcpy(v_d, v, d_size, cudaMemcpyHostToDevice);
my_kernel<<<grids, blocks>>> (v_d,u_d);
cudaDeviceSynchronize();
cudaMemcpy(v, v_d, d_size, cudaMemcpyDeviceToHost);
cudaMemcpy(u, u_d, d_size, cudaMemcpyDeviceToHost);
cout<<"u: "<<u[512][512]<<" v: "<<v[512][512]<<endl;
return 0;
}
What I expect from the second version is :
u: 0.2815 v: 1.7581
Your two-dimensional array - in the first version of the program - is implemented using an array of pointers, each of which to a separately-allocated array of double values.
In your second version, you are using the same pointer-to-pointer-to-double type, but - you're not allocating any space for the actual data, just for the array of pointers (and not copying any of the data to the GPU - just the pointers; which are useless to copy anyway, since they're pointers to host-side memory.)
What is most likely happening is that your kernel attempts to access memory at an invalid address, and its execution is aborted.
If you were to properly check for errors, as #njuffa noted, you would know that is what happened.
Now, you could avoid having to make multiple memory allocations if you were to use a single data area instead of separate allocations for each second-dimension 1D array; and that is true both for the first and the second version of your program. That would not quite be array flattening. See an explanation of how to do this (C-language-style) on this page.
Note, however, that double-dereferencing, which you insist on performing in your kernel, is likely slowing it down significantly.
Is it possible to use the clGetEventProfilingInfo function to get the time taken by clEnqueueWriteBuffer function. I tried using it and i got the same values for CL_PROFILING_COMMAND_START and CL_PROFILING_COMMAND_END. I want to know whether this is because of some error in my code or because profiling wont work for clEnqueueWriteBuffer.
My code snippet is:
//Writing to input buffers
clEnqueueWriteBuffer(queue[0], buf_A, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_A + (PARTITION_SIZE * j), 0, NULL, &eventList[0]);
checkErr(cl_err, "clEnqueueWriteBuffer : buf_A");
clWaitForEvents(1, eventList);
clGetEventProfilingInfo(eventList[0], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time_data += total_time;
final_time_queued_data += total_time_queued;
cout<<"\nTime: "<<final_time_data<<" "<<final_time_queued_data;
And the output i get is:
Time : 0 0.288444
Since the value returned is 0 I am assuming its not able to find the time taken for data transfer.
I'm Trying to convert a code written in Cuda to openCL and run into some trouble. My final goal is to implement the code on an Odroid XU3 board with a Mali T628 GPU.
In order to simplify the transition and save time trying to debug openCL kernels I've done the following steps:
Implement the code in Cuda and test it on a Nvidia GeForce 760
Implement the code in openCL and test it on a Nvidia GeForce 760
test the openCL code on an Odroid XU3 board with a Mali T628 GPU.
I know that different architectures may have different optimizations but that isn't my main concern for now. I manged to run the openCL code on my Nvidia GPU with no apparent issues but keep getting strange errors when trying to run the code on the Odroid board. I know that different architectures have different handling of exceptions etc. but I'm not sure how to solve those.
Since the openCL code works on my Nvidia I assume that I managed to do the correct transition between thread/blocks -> workItems/workGroups etc.
I already fixed several issues that relate to the cl_device_max_work_group_size issue so that can't be the cuase.
When running the code i'm getting a "CL_OUT_OF_RESOURCES" error. I've narrowed the cause of the error to 2 lines in the code but not sure to fix those issues.
the error is caused by the following lines:
lowestDist[pixelNum] = partialDiffSumTemp; both variables are private variables of the kernel and therefor I don't see any potential issue.
d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 0] = bestDisparity[0];
Here I guess the cause is "OUT_OF_BOUND" but not sure how to debug it since the original code doesn't have any issue.
My Kernel code is is:
#define ALIGN_IMAGE_WIDTH 64
#define NUM_PIXEL_PER_THREAD 4
#define MIN_DISPARITY 0
#define MAX_DISPARITY 55
#define WINDOW_SIZE 19
#define WINDOW_RADIUS (WINDOW_SIZE / 2)
#define TILE_SHARED_MEM_WIDTH 96
#define TILE_SHARED_MEM_HEIGHT 32
#define TILE_BOUNDARY_WIDTH 64
#define TILE_BOUNDARY_HEIGHT (2 * WINDOW_RADIUS)
#define BLOCK_WIDTH (TILE_SHARED_MEM_WIDTH - TILE_BOUNDARY_WIDTH)
#define BLOCK_HEIGHT (TILE_SHARED_MEM_HEIGHT - TILE_BOUNDARY_HEIGHT)
#define THREAD_NUM_WIDTH 8
#define THREADS_NUM_HEIGHT TILE_SHARED_MEM_HEIGHT
//TODO fix input arguments
__kernel void hello_kernel( __global unsigned char* d_leftImage,
__global unsigned char* d_rightImage,
__global float* d_disparityLeft) {
int blockX = get_group_id(0);
int blockY = get_group_id(1);
int threadX = get_local_id(0);
int threadY = get_local_id(1);
__local unsigned char leftImage [TILE_SHARED_MEM_WIDTH * TILE_SHARED_MEM_HEIGHT];
__local unsigned char rightImage [TILE_SHARED_MEM_WIDTH * TILE_SHARED_MEM_HEIGHT];
__local unsigned int partialDiffSum [BLOCK_WIDTH * TILE_SHARED_MEM_HEIGHT];
int alignedImageWidth = 640;
int partialDiffSumTemp;
float bestDisparity[4] = {0,0,0,0};
int lowestDist[4];
lowestDist[0] = 214748364;
lowestDist[1] = 214748364;
lowestDist[2] = 214748364;
lowestDist[3] = 214748364;
// Read image blocks into shared memory. read is done at 32bit integers on a uchar array. each thread reads 3 integers(12byte) 96/12=8threads
int sharedMemIdx = threadY * TILE_SHARED_MEM_WIDTH + 4 * threadX;
int globalMemIdx = (blockY * BLOCK_HEIGHT + threadY) * alignedImageWidth + blockX * BLOCK_WIDTH + 4 * threadX;
for (int i = 0; i < 4; i++) {
leftImage [sharedMemIdx + i ] = d_leftImage [globalMemIdx + i];
leftImage [sharedMemIdx + 4 * THREAD_NUM_WIDTH + i ] = d_leftImage [globalMemIdx + 4 * THREAD_NUM_WIDTH + i];
leftImage [sharedMemIdx + 8 * THREAD_NUM_WIDTH + i ] = d_leftImage [globalMemIdx + 8 * THREAD_NUM_WIDTH + i];
rightImage[sharedMemIdx + i ] = d_rightImage[globalMemIdx + i];
rightImage[sharedMemIdx + 4 * THREAD_NUM_WIDTH + i ] = d_rightImage[globalMemIdx + 4 * THREAD_NUM_WIDTH + i];
rightImage[sharedMemIdx + 8 * THREAD_NUM_WIDTH + i ] = d_rightImage[globalMemIdx + 8 * THREAD_NUM_WIDTH + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
int imageIdx = sharedMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS;
int partialSumIdx = threadY * BLOCK_WIDTH + 4 * threadX;
for(int dispLevel = MIN_DISPARITY; dispLevel <= MAX_DISPARITY; dispLevel++) {
// horizontal partial sum
partialDiffSumTemp = 0;
#pragma unroll
for(int i = imageIdx - WINDOW_RADIUS; i <= imageIdx + WINDOW_RADIUS; i++) {
//partialDiffSumTemp += calcDiff(leftImage [i], rightImage[i - dispLevel]);
partialDiffSumTemp += abs(leftImage[i] - rightImage[i - dispLevel]);
}
partialDiffSum[partialSumIdx] = partialDiffSumTemp;
barrier(CLK_LOCAL_MEM_FENCE);
for (int pixelNum = 1, i = imageIdx - WINDOW_RADIUS; pixelNum < NUM_PIXEL_PER_THREAD; pixelNum++, i++) {
partialDiffSum[partialSumIdx + pixelNum] = partialDiffSum[partialSumIdx + pixelNum - 1] +
abs(leftImage[i + WINDOW_SIZE] - rightImage[i - dispLevel + WINDOW_SIZE]) -
abs(leftImage[i] - rightImage[i - dispLevel]);
}
barrier(CLK_LOCAL_MEM_FENCE);
// vertical sum
if(threadY >= WINDOW_RADIUS && threadY < TILE_SHARED_MEM_HEIGHT - WINDOW_RADIUS) {
for (int pixelNum = 0; pixelNum < NUM_PIXEL_PER_THREAD; pixelNum++) {
int rowIdx = partialSumIdx - WINDOW_RADIUS * BLOCK_WIDTH;
partialDiffSumTemp = 0;
for(int i = -WINDOW_RADIUS; i <= WINDOW_RADIUS; i++,rowIdx += BLOCK_WIDTH) {
partialDiffSumTemp += partialDiffSum[rowIdx + pixelNum];
}
if (partialDiffSumTemp < lowestDist[pixelNum]) {
lowestDist[pixelNum] = partialDiffSumTemp;
bestDisparity[pixelNum] = dispLevel - 1;
}
}
}
}
if (threadY >= WINDOW_RADIUS && threadY < TILE_SHARED_MEM_HEIGHT - WINDOW_RADIUS && blockY < 32) {
d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 0] = bestDisparity[0];
d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 1] = bestDisparity[1];
d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 2] = bestDisparity[2];
d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 3] = bestDisparity[3];
}
}
Thanks for all the help
Yuval
From my experience NVidia GPUs not always crash on out of bound access and many times kernel still returns expected results.
Use printf to check the indexes. If you have Nvidia OpenCL 1.2 driver installed printf should be available as a core function. As far as I checked Mali-T628 uses OpenCL 1.1 then check if printf is available as a vendor extension. Also you can run your kernel on AMD/Intel CPU where printf is available (OpenCL 1.2 / 2.0).
Alternative way of checking indexes can be passing __global int* debug array where you would store indexes and then check them on the host. Make sure to allocate it big enough so that out of bound index will be recorded.
i download the scilab sourcecode, i am interested how the conv2 works and want translate it to c# code, but i don't know what the meaning of "C2F(ddot)" and how it works. if i tranfer the "C2F(ddot)" into c or c# code how i should implement it. here are some piece of source code in scilab
extern double C2F(ddot)(int *n, double *A, int *iA, double *B, int *iB);
/*--------------------------------------------------------------------------*/
void conv2_separable_R(double *R, int nR, double *C, int mC, double *A, int mA, int nA, double *Out, int mOut, int nOut, int edgM, int edgN, double *T)
{
int ai = 0, tj = 0, ci = 0, rj = 0; /*current index over A,T,C and R */
int i = 0, j = 0; /* loop variables*/
int l = 0;
int one = 1, minusone = -1;
for (i = 0; i < mOut; i++ )
{
/*Compute the 1-D conv A(i,:) and C in T */
ai = Max(0, i - edgM) ;
ci = mC - 1 - Max(0, edgM - i);
l = Min(ci + 1, mA - ai);
for (j = 0; j < nA; j++ )
{
T[j] = C2F(ddot)(&l, A + ai + mA * j, &one, C + ci - l + 1, &minusone);
}
/*1-D convolution of T and R */
for (j = 0; j < nOut; j++ )
{
rj = nR - 1 - Max(0, edgN - j);
tj = Max(0, j - edgN) ;
l = Min(rj + 1, nA - tj);
Out[i + j * mOut] = C2F(ddot)(&l, T + tj, &one, R + rj - l + 1, &minusone);
}
}
}
if i want tranform the code:" T[j] = C2F(ddot)(&l, A + ai + mA * j, &one, C + ci - l + 1, &minusone);" into c or c# ,how i should do ?
In scilab's sources, C2F is a macro used to call Fortran function from C.
It's declared in machine.h.
Actually this code probably make a call to the Blass ddot function.
That it, the dot product of two vector is done in Fortran...
I have a problem with loops in opencl when I want to use counters in my loop to generate values, It seems like the counters values remain constants during the iterations. My OpenCL function code is the following :
void my_loop( __global unsigned int* tab1, unsigned int* tab2){
uint i, j;
uint global_id_y = get_global_id(1);
uint global_size_y = get_global_size(1);
uint loop = global_size_y - global_id_y - 1
uint length = get_global_size(0);
for(i = 0; i < loop; i++){
sidx = i * length;
for(j = 0; j < length; j++){
tab2[sidx + j ] = i*10 + j;
}
}
The code that calls the clEnqueueNDRangeKernel function
global_work_size[0] = ncols; //ncols = 5
global_work_size[1] = nbfrequents; //nbfrequents = 10
local_work_size[0] = ncols;
local_work_size[1] = nbfrequents;
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size,
local_work_size, 0, NULL, &event);
After execution, the results I obtained is something like this :
0 1 2 3 4
0 1 2 3 4
0 1 2 3 4
0 1 2 3 4
...
0 1 2 3 4
Is it normal ? How can I do to use the differents values of my counters ?