Related
This function CLBlastSgemmStridedBatched() in CLBlast https://github.com/CNugteren/CLBlast performs the matrix-matrix multiplication of a batch of matrices, c_stride denotes The (fixed) stride between two batches of the C matrix and determines the locations of output matrix C. Thus, this functions perform
for(int i=0; i< batch_count; i++){
C + i * c_stride = αlpha* ( A + i * a_stride ) *( B + i * b_stride ) + β* ( C + i * c_stride )
}
When c_stride=0, they get different results on two different devices. By contract, when c_stride=M*N, they get the same results on the two devices. Whether CLBlastSgemmStridedBatched() of CLBLast supports c_stride =0 or not?
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "math.h"
#include <clblast_c.h>
cl_context ctx;
cl_command_queue queue;
double what_time_is_it_now()
{
struct timeval time;
if (gettimeofday(&time, NULL)) {
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * .000001;
}
void init_CL(){
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0 };
/* Setup OpenCL environment. */
err = clGetPlatformIDs(1, &platform, NULL);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
props[1] = (cl_context_properties) platform;
ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
queue = clCreateCommandQueue(ctx, device, 0, &err);
}
double clSBatchedSgemm(int M, int N, int K, int batch_count, int a_stride){
int i,j,b;
// Allocate host storage for batch_count A,B,C matrices
float* A = malloc(sizeof(float) * M * K * batch_count);
float* B = malloc(sizeof(float) * K * N * batch_count);
float* C = malloc(sizeof(float) * M * N * batch_count);
for(b=0; b<batch_count; b++) {
for(j=0; j<M; j++) {
for(i=0; i<K; i++) {
int index = i*M + j + b*M*K;
A[index] = index*index + 0.0f;
B[index] = index + 1.0f;
C[index] = 0.0f;
}
}
}
cl_float alpha = 1;
cl_float beta = 2;
size_t lda = K;
size_t ldb = N;
size_t ldc = N;
const size_t b_stride = N*K, c_stride = M*N;
if(c_stride != M*N){
printf("[Error!] c_stride must be the size of matrix A, the function CLBlastSgemmStridedBatched does not support that c_stride is zero \n");
}
cl_mem bufA, bufB, bufC;
cl_event event = NULL;
cl_int err;
/* Prepare OpenCL memory objects and place matrices inside them. */
if(a_stride==M*K){
bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, batch_count*M * K * sizeof(*A),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,batch_count*M * K * sizeof(*A), A, 0, NULL, NULL);
}else{
bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,M * K * sizeof(*A), A, 0, NULL, NULL);
}
bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, batch_count*K * N * sizeof(*B),NULL, &err);
bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, batch_count*M * N * sizeof(*C),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,batch_count*K * N * sizeof(*B), B, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,batch_count*M * N * sizeof(*C), C, 0, NULL, NULL);
double start_t = what_time_is_it_now();
/* Call clBLAS extended function.*/
CLBlastStatusCode status = CLBlastSgemmStridedBatched(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeNo,
M, N, K,
alpha,
bufA, 0, lda, a_stride,
bufB, 0, ldb, b_stride,
beta,
bufC, 0, ldc, c_stride,
batch_count,
&queue, &event);
/* Wait for calculations to be finished. */
if (status == CLBlastSuccess) {
err = clWaitForEvents(1, &event);
clReleaseEvent(event);
}
clFinish(queue);
double btime = what_time_is_it_now()-start_t;
printf("Batch_size: %6d, batchedGEMMTime: %3.6f ms, time/GEMM: %3.6f \n", batch_count, btime, btime/batch_count);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed SGEMM with status %d\n", status);
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, batch_count*M * N * sizeof(*C), C, 0, NULL, NULL);
/* Release OpenCL memory objects. */
clReleaseMemObject(bufC);
clReleaseMemObject(bufB);
clReleaseMemObject(bufA);
free(A);
free(B);
free(C);
return btime/batch_count;
}
int main(void) {
init_CL();
int M=128, N=5184, K=256;
int batch_Num = 256;
int strideA = 0;
double* times = malloc(sizeof(float) * batch_Num);
for(int batch_index=1;batch_index<batch_Num;batch_index=batch_index*2){
times[batch_index]= clSBatchedSgemm(M,N,K,batch_index,strideA);
}
times[1]= clSBatchedSgemm(M,N,K,1,strideA);
for(int i=1;i<batch_Num/2-1;i=i*2){
printf("i=%3d \t i*2=%3d \t %f, \t %f, \t speedup: %.3f, \t speedup/1: %.3f\n", i, i*2, times[i],times[i*2], times[i]/times[i*2], times[1]/times[i*2]);
}
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return 0;
}
I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.
Here is my code on the Host
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
double *arange(double start, double end, double step)
{
// 'arange' routine.
int i;
int arr_size = ((end - start) / step) + 1;
double *output = malloc(arr_size * sizeof(double));
for(i=0;i<arr_size;i++)
{
output[i] = start + (step * i);
}
return output;
}
int main()
{
// This code executes on the OpenCL Host
// Host data
double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
double *delnu = arange(nu_ini, nu_end, nu_step);
double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
double *gamma, *f;
double prs = 950.0;
int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
FILE *fp = fopen("h2o_HITRAN.par","r");
char string[320];
while(!feof(fp))
{
dum = fgetc(fp);
if(dum == '\n')
{
lines++;
}
}
rewind(fp);
nu = malloc(lines * sizeof(double));
inten = malloc(lines * sizeof(double));
gam_air = malloc(lines * sizeof(double));
n_air = malloc(lines * sizeof(double));
del_air = malloc(lines * sizeof(double));
gamma = malloc(lines * sizeof(double));
f = malloc(delnu_size * sizeof(double));
i=0;
while(fgets(string, 320, fp))
{
sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
i++;
}
size_t line_siz = sizeof(double) * lines;
size_t delnu_siz = sizeof(double) * delnu_size;
// gamma calculation
for(i=0;i<lines;i++)
{
gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
}
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = NULL;
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
// Create a context and associate it with the devices
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = NULL;
cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
// Create a buffer objects that will contain the data from the host array 'buf_xxxx'
cl_mem buf_inten = NULL;
cl_mem buf_gamma = NULL;
cl_mem buf_delnu = NULL;
cl_mem buf_nu = NULL;
cl_mem buf_del_air = NULL;
cl_mem buf_f = NULL;
buf_inten = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_gamma = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_delnu = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
buf_nu = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_f = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
// Write input array A to the Device buffer 'buf_xxx'
status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_Source;
FILE *program_handle = fopen("abs_calc.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_Source = (char*)malloc(program_size+1);
program_Source[program_size] = '\0';
fread(program_Source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// Create the vector addition kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "abs_cross", &status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);
// Define index space (global work size) of work items for execution.
// A workgroup size (local work size) is not required, but can be used.
size_t globalWorkSize[2] = {lines, delnu_size};
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
// Read the Device output buffer to the host output array
clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);
// Verify the output
FILE *file = fopen("opencl_output","w");
for(i=0;i<delnu_size;i++)
{
fprintf(file, "%le %le\n", delnu[i], f[i]);
}
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(buf_nu);
clReleaseMemObject(buf_inten);
clReleaseMemObject(buf_del_air);
clReleaseMemObject(buf_gamma);
clReleaseMemObject(buf_f);
clReleaseMemObject(buf_delnu);
clReleaseContext(context);
// Free host resources
free(nu);
free(inten);
free(gam_air);
free(n_air);
free(del_air);
free(delnu);
free(gamma);
free(f);
free(platforms);
free(devices);
fclose(fp);
fclose(file);
return 0;
}
and this is my kernel code
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
global double *gamma,
global double *delnu,
global double *nu,
global double *del_air,
global double *f)
{
double pie = 4.0*atan(1.0);
int i = get_global_id(0);
int j = get_global_id(1);
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
Am I doing something wrong?
Thank you.
You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.
I am using AMD W2100, and yes, I have printed out all the supported extension and it included cl_khr_fp64 extension.
Sorry, I forgot to include the original calculation. The actual calculation goes like the following..
for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
I would write OpenCL kernel as below,
Without using atomics and only single work dimension.
global_work_size = delnu_size
There could be a better way but its the simplest one.
__kernel void test(__global double *gamma,
__global double *inten,
__global double *delnu,
__global double *delair,
__global double *f,
const int lines)
{
double pie = 4.0*atan(1.0);
int j = get_global_id(0);
f[j] = 0;
for(i=0,i<lines;i++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
You need to understand how OpenCL kernel is executed.
You can think of it as large number of threads executing concurrently
and each thread could be identified with get_global_id
Before I start I am a C beginner and I am trying to do some openCL work which might have been a mistake. Below is my kernel code:
__kernel void collatz(__global int* in, __global int* out)
{
uint id = get_global_id(0);
unsigned long n = (unsigned long)id;
uint count = 0;
while (n > 1) {
if (n % 2 == 0) {
n = n / 2;
} else {
if(n == 1572066143) {
unsigned long test = n;
printf("BEFORE - %lu\n", n);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
n = (3 * n) + 1;
} else {
n = (3 * n) + 1;
}
}
count = count + 1;
}
out[id] = count;
}
and the output:
BEFORE - 1572066143
AFTER - 421231134
To me it looks like n is overflowing but I can't figure out why it is happening.
The interesting thing is if I create a new variable to store the same value as n then it seems to work correctly.
unsigned long test = 1572066143;
printf("BEFORE - %lu\n", test);
test = (3 * test) + 1;
printf("AFTER - %lu\n", test);
Output:
BEFORE - 1572066143
AFTER - 4716198430
As I said I am a C beginner so I could be doing something very stupid! Any help would be appreciated as I have been pulling my hair out for hours now!
Thanks,
Stephen
Update:
Here is my host code in case I am doing something stupid on that end:
int _tmain(int argc, _TCHAR* argv[])
{
/*Step1: Getting platforms and choose an available one.*/
cl_uint numPlatforms; //the NO. of platforms
cl_platform_id platform = NULL; //the chosen platform
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform = platforms[0];
free(platforms);
/*Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *devices;
devices = (cl_device_id*)malloc(1 * sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
/*Step 3: Create context.*/
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, NULL);
/*Step 4: Creating command queue associate with the context.*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
const char *filename = "HelloWorld_Kernel.cl";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] = { strlen(source) };
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
cl_ulong max = 2000000;
cl_ulong *numbers = NULL;
numbers = new cl_ulong[max];
for (int i = 1; i <= max; i++) {
numbers[i] = i;
}
int *output = (int*)malloc(sizeof(cl_ulong) * max);
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, max * sizeof(cl_ulong), (void *)numbers, NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, max * sizeof(cl_ulong), NULL, NULL);
/*Step 8: Create kernel object */
cl_kernel kernel = clCreateKernel(program, "collatz", NULL);
/*Step 9: Sets Kernel arguments.*/
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
/*Step 10: Running the kernel.*/
size_t global_work_size[] = { max };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
/*Step 11: Read the data put back to host memory.*/
status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, max * sizeof(cl_ulong), output, 0, NULL, NULL);
return SUCCESS;
}
I finally got to the bottom of the issue.
I was running the code on my Intel HD Graphics 4600 chip and it was producing the strange behaviour shown in the original question. I switched to using my AMD card and then it started working as expected!
Very strange. Thanks to everyone for their help!
Host side and device size values have different sizes. In host, long can vary from 32 to 64bits, depending on the platform. In device, long refers to 64bits only.
printf() function, as defined in C says that %ld is to print long (host side long) numbers. You are using printf in a kernel, so.... It could be that the C-like parser is used, therefore printing the variable as a 32bits long.
Can you try printing it as %lld or as a floating point?
I have faced the same problem as here: How to effectively swap OpenCL memory buffers?. My first implementation was the same as has been described in the question, at each cycle it writes/reads memory buffers to/from the device. As pointed out this introduces useless read/write buffer overhead. The code (with memory overhead) below works fine:
//THIS WORKS!!!
f0_mem = clCreateBuffer(
context,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof (int)*(capacity + 1),
NULL,
&err);
f1_mem = (..."the same as above"...);
m_d_mem = clCreateBuffer(..., CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof (int)*capacity,...);
for (int k = 0; k < numelem; k++) {
sumK = sumK - weight[k];
cmax = 0;
cmax = max(capacity - sumK, weight[k]);
total_elements = (size_t) (capacity - cmax + 1);
if (k % 2 == 0) {
//clEnqueueWriteBuffer of cl_mem buffers
writeBufferToDevice(f0_mem, f1_mem, f0, f1);
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
} else {
//clEnqueueWriteBuffer of cl_mem buffers
writeBufferToDevice(f1_mem, f0_mem, f1, f0);
setKernelArgs(f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
}
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
//clEnqueueReadBuffer of cl_mem buffers
readBufferFromDevice(f0_mem, f1_mem, m_d_mem, f0, f1, m_d);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
EDIT: My kernel:
void kernel knapsack(global int *input_f, global int *output_f, global int *m_d, int cmax, int weightk, int pk, int maxelem){
int c = get_global_id(0)+cmax;
if(get_global_id(0) < maxelem){
if(input_f[c] < input_f[c - weightk] + pk){
output_f[c] = input_f[c - weightk] + pk;
m_d[c-1] = 1;
}
else{
output_f[c] = input_f[c];
}
}
}
After I have tried to implement the two suggested solutions:
simply swapping setKernelArgs(...)
create two kernels
For the first one this my code:
//ARGUMENTS SWAP
f0_mem = ...
f1_mem = ...
m_d_mem = ...
//clEnqueueWriteBuffer occurs hear
writeBufferToDevice( (cl_mem&) f0_mem, (cl_mem&) f1_mem, (cl_mem&) m_d_mem, (int*) f0, (int*) f1, (int*) m_d);
for (int k = 0; k < numelem; k++) {
/*
The same code block
*/
if (k % 2 == 0) {
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
} else {
setKernelArgs(f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
}
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity, m_d, 0, NULL, NULL);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
The second solution is implemented in this way:
//TWO KERNELS
f0_mem = ...
f1_mem = ...
m_d_mem = ...
//clEnqueueWriteBuffer occurs hear
writeBufferToDevice( (cl_mem&) f0_mem, (cl_mem&) f1_mem, (cl_mem&) m_d_mem, (int*) f0, (int*) f1, (int*) m_d);
for (int k = 0; k < numelem; k++) {
/*
The same code block
*/
if (k % 2 == 0) {
setKernelArgs(f0_mem, f1_mem, weight[k], value[k], (int) total_elements);
clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
} else {
setKernelArgs(kernel1, f1_mem, f0_mem, weight[k], value[k], (int) total_elements);
clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, global_work_items, NULL, 0, NULL, NULL);
}
clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity, m_d, 0, NULL, NULL);
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
}
Neither of the two solutions work for me (it seems to me, no swapping occur at all!), what am I doing wrong?
Sub-question: in the last two solutions, is it possible to have memory buffers filled with zeroes without using writeBufferToDevice( f0_mem, f1_mem, m_d_mem...) before the for cycle?
This work is based on this article:
Solving knapsack problems on GPU by V. Boyera, D. El Baza, M. Elkihel
related work: Accelerating the knapsack problem on GPUs by Bharath Suri
Both attempted solutions looks correct to me but there may be some dependencies between each iteration - you would have to post your kernel to check.
It works fine in your solution probably because you are writing and reading each iteration which works slower so it's enough time to synchronize itself.
You can try to add clFinish(command); after each OpenCL API call to see if that makes a difference.
Apart from that there is 3rd solution you could try: swapping pointers in the kernel. You will need to move your loop from CPU to GPU.
inline void swap_pointers(__global double **A, __global double **B)
{
__global double *tmp = *A;
*A = *B;
*B = tmp;
}
__kernel void my_kernel(
__global double *pA,
__global double *pB,
...
)
{
for (int k = 0; k < numelem; k++)
{
// some stuff here
swap_pointers(&pA, &pB);
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
}
}
Then read everything in one go on the host (m_d_mem must be big enough to store data from all iterations):
clEnqueueReadBuffer(queue, m_d_mem, CL_TRUE, 0, sizeof (int)*capacity*numelem, m_d, 0, NULL, NULL);
Solution:
At each cycle after copying m_d to M, the m_d should be reseted and written back to m_d_mem buffer object with Knapsack::writeBuffer_m_d_ToDevice()
ksack.readBuffer_m_d_FromDevice();
memcpy(M + k*capacity, m_d, sizeof (int)*capacity);
ksack.writeBuffer_m_d_ToDevice();//resets m_d_mem
I'm trying to dig into openCL. At the moment I'm asking myself why a scalar differs from array when it comes to transferring to GPU.
Below there are two inputs. A scalar and a array. Why is there such a big difference in transferring them to the GPU?
Thanks in advance!
// input size
int input_size = 4;
err = clSetKernelArg(kernel, 0, sizeof(unsigned int), &_input_size);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to set kernel arguments!");
}
// input
int input[input_size];
input[0] = 1;
input[1] = 2;
input[2] = 3;
input[3] = 4;
cl_mem cl_input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * _input_size, NULL, NULL);
if (!cl_input) {
throw std::runtime_error("Failed to allocate device memory!");
}
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to set kernel arguments!");
}
err = clEnqueueWriteBuffer(commands, cl_input, CL_TRUE, 0, sizeof(int) * _input_size, _input, 0, NULL, NULL);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to write to source array!");
}
// output
int output_size = 4;
int output[output_size];
cl_mem cl_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * _output_size, NULL, NULL);
if (!cl_output) {
throw std::runtime_error("Failed to allocate device memory!");
}
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &cl_output);
if (err != CL_SUCCESS) {
throw std::runtime_error("Failed to set kernel arguments!" );
}
In OpenCL scalars are passed by value and are constants to each thread. Arrays are passed by reference (address) and can be used a input (read only) , output ( write only) or input/output ( read/write). Arrays require the cl_mem object be passed as the arguments. cl_mem is a meta object that describes the array. Scalars are so simple they just require their size as meta data.
So in you above example the kernel should look like ....
kernel void mykernel( const int arg0, global int * arg1, global int * arg2 ... ) {
int i = get_global_id(0);
// In every thread arg0 is constant value and cannot be changed
if (i < arg0)
arg2[i] = arg[1[i];
}