System crashing (unresponsive screen) when running OpenCL code - opencl

I am writing this simple opencl code and everytime I execute the code it causes system to crash, only my mouse cursor is responsive and there is a one screen blank. Similar thing happened to me when I was using clinfo with up-to-date AMDGPU-PRO driver. I am currently using slightly older version which is 20.40, and I had not had any problem with this version for running other OpenCL codes. I did try downgrading the driver to even older one, 20.30, and I am having the same problem.
One edit that I made for my code is changing the arg_size paramters in kernel arguments, clSetKernelArg(), from sizeof(cl_mem) to sizeof(cl_int) for the first four variables and sizeof(cl_mem) to sizeof(cl_float) or sizeof(float) for the last four variables. I made the changes because I was getting CL_INVALID_KERNEL_ARGS error. After the changes were made, I've noticed that I was not getting any CL errors anymore, so I thought I finally fixed the code, but now it is crashing my system.... Looking at journalctl log, it seems that it has to do with some shared libraries. Other than that I have no idea what the log is saying.
Below is the main code, kernel code, and the journalctl log.
Main code
#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include "cl_err.h"
// netCDF constants
#define err(e) {printf("Error: %s\n", nc_strerror(e)); return(2);}
#define clerrchk(arg, e) {printf(" %-40s : %s\n",arg, geterrstr(e));}
#define fname "leap3d.nc"
// Variable sizes and dimensions (constants)
#define ndims 4
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
int main()
{
int i,j,k;
int nx = 128,
ny = 128,
nz = 16,
nt = 1000;
int *p_nx = &nx,
*p_ny = &ny,
*p_nz = &nz,
*p_nt = &nt;
float u = 0.0,
v = 5.0,
w = 0.0,
c = 0.01;
float *p_u = &u,
*p_v = &v,
*p_w = &w,
*p_c = &c;
// p_tf : p at future
// p_tn : p at now
// p_tp : p at past
float q_tf[nz+2][ny+2][nx+2];
float q_tn[nz+2][ny+2][nx+2];
float q_tp[nz+2][ny+2][nx+2];
float (*p_tf)[ny+2][nx+2] = q_tf;
float (*p_tn)[ny+2][nx+2] = q_tn;
float (*p_tp)[ny+2][nx+2] = q_tp;
size_t p_siz = sizeof(float) * (nx+2) * (ny+2) * (nz+2);
size_t n_siz = sizeof(int) ,
c_siz = sizeof(float) ;
int ncid, retval, varid, x_dimid, y_dimid, z_dimid, t_dimid;
int dimids[ndims];
size_t start[ndims], count[ndims];
// netCDF file operation
// Creating netCDF file
if ((retval = nc_create(fname, NC_CLOBBER, &ncid)))
err(retval);
// Define dimensions
if ((retval = nc_def_dim(ncid, "z", nz+2, &z_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "y", ny+2, &y_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "x", nx+2, &x_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid)))
err(retval);
// Dimension ids
dimids[0] = t_dimid;
dimids[1] = z_dimid;
dimids[2] = y_dimid;
dimids[3] = x_dimid;
// Variable for writing netCDF data one timestep at a time
count[0] = 1; // For time dimension : 1 timestep
count[1] = nz+2; // For z : write everything
count[2] = ny+2; // For y : write everything
count[3] = nx+2; // For x : write everything
start[1] = 0; // For z : don't do anything
start[2] = 0; // For y : don't do anything
start[3] = 0; // For x : don't do anything
if ((retval = nc_def_var(ncid, "data", NC_FLOAT, ndims, dimids, &varid)))
err(retval);
if ((retval = nc_enddef(ncid)))
err(retval);
data_init(nx,ny,nz,(float*)p_tf);
data_init(nx,ny,nz,(float*)p_tn);
data_init(nx,ny,nz,(float*)p_tp);
// Euler scheme for the first time step
for(k=1;k<nz+1;k++)
for(j=1;j<ny+1;j++)
for(i=1;i<nx+1;i++)
{
p_tf[k][j][i] = p_tn[k][j][i]
- u * c * (p_tn[k][j][i] - p_tn[k][j][i-1])
- v * c * (p_tn[k][j][i] - p_tn[k][j-1][i])
- w * c * (p_tn[k][j][i] - p_tn[k-1][j][i]);
}
pbndry(nx,ny,nz,(float*)p_tf);
p_tp = p_tn;
p_tn = p_tf;
start[0] = 0;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
err(retval);
// OpenCL part //
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
clerrchk("get number of devices", status);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
clerrchk("get device ids", status);
// for(i=0;i<numDevices;i++)
// {
// status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name_data), name_data, NULL);
//
// printf("Device Name #%d: %s\n", i, name_data);
// status = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(comp_units), &comp_units, NULL);
//
// printf("Max Work-Group %d\n", comp_units);
// status = clGetDeviceInfo(devices[i], CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
//
// printf("Double FP config = %llu, Support? = %d\n", cfg, status);
// }
// Create a context and associate it with the devices
cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
clerrchk("create context", status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
clerrchk("create cmd queue", status);
cl_mem buf_p_tf = clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
clerrchk("create buffer buf_p_tf", status);
cl_mem buf_p_tn = clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
clerrchk("create buffer buf_p_tn", status);
cl_mem buf_p_tp = clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
clerrchk("create buffer buf_p_tp", status);
cl_mem buf_nx = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nx", status);
cl_mem buf_ny = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_ny", status);
cl_mem buf_nz = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nz", status);
cl_mem buf_nt = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
clerrchk("create buffer buf_nt", status);
cl_mem buf_u = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
clerrchk("create buffer buf_u", status);
cl_mem buf_v = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
clerrchk("create buffer buf_v", status);
cl_mem buf_w = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
clerrchk("create buffer buf_w", status);
cl_mem buf_c = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
clerrchk("create buffer buf_c", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_FALSE, 0, p_siz, p_tf ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tf", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tn , CL_FALSE, 0, p_siz, p_tn ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tn", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tp , CL_FALSE, 0, p_siz, p_tp ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_p_tp", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_nx , CL_FALSE, 0, n_siz, p_nx ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nx", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_ny , CL_FALSE, 0, n_siz, p_ny ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_ny", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_nz , CL_FALSE, 0, n_siz, p_nz ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nz", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_nt , CL_FALSE, 0, n_siz, p_nt ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_nt", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_u , CL_FALSE, 0, c_siz, p_u ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_u", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_v , CL_FALSE, 0, c_siz, p_v ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_v", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_w , CL_FALSE, 0, c_siz, p_w ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_w", status);
status = clEnqueueWriteBuffer(cmdQueue, buf_c , CL_FALSE, 0, c_siz, p_c ,0, NULL, NULL);
clerrchk("enqueue write buffer for buf_c", status);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_source;
FILE *program_handle = fopen("leapfrog.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_source = (char*)malloc(program_size+1);
program_source[program_size] = '\0';
fread(program_source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_source, &program_size, &status);
clerrchk("create program", status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
if(status != CL_SUCCESS)
{
//printf("Code : %d\n",status);
//printf("Program 1 %s\n",getErrorString(status));
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s\n", log);
}
// Create a kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "leapfrog3d", &status);
clerrchk("create kernel", status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &buf_nx );
clerrchk("set kernel buf_nx", status);
status = clSetKernelArg(kernel, 1, sizeof(cl_int), &buf_ny );
clerrchk("set kernel buf_ny", status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &buf_nz );
clerrchk("set kernel buf_nz", status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &buf_nt );
clerrchk("set kernel buf_nz", status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_p_tf);
clerrchk("set kernel buf_p_tf", status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_p_tn);
clerrchk("set kernel buf_p_tn", status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &buf_p_tp);
clerrchk("set kernel buf_p_tp", status);
status = clSetKernelArg(kernel, 7, sizeof(cl_float), &buf_u );
clerrchk("set kernel buf_u", status);
status = clSetKernelArg(kernel, 8, sizeof(cl_float), &buf_v );
clerrchk("set kernel buf_v", status);
status = clSetKernelArg(kernel, 9, sizeof(cl_float), &buf_w );
clerrchk("set kernel buf_w", status);
status = clSetKernelArg(kernel,10, sizeof(cl_float), &buf_c );
clerrchk("set kernel buf_c", status);
// Define index space (global work size) of work items for execution
// A workgroup size (local work size) is not required, but can be used
size_t glbworksiz[3] = {nx,ny,nz};
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 3, NULL, glbworksiz, NULL, 0, NULL, NULL);
clerrchk("enqueue nd range kernel", status);
// Read the Device output buffer to the host output array
status = clEnqueueReadBuffer(cmdQueue, buf_p_tf, CL_TRUE, 0, p_siz, p_tf, 0, NULL, NULL);
clerrchk("enqueue read buffer", status);
start[0] = 1;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
err(retval);
if ((retval = nc_close(ncid)))
err(retval);
clReleaseMemObject(buf_p_tf);
clReleaseMemObject(buf_p_tn);
clReleaseMemObject(buf_p_tp);
clReleaseMemObject(buf_nx);
clReleaseMemObject(buf_ny);
clReleaseMemObject(buf_nz);
clReleaseMemObject(buf_nt);
clReleaseMemObject(buf_u);
clReleaseMemObject(buf_v);
clReleaseMemObject(buf_w);
clReleaseMemObject(buf_c);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
printf("\nDone. . .\n");
return 0;
}
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
int i_min = 50,
i_max = 70,
j_min = 50,
j_max = 70;
for(k=0;k<in_z_siz+2;k++)
for(j=0;j<in_y_siz+2;j++)
for(i=0;i<in_x_siz+2;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 0.0;
for(k=1;k<in_z_siz+1;k++)
for(j=j_min;j<j_max;j++)
for(i=i_min;i<i_max;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 3.0;
}
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
// Periodic boundary
// x-direction
for(k=1;k<in_z_siz+1;k++)
for(j=1;j<in_y_siz+1;j++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];
// y-direction
for(k=1;k<in_z_siz+1;k++)
for(i=1;i<in_x_siz+1;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];
// z-direction
for(j=1;j<in_y_siz+1;j++)
for(i=1;i<in_x_siz+1;i++)
in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}
Kernel
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, global float *in_arr)
{
int i,j,k;
// Periodic boundary
// x-direction
for(k=1;k<in_z_siz+1;k++)
for(j=1;j<in_y_siz+1;j++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];
// y-direction
for(k=1;k<in_z_siz+1;k++)
for(i=1;i<in_x_siz+1;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];
// z-direction
for(j=1;j<in_y_siz+1;j++)
for(i=1;i<in_x_siz+1;i++)
in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}
kernel void leapfrog3d(
const int x_siz,
const int y_siz,
const int z_siz,
const int t_siz,
global float *in_p_tf,
global float *in_p_tn,
global float *in_p_tp,
const float u_vel,
const float v_vel,
const float w_vel,
const float c
)
{
int nx = x_siz;
int ny = y_siz;
int nz = z_siz;
int nt = t_siz;
float u = u_vel;
float v = v_vel;
float w = w_vel;
float C = c ;
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;
for(int t=1;t<t_siz;t++)
{
idx0 = i + j * (nx+2) + k * (nx+2) * (ny+2);
idx_i0 = (i+1) + j * (nx+2) + k * (nx+2) * (ny+2);
idx_j0 = i + (j+1) * (nx+2) + k * (nx+2) * (ny+2);
idx_k0 = i + j * (nx+2) + (k+1) * (nx+2) * (ny+2);
idx_i1 = (i-1) + j * (nx+2) + k * (nx+2) * (ny+2);
idx_j1 = i + (j-1) * (nx+2) + k * (nx+2) * (ny+2);
idx_k1 = i + j * (nx+2) + (k-1) * (nx+2) * (ny+2);
in_p_tf[idx0] = in_p_tp[idx0]
- u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
- v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
- w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
pbndry(nx,ny,nz,in_p_tf);
in_p_tp = in_p_tn;
in_p_tn = in_p_tf;
}
}
journalctl log
Aug 19 13:17:12 Angke systemd-coredump[121514]: Process 121510 (lpf.gpu) of user 1000 dumped core.
Found module /home/rangke/prog_works/opencl/adv/lpf.gpu with build-id: 8036638bab286ce6a1e81a4983bb68d19a7145fd
Found module linux-vdso.so.1 with build-id: 27d9b8c0c25b172c86a29351e47701c1d0676035
Found module libamdocl12cl64.so without build-id.
Found module libgcc_s.so.1 with build-id: 7f8508bb914546ada778809b64b99d234337d835
Found module libstdc++.so.6 with build-id: 8ab0e57054dd1dcba681f217016afc6a4e639783
Found module libamd_comgr.so.1 with build-id: 438b1fbf7c58fd6a5b555a7283a58ee1eb1808f0
Found module libdrm.so.2 with build-id: 3aeff5403ca8d7589eabc05752eb613937f454a1
Found module libdrm_amdgpu.so.1 with build-id: a89ceb7c9082e5276f39023716eb4d194e75f6b8
Found module libamdocl-orca64.so without build-id.
Found module librt.so.1 with build-id: 75484da2d6f1515189eefa076e0a40328834cd16
Found module libamdocl64.so without build-id.
Found module libresolv.so.2 with build-id: c915c72668282861a813f7ea3c0780f37b681dc0
Found module libkeyutils.so.1 with build-id: ac405ddd17be10ce538da3211415ee50c8f8df79
Found module libkrb5support.so.0 with build-id: c4ee4ad1dc2da57487bc2419b88f1b6873184582
Found module libcom_err.so.2 with build-id: eb61ef71c8b97846db759fb89a115405cff6dd30
Found module libk5crypto.so.3 with build-id: 632a59ed7c35704d84645e6d1e9873348d1eb802
Found module libkrb5.so.3 with build-id: c61cb4da63b8a839ee7df99eaf9dbd3d0968534c
Found module libunistring.so.2 with build-id: 015ac6d6bcb60b7d8bea31a80d1941b06e8636ab
Found module libpthread.so.0 with build-id: 07c8f95b4f3251d08550217ad8a1f31066229996
Found module libzstd.so.1 with build-id: 4b10444c1560ebc574af4d5f488b7408b22d450e
Found module libgssapi_krb5.so.2 with build-id: 9be9d3348399b72b76161a64e6d9fd760b77163a
Found module libcrypto.so.1.1 with build-id: 81b77a8e0b6e1c0db19644a5f120890f02762021
Found module libssl.so.1.1 with build-id: 99394a6653d9c107f2e9b730bbbfd18ed43ae3b9
Found module libpsl.so.5 with build-id: 0229a201aaf5652186c9fdc192ebe52baf19d7f1
Found module libssh2.so.1 with build-id: 7f6d9edd2e793b266cae4f22fc1ba7b6b401c08c
Found module libidn2.so.0 with build-id: 1ce2b50ad9f9821c2c629b521cf5a3c99593d332
Found module libnghttp2.so.14 with build-id: 5ca39b42e7cb2770878644d57e88677df6336815
Found module libz.so.1 with build-id: 81bf6e728a6d6f5b105b0f8b25f6c614ce10452a
Found module libsz.so.2 with build-id: c114ff6d6bb52989ad7099aacac51780e5ef418e
Found module libcurl.so.4 with build-id: 49c0cb842d0e0dad11c435b7fb88b3d88b8a43ac
Found module libm.so.6 with build-id: 2b8fd1f869ecab4e0b55e92f2f151897f6818acf
Found module libhdf5.so.200 with build-id: 553f354452b0af3e7232580b8dff9e0c6584830b
Found module libhdf5_hl.so.200 with build-id: 2b0926fbab5318a556eb524497edb6e78099ff60
Found module ld-linux-x86-64.so.2 with build-id: 040cc3dd10461562f177df39e3be2f3704258c3c
Found module libdl.so.2 with build-id: 5abc547e7b0949f89f3c0e21ab0c8331a7440a8a
Found module libc.so.6 with build-id: 4b406737057708c0e4c642345a703c47a61c73dc
Found module libnetcdf.so.18 with build-id: f68d6c8120acfeaaa265d3b0750d24b669671124
Found module libOpenCL.so.1 with build-id: 4f566e048bc3b8112ba357ef29a3affd5858ccdf
Stack trace of thread 121510:
#0 0x00007efc82a9ad71 clEnqueueWriteBuffer (libamdocl64.so + 0xc8ad71)
#1 0x00007efc88a2d28d clEnqueueWriteBuffer (libOpenCL.so.1 + 0x1228d)
#2 0x000055d84a146dcd n/a (/home/rangke/prog_works/opencl/adv/lpf.gpu + 0x2dcd)
#3 0x00007efc88726b25 __libc_start_main (libc.so.6 + 0x27b25)
#4 0x000055d84a14528e n/a (/home/rangke/prog_works/opencl/adv/lpf.gpu + 0x128e)
Mind you that lpf.gpu is the executable that I ran.

Related

Dealing with 3D array with OpenCL, and program build error with invalid operand error

I am writing this OpenCL code that solves an advection equation using leapfrog scheme. I think I've setup the host code and the kernel code correctly but I am getting CL_BUILD_PROGRAM_FAILURE during the kernel compilation.
I did look into the kernel compilation log and here is what I get
/tmp/OCL114018T1.cl:72:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
~~~~~ ^ ~
/tmp/OCL114018T1.cl:73:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
~~~~~ ^ ~
/tmp/OCL114018T1.cl:74:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
~~~~~ ^ ~
/tmp/OCL114018T1.cl:76:32: error: passing '__global float *' to parameter of type 'float *' changes address space of pointer
pbndry(x_siz,y_siz,z_siz,in_p_tf);
^~~~~~~
/tmp/OCL114018T1.cl:1:62: note: passing argument to parameter 'in_arr' here
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
^
4 errors generated.
error: Clang front-end compilation failed!
Frontend phase failed compilation.
Error: Compiling CL to IR
seems to me that u_vel and C are both float so that it should not be a problem. What am I doing wrong here?
Below is the host code and the kernel code.
Host code
#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include "cl_err.h"
// netCDF constants
#define err(e) {printf("Error: %s\n", nc_strerror(e)); return(2);}
#define fname "/home/rangke/temp/leap3d.nc"
// Variable sizes and dimensions (constants)
#define ndims 4
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
int main()
{
int i,j,k;
int Nx = 128,
Ny = 128,
Nz = 16,
Nt = 1000;
int *p_nx = &Nx,
*p_ny = &Ny,
*p_nz = &Nz,
*p_nt = &Nt;
float u = 0.0,
v = 5.0,
w = 0.0,
dtdl = 0.01;
float *p_u = &u,
*p_v = &v,
*p_w = &w,
*p_dtdl = &dtdl;
// p_tf : p at future
// p_tn : p at now
// p_tp : p at past
float q_tf[Nz+2][Ny+2][Nx+2];
float q_tn[Nz+2][Ny+2][Nx+2];
float q_tp[Nz+2][Ny+2][Nx+2];
float (*p_tf)[Ny+2][Nx+2] = q_tf;
float (*p_tn)[Ny+2][Nx+2] = q_tn;
float (*p_tp)[Ny+2][Nx+2] = q_tp;
size_t p_siz = sizeof(float) * (Nx+2) * (Ny+2) * (Nz+2);
size_t n_siz = sizeof(int) * 1,
c_siz = sizeof(float) * 1;
int ncid, retval, varid, x_dimid, y_dimid, z_dimid, t_dimid;
int dimids[ndims];
size_t start[ndims], count[ndims];
// netCDF file operation
// Creating netCDF file
if ((retval = nc_create(fname, NC_CLOBBER, &ncid)))
err(retval);
// Define dimensions
if ((retval = nc_def_dim(ncid, "z", Nz+2, &z_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "y", Ny+2, &y_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "x", Nx+2, &x_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid)))
err(retval);
// Dimension ids
dimids[0] = t_dimid;
dimids[1] = z_dimid;
dimids[2] = y_dimid;
dimids[3] = x_dimid;
// Variable for writing netCDF data one timestep at a time
count[0] = 1; // For time dimension : 1 timestep
count[1] = Nz+2; // For z : write everything
count[2] = Ny+2; // For y : write everything
count[3] = Nx+2; // For x : write everything
start[1] = 0; // For z : don't do anything
start[2] = 0; // For y : don't do anything
start[3] = 0; // For x : don't do anything
printf("line 231\n");
if ((retval = nc_def_var(ncid, "data", NC_FLOAT, ndims, dimids, &varid)))
err(retval);
if ((retval = nc_enddef(ncid)))
err(retval);
data_init(Nx,Ny,Nz,(float*)p_tf);
data_init(Nx,Ny,Nz,(float*)p_tn);
data_init(Nx,Ny,Nz,(float*)p_tp);
// for(i=1;i<123;i++)
// printf("",p_tf[])
// Euler scheme for the first time step
for(k=1;k<Nz+1;k++)
for(j=1;j<Ny+1;j++)
for(i=1;i<Nx+1;i++)
{
p_tf[k][j][i] = p_tn[k][j][i]
- u * dtdl * (p_tn[k][j][i] - p_tn[k][j][i-1])
- v * dtdl * (p_tn[k][j][i] - p_tn[k][j-1][i])
- w * dtdl * (p_tn[k][j][i] - p_tn[k-1][j][i]);
}
pbndry(Nx,Ny,Nz,(float*)p_tf);
p_tp = p_tn;
p_tn = p_tf;
start[0] = 0;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
err(retval);
// OpenCL part //
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
printf("line 299\n");
// for(i=0;i<numDevices;i++)
// {
// status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name_data), name_data, NULL);
//
// printf("Device Name #%d: %s\n", i, name_data);
// status = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(comp_units), &comp_units, NULL);
//
// printf("Max Work-Group %d\n", comp_units);
// status = clGetDeviceInfo(devices[i], CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
//
// printf("Double FP config = %llu, Support? = %d\n", cfg, status);
// }
printf("line 313\n");
// Create a context and associate it with the devices
cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
printf("line 317\n");
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("line 323\n");
cl_mem buf_p_tf = clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
cl_mem buf_p_tn = clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
cl_mem buf_p_tp = clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
cl_mem buf_nx = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_ny = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_nz = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_nt = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_u = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_v = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_w = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_c = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
printf("line 335\n");
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_FALSE, 0, p_siz, p_tf ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tn , CL_FALSE, 0, p_siz, p_tn ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tp , CL_FALSE, 0, p_siz, p_tp ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nx , CL_FALSE, 0, n_siz, p_nx ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_ny , CL_FALSE, 0, n_siz, p_ny ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nz , CL_FALSE, 0, n_siz, p_nz ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nt , CL_FALSE, 0, n_siz, p_nt ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_u , CL_FALSE, 0, c_siz, p_u ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_v , CL_FALSE, 0, c_siz, p_v ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_w , CL_FALSE, 0, c_siz, p_w ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_c , CL_FALSE, 0, c_siz, p_dtdl,0, NULL, NULL);
printf("line 348\n");
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_source;
FILE *program_handle = fopen("leapfrog.cl","r");
printf("line 357\n");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_source = (char*)malloc(program_size+1);
program_source[program_size] = '\0';
fread(program_source, sizeof(char), program_size, program_handle);
fclose(program_handle);
printf("line 366\n");
program = clCreateProgramWithSource(context, 1, (const char**)&program_source, &program_size, &status);
printf("line 370\n");
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
if(status != CL_SUCCESS)
{
printf("Code : %d\n",status);
printf("Program 1 %s\n",getErrorString(status));
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s\n", log);
}
// Create a kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "leapfrog3d", &status);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &buf_nx );
status = clSetKernelArg(kernel, 1, sizeof(cl_int), &buf_ny );
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &buf_nz );
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nt );
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_p_tf);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_p_tn);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &buf_p_tp);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &buf_u );
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &buf_v );
status = clSetKernelArg(kernel, 9, sizeof(cl_mem), &buf_w );
status = clSetKernelArg(kernel,10, sizeof(cl_mem), &buf_c );
// Define index space (global work size) of work items for execution
// A workgroup size (local work size) is not required, but can be used
size_t glbworksiz[3] = {Nx,Ny,Nz};
printf("\nLine 395\n");
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 3, NULL, glbworksiz, NULL, 0, NULL, NULL);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("\nLine 401\n");
// Read the Device output buffer to the host output array
status = clEnqueueReadBuffer(cmdQueue, buf_p_tf, CL_TRUE, 0, p_siz, p_tf, 0, NULL, NULL);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("\nLine 407\n");
start[0] = 1;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
err(retval);
if ((retval = nc_close(ncid)))
err(retval);
clReleaseMemObject(buf_p_tf);
clReleaseMemObject(buf_p_tn);
clReleaseMemObject(buf_p_tp);
clReleaseMemObject(buf_nx);
clReleaseMemObject(buf_ny);
clReleaseMemObject(buf_nz);
clReleaseMemObject(buf_nt);
clReleaseMemObject(buf_u);
clReleaseMemObject(buf_v);
clReleaseMemObject(buf_w);
clReleaseMemObject(buf_c);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
printf("\nDone. . .\n");
return 0;
}
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
int i_min = 50,
i_max = 70,
j_min = 50,
j_max = 70;
for(k=0;k<in_z_siz+2;k++)
for(j=0;j<in_y_siz+2;j++)
for(i=0;i<in_x_siz+2;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 0.0;
for(k=1;k<in_z_siz+1;k++)
for(j=j_min;j<j_max;j++)
for(i=i_min;i<i_max;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 3.0;
}
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
// Periodic boundary
// x-direction
for(k=1;k<in_z_siz+1;k++)
for(j=1;j<in_y_siz+1;j++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];
// y-direction
for(k=1;k<in_z_siz+1;k++)
for(i=1;i<in_x_siz+1;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];
// z-direction
for(j=1;j<in_y_siz+1;j++)
for(i=1;i<in_x_siz+1;i++)
in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}
Kernel code
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
// Periodic boundary
// x-direction
for(k=1;k<in_z_siz+1;k++)
for(j=1;j<in_y_siz+1;j++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];
// y-direction
for(k=1;k<in_z_siz+1;k++)
for(i=1;i<in_x_siz+1;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];
// z-direction
for(j=1;j<in_y_siz+1;j++)
for(i=1;i<in_x_siz+1;i++)
in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}
kernel void leapfrog3d(
const int x_siz,
const int y_siz,
const int z_siz,
const int t_siz,
global float *in_p_tf,
global float *in_p_tn,
global float *in_p_tp,
global float *u_vel,
global float *v_vel,
global float *w_vel,
global float *C
)
{
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;
for(int t=1;t<t_siz;t++)
{
idx0 = i + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_i0 = (i+1) + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_j0 = i + (j+1) * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_k0 = i + j * (x_siz+2) + (k+1) * (x_siz+2) * (y_siz+2);
idx_i1 = (i-1) + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_j1 = i + (j-1) * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_k1 = i + j * (x_siz+2) + (k-1) * (x_siz+2) * (y_siz+2);
in_p_tf[idx0] = in_p_tp[idx0]
- u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
- v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
- w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
pbndry(x_siz,y_siz,z_siz,in_p_tf);
in_p_tp = in_p_tn;
in_p_tn = in_p_tf;
}
}
Two things:
C is an array or a pointer, but you access it as if it were a scalar value. Use C[some_index] for accessing the array elements. If it is just a constant, use (*C) or C[0].
x_siz/y_siz/z_siz/t_siz are all in global memory space because they are kernel arguments, regardless if you explicitly write global const int x_siz or const int x_siz. You need to make private kernel variables, set these to the global variables, and pass the private ones to the function, because function parameters are private by default. So in the kernel, make a variable int x_siz_private = x_siz; and pass that to the function call. Variables declared in a kernel are in private memory space by default, so you don't need to write private explicitly. In the assembly, this corresponds to a ld (load from global memory to register) instruction.

How to make an OpenCL program run for large data set?

I am new to OpenCL. I am trying to run a simple OpenCL program for Vector Addition on NVIDIA GPU.
Here is the code :
OpenCL file is :
#define MAX_SOURCE_SIZE (0x10000)
#include<stdio.h>
#include<stdlib.h>
#include "CL/cl.h"
int main()
{
cl_uint ret_num_platforms;
cl_uint ret_num_devices;
cl_platform_id platform_id = NULL;
cl_kernel kernel2 = NULL;
cl_program program2 = NULL;
cl_command_queue command_queue = NULL;
cl_context context = NULL;
cl_device_id device_id = NULL;
cl_int ret;
FILE * fp2;
char fileName2[]="./kernel.cl";
int for_var=0;
char * source_str2;
size_t source_size2;
size_t globalWorkSize[1];
size_t localWorkSize[1];
cl_mem cl_buffer3;
cl_mem cl_buffer2;
cl_mem cl_buffer1;
cl_mem cl_buffer0;
int *A;
int *B;
int *C;
int *n;
int i;
n = ((int *)(malloc((sizeof(int )))));
printf("Enter the number of elements of vector : \n");
scanf("%d",n);
A = ((int *)(malloc((( *n) * sizeof(int )))));
B = ((int *)(malloc((( *n) * sizeof(int )))));
C = ((int *)(malloc((( *n) * sizeof(int )))));
printf("\nInput Vector1 :\n");
for (i = 0; i <= *n - 1; i += 1) {
A[i] = (2 * i);
printf("%d ",A[i]);
}
printf("\n\nInput Vector2 :\n");
for (i = 0; i <= *n - 1; i += 1) {
B[i] = (3 * i);
printf("%d ",B[i]);
}
ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
if (ret != CL_SUCCESS) {
printf("Platform error");
}
ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_DEFAULT,1,&device_id,&ret_num_devices);
if (ret != CL_SUCCESS)
printf("device err");
context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
if (!context)
printf("context err");
command_queue = clCreateCommandQueue(context,device_id,0,&ret);
if (!command_queue)
printf("command queue error");
localWorkSize[0] = 16;
globalWorkSize[0] =16400;
cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &ret);
cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, (*n) * sizeof(int), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,(*n) * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,(*n) * sizeof(int), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0, sizeof(int), n, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
fp2 = fopen(fileName2,"r");
if (!fp2) {
fprintf(stderr,"Failed");
exit(1);
}
source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
fclose(fp2);
program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);
if(!program2)
printf("error creating program2");
ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);
if (ret)
printf("error building program2");
kernel2 = clCreateKernel(program2, "ADD" , &ret);
ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);
ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);
ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);
ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);
ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
printf("\n\nAddition of vectors :\n");
for (i = 0; i <= *n - 1; i += 1) {
printf("%d ",C[i]);
}
clReleaseMemObject(cl_buffer0);
clReleaseMemObject(cl_buffer1);
clReleaseMemObject(cl_buffer2);
clReleaseMemObject(cl_buffer3);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
Kernel file is(kernel.cl) :
__kernel void ADD(__constant int *A,__constant int *B,__global int *C,__constant int *n)
{
int i = get_global_id(0);
if (i <= *n - 1) {
C[i] = (A[i] + B[i]);
}
}
The program works fine if I give 16384 as total vector elements but it gives 0 as output for values more than that. I want to run this program with large data set so that I can compare its performance with the one running on CPU.
Please guide me how can I do so?
There's at least one bug in your code - you're copying MEM_SIZE * sizeof(int) bytes from n to buffer 3:
ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0,MEM_SIZE * sizeof(int), n, 0, NULL, NULL);
however, n is only sizeof(int) bytes long:
n = ((int *)(malloc((sizeof(int )))));
I don't know what problems this might be causing, and it's entirely possible there are other, more severe bugs, but this one certainly isn't helping.

OpenCL generate SHA-256 hash

I need help with OpenCL.
The task is as follows:
There is an input parameter of type string. It is necessary to generate a SHA-256 hash using the resources of the video card.
It is necessary to create a cycle to select a hash. Each time add some postfix to the original string.
Result*Hash should start with 5 zeros "00000 ...".
For example, the entrance. parameter: "strela".
SHA-256: "7d7ceecdee08ea1c0ac46b27657a79395af36526b3214b59a92f8351ccf8f762"
Next, you need to add a postfix. For example, "strela1"
Here the hash will be: a2afd15651f44f19f3e4e216bf3ead22d5f5937e9f9dc250382ff1f764ba219f
then continue to add the postfix until the resulting hash begins to start with "00000.."
It is necessary to use all the cores of the video card, i.e. use parallelization. Each core will use its postfix.
As soon as some kernel computes the hash we need, interrupt all calculations on the cores and display the hash we need.
Source:
main.cpp
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
#include <stdio.h>
#include < string.h >
void crypt_and_print(char input[])
{
char result[65];
char diff[65] = "00000";
char *istr;
char buffer2[20];
int temp;
char str2[20];
for (int i = 0; i < 1; i++)
{
char string[] = "1qqq";
sprintf(buffer2, "%d", i);
temp = 8 - strlen(buffer2);
str2[0] = '\0';
while (strlen(str2) != temp)
strcat(str2, "0");
strcat(str2, buffer2);
strcat(string, str2);
sha256_crypt(string, result);
istr = strstr(result, diff);
if (istr != NULL) {
printf(istr);
break;
}
}
}
int main()
{
char result[65];
sha256_init(2048);
crypt_and_print((char*)"");
}
sha256.c
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
static cl_platform_id platform_id = NULL;
static cl_device_id device_id = NULL;
static cl_uint ret_num_devices;
static cl_uint ret_num_platforms;
static cl_context context;
static cl_int ret;
static char* source_str;
static size_t source_size;
static cl_program program;
static cl_kernel kernel;
static cl_command_queue command_queue;
static cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_info;
static cl_uint *partial_hashes;
static cl_uint *res_hashes;
static char *saved_plain;
static unsigned int datai[3];
static int have_full_hashes;
static size_t kpc = 4;
static size_t global_work_size=3;
static size_t local_work_size=1;
static size_t string_len;
void load_source();
void createDevice();
void createkernel();
void create_clobj();
void crypt_all();
void sha256_init(size_t user_kpc)
{
kpc = user_kpc;
load_source();
createDevice();
createkernel();
create_clobj();
}
void sha256_crypt(char input[], char* output)
{
int i;
string_len = strlen(input);
global_work_size = 3;
datai[0] = SHA256_PLAINTEXT_LENGTH;
datai[1] = global_work_size;
datai[2] = string_len;
memcpy(saved_plain, input, string_len+1);
crypt_all();
for(i=0; i<SHA256_RESULT_SIZE; i++)
{
sprintf(output+i*8,"%08x", partial_hashes[i]);
}
printf("'%s':\n%s\n", input, output);
}
void crypt_all()
{
//printf("%s\n",saved_plain);
ret = clEnqueueWriteBuffer(command_queue, data_info, CL_TRUE, 0, sizeof(unsigned int) * 3, datai, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, buffer_keys, CL_TRUE, 0, SHA256_PLAINTEXT_LENGTH * kpc, saved_plain, 0, NULL, NULL);
// printf("%s\n",buffer_keys);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
ret = clFinish(command_queue);
// read back partial hashes
ret = clEnqueueReadBuffer(command_queue, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, partial_hashes, 0, NULL, NULL);
have_full_hashes = 0;
}
void load_source()
{
FILE *fp;
fp = fopen("/sha256.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
}
void create_clobj(){
pinned_saved_keys = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (SHA256_PLAINTEXT_LENGTH)*kpc, NULL, &ret);
saved_plain = (char*)clEnqueueMapBuffer(command_queue, pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, (SHA256_PLAINTEXT_LENGTH)*kpc, 0, NULL, NULL, &ret);
memset(saved_plain, 0, SHA256_PLAINTEXT_LENGTH * kpc);
res_hashes = (cl_uint *)malloc(sizeof(cl_uint) * SHA256_RESULT_SIZE);
memset(res_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
pinned_partial_hashes = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
partial_hashes = (cl_uint *) clEnqueueMapBuffer(command_queue, pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, 0, NULL, NULL, &ret);
memset(partial_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
buffer_keys = clCreateBuffer(context, CL_MEM_READ_ONLY, (SHA256_PLAINTEXT_LENGTH) * kpc, NULL, &ret);
buffer_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
data_info = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int) * 3, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(data_info), (void *) &data_info);
clSetKernelArg(kernel, 1, sizeof(buffer_keys), (void *) &buffer_keys);
clSetKernelArg(kernel, 2, sizeof(buffer_out), (void *) &buffer_out);
}
void createDevice()
{
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
}
void createkernel()
{
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
}
sha256.cl
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#define H0 0x6a09e667
#define H1 0xbb67ae85
#define H2 0x3c6ef372
#define H3 0xa54ff53a
#define H4 0x510e527f
#define H5 0x9b05688c
#define H6 0x1f83d9ab
#define H7 0x5be0cd19
uint rotr(uint x, int n) {
if (n < 32) return (x >> n) | (x << (32 - n));
return x;
}
uint ch(uint x, uint y, uint z) {
return (x & y) ^ (~x & z);
}
uint maj(uint x, uint y, uint z) {
return (x & y) ^ (x & z) ^ (y & z);
}
uint sigma0(uint x) {
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
uint sigma1(uint x) {
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
uint gamma0(uint x) {
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}
uint gamma1(uint x) {
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}
__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key, __global uint *digest){
int t, gid, msg_pad;
int stop, mmod;
uint i, ulen, item, total;
uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
uint num_keys = data_info[1];
int current_pad;
//printf(get_global_id(0));
uint K[64]={
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
msg_pad=0;
ulen = data_info[2];
total = ulen%64>=56?2:1 + ulen/64;
//printf("ulen: %u total:%u\n", ulen, total);
digest[0] = H0;
digest[1] = H1;
digest[2] = H2;
digest[3] = H3;
digest[4] = H4;
digest[5] = H5;
digest[6] = H6;
digest[7] = H7;
for(item=0; item<total; item++)
{
A = digest[0];
B = digest[1];
C = digest[2];
D = digest[3];
E = digest[4];
F = digest[5];
G = digest[6];
H = digest[7];
#pragma unroll
for (t = 0; t < 80; t++){
W[t] = 0x00000000;
}
msg_pad=item*64;
if(ulen > msg_pad)
{
current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
}
else
{
current_pad =-1;
}
// printf("current_pad: %d\n",current_pad);
if(current_pad>0)
{
i=current_pad;
stop = i/4;
// printf("i:%d, stop: %d msg_pad:%d\n",i,stop, msg_pad);
for (t = 0 ; t < stop+get_global_id(0) ; t++){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= (uchar) plain_key[msg_pad + t * 4 + 3];
// printf("W[%u]: %u\n",t,W[t]);
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= ((uchar) 0x80) ;
} else if (mmod == 2) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
if (current_pad<56)
{
W[15] = ulen*8 ;
// printf("ulen avlue 2 :w[15] :%u\n", W[15]);
}
}
else if(current_pad <0)
{
if( ulen%64==0)
W[0]=0x80000000;
W[15]=ulen*8;
//printf("ulen avlue 3 :w[15] :%u\n", W[15]);
}
for (t = 0; t < 64; t++) {
if (t >= 16)
W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16];
T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
T2 = sigma0(A) + maj(A, B, C);
H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
}
digest[0] += A;
digest[1] += B;
digest[2] += C;
digest[3] += D;
digest[4] += E;
digest[5] += F;
digest[6] += G;
digest[7] += H;
}
printf("hi");
}
How can i use here paralelism (all GPU cores) to calculate needed hash code?
Is it real to do task like this using OPENCL ?

OpenCL gemm kernel local memory going slower

EDIT: It was my cards fault... The local memory kernel goes a few times faster, sorry all!!
I am writing a simple sgemm (square, alpha=1, beta=0) that is supposed to take advantage of local memory, but it performs at half the speed of a naive version.
Here are the kernels:
const char* matrixMultiplySource =
"__kernel\n"
" void matrixMultiply(__global float* A, __global float* B, __global float* C)\n"
" {\n"
" int i = get_local_id(0);\n"
" int j = get_local_id(1);\n"
" int ig = get_global_id(0);\n"
" int jg = get_global_id(1);\n"
" int sizeG0 = get_global_size(0);\n"
" __local float localA[BLOCK_SIZE][BLOCK_SIZE];\n"
" __local float localB[BLOCK_SIZE][BLOCK_SIZE];\n"
" float val=0.0f;\n"
" for ( int index = 0; index < sizeG0; index += BLOCK_SIZE )\n"
" {\n"
" localA[j][i] = A[ig + sizeG0 * (index+j)];\n"
" localB[j][i] = B[index+i + sizeG0 * jg];\n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" #pragma unroll\n"
" for ( int kk = 0; kk < BLOCK_SIZE; ++kk)\n"
" {\n"
" val = val + localA[kk][i] * localB[j][kk];\n"
" }\n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" }\n"
" C[ig + sizeG0 * jg] = val;\n"
"}\n"
;
const char* matrixMultiplySource2 =
"__kernel\n"
" void matrixMultiply(__global float* A, __global float* B, __global float* C)\n"
" {\n"
" int ig = get_global_id(0);\n"
" int jg = get_global_id(1);\n"
" int sizeG0 = get_global_size(0);\n"
" float val=0;\n"
" for ( int k = 0; k < sizeG0; k++)\n"
" {\n"
" val = val + A[ig + k * sizeG0] * B[k + jg * sizeG0];\n"
" }\n"
" C[ig + sizeG0 * jg] = val;\n"
"}\n"
;
BLOCK_SIZE is 16 and I am using 1024x1024 matrices as well as warming up.
// Create OpenCL context
context = mycl::myclCreateContext( NULL, ret_num_devices, devices, NULL, NULL, &ret);
// Create Command Queue
command_queue = mycl::myclCreateCommandQueue(context, devices[0], 0, &ret);
// Create Memory Buffer
memobjA = mycl::myclCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret);
memobjB = mycl::myclCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret);
memobjC = mycl::myclCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = mycl::myclEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
widthA * heightA * sizeof(float), A, 0, NULL, NULL);
ret = mycl::myclEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
widthB * heightB * sizeof(float), B, 0, NULL, NULL);
// Create Kernel Program from the source
program = mycl::myclCreateProgramWithSource(context, 1, (const char **)&matrixMultiplySource,
NULL, &ret);
// Build Kernel Program
ret = mycl::myclBuildProgram(program, ret_num_devices, devices, "-D BLOCK_SIZE=16", NULL, NULL);
if(ret != CL_SUCCESS){cout << "PROBREM! " << ret << endl;return -1;}
// Create OpenCL Kernel
kernel = mycl::myclCreateKernel(program, "matrixMultiply", &ret);
size_t globalThreads[2] = {heightA, widthB};
size_t localThreads[2] = {BLOCK_SIZE, BLOCK_SIZE};
// Set OpenCL Kernel Arguments
ret = mycl::myclSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = mycl::myclSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
ret = mycl::myclSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
// Time the kernel
struct timeval timev1, timev2;
float time_seconds = 0.0f;
mycl::myclEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, 0, NULL);
mycl::myclFinish(command_queue);
gettimeofday(&timev1, NULL);
ret = mycl::myclEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, 0, NULL);
if(ret != CL_SUCCESS){cout << "fail! " << ret << endl;}
ret = mycl::myclFinish(command_queue);
if(ret != CL_SUCCESS){cout << "fail! " << ret << endl;}
gettimeofday(&timev2,NULL);
time_seconds=(timev2.tv_sec-timev1.tv_sec)+0.000001*(timev2.tv_usec- timev1.tv_usec);
Have you looked at the two kernels in the AMD APP KernelAnalyzer or equivalent tools? These tools compile the Kernels and show their predicted performance characteristics
You use
barrier(CLK_GLOBAL_MEM_FENCE);
where I would expect to see
barrier(CLK_LOCAL_MEM_FENCE);
as you write in the loop to local memory.
Further I doubt that the copy to localA does help you -- at one time every items there is only accessed once.

OpenCL Error Computing Matrix Multiplication during Runtime

I have been debugging for the past few days and cannot get this OpenCL matrix multiplication kernel to run. Whenever I run the program, the output from the GPU results in large negative numbers similar to -198746573.0000. I was wondering if someone with HPC experience could point out an error in my code or if it is an error with the driver.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define widthA 2
#define heightA 2
#define widthB heightA
#define heightB 2
#define widthC widthA
#define heightC heightB
#ifdef __APPLE__
#include < OpenCL/opencl.h >
#else
#include <opencl.h>
#endif
#define MEM_SIZE (128)
#define MAX_SOURCE_SIZE (0x100000)
int main()
{
float * A = (float *)malloc(sizeof(float)*widthA*heightA);
float * B = (float *)malloc(sizeof(float)*widthB*heightB);
float * C = (float *)malloc(sizeof(float)*widthC*heightC);
float * Res = (float *)malloc(sizeof(float)*widthC*heightC);
float * D= (float *)malloc(sizeof(float)*widthC*heightC);
float ref[widthC][heightC];
int i, j, k;
FILE * fp1 = fopen("matAdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightA; j++) {
float p=(rand()%100)/7.0;
//*(A+i*heightA+j)=rand()%100 + p;
*(A+i*heightA+j)=4.0;
fprintf(fp1, "%f ",*(A+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
fp1 = fopen("matBdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthB; i++)
{
for(j=0; j < heightB; j++) {
float p=(rand()%100)/7.0;
//*((B+i*heightB+j))=rand()%100 + p;
*((B+i*heightB+j))=4.0;
fprintf(fp1, "%f ",*(B+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobjA = NULL;
cl_mem memobjB = NULL;
cl_mem memobjC = NULL;
cl_mem rowA = NULL;
cl_mem colC = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id[10];
cl_platform_id platform = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event GPUDone[0];
//char string[MEM_SIZE];
FILE *fp;
char fileName[] = "matrixMultiplication.cl";
char *source_str;
size_t source_size;
int row = widthA;
int col = heightC;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
/* Get Platform and Device Info */
ret = clGetPlatformIDs(10, platform_id, &ret_num_platforms);
char cBuffer[1024];
cl_uint c;
for(c = 0; c < ret_num_platforms; c++)
{
clGetPlatformInfo(platform_id[c], CL_PLATFORM_NAME, 1024, &cBuffer, NULL);
if (strstr(cBuffer, "NVIDIA") != NULL)
{
platform = platform_id[c];
break;
}
}
printf("Found Platform %s\n", cBuffer);
ret = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
printf("Found %d devices.\n", ret_num_devices);
/* Create OpenCL context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create Memory Buffer */
memobjA = clCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret);
memobjB = clCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret);
memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
rowA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
colC = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
widthA * heightA * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
widthB * heightB * sizeof(float), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0, sizeof(int), &row, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0, sizeof(int), &col, 0, NULL, NULL);
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, "matrixMultiplication", &ret);
/* Set OpenCL Kernel Arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row);
ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col);
/* Execute OpenCL Kernel */
//ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
size_t globalThreads[2] = {widthA, heightB};
size_t localThreads[2] = {16,16};
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
//clFlush(command_queue);
//clFinish(command_queue);
/* Copy results from the memory buffer */
ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0,
widthA * heightC * sizeof(float), Res, 0, NULL, &GPUDone[0]);
printf("Buffer Read ended with %d.\n", ret);
clWaitForEvents(1, GPUDone);
fp1 = fopen("matGPURes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(Res+i*heightC+j));
ref[i][j] = *(Res+i*heightC+j);
printf("GPU Output: %f\n", *(Res+i*heightC+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobjA);
ret = clReleaseMemObject(memobjB);
ret = clReleaseMemObject(memobjC);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
ret = clReleaseEvent(GPUDone[0]);
free(source_str);
float sum=0.0;
for(i = 0;i < widthA; i++)
{
for(j = 0; j < heightC; j++)
{
sum = 0;
for(k = 0; k < widthB; k++)
{
sum += A[i*col+k] * B[k*row+j];
printf("Multiplying A: %f, B: %f\n", A[i*col+k], B[k*row+j]);
}
D[i*heightC+j] = sum;
}
}
fp1 = fopen("matNormalMultiplicationRes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matNormalMultiplicationRes.txt\n");
exit(1);
}
for(i = 0; i<widthA; i++)
{
for(j = 0; j<heightA; j++)
{
if (ref[i][j] != D[i*heightA+j])
{
printf("Calculation error[ CPU: %f, GPU: %f ]\n", D[i*heightA+j], ref[i][j]);
}
}
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(D+i*heightC+j));
}
fprintf(fp1, "\n");
}
free(A);
free(B);
free(C);
free(D);
free(Res);
return 0;
}
Here is the kernel
#define BLOCK_SIZE 16
__kernel
void matrixMultiplication(__global float* A, __global float* B, __global float* C, int wA, int wB )
{
//int i = get_global_id(0);
//int j = get_global_id(1);
float Csub = 0.0f;
int bx = get_group_id(0);
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int aBegin = wA * BLOCK_SIZE * by;
int aEnd = aBegin + wA - 1;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * wB;
for (int a = aBegin, b=bBegin;
a <= aEnd;
a += aStep, b+=bStep)
{
__local float As[BLOCK_SIZE][BLOCK_SIZE];
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
barrier(CLK_LOCAL_MEM_FENCE);
for( int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
barrier(CLK_LOCAL_MEM_FENCE);
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
/*
float value=0;
for ( int k = 0; k < widthA; k++)
{
value = value + A[k + j * widthA] * B[k*widthB + i];
}
C[i + widthA * j] = value;
*/
}
I have double checked over and over again but simply cannot find any errors. I want to make sure its not a code error before I conclude its a driver issue.
Thanks!
Do you really need a complex kernel like that ? if you really want to do simple matrix multiplication
you can write a simple kernel like this, which is easy to debug.
__kernel void matrixMultiplication (__global float* A,
__global float* B,
__global float* C,
int widthA, int widthB )
{
//y direction
int row = get_global_id(1);
int col = get_global_id(0);
float cSum = 0.0f;
//calculate the result
for (int i=0; i<widthA; i++)
{
cSum += A[row*widthA+ i] * B[i*widthB+col];
}
C[row*widthB+col] = cSum;
}
Case is probably closed already, but for the sake of google-comers:
Shouldnt shared memory be explicitly declared on host and passed as kernel argument to the source? __local keyword is not the one you are looking for in this case.
See post on How to declare local memory in OpenCL? for the detailed explanation.
Check the functionality of your host. Here a few things to get you started ...
1) You don't need to create a buffer and enqueue it for a scalar constant Int like row and col. Just set it as a kernel arg.
2) Wait for the clEnqueueNDRangeKernel with an event. You want to be sure the calc has completed.
3) Add a printf statement in the kernel to print selected values to see that the input and output values are what you expect.
try
if ( get_local_id(0) % 8 == 0)
{
printf some useful value of a,b,c
}
3) Try the host code with a dumb kernel that copies an input array to an output array. That will confirm it you have the handling of buffer creation and the enqeue read/write code correct!

Resources