OpenCL generate SHA-256 hash - opencl

I need help with OpenCL.
The task is as follows:
There is an input parameter of type string. It is necessary to generate a SHA-256 hash using the resources of the video card.
It is necessary to create a cycle to select a hash. Each time add some postfix to the original string.
Result*Hash should start with 5 zeros "00000 ...".
For example, the entrance. parameter: "strela".
SHA-256: "7d7ceecdee08ea1c0ac46b27657a79395af36526b3214b59a92f8351ccf8f762"
Next, you need to add a postfix. For example, "strela1"
Here the hash will be: a2afd15651f44f19f3e4e216bf3ead22d5f5937e9f9dc250382ff1f764ba219f
then continue to add the postfix until the resulting hash begins to start with "00000.."
It is necessary to use all the cores of the video card, i.e. use parallelization. Each core will use its postfix.
As soon as some kernel computes the hash we need, interrupt all calculations on the cores and display the hash we need.
Source:
main.cpp
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
#include <stdio.h>
#include < string.h >
void crypt_and_print(char input[])
{
char result[65];
char diff[65] = "00000";
char *istr;
char buffer2[20];
int temp;
char str2[20];
for (int i = 0; i < 1; i++)
{
char string[] = "1qqq";
sprintf(buffer2, "%d", i);
temp = 8 - strlen(buffer2);
str2[0] = '\0';
while (strlen(str2) != temp)
strcat(str2, "0");
strcat(str2, buffer2);
strcat(string, str2);
sha256_crypt(string, result);
istr = strstr(result, diff);
if (istr != NULL) {
printf(istr);
break;
}
}
}
int main()
{
char result[65];
sha256_init(2048);
crypt_and_print((char*)"");
}
sha256.c
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
static cl_platform_id platform_id = NULL;
static cl_device_id device_id = NULL;
static cl_uint ret_num_devices;
static cl_uint ret_num_platforms;
static cl_context context;
static cl_int ret;
static char* source_str;
static size_t source_size;
static cl_program program;
static cl_kernel kernel;
static cl_command_queue command_queue;
static cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_info;
static cl_uint *partial_hashes;
static cl_uint *res_hashes;
static char *saved_plain;
static unsigned int datai[3];
static int have_full_hashes;
static size_t kpc = 4;
static size_t global_work_size=3;
static size_t local_work_size=1;
static size_t string_len;
void load_source();
void createDevice();
void createkernel();
void create_clobj();
void crypt_all();
void sha256_init(size_t user_kpc)
{
kpc = user_kpc;
load_source();
createDevice();
createkernel();
create_clobj();
}
void sha256_crypt(char input[], char* output)
{
int i;
string_len = strlen(input);
global_work_size = 3;
datai[0] = SHA256_PLAINTEXT_LENGTH;
datai[1] = global_work_size;
datai[2] = string_len;
memcpy(saved_plain, input, string_len+1);
crypt_all();
for(i=0; i<SHA256_RESULT_SIZE; i++)
{
sprintf(output+i*8,"%08x", partial_hashes[i]);
}
printf("'%s':\n%s\n", input, output);
}
void crypt_all()
{
//printf("%s\n",saved_plain);
ret = clEnqueueWriteBuffer(command_queue, data_info, CL_TRUE, 0, sizeof(unsigned int) * 3, datai, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, buffer_keys, CL_TRUE, 0, SHA256_PLAINTEXT_LENGTH * kpc, saved_plain, 0, NULL, NULL);
// printf("%s\n",buffer_keys);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
ret = clFinish(command_queue);
// read back partial hashes
ret = clEnqueueReadBuffer(command_queue, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, partial_hashes, 0, NULL, NULL);
have_full_hashes = 0;
}
void load_source()
{
FILE *fp;
fp = fopen("/sha256.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
}
void create_clobj(){
pinned_saved_keys = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (SHA256_PLAINTEXT_LENGTH)*kpc, NULL, &ret);
saved_plain = (char*)clEnqueueMapBuffer(command_queue, pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, (SHA256_PLAINTEXT_LENGTH)*kpc, 0, NULL, NULL, &ret);
memset(saved_plain, 0, SHA256_PLAINTEXT_LENGTH * kpc);
res_hashes = (cl_uint *)malloc(sizeof(cl_uint) * SHA256_RESULT_SIZE);
memset(res_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
pinned_partial_hashes = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
partial_hashes = (cl_uint *) clEnqueueMapBuffer(command_queue, pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, 0, NULL, NULL, &ret);
memset(partial_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
buffer_keys = clCreateBuffer(context, CL_MEM_READ_ONLY, (SHA256_PLAINTEXT_LENGTH) * kpc, NULL, &ret);
buffer_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
data_info = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int) * 3, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(data_info), (void *) &data_info);
clSetKernelArg(kernel, 1, sizeof(buffer_keys), (void *) &buffer_keys);
clSetKernelArg(kernel, 2, sizeof(buffer_out), (void *) &buffer_out);
}
void createDevice()
{
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
}
void createkernel()
{
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
}
sha256.cl
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#define H0 0x6a09e667
#define H1 0xbb67ae85
#define H2 0x3c6ef372
#define H3 0xa54ff53a
#define H4 0x510e527f
#define H5 0x9b05688c
#define H6 0x1f83d9ab
#define H7 0x5be0cd19
uint rotr(uint x, int n) {
if (n < 32) return (x >> n) | (x << (32 - n));
return x;
}
uint ch(uint x, uint y, uint z) {
return (x & y) ^ (~x & z);
}
uint maj(uint x, uint y, uint z) {
return (x & y) ^ (x & z) ^ (y & z);
}
uint sigma0(uint x) {
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
uint sigma1(uint x) {
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
uint gamma0(uint x) {
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}
uint gamma1(uint x) {
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}
__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key, __global uint *digest){
int t, gid, msg_pad;
int stop, mmod;
uint i, ulen, item, total;
uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
uint num_keys = data_info[1];
int current_pad;
//printf(get_global_id(0));
uint K[64]={
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
msg_pad=0;
ulen = data_info[2];
total = ulen%64>=56?2:1 + ulen/64;
//printf("ulen: %u total:%u\n", ulen, total);
digest[0] = H0;
digest[1] = H1;
digest[2] = H2;
digest[3] = H3;
digest[4] = H4;
digest[5] = H5;
digest[6] = H6;
digest[7] = H7;
for(item=0; item<total; item++)
{
A = digest[0];
B = digest[1];
C = digest[2];
D = digest[3];
E = digest[4];
F = digest[5];
G = digest[6];
H = digest[7];
#pragma unroll
for (t = 0; t < 80; t++){
W[t] = 0x00000000;
}
msg_pad=item*64;
if(ulen > msg_pad)
{
current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
}
else
{
current_pad =-1;
}
// printf("current_pad: %d\n",current_pad);
if(current_pad>0)
{
i=current_pad;
stop = i/4;
// printf("i:%d, stop: %d msg_pad:%d\n",i,stop, msg_pad);
for (t = 0 ; t < stop+get_global_id(0) ; t++){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= (uchar) plain_key[msg_pad + t * 4 + 3];
// printf("W[%u]: %u\n",t,W[t]);
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= ((uchar) 0x80) ;
} else if (mmod == 2) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
if (current_pad<56)
{
W[15] = ulen*8 ;
// printf("ulen avlue 2 :w[15] :%u\n", W[15]);
}
}
else if(current_pad <0)
{
if( ulen%64==0)
W[0]=0x80000000;
W[15]=ulen*8;
//printf("ulen avlue 3 :w[15] :%u\n", W[15]);
}
for (t = 0; t < 64; t++) {
if (t >= 16)
W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16];
T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
T2 = sigma0(A) + maj(A, B, C);
H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
}
digest[0] += A;
digest[1] += B;
digest[2] += C;
digest[3] += D;
digest[4] += E;
digest[5] += F;
digest[6] += G;
digest[7] += H;
}
printf("hi");
}
How can i use here paralelism (all GPU cores) to calculate needed hash code?
Is it real to do task like this using OPENCL ?

Related

How to make an OpenCL program run for large data set?

I am new to OpenCL. I am trying to run a simple OpenCL program for Vector Addition on NVIDIA GPU.
Here is the code :
OpenCL file is :
#define MAX_SOURCE_SIZE (0x10000)
#include<stdio.h>
#include<stdlib.h>
#include "CL/cl.h"
int main()
{
cl_uint ret_num_platforms;
cl_uint ret_num_devices;
cl_platform_id platform_id = NULL;
cl_kernel kernel2 = NULL;
cl_program program2 = NULL;
cl_command_queue command_queue = NULL;
cl_context context = NULL;
cl_device_id device_id = NULL;
cl_int ret;
FILE * fp2;
char fileName2[]="./kernel.cl";
int for_var=0;
char * source_str2;
size_t source_size2;
size_t globalWorkSize[1];
size_t localWorkSize[1];
cl_mem cl_buffer3;
cl_mem cl_buffer2;
cl_mem cl_buffer1;
cl_mem cl_buffer0;
int *A;
int *B;
int *C;
int *n;
int i;
n = ((int *)(malloc((sizeof(int )))));
printf("Enter the number of elements of vector : \n");
scanf("%d",n);
A = ((int *)(malloc((( *n) * sizeof(int )))));
B = ((int *)(malloc((( *n) * sizeof(int )))));
C = ((int *)(malloc((( *n) * sizeof(int )))));
printf("\nInput Vector1 :\n");
for (i = 0; i <= *n - 1; i += 1) {
A[i] = (2 * i);
printf("%d ",A[i]);
}
printf("\n\nInput Vector2 :\n");
for (i = 0; i <= *n - 1; i += 1) {
B[i] = (3 * i);
printf("%d ",B[i]);
}
ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
if (ret != CL_SUCCESS) {
printf("Platform error");
}
ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_DEFAULT,1,&device_id,&ret_num_devices);
if (ret != CL_SUCCESS)
printf("device err");
context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
if (!context)
printf("context err");
command_queue = clCreateCommandQueue(context,device_id,0,&ret);
if (!command_queue)
printf("command queue error");
localWorkSize[0] = 16;
globalWorkSize[0] =16400;
cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &ret);
cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, (*n) * sizeof(int), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,(*n) * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,(*n) * sizeof(int), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0, sizeof(int), n, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
fp2 = fopen(fileName2,"r");
if (!fp2) {
fprintf(stderr,"Failed");
exit(1);
}
source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
fclose(fp2);
program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);
if(!program2)
printf("error creating program2");
ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);
if (ret)
printf("error building program2");
kernel2 = clCreateKernel(program2, "ADD" , &ret);
ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);
ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);
ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);
ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);
ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
printf("\n\nAddition of vectors :\n");
for (i = 0; i <= *n - 1; i += 1) {
printf("%d ",C[i]);
}
clReleaseMemObject(cl_buffer0);
clReleaseMemObject(cl_buffer1);
clReleaseMemObject(cl_buffer2);
clReleaseMemObject(cl_buffer3);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
Kernel file is(kernel.cl) :
__kernel void ADD(__constant int *A,__constant int *B,__global int *C,__constant int *n)
{
int i = get_global_id(0);
if (i <= *n - 1) {
C[i] = (A[i] + B[i]);
}
}
The program works fine if I give 16384 as total vector elements but it gives 0 as output for values more than that. I want to run this program with large data set so that I can compare its performance with the one running on CPU.
Please guide me how can I do so?
There's at least one bug in your code - you're copying MEM_SIZE * sizeof(int) bytes from n to buffer 3:
ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0,MEM_SIZE * sizeof(int), n, 0, NULL, NULL);
however, n is only sizeof(int) bytes long:
n = ((int *)(malloc((sizeof(int )))));
I don't know what problems this might be causing, and it's entirely possible there are other, more severe bugs, but this one certainly isn't helping.

How do I send/read data from device in OpenCL?

How do I just create some data on device and then send/read it on the host?
I tried following but does not seem to work.
#include <stdio.h>
#include <stdlib.h>
#include "vectors.h"
#include "sphere.h"
#include "shading.h"
#include "ray.h"
#include "stdbool.h"
#include <CL/cl.h>
#if defined __APPLE__
#include <OpenGL/gl.h>
#include <OpenGL/glu.h>
#include <GLUT/glut.h>
#elif defined (WIN32)
#include <GL/freeglut.h>
#else
#include <GL/gl.h>
#include <GL/glu.h>
#include <GL/freeglut_std.h>
#endif
#include <time.h>
VECTOR3D light;
SPHERE sphere[NSPHERES];
static PIXEL pixel;
VIEWPORT viewport;
VECTOR3D view_point;
VEC_BASIS camera_frame;
cl_double focal_distance;
double color;
//double kd_red, kd_green, kd_blue;
//double ks_red, ks_green, ks_blue;
double red, green, blue;
double light_intensity, ambi_light_intensity;
double theta, reflected_theta;
int bShadow = 0;
int direction[NSPHERES];
int intersection_object = -1; // none
double current_lambda = 0x7fefffffffffffff; // maximum positive double
double current_reflected_lambda = 0x7fefffffffffffff; // maximum positive double
// window identifier:
static int win;
void Timer (int obsolete) {
glutPostRedisplay();
glutTimerFunc(10, Timer, 0);
}
// opencl stuff
typedef struct cl_struct {
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
cl_command_queue queue;
} cl_struct;
#define MAX_SOURCE_SIZE (0x100000)
void disp2(void) {
int i,j;
PIXEL* pCurrentPixel;
PIXEL* pPixels;
int VPWIDTH = viewport.xvmax - viewport.xvmin;
int VPHEIGHT = viewport.yvmax - viewport.yvmin;
pPixels = (PIXEL*)(viewport.pPixels);
//clear all pixels:
glClear(GL_COLOR_BUFFER_BIT);
// For all pixels:
for (i=0; i<VPWIDTH; i++) {
for (j=0; j<VPHEIGHT; j++) {
pCurrentPixel = (PIXEL*)(pPixels + VPWIDTH*i + j);
//set color for the current pixel:
glColor3f(pCurrentPixel->rgb[0] , pCurrentPixel->rgb[1], pCurrentPixel->rgb[2]);
// draw pixel
glBegin(GL_POINTS);
glVertex2i(i, j);
glEnd();
} // j
} //i
//glFlush();
glutSwapBuffers();
}
void init(void) {
direction[0] = 1;
direction[1] = 0;
direction[2] = 1;
pixel.i = 0;
pixel.j = 0;
// set scene:
// 1. define viewport
viewport.xvmin = -VIEWPLANE;
viewport.yvmin = -VIEWPLANE;
viewport.xvmax = VIEWPLANE;
viewport.yvmax = VIEWPLANE;
// 2. allocate enough space for pixels in viewport
viewport.pPixels = (PIXEL *) malloc(sizeof(PIXEL) * (viewport.xvmax - viewport.xvmin) * (viewport.yvmax- viewport.yvmin));
// 3. set camera:
camera_frame.u.x = 1.0;
camera_frame.u.y = 0.0;
camera_frame.u.z = 0.0;
camera_frame.v.x = 0.0;
camera_frame.v.y = 1.0;
camera_frame.v.z = 0.0;
camera_frame.n.x = 0.0;
camera_frame.n.y = 0.0;
camera_frame.n.z = 1.0;
view_point.x = (viewport.xvmax - viewport.xvmin) / 2.0 ;
view_point.y = (viewport.yvmax - viewport.yvmin) / 2.0 ;
view_point.z = 0.0;
// 4. set light:
light.x = view_point.x - 1300;
light.y = view_point.y + 1300 ;
light.z = view_point.z - 300;
ambi_light_intensity = 1.0;
light_intensity = 1.0;
focal_distance = FOCALDIST;
// 5. put spheres behind the viewport:
sphere[0].radius = RADIUS/1.5;
sphere[0].center.x = view_point.x - (RADIUS+30);
sphere[0].center.y = view_point.y ;
sphere[0].center.z = view_point.z - focal_distance - (2*RADIUS+20);
// the first sphere is blue:
set_rgb_array(sphere[0].kd_rgb, 0.0, 0.0, 0.8);
set_rgb_array(sphere[0].ks_rgb, 1.0, 1.0, 1.0);
set_rgb_array(sphere[0].ka_rgb, 0.0, 0.0, 0.2);
sphere[0].shininess = 100.0;
sphere[0].mirror = false;
sphere[1].radius = RADIUS/1.2;
sphere[1].center.x = view_point.x + 0;
sphere[1].center.y = view_point.y + 50;
sphere[1].center.z = view_point.z - focal_distance - (3*RADIUS+20);
// the second sphere is green:
set_rgb_array(sphere[1].kd_rgb, 0.0, 0.5, 0.0);
set_rgb_array(sphere[1].ks_rgb, 1.0, 1.0, 1.0);
set_rgb_array(sphere[1].ka_rgb, 0.0, 0.2, 0.0);
sphere[1].shininess = 10.0;
sphere[1].mirror = false;
sphere[2].radius = RADIUS;
sphere[2].center.x = view_point.x + (2*RADIUS+30);
sphere[2].center.y = view_point.y + 100;
sphere[2].center.z = view_point.z - focal_distance - (4*RADIUS+20);
// the third sphere is red:
set_rgb_array(sphere[2].kd_rgb, 1.0, 0.0, 0.0);
set_rgb_array(sphere[2].ks_rgb, 1.0, 1.0, 1.0);
set_rgb_array(sphere[2].ka_rgb, 0.2, 0.0, 0.0);
sphere[2].shininess = 100.0;
sphere[2].mirror = false;
sphere[3].radius = 1*RADIUS;
sphere[3].center.x = view_point.x ;
sphere[3].center.y = view_point.y - 100*RADIUS-130;
sphere[3].center.z = view_point.z - focal_distance - (4*RADIUS+20);
// the third sphere is red:
set_rgb_array(sphere[3].kd_rgb, 0.5, 0.5, 0.5);
set_rgb_array(sphere[3].ks_rgb, 1.0, 1.0, 1.0);
set_rgb_array(sphere[3].ka_rgb, 0.5, 0.5, 0.5);
sphere[3].shininess = 100.0;
sphere[3].mirror = false;
// set clearing (background) color to white:
glClearColor(0.0, 0.0, 0.0, 0.0);
// specify that ortgogonal 2D projection is to be used to
// map context of 2D world coordinats to the screen. We use the
// world-coordinate rectangle of the same aspect ratio as the display window
// so ther is no distortion:
glMatrixMode(GL_PROJECTION);
gluOrtho2D(0.0, WINDOW, 0.0, WINDOW);
}
int main(int argc, const char * argv[]) {
clock_t startCPU, endCPU, startGPU, endGPU;
// init glut:
glutInit (&argc, argv);
// specify the display mode to be RGB and single buffering:
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB);
// specify the initial window position:
glutInitWindowPosition(100, 100);
// specify the initial window size:
glutInitWindowSize(WINDOW,WINDOW);
// create the window and set title:
win = glutCreateWindow("Basic Ray Tracer by Pa3cio, UL FRI");
init();
// Create the two input vectors
int i, j, k, l;
int VPWIDTH = viewport.xvmax - viewport.xvmin;
int VPHEIGHT = viewport.yvmax - viewport.yvmin;
// PIXEL* pixels = (PIXEL*) malloc(sizeof(PIXEL) * VPWIDTH * VPHEIGHT);
PIXEL* pPixelsFromGPU = (PIXEL*) malloc(sizeof(PIXEL) * VPWIDTH * VPHEIGHT);
PIXEL* pCurrentPixel;
PIXEL* pPixels;
RAY ray, shadow_ray;
SPHERE_INTERSECTION intersection, current_intersection, shadow_ray_intersection;
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("/home/rokj/sula/vpsa/seminarska/kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR,
VPWIDTH * VPHEIGHT * sizeof(PIXEL), NULL, &ret);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "compute_ray", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&output);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 1\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 1, sizeof(VECTOR3D), &view_point);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 2\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 2, sizeof(VECTOR3D), &camera_frame.n);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 3\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 3, sizeof(VECTOR3D), &camera_frame.u);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 4\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 4, sizeof(VECTOR3D), &camera_frame.v);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 5\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 5, sizeof(cl_int), &viewport.xvmin);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 6\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 6, sizeof(cl_int), &viewport.yvmin);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 7\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 7, sizeof(cl_double), &focal_distance);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 7\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 8, sizeof(cl_int), &VPWIDTH);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 9\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 9, sizeof(cl_int), &VPHEIGHT);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 10\n", stderr);
exit(1);
}
ret = clFinish(command_queue);
// Execute the OpenCL kernel on the list
size_t global_item_size = VPWIDTH * VPHEIGHT; // Process the entire lists
size_t local_item_size = 1024; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0,
VPWIDTH * VPHEIGHT * sizeof(PIXEL), pPixelsFromGPU, 0, NULL, NULL);
// Display the result to the screen
//for(i = 0; i < LIST_SIZE; i++)
// printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
// ret = clFlush(command_queue);
// ret = clFinish(command_queue);
// ret = clReleaseKernel(kernel);
// ret = clReleaseProgram(program);
// ret = clReleaseMemObject(b_mem_obj);
// ret = clReleaseCommandQueue(command_queue);
// ret = clReleaseContext(context);
pPixels = (PIXEL*) (pPixelsFromGPU);
// For all pixels:
for (i=0; i<VPWIDTH; i++) {
for (j=0; j<VPHEIGHT; j++) {
//pCurrentPixel = (PIXEL*)(pPixels + VPWIDTH*i + j);
// here I try to get pixel set on GPU, but it does not work
pCurrentPixel = &pPixels[i*VPWIDTH+j];
} //j
} //i
viewport.pPixels = (PIXEL*) (pPixelsFromGPU);
// register callback function to display graphics:
glutDisplayFunc(disp2);
// call Timer():
Timer(0);
// enter tha main loop and process events:
glutMainLoop();
free(pPixelsFromGPU);
return 0;
}
Definitions on host:
#include <CL/cl.h>
#ifndef DEFS_H
#define DEFS_H
#define _BLINNPHONG
//#define _LAMBERT
//#define _NOSHADING
#define NSPHERES 4
#define VIEWPLANE 500
#define WINDOW VIEWPLANE*2
#define FOCALDIST 1000
#define RADIUS 200
//#define _ANIMATE
// typedef enum {false=0, true=1} BOOL;
// typedef enum {down=0, up=1} DIRECTION;
#define CRED 0
#define CGREEN 1
#define CBLUE 2
#define true 1
#define false 0
/* --------------- VECTORS -------------------- */
typedef struct Vector3D{
cl_double x;
cl_double y;
cl_double z;
cl_double dummy1;
} VECTOR3D;
typedef struct ray {
VECTOR3D origin;
VECTOR3D direction;
} RAY;
/* ------------------- PIXEL ------------------ */
typedef struct pixel {
RAY ray;
cl_double rgb[4];
cl_int i;
cl_int j;
cl_int dummy1;
cl_int dummy2;
cl_int dummy3;
cl_int dummy4;
cl_int dummy5;
cl_int dummy6;
} PIXEL;
/* ----------------- VIEWPORT ----------------- */
typedef struct vp {
cl_int xvmin;
cl_int yvmin;
cl_int xvmax;
cl_int yvmax;
PIXEL* pPixels;
} VIEWPORT;
/* ---------------- SPHERE -------------------- */
typedef struct sp_intersection {
cl_double lambda_in;
cl_double lambda_out;
VECTOR3D normal;
VECTOR3D point;
cl_int valid;
} SPHERE_INTERSECTION;
typedef struct sph {
VECTOR3D center;
cl_double radius;
cl_double kd_rgb[3];
cl_double ks_rgb[3];
cl_double ka_rgb[3];
cl_double shininess;
cl_int mirror;
}SPHERE;
/* ------------------- RAY --------------------- */
/* --------------- VECTOR BASIS ---------------- */
typedef struct vb {
VECTOR3D u;
VECTOR3D v;
VECTOR3D n;
} VEC_BASIS;
#endif
In kernel:
typedef struct pixel {
RAY ray;
double rgb[4];
int i;
int j;
int dummy1;
int dummy2;
int dummy3;
int dummy4;
int dummy5;
int dummy6;
} PIXEL;
And kernel:
typedef struct Vector3D {
double x;
double y;
double z;
double dummy1;
} VECTOR3D;
typedef struct ray {
VECTOR3D origin;
VECTOR3D direction;
} RAY;
typedef struct pixel {
RAY ray;
double rgb[4];
int i;
int j;
int dummy1;
int dummy2;
int dummy3;
int dummy4;
int dummy5;
int dummy6;
} PIXEL;
void vec_sub(VECTOR3D *v1, VECTOR3D *v2, VECTOR3D *v3) {
v1->x = v2->x - v3->x;
v1->y = v2->y - v3->y;
v1->z = v2->z - v3->z;
}
void vec_add(VECTOR3D *v1, VECTOR3D *v2, VECTOR3D *v3) {
v1->x = v2->x + v3->x;
v1->y = v2->y + v3->y;
v1->z = v2->z + v3->z;
}
void vec_scale(double scale, VECTOR3D *v1, VECTOR3D *v2) {
v1->x = scale * v2->x;
v1->y = scale * v2->y;
v1->z = scale * v2->z;
}
double dotproduct(VECTOR3D *v1, VECTOR3D *v2) {
return v1->x * v2->x + v1->y * v2->y + v1->z * v2->z;
}
void normalize_vector(VECTOR3D *v) {
double magnitude;
// 1. calculate the magnitude (length):
magnitude = sqrt( dotproduct(v, v) );
// 2. normalize the vector:
v->x = v->x / magnitude;
v->y = v->y / magnitude;
v->z = v->z / magnitude;
}
__kernel void compute_ray(
write_only global PIXEL *output,
VECTOR3D view_point,
VECTOR3D camera_frame_n,
VECTOR3D camera_frame_u,
VECTOR3D camera_frame_v,
const int viewport_xvmin,
const int viewport_yvmin,
const double distance,
const int w, const int h
)
{
float u, v;
VECTOR3D v1, v2, v3, v4, dir;
RAY ray;
PIXEL pixel;
int gi = get_global_id(0);
int i = gi / w;
int j = gi % w;
u = (float)(viewport_xvmin) + (float)(i) + 0.5f;
v = (float)(viewport_yvmin) + (float)(j) + 0.5f;
vec_scale(-distance, &v1, &camera_frame_n);
vec_scale(u, &v2, &camera_frame_u);
vec_scale(v, &v3, &camera_frame_v);
ray.origin.x = 22;
ray.origin.y = 22;
ray.origin.z = 22;
vec_add(&v4, &v1, &v2);
vec_add(&dir, &v4, &v3);
normalize_vector(&dir);
ray.direction.x = 11;
ray.direction.y = 11;
ray.direction.z = 11;
pixel.ray = ray;
pixel.i = 33;
pixel.j = 33;
output[i*w*j] = pixel;
}
I intentionally set i, j, origin and direction structures to fixed number so I could see if numbers are set.
Then I try to get pixel set on GPU in line
pCurrentPixel = &pPixels[i*VPWIDTH+j];
but pCurrentPixel->i for example is 0 instead of 33.
Code compiles with following commands:
gcc -c main.c -o main.o
gcc -c shading.c -o shading.o
gcc -c sphere.c -o sphere.o
gcc -c ray.c -o ray.o
gcc -c vectors.c -o vectors.o
gcc -I/usr/include -L/usr/lib/x86_64-linux-gnu main.o shading.o sphere.o ray.o vectors.o -lGL -lglut -lGLU -lX11 -lm -lrt -lOpenCL -o main
Besides setting the right VPWIDTH and VPHEIGHT output[i*w*j] = pixel; had to be changed to output[i*w+j] = pixel; in C code.

clEnqueueNDRangeKernel throws CL_OUT_OF_RESOURCES

I have a kernel running very well on Intel HD graphics card. But, when I want to run the kernel on my GeForce 960 it gives the CL_OUT_OF_RESOURCES error.
I have tried for different local sizes and made sure to not go beyond the array indices, but still have no clue why this error is happening. Do you know why my code runs fine on Intel and doesn't work on NVIDIA?
One weird thing that is happening in my code is that I have a 13 itrations of similar operations. For performance purposes, I have repeated the same operations for 13 times and avoided writing a loop just to save some additional operations that loops have. The code works on NVIDIA when I reach to the 11th operation. But, when I include the 12th operation in the code it gives the above error and the 11th and 12th operations are similar! Any ideas why such thing is happening?
Here is the kernel:
float2 projectCube(float3 axis, float3 vertex){
float voxelSize = 0.5f;
float2 projection = (float2)(0.0f, 0.0f);
float temp;
//1
temp = axis.x;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//2
temp = axis.x + axis.y;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//3
temp = axis.y;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//4
temp = axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//5
temp = axis.x + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//6
temp = axis.y + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//7
temp = axis.x + axis.y + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
float product = dot(axis, vertex);
projection.x = voxelSize * projection.x + product;
projection.y = voxelSize * projection.y + product;
return projection;
}
float2 projectTriangle(float3 axis, float3 v0, float3 v1, float3 v2){
float2 projection;
projection.x = dot(axis, v0);
projection.y = projection.x;
float temp = dot(axis, v1);
if(projection.x > temp){
projection.x = temp;
}
else if(projection.y < temp){
projection.y = temp;
}
temp = dot(axis, v2);
if (projection.x > temp){
projection.x = temp;
}
else if (projection.y < temp){
projection.y = temp;
}
return projection;
}
float tester(float3 axis, float3 voxel, float3 v0, float3 v1, float3 v2){
float2 voxelProjection = projectCube(axis, voxel);
float2 faceProjection = projectTriangle(axis, v0, v1, v2);
float minProjection = fmin(voxelProjection.x, faceProjection.x);
float maxProjection = fmax(voxelProjection.y, faceProjection.y);
float testResult = maxProjection - minProjection - voxelProjection.y + voxelProjection.x
- faceProjection.y + faceProjection.x;
return testResult;
}
__kernel void voxelizer(size_t global_size,
float h_voxelSize,
__global float* h_minBoundsGrid,
__global int *h_dimGrid,
__global float* coords,
__global int* density)
{
//printf("local size is: %d\n", get_num_groups(0));
int i = get_global_id(0) * 9;
if (i <= global_size * 9){
float voxelSize = h_voxelSize;
float3 minBoundsGrid;
minBoundsGrid.x = h_minBoundsGrid[0];
minBoundsGrid.y = h_minBoundsGrid[1];
minBoundsGrid.z = h_minBoundsGrid[2];
int3 dimGrid;
dimGrid.x = h_dimGrid[0];
dimGrid.y = h_dimGrid[1];
dimGrid.z = h_dimGrid[2];
if ( i %9 == 0){
/*Triangle vertices*/
float3 v0;
v0 = (float3)(coords[i], coords[i + 1], coords[i + 2]);
float3 v1;
v1 = (float3)(coords[i + 3], coords[i + 4], coords[i + 5]);
float3 v2;
v2 = (float3)(coords[i + 6], coords[i + 7], coords[i + 8]);
//printf("i = %d. v0: %f, %f, %f\n", i, v0.x, v0.y, v0.z);
//printf("i = %d. v1: %f, %f, %f\n", i, v1.x, v1.y, v1.z);
//printf("i = %d. v2: %f, %f, %f\n", i, v2.x, v2.y, v2.z);
/*Normal vectors of the each voxel*/
float3 e0;
e0 = (float3)(0.5f, 0.0f, 0.0f);
float3 e1;
e1 = (float3)(0.0f, 0.5f, 0.0f);
float3 e2;
e2 = (float3)(0.0f, 0.0f, 0.5f);
/*Edges of a traingle*/
float3 f0;
f0 = v1 - v0;
float3 f1;
f1 = v2 - v1;
float3 f2;
f2 = v0 - v2;
float3 minLocalGrid;
minLocalGrid.x = fmin(v0.x, fmin(v1.x, v2.x));
minLocalGrid.y = fmin(v0.y, fmin(v1.y, v2.y));
minLocalGrid.z = fmin(v0.z, fmin(v1.z, v2.z));
minLocalGrid.x = voxelSize * floor(minLocalGrid.x / voxelSize);
minLocalGrid.y = voxelSize * floor(minLocalGrid.y / voxelSize);
minLocalGrid.z = voxelSize * floor(minLocalGrid.z / voxelSize);
//printf("i = %d. minLocalGrid = %f, %f, %f.\n", i, minLocalGrid.x, minLocalGrid.y, minLocalGrid.z);
float3 maxLocalGrid;
maxLocalGrid.x = fmax(v0.x, fmax(v1.x, v2.x));
maxLocalGrid.y = fmax(v0.y, fmax(v1.y, v2.y));
maxLocalGrid.z = fmax(v0.z, fmax(v1.z, v2.z));
maxLocalGrid.x = voxelSize * ceil(maxLocalGrid.x / voxelSize);
maxLocalGrid.y = voxelSize * ceil(maxLocalGrid.y / voxelSize);
maxLocalGrid.z = voxelSize * ceil(maxLocalGrid.z / voxelSize);
if (maxLocalGrid.x == minLocalGrid.x){ maxLocalGrid.x += voxelSize; }
if (maxLocalGrid.y == minLocalGrid.y){ maxLocalGrid.y += voxelSize; }
if (maxLocalGrid.z == minLocalGrid.z){ maxLocalGrid.z += voxelSize; }
//printf("i = %d. maxLocalGrid = %f, %f, %f.\n", i, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
//printf("i = %d\n v0 = %f, %f, %f\n v1 = %f, %f, %f\n v2 = %f, %f, %f\n minLocalGrid = %f, %f, %f\n===============\n",
// i, v0.x, v0.y, v0.z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
float j = minLocalGrid.z;
while(j < maxLocalGrid.z){
float k = minLocalGrid.y;
while(k < maxLocalGrid.y){
float l = minLocalGrid.x;
while (l < maxLocalGrid.x){
float3 firstVertexOfVoxel = (float3)(l, k, j);
//printf("l,k,j: %f, %f, %f\n", l, k, j);
float3 globalCoordOffset = (firstVertexOfVoxel - minBoundsGrid) / voxelSize;
int3 globalDimOffset = convert_int3_rtz(globalCoordOffset);
//printf("i = %d. globalCoordOffset: %f, %f, %f\n", i, globalCoordOffset.x, globalCoordOffset.y, globalCoordOffset.z);
//printf("i = %d. globalDimOffset: %d, %d, %d\n", i, globalDimOffset.x, globalDimOffset.y, globalDimOffset.z);
int voxelIndexGlobalGrid = globalDimOffset.x + dimGrid.x * (globalDimOffset.y +
dimGrid.y * globalDimOffset.z);
//printf("i = %d. voxelIndexGlobalGrid = %d\n", i, voxelIndexGlobalGrid);
if (density[voxelIndexGlobalGrid] != 1){
/*The famous 13-axes test*/
float3 axis;
float testResult = 0;
int overlapCount = 0;
//1
testResult = tester(e0, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//2
testResult = tester(e1, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//3
testResult = tester(e2, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//4
//axis = ;
testResult = tester(cross(-f2, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//5
/*axis = cross(e0, f0);*/
testResult = tester(cross(e0, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//6
//axis = cross(e0, f0);
testResult = tester(cross(e0, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//7
//axis = cross(e0, f0);
testResult = tester(cross(e0, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//8
//axis = cross(e1, f0);
testResult = tester(cross(e1, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//9
//axis = cross(e1, f1);
testResult = tester(cross(e1, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//10
//axis = cross(e1, f2);
testResult = tester(cross(e1, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//11
//axis = cross(e2, f0);
testResult = tester(cross(e2, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//12
//axis = cross(e2, f1);
testResult = tester(cross(e2, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//13
//axis = cross(e2, f2);
testResult = tester(cross(e2, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
if (overlapCount == 13){
density[voxelIndexGlobalGrid] = 1;
}
}
l = l + voxelSize;
}// while for l
k = k + voxelSize;
}// while for k
j = j + voxelSize;
}//while for j
//printf("Here are the max of the %d-th face: %f, %f, %f\n", i / 9, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e1.x, e1.y, e1.z);
//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e2.x, e2.y, e2.z);
//printf("\n==================KERNEL COMPUTED==================\n");
//barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
And this is the c-code:
#define DEVICE_SELECTOR 1 //0 for Intel and 1 for Nvidia in my computer
#define _CRT_SECURE_NO_WARNINGS
#define KERNEL_FILE "..\\voxelizerKernel.cl"
#define WORK_DIM 1
#define VOXEL_SIZE 0.5f
#define HALF_VOXEL_SIZE VOXEL_SIZE/2.0f;
//C header files
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <algorithm>
//OpenCL header files
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
cl_device_id create_device() {
cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;
/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
printf("Error code: %d. Couldn't identify a platform\n", err);
exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[DEVICE_SELECTOR], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't access any devices\n", err);
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
printf("Couldn't find the program file\n");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create the program\n", err);
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
void print_device_info(cl_device_id dev){
cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];
clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);
printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR: %s\n", dev_vendor);
printf("CL_DEVICE_NAME: %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE: %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE: %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION: %s\n", driver_version);
printf("CL_DEVICE_VERSION: %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY: %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS: %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");
}
int main()
{
/*OpenCL variables*/
cl_int i, j, err, num_groups;
size_t local_size, max_local_size, global_size, processed_global_size;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_device_id device;
cl_kernel voxelization_kernel, reduction_kernel, reduction_complete_kernel;
cl_mem coords_buffer, density_buffer, dimGrid_buffer, h_minBoundsGrid_buffer, fullVxelsCount_buffer, group_sums_buffer;
void *density_mapped_memory;
cl_event prof_event;
cl_ulong time_start, time_end, total_time;
float h_voxelSize = VOXEL_SIZE;
float fullVxelsCount = 0;
/*Read mesh data*/
float coords[54] =
{ 0.300500,
1.300000,
0.000500,
1.200500,
1.600000,
0.000500,
1.600500,
0.600000,
0.000500,
0.300500,
1.300000,
0.000500,
0.500500,
1.900000,
0.000500,
1.200500,
1.600000,
0.000500,
0.300500,
1.300000,
0.000500,
1.600500,
0.600000,
0.000500,
0.100500,
0.700000,
0.000500,
0.100500,
0.700000,
0.000500,
1.600500,
0.600000,
0.000500,
0.000500,
0.200000,
0.000500,
0.000500,
0.200000,
0.000500,
1.600500,
0.600000,
0.000500,
1.600500,
0.100000,
0.000500,
1.200500,
1.600000,
0.000500,
1.600500,
1.300000,
0.000500,
1.600500,
0.600000,
0.000500 };
/*Get the voxel count*/
float boundsGrid[6] = {0,2,0,2,0,0.5};
int dimGrid[3] = {
(boundsGrid[1] - boundsGrid[0]) / VOXEL_SIZE,
(boundsGrid[3] - boundsGrid[2]) / VOXEL_SIZE,
(boundsGrid[5] - boundsGrid[4]) / VOXEL_SIZE
};
if (dimGrid[0] == 0) dimGrid[0] = 1;
if (dimGrid[1] == 0) dimGrid[1] = 1;
if (dimGrid[2] == 0) dimGrid[2] = 1;
float h_minBoundsGrid[3];
h_minBoundsGrid[0] = boundsGrid[0];
h_minBoundsGrid[1] = boundsGrid[2];
h_minBoundsGrid[2] = boundsGrid[4];
int voxelCounts = dimGrid[0] * dimGrid[1] * dimGrid[2];
/*Prepare kernel output : build an array for storing voxles' density info*/
int *density = (int*)malloc(sizeof(int)*voxelCounts);
for (int i = 0; i < voxelCounts; i++){
density[i] = 0;
}
/*OpenCL essentials*/
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, NULL);
//print_device_info(device);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a context\n", err);
exit(1);
}
program = build_program(context, device, KERNEL_FILE);
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a command queue\n", err);
exit(1);
};
voxelization_kernel = clCreateKernel(program, "voxelizer", &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a kernel\n", err);
exit(1);
};
int numberOfFaces = 6;
global_size = numberOfFaces;
local_size = max_local_size;
if (global_size % local_size != 0){
processed_global_size = (global_size / local_size + 1) * local_size;
//int padding = processed_global_size - global_size;
//int *working_data = (int*)malloc((voxelCounts + padding)*sizeof(int));
//memcpy(working_data, density, voxelCounts);
//memset(working_data + voxelCounts, 0.0, padding);
}
else{
processed_global_size = global_size;
}
/* Create host-device data exchange interface*/
dimGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, dimGrid, &err);
h_minBoundsGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, h_minBoundsGrid, &err);
coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float) * 54, coords, &err);
density_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * voxelCounts, density, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a buffer\n", err);
exit(1);
};
err = clSetKernelArg(voxelization_kernel, 0, sizeof(global_size), &global_size);
err |= clSetKernelArg(voxelization_kernel, 1, sizeof(h_voxelSize), &h_voxelSize);
err |= clSetKernelArg(voxelization_kernel, 2, sizeof(cl_mem), &h_minBoundsGrid_buffer);
err |= clSetKernelArg(voxelization_kernel, 3, sizeof(cl_mem), &dimGrid_buffer);
err |= clSetKernelArg(voxelization_kernel, 4, sizeof(cl_mem), &coords_buffer);
err |= clSetKernelArg(voxelization_kernel, 5, sizeof(cl_mem), &density_buffer);
if (err < 0) {
printf("Error code: %d. Couldn't create an argument for voxelization_kernel\n", err);
exit(1);
}
/* Do the voxelization magic */
err = clEnqueueNDRangeKernel(queue, voxelization_kernel, 1, NULL, &processed_global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
printf("Error code: %d. Couldn't enqueue the voxelization_kernel\n", err);
exit(1);
}
/* Read the results */
density_mapped_memory = clEnqueueMapBuffer(queue, density_buffer, CL_TRUE,
CL_MAP_READ, 0, sizeof(density), 0, NULL, NULL, &err);
if (err < 0) {
printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
exit(1);
}
memcpy(density, density_mapped_memory, sizeof(density)* voxelCounts);
err = clEnqueueUnmapMemObject(queue, density_buffer, density_mapped_memory,
0, NULL, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't unmap the density_buffer\n", err);
exit(1);
}
for (int i = 0; i < voxelCounts; i++){
printf("%d\n", density[i]);
}
/*Clean up*/
clReleaseKernel(voxelization_kernel);
clReleaseMemObject(dimGrid_buffer);
clReleaseMemObject(h_minBoundsGrid_buffer);
clReleaseMemObject(coords_buffer);
clReleaseMemObject(density_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}

Using clEnqueueNDRangeKernel in OpenCL

I need help with one function in OpenCL. When I'm starting using clEnqueueNDRangeKernel instead of clEnqueueTask it takes much more time for program to succeed. Why so? As I understand, the program should use data parallel model and it will work faster, am I wrong? And if I am, how I can change code to see the actual work of data parallel model?
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
for (int i = 0, j = 0; i < InSize; i+=4, j++)
{
unsigned char Value = (pDataIn[i] + pDataIn[i + 1] + pDataIn[i + 2]) / 3;
pDataOut[j] = Value;
}
}
int iWidth, iHeight, iBpp;
vector<unsigned char> pDataIn;
vector<unsigned char> pDataOut;
int err = LoadBmpFile(L"3840x2160.bmp", iWidth, iHeight, iBpp, pDataIn);
if (err != 0 || pDataIn.size() == 0 || iBpp != 32)
{
std::cout << "error load input file!\n";
}
pDataOut.resize(pDataIn.size()/4);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj = NULL;
cl_mem memobj1 = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
unsigned int SizeIn, SizeOut;
SizeIn = pDataIn.size();
SizeOut = pDataOut.size();
FILE *fp;
char fileName[] = "./kernel.cl";
char *source_str;
size_t source_size;
//Loading kernel
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
system("PAUSE");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
//Getting Platform and Device
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
//Create context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
//create kernel program
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
//build it
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
//create queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//create bufer
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, pDataIn.size(), NULL, &ret);
memobj1 = clCreateBuffer(context, CL_MEM_READ_WRITE,pDataOut.size(), NULL, &ret);
//copy buffer to kernel
ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, pDataIn.size(), pDataIn.data(), 0, NULL, NULL);
//create opencl kernel
kernel = clCreateKernel(program, "red_to_green", &ret);
//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
ret = clSetKernelArg(kernel, 2, sizeof(unsigned int), (void *)&SizeIn);
ret = clSetKernelArg(kernel, 3, sizeof(unsigned int), (void *)&SizeOut);
const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){
float start_time = clock();
float search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = 8;
size_t local_item_size = 4;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
//copy from buffer
ret = clEnqueueReadBuffer(command_queue, memobj1, CL_TRUE, 0, pDataOut.size(), pDataOut.data(), 0, NULL, NULL);
ret = clFinish(command_queue);
float end_time = clock();
search_time = end_time - start_time;
//float last_time = last_time + search_time;
cout << search_time << endl;
}
clock_t t1 = clock();
double time_seconds = (t1-t0)*CLOCKS_PER_SEC/cycles_max;
cout << time_seconds/1000 <<endl;
WriteBmpFile(L"3840x2160_wb.bmp", iWidth, iHeight, 8, pDataOut.size(), pDataOut.data(), false);
system("PAUSE");
from the docs page:
The kernel is executed using a single work-item.
clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with
work_dim = 1, global_work_offset = NULL, global_work_size[0] set to 1,
and local_work_size[0] set to 1.
When you use clEnqueueNDRangeKernel, you are using 2 work groups of 4 work items, but they are all doing the same work. They all read from the same global memory, but more importantly, they all try to write to the same locations in global memory.
You need to take into account the worker's global id when doing your computations.
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
int gid = get_global_id(0);
int gsize = get_global_size(0);
for (int j = gid; j < (InSize >> 2); j+= gsize)
{
unsigned char Value = (pDataIn[j*4] + pDataIn[j*4 + 1] + pDataIn[j*4 + 2]) / 3;
pDataOut[j] = Value;
}
}
It looks like you are iterating over all pixels of an input image in your kernel. This will cause all threads to calculate the image intensity for all pixels. Try to launch a single thread for each pixel instead. To do so, change your kernel source code to only calculate the output value for one pixel:
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut) {
int j = get_global_id(0);
int i = j*4;
pDataOut[i] = (pDataIn[j] + pDataIn[j + 1] + pDataIn[j + 2]) / 3;
}
This code will now perform the averaging over the RGB values of your RGBA input image for the single pixel at location i. Now all you need to do is launch as many threads as your image has pixels. Relevant changes:
//create opencl kernel
kernel = clCreateKernel(program, "black_white_img", &ret);
//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){
float start_time = clock();
float search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = iWidth * iHeight;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, NULL, 0, NULL, NULL);
This should give a considerable speedup comparing to your code.

OpenCL Error Computing Matrix Multiplication during Runtime

I have been debugging for the past few days and cannot get this OpenCL matrix multiplication kernel to run. Whenever I run the program, the output from the GPU results in large negative numbers similar to -198746573.0000. I was wondering if someone with HPC experience could point out an error in my code or if it is an error with the driver.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define widthA 2
#define heightA 2
#define widthB heightA
#define heightB 2
#define widthC widthA
#define heightC heightB
#ifdef __APPLE__
#include < OpenCL/opencl.h >
#else
#include <opencl.h>
#endif
#define MEM_SIZE (128)
#define MAX_SOURCE_SIZE (0x100000)
int main()
{
float * A = (float *)malloc(sizeof(float)*widthA*heightA);
float * B = (float *)malloc(sizeof(float)*widthB*heightB);
float * C = (float *)malloc(sizeof(float)*widthC*heightC);
float * Res = (float *)malloc(sizeof(float)*widthC*heightC);
float * D= (float *)malloc(sizeof(float)*widthC*heightC);
float ref[widthC][heightC];
int i, j, k;
FILE * fp1 = fopen("matAdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightA; j++) {
float p=(rand()%100)/7.0;
//*(A+i*heightA+j)=rand()%100 + p;
*(A+i*heightA+j)=4.0;
fprintf(fp1, "%f ",*(A+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
fp1 = fopen("matBdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthB; i++)
{
for(j=0; j < heightB; j++) {
float p=(rand()%100)/7.0;
//*((B+i*heightB+j))=rand()%100 + p;
*((B+i*heightB+j))=4.0;
fprintf(fp1, "%f ",*(B+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobjA = NULL;
cl_mem memobjB = NULL;
cl_mem memobjC = NULL;
cl_mem rowA = NULL;
cl_mem colC = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id[10];
cl_platform_id platform = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event GPUDone[0];
//char string[MEM_SIZE];
FILE *fp;
char fileName[] = "matrixMultiplication.cl";
char *source_str;
size_t source_size;
int row = widthA;
int col = heightC;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
/* Get Platform and Device Info */
ret = clGetPlatformIDs(10, platform_id, &ret_num_platforms);
char cBuffer[1024];
cl_uint c;
for(c = 0; c < ret_num_platforms; c++)
{
clGetPlatformInfo(platform_id[c], CL_PLATFORM_NAME, 1024, &cBuffer, NULL);
if (strstr(cBuffer, "NVIDIA") != NULL)
{
platform = platform_id[c];
break;
}
}
printf("Found Platform %s\n", cBuffer);
ret = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
printf("Found %d devices.\n", ret_num_devices);
/* Create OpenCL context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create Memory Buffer */
memobjA = clCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret);
memobjB = clCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret);
memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
rowA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
colC = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
widthA * heightA * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
widthB * heightB * sizeof(float), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0, sizeof(int), &row, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0, sizeof(int), &col, 0, NULL, NULL);
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, "matrixMultiplication", &ret);
/* Set OpenCL Kernel Arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row);
ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col);
/* Execute OpenCL Kernel */
//ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
size_t globalThreads[2] = {widthA, heightB};
size_t localThreads[2] = {16,16};
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
//clFlush(command_queue);
//clFinish(command_queue);
/* Copy results from the memory buffer */
ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0,
widthA * heightC * sizeof(float), Res, 0, NULL, &GPUDone[0]);
printf("Buffer Read ended with %d.\n", ret);
clWaitForEvents(1, GPUDone);
fp1 = fopen("matGPURes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(Res+i*heightC+j));
ref[i][j] = *(Res+i*heightC+j);
printf("GPU Output: %f\n", *(Res+i*heightC+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobjA);
ret = clReleaseMemObject(memobjB);
ret = clReleaseMemObject(memobjC);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
ret = clReleaseEvent(GPUDone[0]);
free(source_str);
float sum=0.0;
for(i = 0;i < widthA; i++)
{
for(j = 0; j < heightC; j++)
{
sum = 0;
for(k = 0; k < widthB; k++)
{
sum += A[i*col+k] * B[k*row+j];
printf("Multiplying A: %f, B: %f\n", A[i*col+k], B[k*row+j]);
}
D[i*heightC+j] = sum;
}
}
fp1 = fopen("matNormalMultiplicationRes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matNormalMultiplicationRes.txt\n");
exit(1);
}
for(i = 0; i<widthA; i++)
{
for(j = 0; j<heightA; j++)
{
if (ref[i][j] != D[i*heightA+j])
{
printf("Calculation error[ CPU: %f, GPU: %f ]\n", D[i*heightA+j], ref[i][j]);
}
}
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(D+i*heightC+j));
}
fprintf(fp1, "\n");
}
free(A);
free(B);
free(C);
free(D);
free(Res);
return 0;
}
Here is the kernel
#define BLOCK_SIZE 16
__kernel
void matrixMultiplication(__global float* A, __global float* B, __global float* C, int wA, int wB )
{
//int i = get_global_id(0);
//int j = get_global_id(1);
float Csub = 0.0f;
int bx = get_group_id(0);
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int aBegin = wA * BLOCK_SIZE * by;
int aEnd = aBegin + wA - 1;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * wB;
for (int a = aBegin, b=bBegin;
a <= aEnd;
a += aStep, b+=bStep)
{
__local float As[BLOCK_SIZE][BLOCK_SIZE];
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
barrier(CLK_LOCAL_MEM_FENCE);
for( int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
barrier(CLK_LOCAL_MEM_FENCE);
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
/*
float value=0;
for ( int k = 0; k < widthA; k++)
{
value = value + A[k + j * widthA] * B[k*widthB + i];
}
C[i + widthA * j] = value;
*/
}
I have double checked over and over again but simply cannot find any errors. I want to make sure its not a code error before I conclude its a driver issue.
Thanks!
Do you really need a complex kernel like that ? if you really want to do simple matrix multiplication
you can write a simple kernel like this, which is easy to debug.
__kernel void matrixMultiplication (__global float* A,
__global float* B,
__global float* C,
int widthA, int widthB )
{
//y direction
int row = get_global_id(1);
int col = get_global_id(0);
float cSum = 0.0f;
//calculate the result
for (int i=0; i<widthA; i++)
{
cSum += A[row*widthA+ i] * B[i*widthB+col];
}
C[row*widthB+col] = cSum;
}
Case is probably closed already, but for the sake of google-comers:
Shouldnt shared memory be explicitly declared on host and passed as kernel argument to the source? __local keyword is not the one you are looking for in this case.
See post on How to declare local memory in OpenCL? for the detailed explanation.
Check the functionality of your host. Here a few things to get you started ...
1) You don't need to create a buffer and enqueue it for a scalar constant Int like row and col. Just set it as a kernel arg.
2) Wait for the clEnqueueNDRangeKernel with an event. You want to be sure the calc has completed.
3) Add a printf statement in the kernel to print selected values to see that the input and output values are what you expect.
try
if ( get_local_id(0) % 8 == 0)
{
printf some useful value of a,b,c
}
3) Try the host code with a dumb kernel that copies an input array to an output array. That will confirm it you have the handling of buffer creation and the enqeue read/write code correct!

Resources