OpenCL gemm kernel local memory going slower

OpenCL gemm kernel local memory going slower - opencl

EDIT: It was my cards fault... The local memory kernel goes a few times faster, sorry all!!
I am writing a simple sgemm (square, alpha=1, beta=0) that is supposed to take advantage of local memory, but it performs at half the speed of a naive version.
Here are the kernels:
const char* matrixMultiplySource =
"__kernel\n"
" void matrixMultiply(__global float* A, __global float* B, __global float* C)\n"
" {\n"
" int i = get_local_id(0);\n"
" int j = get_local_id(1);\n"
" int ig = get_global_id(0);\n"
" int jg = get_global_id(1);\n"
" int sizeG0 = get_global_size(0);\n"
" __local float localA[BLOCK_SIZE][BLOCK_SIZE];\n"
" __local float localB[BLOCK_SIZE][BLOCK_SIZE];\n"
" float val=0.0f;\n"
" for ( int index = 0; index < sizeG0; index += BLOCK_SIZE )\n"
" {\n"
" localA[j][i] = A[ig + sizeG0 * (index+j)];\n"
" localB[j][i] = B[index+i + sizeG0 * jg];\n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" #pragma unroll\n"
" for ( int kk = 0; kk < BLOCK_SIZE; ++kk)\n"
" {\n"
" val = val + localA[kk][i] * localB[j][kk];\n"
" }\n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" }\n"
" C[ig + sizeG0 * jg] = val;\n"
"}\n"
;
const char* matrixMultiplySource2 =
"__kernel\n"
" void matrixMultiply(__global float* A, __global float* B, __global float* C)\n"
" {\n"
" int ig = get_global_id(0);\n"
" int jg = get_global_id(1);\n"
" int sizeG0 = get_global_size(0);\n"
" float val=0;\n"
" for ( int k = 0; k < sizeG0; k++)\n"
" {\n"
" val = val + A[ig + k * sizeG0] * B[k + jg * sizeG0];\n"
" }\n"
" C[ig + sizeG0 * jg] = val;\n"
"}\n"
;
BLOCK_SIZE is 16 and I am using 1024x1024 matrices as well as warming up.
// Create OpenCL context
context = mycl::myclCreateContext( NULL, ret_num_devices, devices, NULL, NULL, &ret);
// Create Command Queue
command_queue = mycl::myclCreateCommandQueue(context, devices[0], 0, &ret);
// Create Memory Buffer
memobjA = mycl::myclCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret);
memobjB = mycl::myclCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret);
memobjC = mycl::myclCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = mycl::myclEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
widthA * heightA * sizeof(float), A, 0, NULL, NULL);
ret = mycl::myclEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
widthB * heightB * sizeof(float), B, 0, NULL, NULL);
// Create Kernel Program from the source
program = mycl::myclCreateProgramWithSource(context, 1, (const char **)&matrixMultiplySource,
NULL, &ret);
// Build Kernel Program
ret = mycl::myclBuildProgram(program, ret_num_devices, devices, "-D BLOCK_SIZE=16", NULL, NULL);
if(ret != CL_SUCCESS){cout << "PROBREM! " << ret << endl;return -1;}
// Create OpenCL Kernel
kernel = mycl::myclCreateKernel(program, "matrixMultiply", &ret);
size_t globalThreads[2] = {heightA, widthB};
size_t localThreads[2] = {BLOCK_SIZE, BLOCK_SIZE};
// Set OpenCL Kernel Arguments
ret = mycl::myclSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = mycl::myclSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
ret = mycl::myclSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
// Time the kernel
struct timeval timev1, timev2;
float time_seconds = 0.0f;
mycl::myclEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, 0, NULL);
mycl::myclFinish(command_queue);
gettimeofday(&timev1, NULL);
ret = mycl::myclEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, 0, NULL);
if(ret != CL_SUCCESS){cout << "fail! " << ret << endl;}
ret = mycl::myclFinish(command_queue);
if(ret != CL_SUCCESS){cout << "fail! " << ret << endl;}
gettimeofday(&timev2,NULL);
time_seconds=(timev2.tv_sec-timev1.tv_sec)+0.000001*(timev2.tv_usec- timev1.tv_usec);

Have you looked at the two kernels in the AMD APP KernelAnalyzer or equivalent tools? These tools compile the Kernels and show their predicted performance characteristics

You use
barrier(CLK_GLOBAL_MEM_FENCE);
where I would expect to see
barrier(CLK_LOCAL_MEM_FENCE);
as you write in the loop to local memory.
Further I doubt that the copy to localA does help you -- at one time every items there is only accessed once.

Related

How to make an OpenCL program run for large data set?

I am new to OpenCL. I am trying to run a simple OpenCL program for Vector Addition on NVIDIA GPU.
Here is the code :
OpenCL file is :
#define MAX_SOURCE_SIZE (0x10000)
#include<stdio.h>
#include<stdlib.h>
#include "CL/cl.h"
int main()
{
cl_uint ret_num_platforms;
cl_uint ret_num_devices;
cl_platform_id platform_id = NULL;
cl_kernel kernel2 = NULL;
cl_program program2 = NULL;
cl_command_queue command_queue = NULL;
cl_context context = NULL;
cl_device_id device_id = NULL;
cl_int ret;
FILE * fp2;
char fileName2[]="./kernel.cl";
int for_var=0;
char * source_str2;
size_t source_size2;
size_t globalWorkSize[1];
size_t localWorkSize[1];
cl_mem cl_buffer3;
cl_mem cl_buffer2;
cl_mem cl_buffer1;
cl_mem cl_buffer0;
int *A;
int *B;
int *C;
int *n;
int i;
n = ((int *)(malloc((sizeof(int )))));
printf("Enter the number of elements of vector : \n");
scanf("%d",n);
A = ((int *)(malloc((( *n) * sizeof(int )))));
B = ((int *)(malloc((( *n) * sizeof(int )))));
C = ((int *)(malloc((( *n) * sizeof(int )))));
printf("\nInput Vector1 :\n");
for (i = 0; i <= *n - 1; i += 1) {
A[i] = (2 * i);
printf("%d ",A[i]);
}
printf("\n\nInput Vector2 :\n");
for (i = 0; i <= *n - 1; i += 1) {
B[i] = (3 * i);
printf("%d ",B[i]);
}
ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
if (ret != CL_SUCCESS) {
printf("Platform error");
}
ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_DEFAULT,1,&device_id,&ret_num_devices);
if (ret != CL_SUCCESS)
printf("device err");
context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
if (!context)
printf("context err");
command_queue = clCreateCommandQueue(context,device_id,0,&ret);
if (!command_queue)
printf("command queue error");
localWorkSize[0] = 16;
globalWorkSize[0] =16400;
cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &ret);
cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, (*n) * sizeof(int), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,(*n) * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,(*n) * sizeof(int), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0, sizeof(int), n, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
fp2 = fopen(fileName2,"r");
if (!fp2) {
fprintf(stderr,"Failed");
exit(1);
}
source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
fclose(fp2);
program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);
if(!program2)
printf("error creating program2");
ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);
if (ret)
printf("error building program2");
kernel2 = clCreateKernel(program2, "ADD" , &ret);
ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);
ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);
ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);
ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);
ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
printf("\n\nAddition of vectors :\n");
for (i = 0; i <= *n - 1; i += 1) {
printf("%d ",C[i]);
}
clReleaseMemObject(cl_buffer0);
clReleaseMemObject(cl_buffer1);
clReleaseMemObject(cl_buffer2);
clReleaseMemObject(cl_buffer3);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
Kernel file is(kernel.cl) :
__kernel void ADD(__constant int *A,__constant int *B,__global int *C,__constant int *n)
{
int i = get_global_id(0);
if (i <= *n - 1) {
C[i] = (A[i] + B[i]);
}
}
The program works fine if I give 16384 as total vector elements but it gives 0 as output for values more than that. I want to run this program with large data set so that I can compare its performance with the one running on CPU.
Please guide me how can I do so?

There's at least one bug in your code - you're copying MEM_SIZE * sizeof(int) bytes from n to buffer 3:
ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0,MEM_SIZE * sizeof(int), n, 0, NULL, NULL);
however, n is only sizeof(int) bytes long:
n = ((int *)(malloc((sizeof(int )))));
I don't know what problems this might be causing, and it's entirely possible there are other, more severe bugs, but this one certainly isn't helping.

OpenCL generate SHA-256 hash

I need help with OpenCL.
The task is as follows:
There is an input parameter of type string. It is necessary to generate a SHA-256 hash using the resources of the video card.
It is necessary to create a cycle to select a hash. Each time add some postfix to the original string.
Result*Hash should start with 5 zeros "00000 ...".
For example, the entrance. parameter: "strela".
SHA-256: "7d7ceecdee08ea1c0ac46b27657a79395af36526b3214b59a92f8351ccf8f762"
Next, you need to add a postfix. For example, "strela1"
Here the hash will be: a2afd15651f44f19f3e4e216bf3ead22d5f5937e9f9dc250382ff1f764ba219f
then continue to add the postfix until the resulting hash begins to start with "00000.."
It is necessary to use all the cores of the video card, i.e. use parallelization. Each core will use its postfix.
As soon as some kernel computes the hash we need, interrupt all calculations on the cores and display the hash we need.
Source:
main.cpp
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
#include <stdio.h>
#include < string.h >
void crypt_and_print(char input[])
{
char result[65];
char diff[65] = "00000";
char *istr;
char buffer2[20];
int temp;
char str2[20];
for (int i = 0; i < 1; i++)
{
char string[] = "1qqq";
sprintf(buffer2, "%d", i);
temp = 8 - strlen(buffer2);
str2[0] = '\0';
while (strlen(str2) != temp)
strcat(str2, "0");
strcat(str2, buffer2);
strcat(string, str2);
sha256_crypt(string, result);
istr = strstr(result, diff);
if (istr != NULL) {
printf(istr);
break;
}
}
}
int main()
{
char result[65];
sha256_init(2048);
crypt_and_print((char*)"");
}
sha256.c
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
static cl_platform_id platform_id = NULL;
static cl_device_id device_id = NULL;
static cl_uint ret_num_devices;
static cl_uint ret_num_platforms;
static cl_context context;
static cl_int ret;
static char* source_str;
static size_t source_size;
static cl_program program;
static cl_kernel kernel;
static cl_command_queue command_queue;
static cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_info;
static cl_uint *partial_hashes;
static cl_uint *res_hashes;
static char *saved_plain;
static unsigned int datai[3];
static int have_full_hashes;
static size_t kpc = 4;
static size_t global_work_size=3;
static size_t local_work_size=1;
static size_t string_len;
void load_source();
void createDevice();
void createkernel();
void create_clobj();
void crypt_all();
void sha256_init(size_t user_kpc)
{
kpc = user_kpc;
load_source();
createDevice();
createkernel();
create_clobj();
}
void sha256_crypt(char input[], char* output)
{
int i;
string_len = strlen(input);
global_work_size = 3;
datai[0] = SHA256_PLAINTEXT_LENGTH;
datai[1] = global_work_size;
datai[2] = string_len;
memcpy(saved_plain, input, string_len+1);
crypt_all();
for(i=0; i<SHA256_RESULT_SIZE; i++)
{
sprintf(output+i*8,"%08x", partial_hashes[i]);
}
printf("'%s':\n%s\n", input, output);
}
void crypt_all()
{
//printf("%s\n",saved_plain);
ret = clEnqueueWriteBuffer(command_queue, data_info, CL_TRUE, 0, sizeof(unsigned int) * 3, datai, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, buffer_keys, CL_TRUE, 0, SHA256_PLAINTEXT_LENGTH * kpc, saved_plain, 0, NULL, NULL);
// printf("%s\n",buffer_keys);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
ret = clFinish(command_queue);
// read back partial hashes
ret = clEnqueueReadBuffer(command_queue, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, partial_hashes, 0, NULL, NULL);
have_full_hashes = 0;
}
void load_source()
{
FILE *fp;
fp = fopen("/sha256.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
}
void create_clobj(){
pinned_saved_keys = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (SHA256_PLAINTEXT_LENGTH)*kpc, NULL, &ret);
saved_plain = (char*)clEnqueueMapBuffer(command_queue, pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, (SHA256_PLAINTEXT_LENGTH)*kpc, 0, NULL, NULL, &ret);
memset(saved_plain, 0, SHA256_PLAINTEXT_LENGTH * kpc);
res_hashes = (cl_uint *)malloc(sizeof(cl_uint) * SHA256_RESULT_SIZE);
memset(res_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
pinned_partial_hashes = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
partial_hashes = (cl_uint *) clEnqueueMapBuffer(command_queue, pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, 0, NULL, NULL, &ret);
memset(partial_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
buffer_keys = clCreateBuffer(context, CL_MEM_READ_ONLY, (SHA256_PLAINTEXT_LENGTH) * kpc, NULL, &ret);
buffer_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
data_info = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int) * 3, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(data_info), (void *) &data_info);
clSetKernelArg(kernel, 1, sizeof(buffer_keys), (void *) &buffer_keys);
clSetKernelArg(kernel, 2, sizeof(buffer_out), (void *) &buffer_out);
}
void createDevice()
{
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
}
void createkernel()
{
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
}
sha256.cl
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#define H0 0x6a09e667
#define H1 0xbb67ae85
#define H2 0x3c6ef372
#define H3 0xa54ff53a
#define H4 0x510e527f
#define H5 0x9b05688c
#define H6 0x1f83d9ab
#define H7 0x5be0cd19
uint rotr(uint x, int n) {
if (n < 32) return (x >> n) | (x << (32 - n));
return x;
}
uint ch(uint x, uint y, uint z) {
return (x & y) ^ (~x & z);
}
uint maj(uint x, uint y, uint z) {
return (x & y) ^ (x & z) ^ (y & z);
}
uint sigma0(uint x) {
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
uint sigma1(uint x) {
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
uint gamma0(uint x) {
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}
uint gamma1(uint x) {
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}
__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key, __global uint *digest){
int t, gid, msg_pad;
int stop, mmod;
uint i, ulen, item, total;
uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
uint num_keys = data_info[1];
int current_pad;
//printf(get_global_id(0));
uint K[64]={
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
msg_pad=0;
ulen = data_info[2];
total = ulen%64>=56?2:1 + ulen/64;
//printf("ulen: %u total:%u\n", ulen, total);
digest[0] = H0;
digest[1] = H1;
digest[2] = H2;
digest[3] = H3;
digest[4] = H4;
digest[5] = H5;
digest[6] = H6;
digest[7] = H7;
for(item=0; item<total; item++)
{
A = digest[0];
B = digest[1];
C = digest[2];
D = digest[3];
E = digest[4];
F = digest[5];
G = digest[6];
H = digest[7];
#pragma unroll
for (t = 0; t < 80; t++){
W[t] = 0x00000000;
}
msg_pad=item*64;
if(ulen > msg_pad)
{
current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
}
else
{
current_pad =-1;
}
// printf("current_pad: %d\n",current_pad);
if(current_pad>0)
{
i=current_pad;
stop = i/4;
// printf("i:%d, stop: %d msg_pad:%d\n",i,stop, msg_pad);
for (t = 0 ; t < stop+get_global_id(0) ; t++){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= (uchar) plain_key[msg_pad + t * 4 + 3];
// printf("W[%u]: %u\n",t,W[t]);
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= ((uchar) 0x80) ;
} else if (mmod == 2) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
if (current_pad<56)
{
W[15] = ulen*8 ;
// printf("ulen avlue 2 :w[15] :%u\n", W[15]);
}
}
else if(current_pad <0)
{
if( ulen%64==0)
W[0]=0x80000000;
W[15]=ulen*8;
//printf("ulen avlue 3 :w[15] :%u\n", W[15]);
}
for (t = 0; t < 64; t++) {
if (t >= 16)
W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16];
T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
T2 = sigma0(A) + maj(A, B, C);
H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
}
digest[0] += A;
digest[1] += B;
digest[2] += C;
digest[3] += D;
digest[4] += E;
digest[5] += F;
digest[6] += G;
digest[7] += H;
}
printf("hi");
}
How can i use here paralelism (all GPU cores) to calculate needed hash code?
Is it real to do task like this using OPENCL ?

OpenCL uses sub-buffer's parent for constant parameter

I have an OpenCL (1.2) kernel that takes a constant argument, which is a sub-buffer. When I run this kernel, it seems like the parent buffer is used instead. If I use a global const argument, it works as expected.
I would put this down to a driver bug, except I can reproduce it on both Intel (Linux, beignet git) and nVidia (Linux, 367.44-3) implementations on different machines, which makes me think I've made a mistake somewhere.
Below is a working example. The expected output is 1, 1025, 1, 1025,, but instead 1, 1, 1, 1025, is printed.
#include <CL/cl.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#define NELEMS(x) (sizeof(x) / sizeof(*x))
#define PLATFORM 0
#define DEVICE 0
const char src[] =
"kernel void test1(constant int * const a) {\n"
" size_t i = get_global_id(0);\n"
" if (i == 1)\n"
" printf(\"%i, \", a[i]);\n"
"}\n"
"\n"
"kernel void test2(global const int * const a) {\n"
" size_t i = get_global_id(0);\n"
" if (i == 1)\n"
" printf(\"%i, \", a[i]);\n"
"}\n";
const size_t src_len = sizeof(src);
const char * const kernels[] = {"test1", "test2"};
int main(void) {
cl_int err = -1;
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
assert(num_platforms > PLATFORM);
cl_platform_id * platforms = malloc(sizeof(*platforms) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_uint num_devices;
clGetDeviceIDs(platforms[PLATFORM], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
assert(num_devices >= DEVICE);
cl_device_id * devices = malloc(sizeof(*devices) * num_devices);
clGetDeviceIDs(platforms[PLATFORM], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_context_properties context_properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[PLATFORM], 0
};
cl_context context = clCreateContext(context_properties, 1, &devices[DEVICE], NULL, NULL, &err);
assert(err == CL_SUCCESS);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
cl_command_queue queue = clCreateCommandQueue(context, devices[DEVICE], 0, &err);
#pragma GCC diagnostic pop
assert(err == CL_SUCCESS);
cl_program program;
{
// Crashes if directly using src[]
char * source = malloc(src_len);
memcpy(source, src, src_len);
program = clCreateProgramWithSource(context, 1, (const char **) &source, &src_len, &err);
assert(err == CL_SUCCESS);
free(source);
}
err = clBuildProgram(program, 1, &devices[DEVICE], "", NULL, NULL);
assert(err == CL_SUCCESS);
size_t buffer_size = 8192;
size_t subbuffer_size = buffer_size / 2;
{
cl_uint align;
err = clGetDeviceInfo(devices[DEVICE], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(align), &align, NULL);
assert(err == CL_SUCCESS);
assert(subbuffer_size % align == 0);
cl_ulong constbuf_size;
err = clGetDeviceInfo(devices[DEVICE], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(constbuf_size), &constbuf_size, NULL);
assert(err == CL_SUCCESS);
assert(constbuf_size > subbuffer_size);
}
cl_mem buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, &err);
assert(err == CL_SUCCESS);
cl_mem sub_buffers[2];
for (size_t i = 0; i < NELEMS(sub_buffers); i++){
cl_buffer_region region = {
.origin = i * subbuffer_size,
.size = subbuffer_size,
};
sub_buffers[i] = clCreateSubBuffer(buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
assert(err == CL_SUCCESS);
}
{
cl_int * data = clEnqueueMapBuffer(queue, buffer, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, 0, NULL, NULL, &err);
assert(err == CL_SUCCESS);
for (size_t i = 0; i < buffer_size / sizeof(cl_int); i++)
data[i] = i;
cl_event unmap_event;
err = clEnqueueUnmapMemObject(queue, buffer, data, 0, NULL, &unmap_event);
assert(err == CL_SUCCESS);
err = clWaitForEvents(1, &unmap_event);
assert(err == CL_SUCCESS);
}
for (size_t k = 0; k < NELEMS(kernels); k++) {
cl_kernel kernel = clCreateKernel(program, kernels[k], &err);
assert(err == CL_SUCCESS);
cl_event run_event;
for (size_t i = 0; i < NELEMS(sub_buffers); i++){
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &sub_buffers[i]);
assert(err == CL_SUCCESS);
size_t work_size[] = {subbuffer_size / sizeof(cl_int)};
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, &run_event);
assert(err == CL_SUCCESS);
err = clWaitForEvents(1, &run_event);
assert(err == CL_SUCCESS);
err = clFinish(queue);
assert(err == CL_SUCCESS);
}
clReleaseKernel(kernel);
}
puts("");
for (size_t i = 0; i < NELEMS(sub_buffers); i++)
clReleaseMemObject(sub_buffers[i]);
clReleaseMemObject(buffer);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(devices);
free(platforms);
return 0;
}

This is interesting. I try it on difference devices, in a MacBookPro there are 3 devices include Nvidia IRIS and Intel, all get correct output. In the windows 10 in this MBP with Nvidia driver, the output is exactly same wrong.
I think it's a Nvidia bug, but not limit to Nvidia.

Using clEnqueueNDRangeKernel in OpenCL

I need help with one function in OpenCL. When I'm starting using clEnqueueNDRangeKernel instead of clEnqueueTask it takes much more time for program to succeed. Why so? As I understand, the program should use data parallel model and it will work faster, am I wrong? And if I am, how I can change code to see the actual work of data parallel model?
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
for (int i = 0, j = 0; i < InSize; i+=4, j++)
{
unsigned char Value = (pDataIn[i] + pDataIn[i + 1] + pDataIn[i + 2]) / 3;
pDataOut[j] = Value;
}
}
int iWidth, iHeight, iBpp;
vector<unsigned char> pDataIn;
vector<unsigned char> pDataOut;
int err = LoadBmpFile(L"3840x2160.bmp", iWidth, iHeight, iBpp, pDataIn);
if (err != 0 || pDataIn.size() == 0 || iBpp != 32)
{
std::cout << "error load input file!\n";
}
pDataOut.resize(pDataIn.size()/4);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj = NULL;
cl_mem memobj1 = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
unsigned int SizeIn, SizeOut;
SizeIn = pDataIn.size();
SizeOut = pDataOut.size();
FILE *fp;
char fileName[] = "./kernel.cl";
char *source_str;
size_t source_size;
//Loading kernel
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
system("PAUSE");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
//Getting Platform and Device
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
//Create context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
//create kernel program
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
//build it
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
//create queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//create bufer
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, pDataIn.size(), NULL, &ret);
memobj1 = clCreateBuffer(context, CL_MEM_READ_WRITE,pDataOut.size(), NULL, &ret);
//copy buffer to kernel
ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, pDataIn.size(), pDataIn.data(), 0, NULL, NULL);
//create opencl kernel
kernel = clCreateKernel(program, "red_to_green", &ret);
//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
ret = clSetKernelArg(kernel, 2, sizeof(unsigned int), (void *)&SizeIn);
ret = clSetKernelArg(kernel, 3, sizeof(unsigned int), (void *)&SizeOut);
const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){
float start_time = clock();
float search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = 8;
size_t local_item_size = 4;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
//copy from buffer
ret = clEnqueueReadBuffer(command_queue, memobj1, CL_TRUE, 0, pDataOut.size(), pDataOut.data(), 0, NULL, NULL);
ret = clFinish(command_queue);
float end_time = clock();
search_time = end_time - start_time;
//float last_time = last_time + search_time;
cout << search_time << endl;
}
clock_t t1 = clock();
double time_seconds = (t1-t0)*CLOCKS_PER_SEC/cycles_max;
cout << time_seconds/1000 <<endl;
WriteBmpFile(L"3840x2160_wb.bmp", iWidth, iHeight, 8, pDataOut.size(), pDataOut.data(), false);
system("PAUSE");

from the docs page:
The kernel is executed using a single work-item.
clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with
work_dim = 1, global_work_offset = NULL, global_work_size[0] set to 1,
and local_work_size[0] set to 1.
When you use clEnqueueNDRangeKernel, you are using 2 work groups of 4 work items, but they are all doing the same work. They all read from the same global memory, but more importantly, they all try to write to the same locations in global memory.
You need to take into account the worker's global id when doing your computations.
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
int gid = get_global_id(0);
int gsize = get_global_size(0);
for (int j = gid; j < (InSize >> 2); j+= gsize)
{
unsigned char Value = (pDataIn[j*4] + pDataIn[j*4 + 1] + pDataIn[j*4 + 2]) / 3;
pDataOut[j] = Value;
}
}

It looks like you are iterating over all pixels of an input image in your kernel. This will cause all threads to calculate the image intensity for all pixels. Try to launch a single thread for each pixel instead. To do so, change your kernel source code to only calculate the output value for one pixel:
__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut) {
int j = get_global_id(0);
int i = j*4;
pDataOut[i] = (pDataIn[j] + pDataIn[j + 1] + pDataIn[j + 2]) / 3;
}
This code will now perform the averaging over the RGB values of your RGBA input image for the single pixel at location i. Now all you need to do is launch as many threads as your image has pixels. Relevant changes:
//create opencl kernel
kernel = clCreateKernel(program, "black_white_img", &ret);
//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){
float start_time = clock();
float search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = iWidth * iHeight;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, NULL, 0, NULL, NULL);
This should give a considerable speedup comparing to your code.

OpenCL Error Computing Matrix Multiplication during Runtime

I have been debugging for the past few days and cannot get this OpenCL matrix multiplication kernel to run. Whenever I run the program, the output from the GPU results in large negative numbers similar to -198746573.0000. I was wondering if someone with HPC experience could point out an error in my code or if it is an error with the driver.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define widthA 2
#define heightA 2
#define widthB heightA
#define heightB 2
#define widthC widthA
#define heightC heightB
#ifdef __APPLE__
#include < OpenCL/opencl.h >
#else
#include <opencl.h>
#endif
#define MEM_SIZE (128)
#define MAX_SOURCE_SIZE (0x100000)
int main()
{
float * A = (float *)malloc(sizeof(float)*widthA*heightA);
float * B = (float *)malloc(sizeof(float)*widthB*heightB);
float * C = (float *)malloc(sizeof(float)*widthC*heightC);
float * Res = (float *)malloc(sizeof(float)*widthC*heightC);
float * D= (float *)malloc(sizeof(float)*widthC*heightC);
float ref[widthC][heightC];
int i, j, k;
FILE * fp1 = fopen("matAdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightA; j++) {
float p=(rand()%100)/7.0;
//*(A+i*heightA+j)=rand()%100 + p;
*(A+i*heightA+j)=4.0;
fprintf(fp1, "%f ",*(A+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
fp1 = fopen("matBdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthB; i++)
{
for(j=0; j < heightB; j++) {
float p=(rand()%100)/7.0;
//*((B+i*heightB+j))=rand()%100 + p;
*((B+i*heightB+j))=4.0;
fprintf(fp1, "%f ",*(B+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobjA = NULL;
cl_mem memobjB = NULL;
cl_mem memobjC = NULL;
cl_mem rowA = NULL;
cl_mem colC = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id[10];
cl_platform_id platform = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event GPUDone[0];
//char string[MEM_SIZE];
FILE *fp;
char fileName[] = "matrixMultiplication.cl";
char *source_str;
size_t source_size;
int row = widthA;
int col = heightC;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
/* Get Platform and Device Info */
ret = clGetPlatformIDs(10, platform_id, &ret_num_platforms);
char cBuffer[1024];
cl_uint c;
for(c = 0; c < ret_num_platforms; c++)
{
clGetPlatformInfo(platform_id[c], CL_PLATFORM_NAME, 1024, &cBuffer, NULL);
if (strstr(cBuffer, "NVIDIA") != NULL)
{
platform = platform_id[c];
break;
}
}
printf("Found Platform %s\n", cBuffer);
ret = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
printf("Found %d devices.\n", ret_num_devices);
/* Create OpenCL context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create Memory Buffer */
memobjA = clCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret);
memobjB = clCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret);
memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
rowA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
colC = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
widthA * heightA * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
widthB * heightB * sizeof(float), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0, sizeof(int), &row, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0, sizeof(int), &col, 0, NULL, NULL);
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, "matrixMultiplication", &ret);
/* Set OpenCL Kernel Arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row);
ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col);
/* Execute OpenCL Kernel */
//ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
size_t globalThreads[2] = {widthA, heightB};
size_t localThreads[2] = {16,16};
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
//clFlush(command_queue);
//clFinish(command_queue);
/* Copy results from the memory buffer */
ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0,
widthA * heightC * sizeof(float), Res, 0, NULL, &GPUDone[0]);
printf("Buffer Read ended with %d.\n", ret);
clWaitForEvents(1, GPUDone);
fp1 = fopen("matGPURes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(Res+i*heightC+j));
ref[i][j] = *(Res+i*heightC+j);
printf("GPU Output: %f\n", *(Res+i*heightC+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobjA);
ret = clReleaseMemObject(memobjB);
ret = clReleaseMemObject(memobjC);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
ret = clReleaseEvent(GPUDone[0]);
free(source_str);
float sum=0.0;
for(i = 0;i < widthA; i++)
{
for(j = 0; j < heightC; j++)
{
sum = 0;
for(k = 0; k < widthB; k++)
{
sum += A[i*col+k] * B[k*row+j];
printf("Multiplying A: %f, B: %f\n", A[i*col+k], B[k*row+j]);
}
D[i*heightC+j] = sum;
}
}
fp1 = fopen("matNormalMultiplicationRes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matNormalMultiplicationRes.txt\n");
exit(1);
}
for(i = 0; i<widthA; i++)
{
for(j = 0; j<heightA; j++)
{
if (ref[i][j] != D[i*heightA+j])
{
printf("Calculation error[ CPU: %f, GPU: %f ]\n", D[i*heightA+j], ref[i][j]);
}
}
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(D+i*heightC+j));
}
fprintf(fp1, "\n");
}
free(A);
free(B);
free(C);
free(D);
free(Res);
return 0;
}
Here is the kernel
#define BLOCK_SIZE 16
__kernel
void matrixMultiplication(__global float* A, __global float* B, __global float* C, int wA, int wB )
{
//int i = get_global_id(0);
//int j = get_global_id(1);
float Csub = 0.0f;
int bx = get_group_id(0);
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int aBegin = wA * BLOCK_SIZE * by;
int aEnd = aBegin + wA - 1;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * wB;
for (int a = aBegin, b=bBegin;
a <= aEnd;
a += aStep, b+=bStep)
{
__local float As[BLOCK_SIZE][BLOCK_SIZE];
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
barrier(CLK_LOCAL_MEM_FENCE);
for( int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
barrier(CLK_LOCAL_MEM_FENCE);
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
/*
float value=0;
for ( int k = 0; k < widthA; k++)
{
value = value + A[k + j * widthA] * B[k*widthB + i];
}
C[i + widthA * j] = value;
*/
}
I have double checked over and over again but simply cannot find any errors. I want to make sure its not a code error before I conclude its a driver issue.
Thanks!

Do you really need a complex kernel like that ? if you really want to do simple matrix multiplication
you can write a simple kernel like this, which is easy to debug.
__kernel void matrixMultiplication (__global float* A,
__global float* B,
__global float* C,
int widthA, int widthB )
{
//y direction
int row = get_global_id(1);
int col = get_global_id(0);
float cSum = 0.0f;
//calculate the result
for (int i=0; i<widthA; i++)
{
cSum += A[row*widthA+ i] * B[i*widthB+col];
}
C[row*widthB+col] = cSum;
}

Case is probably closed already, but for the sake of google-comers:
Shouldnt shared memory be explicitly declared on host and passed as kernel argument to the source? __local keyword is not the one you are looking for in this case.
See post on How to declare local memory in OpenCL? for the detailed explanation.

Check the functionality of your host. Here a few things to get you started ...
1) You don't need to create a buffer and enqueue it for a scalar constant Int like row and col. Just set it as a kernel arg.
2) Wait for the clEnqueueNDRangeKernel with an event. You want to be sure the calc has completed.
3) Add a printf statement in the kernel to print selected values to see that the input and output values are what you expect.
try
if ( get_local_id(0) % 8 == 0)
{
printf some useful value of a,b,c
}
3) Try the host code with a dumb kernel that copies an input array to an output array. That will confirm it you have the handling of buffer creation and the enqeue read/write code correct!

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

OpenCL gemm kernel local memory going slower - opencl

Have you looked at the two kernels in the AMD APP KernelAnalyzer or equivalent tools? These tools compile the Kernels and show their predicted performance characteristics

You use barrier(CLK_GLOBAL_MEM_FENCE); where I would expect to see barrier(CLK_LOCAL_MEM_FENCE); as you write in the loop to local memory. Further I doubt that the copy to localA does help you -- at one time every items there is only accessed once.

Related

How to make an OpenCL program run for large data set?

OpenCL generate SHA-256 hash

OpenCL uses sub-buffer's parent for constant parameter

Using clEnqueueNDRangeKernel in OpenCL

OpenCL Error Computing Matrix Multiplication during Runtime

Categories

Resources