This function CLBlastSgemmStridedBatched() in CLBlast https://github.com/CNugteren/CLBlast performs the matrix-matrix multiplication of a batch of matrices, c_stride denotes The (fixed) stride between two batches of the C matrix and determines the locations of output matrix C. Thus, this functions perform
for(int i=0; i< batch_count; i++){
C + i * c_stride = αlpha* ( A + i * a_stride ) *( B + i * b_stride ) + β* ( C + i * c_stride )
}
When c_stride=0, they get different results on two different devices. By contract, when c_stride=M*N, they get the same results on the two devices. Whether CLBlastSgemmStridedBatched() of CLBLast supports c_stride =0 or not?
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "math.h"
#include <clblast_c.h>
cl_context ctx;
cl_command_queue queue;
double what_time_is_it_now()
{
struct timeval time;
if (gettimeofday(&time, NULL)) {
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * .000001;
}
void init_CL(){
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0 };
/* Setup OpenCL environment. */
err = clGetPlatformIDs(1, &platform, NULL);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
props[1] = (cl_context_properties) platform;
ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
queue = clCreateCommandQueue(ctx, device, 0, &err);
}
double clSBatchedSgemm(int M, int N, int K, int batch_count, int a_stride){
int i,j,b;
// Allocate host storage for batch_count A,B,C matrices
float* A = malloc(sizeof(float) * M * K * batch_count);
float* B = malloc(sizeof(float) * K * N * batch_count);
float* C = malloc(sizeof(float) * M * N * batch_count);
for(b=0; b<batch_count; b++) {
for(j=0; j<M; j++) {
for(i=0; i<K; i++) {
int index = i*M + j + b*M*K;
A[index] = index*index + 0.0f;
B[index] = index + 1.0f;
C[index] = 0.0f;
}
}
}
cl_float alpha = 1;
cl_float beta = 2;
size_t lda = K;
size_t ldb = N;
size_t ldc = N;
const size_t b_stride = N*K, c_stride = M*N;
if(c_stride != M*N){
printf("[Error!] c_stride must be the size of matrix A, the function CLBlastSgemmStridedBatched does not support that c_stride is zero \n");
}
cl_mem bufA, bufB, bufC;
cl_event event = NULL;
cl_int err;
/* Prepare OpenCL memory objects and place matrices inside them. */
if(a_stride==M*K){
bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, batch_count*M * K * sizeof(*A),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,batch_count*M * K * sizeof(*A), A, 0, NULL, NULL);
}else{
bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,M * K * sizeof(*A), A, 0, NULL, NULL);
}
bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, batch_count*K * N * sizeof(*B),NULL, &err);
bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, batch_count*M * N * sizeof(*C),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,batch_count*K * N * sizeof(*B), B, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,batch_count*M * N * sizeof(*C), C, 0, NULL, NULL);
double start_t = what_time_is_it_now();
/* Call clBLAS extended function.*/
CLBlastStatusCode status = CLBlastSgemmStridedBatched(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeNo,
M, N, K,
alpha,
bufA, 0, lda, a_stride,
bufB, 0, ldb, b_stride,
beta,
bufC, 0, ldc, c_stride,
batch_count,
&queue, &event);
/* Wait for calculations to be finished. */
if (status == CLBlastSuccess) {
err = clWaitForEvents(1, &event);
clReleaseEvent(event);
}
clFinish(queue);
double btime = what_time_is_it_now()-start_t;
printf("Batch_size: %6d, batchedGEMMTime: %3.6f ms, time/GEMM: %3.6f \n", batch_count, btime, btime/batch_count);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed SGEMM with status %d\n", status);
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, batch_count*M * N * sizeof(*C), C, 0, NULL, NULL);
/* Release OpenCL memory objects. */
clReleaseMemObject(bufC);
clReleaseMemObject(bufB);
clReleaseMemObject(bufA);
free(A);
free(B);
free(C);
return btime/batch_count;
}
int main(void) {
init_CL();
int M=128, N=5184, K=256;
int batch_Num = 256;
int strideA = 0;
double* times = malloc(sizeof(float) * batch_Num);
for(int batch_index=1;batch_index<batch_Num;batch_index=batch_index*2){
times[batch_index]= clSBatchedSgemm(M,N,K,batch_index,strideA);
}
times[1]= clSBatchedSgemm(M,N,K,1,strideA);
for(int i=1;i<batch_Num/2-1;i=i*2){
printf("i=%3d \t i*2=%3d \t %f, \t %f, \t speedup: %.3f, \t speedup/1: %.3f\n", i, i*2, times[i],times[i*2], times[i]/times[i*2], times[1]/times[i*2]);
}
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return 0;
}
Related
I decided to study OpenCL myself and write a brute-force password for the TEA algorithm, did I understand OpenCL correctly? can you improve something in the direction of speed? what mistakes have I made?
I prepare the first 5 bytes in cycles, the remaining 3 bytes are sorted out by the kernel, 255 threads at 65535 each
in the main program:
for (int x5 = KEY[0]; x5 >= 0; x5--) {
KEY[0]=x5;
for (int x4 = KEY[1]; x4 >= 0; x4--) {
KEY[1]=x4;
for (int x3 = KEY[2]; x3 >= 0; x3--) {
KEY[2]=x3;
for (int x2 = KEY[3]; x2 >= 0; x2--) {
KEY[3]=x2;
for (int x = KEY[4]; x >= 0; x--) {
KEY[4]=x;
ret = clEnqueueWriteBuffer(command_queue, key_mem_obj, CL_TRUE, 0,
8 * sizeof(int), KEY, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, cadr_mem_obj, CL_TRUE, 0,
1 * sizeof(int), CADR, 0, NULL, NULL);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&key_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&cadr_mem_obj);
NDRange = 0x0100;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&NDRange, NULL, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
break;
}
ret = clEnqueueReadBuffer(command_queue, cadr_mem_obj, CL_TRUE, 0,
1 * sizeof(int), CADR, 0, NULL, NULL);
if (CADR[0]>0) {
uint16_t k=CADR[0];
ret = clEnqueueReadBuffer(command_queue, retc_mem_obj, CL_TRUE, 0,
524280 * sizeof(int), RETC, 0, NULL, NULL);
for ((i = 0); i < k; i++) {
Form1->Memo1->Lines->BeginUpdate();
Form1->Memo1->Lines->Add(IntToHex(RETC[i*8],2)+IntToHex(RETC[i*8+1],2)+
IntToHex(RETC[i*8+2],2)+IntToHex(RETC[i*8+3],2)+IntToHex(RETC[i*8+4],2)+
IntToHex(RETC[i*8+5],2)+IntToHex(RETC[i*8+6],2)+IntToHex(RETC[i*8+7],2));
Form1->Memo1->Lines->EndUpdate();
Form1->Label6->Caption=IntToStr(Form1->Memo1->Lines->Count-1);
}
CADR[0]=0;
}
KEY2[0]=KEY[0];
KEY2[1]=KEY[1];
KEY2[2]=KEY[2];
KEY2[3]=KEY[3];
KEY2[4]=KEY[4];
KEY2[5]=KEY[5];
KEY2[6]=KEY[6];
KEY2[7]=KEY[7];
if(Terminated){
break;
}
}
KEY[4]=0xFF;
}
KEY[3]=0xFF;
}
KEY[2]=0xFF;
}
KEY[1]=0xFF;
}
KEY[0]=0xFF;`
Kernel:
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
__kernel void brute(__global const int *KEY, __global const int *DAT, __global int
*CADR,__global int *RETC)
{
int i = get_global_id(0);
ushort Data[2];
ushort Key[4];
Key[0]=(KEY[0]<<8)+KEY[1];
Key[1]=(KEY[2]<<8)+KEY[3];
// Key[2]=(KEY[4]<<8)+KEY[5];
Key[3]=(KEY[6]<<8)+KEY[7];
Key[2] = (KEY[4]<<8) + i;
for (int j=0xFFFF; j>=0; j--){
Key[3]=j;
Data[0]=(DAT[0]<<8)+DAT[1];
Data[1]=(DAT[2]<<8)+DAT[3];
ushort delta = 0x9e37;
ushort sum = (delta<<5);
for (uint n = 0;n < 32; ++n){
Data[1]-=(((Data[0])+Key[2])^(Data[0]+sum)^((Data[0]>>5)+Key[3]));
Data[0]-=(((Data[1]<<4)+Key[0])^(Data[1]+sum)^(Data[1]+Key[1]));
sum -= delta;
}
if ((Data[0]==0x0000) && (Data[1]==0x0000)){
int a=CADR[0];
atomic_inc(CADR);
RETC[a*8]=(Key[0] >> 8);
RETC[a*8+1]=(Key[0] & 0xFF);
RETC[a*8+2]=(Key[1] >> 8);
RETC[a*8+3]=(Key[1] & 0xFF);
RETC[a*8+4]=(Key[2] >> 8);
RETC[a*8+5]=(Key[2] & 0xFF);
RETC[a*8+6]=(Key[3] >> 8);
RETC[a*8+7]=(Key[3] & 0xFF);
}
}
}
If you only launch 256 threads that each do 65536 iterations of the same thing, your GPU will not be saturated and performance will be very poor. GPUs have thousands of "cores", and if you launch 256 threads you will only use 256 of them while the rest remains idle.
The idea of GPU parallelization is to split the work up into as many imdependent problems as there are. In your case this means: Lauch 256*65536 threads that do one Iteration each. Then performance will be much better.
Imagine a binary operation (lets name it "+") with associative property. When you can compute a1 + a2 + a3 + a4 + ... in parallel, first computing
b1 = a1 + a2
b2 = a3 + a4
then
c1 = b1 + b2
c2 = b3 + b4
then doing the same thing for results of previous step, and so on, until there is one element left.
I'am learning OpenCL and trying to implement this approach to summarize all elements in array. I am a total newbie in this technology, so the program might look something weird.
This is the kernel:
__kernel void reduce (__global float *input, __global float *output)
{
size_t gl = get_global_id (0);
size_t s = get_local_size (0);
int i;
float accum = 0;
for (i=0; i<s; i++) {
accum += input[s*gl+i];
}
output[gl] = accum;
}
This is the main program:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <CL/cl.h>
#define N (64*64*64*64)
#include <sys/time.h>
#include <stdlib.h>
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
int main()
{
int i, fd, res = 0;
void* kernel_source = MAP_FAILED;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input, output;
size_t global, local;
cl_float *array = malloc (sizeof (cl_float)*N);
cl_float *array2 = malloc (sizeof (cl_float)*N);
for (i=0; i<N; i++) array[i] = i;
fd = open ("kernel.cl", O_RDONLY);
if (fd == -1) {
perror ("Cannot open kernel");
res = 1;
goto cleanup;
}
struct stat s;
res = fstat (fd, &s);
if (res == -1) {
perror ("Cannot stat() kernel");
res = 1;
goto cleanup;
}
kernel_source = mmap (NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (kernel_source == MAP_FAILED) {
perror ("Cannot map() kernel");
res = 1;
goto cleanup;
}
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms) != CL_SUCCESS) {
printf("Unable to get platform_id\n");
res = 1;
goto cleanup;
}
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
res = 1;
goto cleanup;
}
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err);
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char buffer[4096];
size_t len;
printf("Error building program\n");
clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len);
printf ("%s\n", buffer);
res = 1;
goto cleanup;
}
kernel = clCreateKernel(program, "reduce", &err);
if (err != CL_SUCCESS) {
printf("Unable to create kernel\n");
res = 1;
goto cleanup;
}
// create buffers for the input and ouput
input = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * N, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * N, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0,
sizeof(cl_float) * N, array, 0, NULL, NULL);
size_t size = N;
cl_mem tmp;
double time = gettime();
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, NULL);
clFinish(command_queue);
size = size/64;
tmp = output;
output = input;
input = tmp;
}
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
time = gettime() - time;
printf ("%f %f\n", array[0], time);
cleanup:
free (array);
free (array2);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
if (kernel_source != MAP_FAILED) munmap (kernel_source, s.st_size);
if (fd != -1) close (fd);
_Exit (res); // Kludge
return res;
}
So I re-run kernel until there is only one element in the buffer. Is this correct approach to compute sum of elements in OpenCL? The time which I measure with gettime is about 10 times slower when execution time of a simple loop on CPU (compiled clang 4.0.0 and -O2 -ffast-math flags). Hardware I use: Amd Ryzen 5 1600X and Amd Radeon HD 6950.
There's a couple of things you can do to try to improve performance.
Firstly, get rid of the clFinish call inside your loop. This forces individual executions of the kernels to be dependent on the entire state of the Command Queue reaching a synchronization point with the Host before continuing, which is unnecessary. The only synchronization required is that the kernels execute in order, and even if you have an out-of-order queue (which your program isn't requesting anyways), you can guarantee that with simple use of event objects.
size_t size = N;
size_t total_expected_events = 0;
for(size_t event_count = size; event_count > 1; event_count /= 64)
total_expected_events++;
cl_event * events = malloc(total_expected_events * sizeof(cl_event));
cl_mem tmp;
double time = gettime();
size_t event_index = 0;
while (size > 1)
{
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
global = size;
local = 64;
if(event_index == 0)
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 0, NULL, events);
else
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global,
&local, 1, events + (event_index - 1), events + event_index);
size = size/64;
tmp = output;
output = input;
input = tmp;
event_index++;
}
clFinish(command_queue);
for(; event_index > 0; event_index--)
clReleaseEvent(events[event_index-1]);
free(events);
cl_float answer[1];
clEnqueueReadBuffer(command_queue, tmp, CL_TRUE, 0,
sizeof(cl_float), array, 0, NULL, NULL);
The other thing to potentially look into is performing the reduction all in one kernel, instead of spreading it out over multiple invocations of the same kernel. This is one potential example, though it may be more complicated than you need it to be.
I am starting out OpenCL by converting existing C codes to an OpenCL. I am getting strange results with the both CPU and GPU calculation. Their values change 'every time' when I run the code. When I compare with the normal C, I would get 'somewhat' acceptable results from the CPU (but, still the results are not identical with the that of native C or even other languages), but when I run the 'exact same' code with GPU, I get gibberish results.
Here is my code on the Host
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
double *arange(double start, double end, double step)
{
// 'arange' routine.
int i;
int arr_size = ((end - start) / step) + 1;
double *output = malloc(arr_size * sizeof(double));
for(i=0;i<arr_size;i++)
{
output[i] = start + (step * i);
}
return output;
}
int main()
{
// This code executes on the OpenCL Host
// Host data
double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
double *delnu = arange(nu_ini, nu_end, nu_step);
double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
double *gamma, *f;
double prs = 950.0;
int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
FILE *fp = fopen("h2o_HITRAN.par","r");
char string[320];
while(!feof(fp))
{
dum = fgetc(fp);
if(dum == '\n')
{
lines++;
}
}
rewind(fp);
nu = malloc(lines * sizeof(double));
inten = malloc(lines * sizeof(double));
gam_air = malloc(lines * sizeof(double));
n_air = malloc(lines * sizeof(double));
del_air = malloc(lines * sizeof(double));
gamma = malloc(lines * sizeof(double));
f = malloc(delnu_size * sizeof(double));
i=0;
while(fgets(string, 320, fp))
{
sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
i++;
}
size_t line_siz = sizeof(double) * lines;
size_t delnu_siz = sizeof(double) * delnu_size;
// gamma calculation
for(i=0;i<lines;i++)
{
gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
}
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = NULL;
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = NULL;
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
// Create a context and associate it with the devices
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = NULL;
cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);
// Create a buffer objects that will contain the data from the host array 'buf_xxxx'
cl_mem buf_inten = NULL;
cl_mem buf_gamma = NULL;
cl_mem buf_delnu = NULL;
cl_mem buf_nu = NULL;
cl_mem buf_del_air = NULL;
cl_mem buf_f = NULL;
buf_inten = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_gamma = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_delnu = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
buf_nu = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
buf_f = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
// Write input array A to the Device buffer 'buf_xxx'
status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_Source;
FILE *program_handle = fopen("abs_calc.cl","r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_Source = (char*)malloc(program_size+1);
program_Source[program_size] = '\0';
fread(program_Source, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// Create the vector addition kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "abs_cross", &status);
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);
// Define index space (global work size) of work items for execution.
// A workgroup size (local work size) is not required, but can be used.
size_t globalWorkSize[2] = {lines, delnu_size};
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
// Read the Device output buffer to the host output array
clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);
// Verify the output
FILE *file = fopen("opencl_output","w");
for(i=0;i<delnu_size;i++)
{
fprintf(file, "%le %le\n", delnu[i], f[i]);
}
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(buf_nu);
clReleaseMemObject(buf_inten);
clReleaseMemObject(buf_del_air);
clReleaseMemObject(buf_gamma);
clReleaseMemObject(buf_f);
clReleaseMemObject(buf_delnu);
clReleaseContext(context);
// Free host resources
free(nu);
free(inten);
free(gam_air);
free(n_air);
free(del_air);
free(delnu);
free(gamma);
free(f);
free(platforms);
free(devices);
fclose(fp);
fclose(file);
return 0;
}
and this is my kernel code
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
global double *gamma,
global double *delnu,
global double *nu,
global double *del_air,
global double *f)
{
double pie = 4.0*atan(1.0);
int i = get_global_id(0);
int j = get_global_id(1);
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
Am I doing something wrong?
Thank you.
You appear to be running a 2D global work size, but storing into a location based only on dimension 1 (not 0). Therefore multiple work items are storing into the same location using +=. You have a race condition. You could use atomics to solve this, but it will likely slow the performance down too much. Therefore, you should store intermediate results and then do a parallel reduction operation.
I am using AMD W2100, and yes, I have printed out all the supported extension and it included cl_khr_fp64 extension.
Sorry, I forgot to include the original calculation. The actual calculation goes like the following..
for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
I would write OpenCL kernel as below,
Without using atomics and only single work dimension.
global_work_size = delnu_size
There could be a better way but its the simplest one.
__kernel void test(__global double *gamma,
__global double *inten,
__global double *delnu,
__global double *delair,
__global double *f,
const int lines)
{
double pie = 4.0*atan(1.0);
int j = get_global_id(0);
f[j] = 0;
for(i=0,i<lines;i++)
{
f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}
You need to understand how OpenCL kernel is executed.
You can think of it as large number of threads executing concurrently
and each thread could be identified with get_global_id
I'm a beginner in OpenCL and am trying to run the sample codes of the "OpenLC in Action" book. I have the following code to get the preferred vector width of my device. The platforms detected on my computer are from Intel Core i7 and HD graphics and another one from NVIDIA GeForce 940M. Whenever I run the code, it gives "1" for vector width of every type unless type double which is zero because it is not supported. Even when I change the platform in my computer to check its devices, results are the same. I ran the code on an AMD computer and it seemed to work properly because it gave me different numbers for different types. But, I am not sure why this code keeps giving me "1" for every type on different platforms of my computer. Any ideas?
Here is the output:
Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>
int main(){
cl_int err, i, j;
cl_platform_id *platforms;
cl_device_id *devices;
cl_uint num_platforms, num_devices, vector_width;
size_t plat_name_size, devi_name_size;
char *plat_name_data, *devi_name_data;
err = clGetPlatformIDs(1, NULL, &num_platforms);
if (err < 0){
perror("No platform is found");
exit(1);
}
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
printf("Number of found platforms is %d\n ", num_platforms);
for (i = 0; i < num_platforms; i++){
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
if (err < 0){
perror("Couldn't read platform name.");
exit(1);
}
plat_name_data = (char*)malloc(plat_name_size);
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL);
printf("Platform No.%d is: %s\n", i, plat_name_data);
err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
if (err < 0){
perror("No device is found in this platform");
exit(1);
}
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices));
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
printf("Number of devices found in this platform is: %d\n", num_devices);
for (j = 0; j < num_devices; j++){
err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size);
if (err < 0){
perror("Couldn't read the device name.");
exit(1);
}
devi_name_data = (char*)malloc(devi_name_size);
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL);
printf("Device No.%d name is: %s\n", j + 1, devi_name_data);
if (strstr(devi_name_data, "GeForce 940M")){
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in chars: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in shorts: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in ints: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in longs: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in floats: %u\n", vector_width);
clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(cl_uint), &vector_width, NULL);
printf("Preferred vector width in doubles: %u\n", vector_width);
}
}
}
return 0;
}
Short answer: You are querying it correctly, and the platform compiler knows what is the best vector width size. So yes, it is correct the value of 1.
Long answer: For a CPU (any type of CPU) it is likely to prefer non vectored. Specially on Intel CPU + Compiler, since the Intel compiler does the vectorization as part of the optimization process, so it prefers the user NOT to vectorize the code in the first place.
Indeed it looks like nVIDIA also prefers the user to input non vectorized code. It does not mean code will run slower if vectorized already. It just means the compiler (due to the optimization techniques it has) prefers the code to be unvectorized.
Updates to the OpenCL drivers may lead to a change of these values.
Also, you should take them as orientative. Other factors as: local memory usage, coalesced global access, local size, etc... are way more important usually.
Here is one experiment that I've done to see how vectorized operations perform in a device which prefers to do scalar operations. I have implemented the reduction algorithm with two different kernels. The first kernel treats data as scalars while the second treats data as float4 vectors (Codes are given below). Here is the execution results. It is clear that although the NVIDIA device prefers non-vectorized operation, vectorized operation is faster.
Preferred vector width: 1
reduction_scalar: Check passed.
Total time = 4471424
reduction_vector: Check passed.
Total time = 1723776
And here is the code:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"
#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err == CL_DEVICE_NOT_FOUND) {
printf(" GPU is not first! Going on CPU :(");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
}
if (err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main() {
/* OpenCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel[NUM_KERNELS];
cl_command_queue queue;
cl_event prof_event;
cl_int i, j, err, preferred_width;
size_t local_size, global_size;
char kernel_names[NUM_KERNELS][20] =
{ "reduction_scalar", "reduction_vector" };
/* Data and buffers */
float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);
//float data[ARRAY_SIZE];
float sum, actual_sum, *scalar_sum, *vector_sum;
cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
cl_int num_groups;
cl_ulong time_start, time_end, total_time;
/* Initialize data */
for (i = 0; i<ARRAY_SIZE; i++) {
data[i] = 1.0f*i;
}
/* Create device and determine local size */
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(preferred_width), &preferred_width, NULL);
printf("Preferred vector width: %d\n", preferred_width);
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(local_size), &local_size, NULL);
if (err < 0) {
perror("Couldn't obtain device information");
exit(1);
}
/* Allocate and initialize output arrays */
num_groups = ARRAY_SIZE / local_size;
scalar_sum = (float*)malloc(num_groups * sizeof(float));
vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
for (i = 0; i<num_groups; i++) {
scalar_sum[i] = 0.0f;
}
for (i = 0; i<num_groups / 4; i++) {
vector_sum[i] = 0.0f;
}
/* Create a context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build program */
program = build_program(context, device, PROGRAM_FILE);
/* Create data buffer */
data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
if (err < 0) {
perror("Couldn't create a buffer");
exit(1);
};
/* Create a command queue */
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
for (i = 0; i<NUM_KERNELS; i++) {
/* Create a kernel */
kernel[i] = clCreateKernel(program, kernel_names[i], &err);
if (err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
/* Create kernel arguments */
err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
if (i == 0) {
global_size = ARRAY_SIZE;
err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
}
else {
global_size = ARRAY_SIZE / 4;
err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
}
if (err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
/* Finish processing the queue and get profiling information */
clFinish(queue);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
/* Read the result */
if (i == 0) {
err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups; j++) {
sum += scalar_sum[j];
}
}
else {
err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
sum = 0.0f;
for (j = 0; j<num_groups / 4; j++) {
sum += vector_sum[j];
}
}
/* Check result */
printf("%s: ", kernel_names[i]);
actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
if (fabs(sum - actual_sum) > 0.01*fabs(sum))
printf("Check failed.\n");
else
printf("Check passed.\n");
printf("Total time = %lu\n\n", total_time);
/* Deallocate event */
clReleaseEvent(prof_event);
}
/* Deallocate resources */
free(scalar_sum);
free(vector_sum);
for (i = 0; i<NUM_KERNELS; i++) {
clReleaseKernel(kernel[i]);
}
clReleaseMemObject(scalar_sum_buffer);
clReleaseMemObject(vector_sum_buffer);
clReleaseMemObject(data_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
and the kernels:
__kernel void reduction_scalar(__global float* data,
__local float* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = partial_sums[0];
}
}
__kernel void reduction_vector(__global float4* data,
__local float4* partial_sums, __global float* output) {
int lid = get_local_id(0);
int group_size = get_local_size(0);
partial_sums[lid] = data[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = group_size/2; i>0; i >>= 1) {
if(lid < i) {
partial_sums[lid] += partial_sums[lid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
}
}
im new with OpenCL, I have a problem in clCreateKernel, it throws CL_INVALID_PROGRAM_EXECUTABLE, could anybody help, the code is based on http://www.cs.bris.ac.uk/home/simonm/workshops/OpenCL_lecture3.pdf , the last optimization
Here is the code:
#define ORDER 10 // Order of the square matrices A, B, and C
#define AVAL 3.0 // A elements are constant and equal to AVAL
#define BVAL 5.0 // B elements are constant and equal to BVAL
#define TOL (0.001) // tolerance used in floating point comparisons
#define DIM 2 // Max dim for NDRange
#define COUNT 1 // number of times to do each multiplication
#define SUCCESS 1
#define FAILURE 0
// Funciones Auxiliares
void initmat(int Mdim, int Ndim, int Pdim, float *A, float *B, float *C)
{
int i, j;
/* Initialize matrices */
for (i = 0; i < Ndim; i++)
for (j = 0; j < Pdim; j++)
A[i*Ndim+j] = AVAL;
for (i = 0; i < Pdim; i++)
for (j = 0; j < Mdim; j++)
B[i*Pdim+j] = BVAL;
for (i = 0; i < Ndim; i++)
for (j = 0; j < Mdim; j++)
C[i*Ndim+j] = 0.0f;
}
// Definicion de la funcion:
char * readKernel(void)
{
size_t *source_length;
FILE *fp = fopen("kernel.cl", "r");
if (fp == NULL)
{
printf("Cannot Open Kernel.cl\n");
}
else
{
printf("Kernel.cl Opened\n");
}
fseek(fp, 0, SEEK_END);
source_length[0] = ftell(fp);
if (source_length[0] == 0)
{
printf("Kernel.cl is empty\n");
}
else
{
printf("Kernel.cl length: %zu bytes\n", source_length[0]);
}
char *source = (char*) calloc(source_length[0] + 1, 1);
if (source == 0)
{
printf("Memory allocation failed");
}
fseek(fp, 0, SEEK_SET);
fread(source, 1, source_length[0], fp);
printf("Kernel.cl Read\n");
return source;
}
int main(int argc, char **argv)
{
// Declare and iniciate data
float *A, *B, *C;
int Mdim, Ndim, Pdim;
int err, szA, szB, szC;
size_t global[DIM];
size_t local[DIM];
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_uint nd;
cl_mem a_in, b_in, c_out;
Ndim = ORDER;
Pdim = ORDER;
Mdim = ORDER;
szA = Ndim*Pdim;
szB = Pdim*Mdim;
szC = Ndim*Mdim;
A = (float *)malloc(szA*sizeof(float));
B = (float *)malloc(szB*sizeof(float));
C = (float *)malloc(szC*sizeof(float));
const char* C_elem_KernelSource =
"__kernel \n"
"void mmul( \n"
" const int Mdim, \n"
" const int Ndim, \n"
" const int Pdim, \n"
" __global float* A, \n"
" __global float* B, \n"
" __global float* C, \n"
" __local float* Bwrk) \n"
"{ \n"
" int k,j; \n"
" int i = get_global_id(0); \n"
" int iloc = get_local_id(0); \n"
" int nloc = get_local_size(0); \n"
" float Awrk[10]; \n"
" float tmp; \n"
" for (k=0; k<Pdim; k++) \n"
" Awrk[k] = A[i*Ndim+k]; \n"
" for (j=0; j<Mdim; j++){ \n"
" for (k=iloc; k<Pdim; k=k+nloc) \n"
" Bwrk[k] = B[k*Pdim+j]; \n"
" barrier(CLK_LOCAL_MEM_FENCE); \n"
" tmp = 0.0f; \n"
" for (k=0; k<Pdim; k++) \n"
" tmp += Awrk[k] * Bwrk[k]; \n"
" C[i*Ndim+j] += tmp; \n"
"} \n"
;
initmat(Mdim, Ndim, Pdim, A, B, C);
// Setup the plataform
cl_uint num_platforms;
if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS)
{
printf("Unable to get platform!\n");
}else{
printf("Plataformas Disponibles: %u \n", num_platforms);
}
//Identificador
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, &num_platforms);
printf("Plataformas creada\n");
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err==CL_SUCCESS){
printf("Device creado \n");
}else {
printf("Error %d \n", err);
}
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
if (err==CL_SUCCESS){
printf("Contexto creado \n");
}else {
printf("Error creando contexto \n");
}
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (err==CL_SUCCESS){
printf("cola de comandos creadas \n");
}else {
printf("Error creando cola de comandos \n");
}
// Setup buffers and write A and B matrices to the device memory
a_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szA, NULL, NULL);
b_in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * szB, NULL, NULL);
c_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, NULL);
err = clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, sizeof(float) * szA, A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, b_in, CL_TRUE, 0, sizeof(float) * szB, B, 0, NULL, NULL);
// Build the program, define the kernel and setup arguments
program = clCreateProgramWithSource(context, 1, (const char **) &C_elem_KernelSource, NULL, &err);
if (err==CL_SUCCESS){
printf("programa creado \n");
}else {
printf("Error generado %d creando programa\n", err);
}
//Compila el programa en el dispositivo elegido
clBuildProgram(program, 1, &device_id, NULL, NULL, NULL );
if (err==CL_SUCCESS){
printf("programa compilado 1\n");
}else {
printf("Error generado %d compilando programa 1\n", err);
}
kernel = clCreateKernel(program, "mmul", &err);
if (err==CL_SUCCESS){
printf("Kernel creado \n");
}else {
printf("Error generado %d creando kernel\n", err);
}
err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim);
err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim);
err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &a_in);
err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_in);
err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &c_out);
err |= clSetKernelArg(kernel, 6, sizeof(float)*Pdim, NULL);
if (err==CL_SUCCESS){
printf("Argumentos del Kernel configurados \n");
}else {
printf("Error configurando argumentos del kernel \n");
}
//Run the kernel and collect results
// 1D ND Range set to dimensions of C matrix
//Local Dim set to 250 so number of work-groups match number of
//compute units (4 in this case) for our order 1000 matrices
//Pass local memory to kernels. This requires a change to the kernel
//argument list … a new call to clSetKernelArg is needed
printf("Encolando Kernel:\n");
global[0] = (size_t) Ndim; global[1] = (size_t) Mdim; local[0] = (size_t) 2;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global, local, 0, NULL, NULL);
if (err==CL_SUCCESS){
printf("Kernel enviado a device \n");
}else {
printf("Error enviando kernel a device \n");
}
clFinish(commands);
err = clEnqueueReadBuffer(commands, c_out, CL_TRUE, 0, sizeof(float) * szC, C, 0, NULL, NULL );
//test_results(A, B, c_out);
}
Thanks
The main problem is that the open brace on line 112 has no matching closing brace:
" for (j=0; j<Mdim; j++){ \n"
Also note that the pointer declared on line 34 is used without initialization:
size_t *source_length;
On line 170, an err= should be added to the clBuildProgram() call so that the error checking works as intended. Then you can add logic to use clGetProgramBuildInfo() to get details in the case of a build fail.