Related
I'm new to openCL program and this is the problem I'm facing while executing a simple vector addition.
I have the following kernel code
#include <CL/cl.hpp>
#include<iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
I have integrated gpu and amd gpus on my system. I'm trying to perform vector addition on my intel gpu and for which I have installed the intel opencl drivers (i7 3rd gen processor with hd graphics).
I have the below openCL code
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::cout << "Total platforms including cpu: " << platforms.size() << std::endl;
if (platforms.size() == 0) {
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
int i;
const int LIST_SIZE = 50;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
//std::cout<<source_str<<std::endl;
// Get platform and device information
cl_platform_id* platforms1 = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, platforms1, &ret_num_platforms);
platforms1= (cl_platform_id*) malloc(sizeof(cl_platform_id) * ret_num_platforms);
clGetPlatformIDs(ret_num_platforms, platforms1, NULL);
/*
* Platform 0: Intel Graphics
* Platform 1 : AMD Graphics
*/
//CHANGE THE PLATFORM ACCORDING TO YOUR SYSTEM!!!!
ret = clGetDeviceIDs( platforms1[0], CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 16; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
//FREE
return 0;
}
If the LISTSIZE is 50, it prints only till 48 that is 16*3. It prints only the multiple of LISTSIZE and I'm not able to figure out why?.
OpenCL kernels execute only for a multiple of the local thread block size (local Range, in your code local_item_size), which should not be smaller than 32 and must be a multiple of 2, (so it can be (32, 64, 128, 256, ...). If you set it to 16, half of the GPU will be idle at any time. global_item_size must be a multiple of local_item_size. You need at least 32 data items for the kernel to function and a lot more for it to yield good performance.
Also the part
#include <CL/cl.hpp>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define MAX_SOURCE_SIZE (0x100000)
int main() {
is not OpenCL C code and does not belong in the .cl source file. If it is not too lengthy, you can write the OpenCL C code directly in the .cpp file as a raw string:
const string kernel_code = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
)";
char* source_str = kernel_code.c_str();
I want to do a sub-matrix multiplication. Say I have a function:
void MatMul(cl_mem A, cl_mem B, cl_mem C, int M, int K, int N)
where A is M*K, B is K*N, C is M*N, and A, B, C are all row major 1 dimension array passed by host memory float *h_A, *h_B, *hC with the following function:
void ocl_push_array(cl_mem d_x, float *h_x, int n){
size_t data_size = sizeof(float)*n;
err = clEnqueueWriteBuffer(queue, d_x, CL_TRUE, 0, data_size, h_x, 0, NULL, NULL);
}
I want to ask:
if I want to do sub-matrix multiplication, say slicing A by row:
// cl_mem A, B, C;
for(int x=0; x<M; x+=16)
{
cl_mem A_sub = (cl_mem)((float *)A+x*K);
cl_mem C_sub = (cl_mem)((float *)C+x*N);
if((M-x+1)>=16)
MatMul(A_sub, B, C_sub, 16, K, N);
else
MatMul(A_sub, B, C_sub, M-x+1, K, N);
}
Is it the right code to do this operation? I got a run time error says: "CL_INVALID_MEM_OBJECT" (-38) when it assigns arguments to OpenCL kernel (clSetKernelArg).
The reason I want to do this operation is that I found the matrix multiplication got wrong answers when my input matrix A and B become big.
My OpenCL kernel is:
#define BLOCK_SIZE 16
#define AS(i, j) As[j + i * BLOCK_SIZE]
#define BS(i, j) Bs[j + i * BLOCK_SIZE]
__kernel void
matrixMul(__global float* A, __global float* B, __global float* C,
__local float* As, __local float* Bs, int uiWA, int uiWB)
{
int bx = get_group_id(0);
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int aBegin = uiWA * BLOCK_SIZE * by;
int aEnd = aBegin + uiWA - 1;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * uiWB;
float Csub = 0.0f;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
AS(ty, tx) = A[a + uiWA * ty + tx];
BS(ty, tx) = B[b + uiWB * ty + tx];
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += AS(ty, k) * BS(k, tx);
barrier(CLK_LOCAL_MEM_FENCE);
}
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
and the size is:
#define BLOCK_SIZE 16
size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, N), shrRoundUp(BLOCK_SIZE, M)};
size_t shrRoundUp(int group_size, int global_size)
{
int r = global_size % group_size;
if(r == 0)
{
return global_size;
} else
{
return global_size + group_size - r;
}
}
the code is adopted from Nvidia OpenCL matrix multiplication sample. My GPU is: Intel(R) HD Graphics 4600.
Thanks!
I don't think you can do this:
cl_mem A_sub = (cl_mem)((float *)A+x*K);
Because cl_mem is an object in OpenCL, which is actually a complex data structure instead of just a data pointer. It maintains information such as data pointer to the actually memory, reference to the object, memory properties and so on. Different run-times may even have different implementations of cl_mem object. That's why you got the CL_INVALID_MEM_OBJECT error message.
What you can do to get wanted data for the sub matrix is one of the followings:
define two new cl_mem objects, and use a separate kernel to do
the copy work.
use clEnqueueCopyBuffer function to copy the data at the host
code domain.
use CL_MEM_ALLOC_HOST_PTR to memory buffer, and then use
clEnqueueMapBuffer map the GPU memory to host memory pointer, and
then modify the memory content by using the mapped host memory
pointer, when you finish, unmap the pointer to return the GPU memory
to the device memory domain.
My program, I have set
size_t global_item_size = 12000;
size_t local_item_size = 600;
cl_mem arr_M_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, 12000 * sizeof(int), NULL, &ret);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
and my kernel:
__kernel void mykernel(__global int* arrM)
{
int n = get_global_id(0);
arrM[n] = n;
}
But my result has an error, I use a loop and print arrM after copying from device to host, and my result
arrM[0] = 0
arrM[1] = 0
arrM[2] = 1
arrM[3] = 0
arrM[4] = 2
arrM[5] = 0
arrM[6] = 3
...
arrM[11998] = 5999
arrM[11999] = 0
Can you help me solve it?
Yes, the problem is quite easy if you didn't omit anything in your posting - you forgot to set the argument to the kernel - the buffer arr_M_obj.
Take a look here for the function used to set the argument.
cl_mem arr_M_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, nmax * NMAX * sizeof(cl_ulong), NULL, &ret);
__kernel void PatternBranching(__global int4* arrlmer, __global int* arrMscore, __global int4* arrM, int k, int l, int n1, int sqelenght, int a1)
PatternBranching(arr_lmer_obj, arr_Mscore_obj, arr_M_obj, k, l, n1, sqelength, a1)
notice anything bad about the declaration of arr_Mscore_obj and the signature? Change the cl_ulong to cl_int
I made an OpenCL program and use pinned memory (CL_MEM_ALLOC_HOST_PTR) to get a higher transfer rate from device to host.
The transfer rate is increased as I expected (get transfer rate using AMD APP Profiler 2.4).
The problem is the transfer rate is higher than PCIe bandwidth (93703 GB /s) for matrix 4096 x 4096 (64 MB).
It happened too when I use zero copy buffer ( CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer).
I search some information that it is true if pinned memory and zero copy buffer have high transfer rate but it still limited with PCIe bandwidth for discrete GPU.
So, is it normal if the transfer rate exceed PCIe bandwidth (using PCIe bandwidth 2.0 x 16)?
My OS is Windows 7 64 bit.
I use AMD APP SDK 2.6 and discrete GPU AMD HD 6630M.
Edit:
Here is the code:
#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
cl_context context = NULL;
cl_command_queue queue = NULL;
cl_program program = NULL;
void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;
// Create Kernel Object Bound To Kernel Function
naive = clCreateKernel(program, "naiveAlgorithm", &err);
//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;
if(globalsize >= 16)
{
localsize =16;
}else
{
localsize = globalsize;
}
size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items [2] = {localsize, localsize};
// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);
// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);
//Release Kernel
err = clReleaseKernel(naive);
}
void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL);
MatrixMul(d_A, d_B, d_C, size);
err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);
err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}
//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096;
//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;
//Allocate and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));
//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);
//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);
cl_int err; // error code
cl_platform_id* platforms;
cl_uint platformCount;
cl_device_id device;
int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk
clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, platforms, NULL); //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices
const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
reinterpret_cast<cl_context_properties> (platforms [platformtype]),
0, 0
};
context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
//Load Kernel Source
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;
fp = fopen(fileName, "r");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Create Program Object
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
&source_size, &err);
// Build Program
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
Naive(matrixA, matrixB, matrixC, size);
//Cleanup all memory
err = clFlush(queue);
err = clFinish(queue);
err = clReleaseProgram(program);
err = clReleaseCommandQueue(queue);
err = clReleaseContext(context);
// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
printMatrix(matrixC, size*size, size);
cout<<endl;
free(matrixA);
free(matrixB);
free(matrixC);
free(source_str);
return 0;
}
And here is the kernel code:
__kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {
int tx = get_global_id(0); //2D Thread IDx
int ty = get_global_id(1); //2D Thread IDy
float sum = 0;
//Calculate result of one element of Matrix C
for (int k = 0; k < size; k++) {
sum += A[ty*size+k] * B[k*size+tx];
}
C[ty*size+tx] = sum;
}
And here is the image:
I see that your output array is actually located in host memory because of the CL_MEM_ALLOC_HOST_PTR flag in the following line:
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
This means that you should be using clEnqueueMapBuffer, followed by using the matrix in whatever way you see fit, followed by clEnqueueUnmapMemObject. There is no need for the array matrixC since d_C is already in host memory.
The data transfer from GPU to host actually happens while your kernel is running. The map call makes sure that all data has finished moving from the GPU to the CPU. That is why the transfer times are actually so small.
I can't find any documentation on whether clEnqueueReadBuffer works for pinned memory or not. I also see that you are retrieving the error codes of each operation but do not check these error codes, hence your code may be silently failing.
Regarding the large difference between the time taken by clEnqueueReadBuffer and the time spent transferring data, note that all queued operations don't immediately get dispatched to the GPU. One source of delay is the Windows display driver model (WDDM) for graphics cards. The +-20 micro-seconds used for the clEnqueueReadBuffer sounds right for this delay (I've actually seen longer delays).
Doing simple matrix multiplication using OpenCL:
// Multiply two matrices A * B = C
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <oclUtils.h>
#define WA 3
#define HA 3
#define WB 3
#define HB 3
#define WC 3
#define HC 3
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{
// set seed for rand()
srand(2006);
// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
// 2. initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// 3. print out A and B
printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++)
{
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n");
}
printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++)
{
printf("%f ", h_B[i]);
if(((i + 1) % WB) == 0)
printf("\n");
}
// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
// 5. Initialize OpenCL
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
cl_kernel clKernel;
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
/*****************************************/
/* Initialize OpenCL */
/*****************************************/
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
//shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
//shrCheckError(errcode, CL_SUCCESS);
// Setup device memory
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);
// 6. Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("kernel.cl",
"// My comment\n",
&kernelLength);
//shrCheckError(clMatrixMul != NULL, shrTRUE);
clProgram = clCreateProgramWithSource(clGPUContext,
1, (const char **)&clMatrixMul,
&kernelLength, &errcode);
//shrCheckError(errcode, CL_SUCCESS);
errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
//shrCheckError(errcode, CL_SUCCESS);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
//shrCheckError(errcode, CL_SUCCESS);
// 7. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];
int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
//shrCheckError(errcode, CL_SUCCESS);
localWorkSize[0] = 3;
localWorkSize[1] = 3;
globalWorkSize[0] = 3;
globalWorkSize[1] = 3;
errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
//shrCheckError(errcode, CL_SUCCESS);
// 8. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
//shrCheckError(errcode, CL_SUCCESS);
// 9. print out the results
printf("\n\nMatrix C (Results)\n");
for(int i = 0; i < size_C; i++)
{
printf("%f ", h_C[i]);
if(((i + 1) % WC) == 0)
printf("\n");
}
printf("\n");
// 10. clean up memory
free(h_A);
free(h_B);
free(h_C);
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
free(clDevices);
free(clMatrixMul);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);
}
In the above code I keep getting error at the place :
/**********************/ / Initialize OpenCL
/ /**********************/
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS);
The error code being returned is -32 that means: CL_INVALID_PLATFORM"
How do I remove this error?
OS: Windows 7, 32 bit, NVIDIA GPU GeForce 610
The Nvidia drivers expect you to provide a non-NULL properties pointer as first argument to the clCreateContextFromType call.
The Khronos specification for clCreateContextFromType states that if NULL is passed for the properties parameter, the platform that is selected is implementation dependent. In case of Nvidia the choice seems to be that no platform at all is selected if a NULL pointer is passed. See clCreateContextFromType for more information.
On the other hand, this behavior is consistent with Issue #3 in the cl_khr_icd extension, which would apply if you are using OpenCL through the ICD, and which states:
3: How will the ICD handle a NULL cl_platform_id?
RESOLVED: The NULL platform is not supported by the ICD.
To pass the properties to clCreateContextFromType, first query the platforms with clGetPlatformIDs. Then construct a properties array with the desired platform ID and pass it to clCreateContextFromType. Something along the following lines should work with a C99 compliant compiler:
// query the number of platforms
cl_uint numPlatforms;
errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
shrCheckError(errcode, CL_SUCCESS);
// now get all the platform IDs
cl_platform_id platforms[numPlatforms];
errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
shrCheckError(errcode, CL_SUCCESS);
// set platform property - we just pick the first one
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0};
clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);