Mandelbrot in OpenCL - opencl

I have this Mandelbrot Kernel written for an OpenCL program. For test I've decided to have all my complex plane on a vector. My problem is when I print the output I obtain a list of 1 (like the initialization of the results array) and not the result of the kernel work.
Where can I have the problem?
#include <iostream>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
int main(){
using namespace std;
int xPixel=100;
int yPixel=100;
float ics[xPixel];
for(int i=0;i<xPixel;++i)
ics[i]=-2+i*((float)4/xPixel);
float ypsilon[yPixel];
for(int i=0;i<yPixel;++i)
ypsilon[i]=-2+i*((float)4/yPixel);
int results[xPixel*yPixel];
for(int i=0;i<xPixel*yPixel;++i)
results[i]=1;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem memX, memY, memOutput;
size_t global;
const char *KernelSource =
"__kernel void mandelbrot(__global float *ics, __global float *ypsilon, __global int *output){\n"\
"size_t id=get_global_id(0);\n"\
"int yPixel=100;\n"\
"for(int i=0;i<yPixel;i++){\n"\
"float x=0;\n"\
"float y=0;\n"\
"int counter=0;\n"\
"while(counter<1000){\n"\
"if(x*x+y*y>2*2){\n"\
"output[(id*yPixel)+i]=counter;\n"\
"break;\n"\
"}\n"\
"float xTemp=x*x-y*y+ics[id];\n"\
"y=2*x*y+ypsilon[i];\n"\
"x=xTemp;\n"\
"counter++;\n"\
"}\n"\
"}\n"\
"}\n";
// retreives a list of platforms available
if (clGetPlatformIDs(1, &platform_id, &num_of_platforms)!= CL_SUCCESS){
cout<<"Unable to get platform_id\n"<<endl;;
return 1;
}
// try to get a supported GPU device
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&num_of_devices) != CL_SUCCESS){
cout<<"Unable to get device_id\n"<<endl;
return 1;
}
//context properties list - nust be terminated with 0
properties[0]=CL_CONTEXT_PLATFORM;
properties[1]=(cl_context_properties)platform_id;
properties[2]=0;
//create a context with the GPU device
context=clCreateContext(properties,1,&device_id,NULL,NULL,&err);
//create a command queue using the context and device
command_queue=clCreateCommandQueue(context,device_id,0,&err);
//create a program from the kernel source code
program=clCreateProgramWithSource(context,1,(const char**)&KernelSource,NULL,&err);
//compile the program
if(clBuildProgram(program,0,NULL,NULL,NULL,NULL)!=CL_SUCCESS){
cout<<"Error building program"<<endl;
return 1;
}
//specify which kernel from the program to execute
kernel=clCreateKernel(program,"mandelbrot",&err);
//create buffers for input and output
memX=clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*xPixel,NULL,NULL);
memY=clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*yPixel,NULL,NULL);
memOutput=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(int)*(xPixel*yPixel),NULL,NULL);
//load data into the input buffer
clEnqueueWriteBuffer(command_queue,memX,CL_TRUE,0,sizeof(float)*xPixel,ics,0,NULL,NULL);
clEnqueueWriteBuffer(command_queue,memY,CL_TRUE,0,sizeof(float)*yPixel,ypsilon,0,NULL,NULL);
//set the argument list for the kernel command
clSetKernelArg(kernel,0,sizeof(cl_mem),&memX);
clSetKernelArg(kernel,1,sizeof(cl_mem),&memY);
clSetKernelArg(kernel,2,sizeof(cl_mem),&memOutput);
global=xPixel*yPixel;
//enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue,kernel,1,NULL,&global,NULL,0,NULL,NULL);
clFinish(command_queue);
//copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue,memOutput,CL_TRUE,0,sizeof(int)*(xPixel*yPixel),results,0,NULL,NULL);
//print output
for(int i=0;i<xPixel;++i){
for(int j=0;j<yPixel;++j){
cout<<results[(i*yPixel)+j]<<" ";
}
cout<<endl;
}
//cleanup - release OpenCL resources
clReleaseMemObject(memX);
clReleaseMemObject(memY);
clReleaseMemObject(memOutput);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
}

I'm not seeing the exact reason, but I do have a question: If you're running this on every element then what is the "i" looping over "yPixel" for? It seems like you're doing X*Y*Y work instead of X*Y work (your global size is X*Y then the kernel loops on Y again).
If you add "output[(id*yPixel)+i]=42" before the "i" loop then what does your output buffer hold? That will tell you if the problem lies in your kernel or your host code.
To help anyone else looking at this, I've reformatted the kernel code:
__kernel void mandelbrot(__global float *ics, __global float *ypsilon, __global int *output)
{
size_t id=get_global_id(0);
int yPixel=100;
for(int i=0;i<yPixel;i++)
{
float x=0;
float y=0;
int counter=0;
while(counter<1000)
{
if(x*x+y*y>2*2)
{
output[(id*yPixel)+i]=counter;
break;
}
float xTemp=x*x-y*y+ics[id];
y=2*x*y+ypsilon[i];
x=xTemp;
counter++;
}
}
}

Related

OpenCl cannot compile kernel with 'printf'

I am getting the error error: implicit declaration of function 'printf' is invalid in OpenCL when I try to build an OpenCL kernel. The kernel code is this
__kernel void conj_grad(int dim, int num_vals, __local float *r,
__local float *x, __local float* A_times_p, __local float *p,
__global int *rows, __global int *cols, __global float *A,
__global float *b, __global float *result) {
local float alpha, r_length, old_r_dot_r, new_r_dot_r;
local int iteration;
int id = get_local_id(0);
int start_index = -1;
int end_index = -1;
float Ap_dot_p;
printf("OpenCL Kernel ID: %d\n", id);
This gives me the error below
input.cl:14:4: error: implicit declaration of function 'printf' is invalid in OpenCL
input.cl:14:4: note: did you mean 'rint'?
/usr/include/clc/math/unary_decl.inc:1:39: note: 'rint' declared here
/usr/include/clc/math/rint.h:2:24: note: expanded from macro '__CLC_FUNCTION'
input.cl:46:45: warning: double precision constant requires cl_khr_fp64, casting to single precision
I am getting a negative return code from this function err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
I have already tried the solutions from these questions Using printf() in OpenCL kernel and printf function doesn't work in OpenCL kernel, and neither of these solutions fix it. When I try these solutions, I see a warning such as input.cl:1:26: warning: unknown OpenCL extension 'cl_amd_printf' - ignoring
Conclusion
Looks like my system does not support the printf extensions. The code below (stealing from pmdj's answer) gives me the following output. Looks like a classic story of don't depend on vendor specific extensions to a standard.
#include <stdio.h>
#include <CL/cl.h>
int main(void) {
char ext_str[1024] = "";
size_t ext_str_len = sizeof(ext_str) - 1;
cl_device_id device_id;
cl_int err;
cl_platform_id platform;
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(ext_str), ext_str, &ext_str_len);
if(err < 0) {
perror("Couldn't get device info");
exit(1);
}
printf("CL extensions (%lu): '%s'\n", ext_str_len, ext_str);
return 0;
}
CL extensions (248): 'cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp64 cl_khr_fp16'
As you've found, printf isn't part of standard OpenCL, but some implementations offer extensions which enable it.
To check for extensions supported by your implementation, try something like this:
char ext_str[1024] = "";
size_t ext_str_len = sizeof(ext_str) - 1;
err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(ext_str), ext_str, &ext_str_len);
printf("CL extensions (%lu): '%s'\n", ext_str_len, ext_str);
If there aren't any cl_*_printf extensions listed, you are probably out of luck. If one is listed, you'll need to enable it as described in the other answers you linked. You might also want to check the specification for the specific extension yours supports in case it exhibits any particular quirks.

OpenCL: Basic example not working. clSetKernelArg -38 Error

I am attempting a very simple OpenCL example. I have developed the following code below. It compiles a simple kernel, and then I create a simple float* buffer and set it to a cl::Buffer. However, when I attempt to call the kernel.setArg() function, it crashes, with an error -38. This error states that my cl::Buffer is invalid. I have no idea why this is happening:
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include <CL/cl2.hpp>
#define MULTI_LINE_STRING(ARG) #ARG
namespace op
{
const char *resizeAndMergeKernel = MULTI_LINE_STRING(
__kernel void testKernel(__global float* image)
{
}
);
}
void testCL(){
cl::Device device;
cl::Context context;
cl::CommandQueue queue;
int deviceId = 0;
// Load Device
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
std::string deviceName;
cl_uint i, type;
cl::Platform::get(&platforms);
type = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
if( type == CL_SUCCESS)
{
// Get only relavent device
cl::Context allContext(devices);
std::vector<cl::Device> gpuDevices;
gpuDevices = allContext.getInfo<CL_CONTEXT_DEVICES>();
bool deviceFound = false;
for(int i=0; i<gpuDevices.size(); i++){
if(i == deviceId){
device = gpuDevices[i];
context = cl::Context(device);
queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
deviceFound = true;
cout << "Made new GPU Instance: " << deviceId << endl;
break;
}
}
if(!deviceFound)
{
throw std::runtime_error("Error: Invalid GPU ID");
}
}
// Create Kernel
cl::Program program = cl::Program(context, op::resizeAndMergeKernel, true);
cl::Kernel kernel = cl::Kernel(program, "testKernel");
// Simple Buffer
cl_int err;
float* test = new float[3*224*224];
cl::Buffer x = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * 3 * 224 * 224, (void*)test, &err);
cout << err << endl;
kernel.setArg(0,x); // CRASHES WITH cl::Error -38
}
As you can see the last line kernel.setArg(0,x) crashes with error -38.
It's not a "crash", it's an error code. OpenCL error -38 is CL_INVALID_MEM_OBJECT. It means the cl_mem_obj is not valid. It is because you are passing a cl::Buffer object to setArg, but you need to instead pass the cl_mem handle which represents that buffer. The cl::Buffer operator() method returns that. So use kernel.setArg(0,x()). Note the () are the added part (yes, it's subtle).

Why am I not getting I/O-compute overlap with this code?

The following program:
#include <iostream>
#include <array>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles)
{
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
__global__ void dummy(clock_value_t duration_in_cycles)
{
gpu_sleep(duration_in_cycles);
}
int main()
{
const clock_value_t duration_in_clocks = 1e7;
const size_t buffer_size = 5e7;
constexpr const auto num_streams = 2;
std::array<char*, num_streams> host_ptrs;
std::array<char*, num_streams> device_ptrs;
std::array<cudaStream_t, num_streams> streams;
for (auto i=0; i<num_streams; i++) {
cudaMallocHost(&host_ptrs[i], buffer_size);
cudaMalloc(&device_ptrs[i], buffer_size);
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
}
cudaDeviceSynchronize();
for (auto i=0; i<num_streams; i++) {
cudaMemcpyAsync(device_ptrs[i], host_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
dummy<<<128, 128, 0, streams[i]>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptrs[i], device_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
}
for (auto i=0; i<num_streams; i++) { cudaStreamSynchronize(streams[i]); }
for (auto i=0; i<num_streams; i++) {
cudaFreeHost(host_ptrs[i]);
cudaFree(device_ptrs[i]);
}
}
should result in overlapping I/O and Compute between the work on the first and second streams: When the first stream's Host-to-Device ends, the first stream's kernel can start, but so can the second stream's Host-to-Device transfer. Instead, I get the following timeline, with no overlap:
I think I've covered my bases to ensure overlap. The streams are non-blocking (and indeed the enqueueing of work concludes well before the first HtoD does); the host memory is pinned... so what's missing for me to see overlap?
Using CUDA 8.0.61 on GNU/Linux Mint 18.2 with an NVIDIA GTX 650 Ti Boost. But the driver is v384.59.
Ok, it must be something with my GPU model, because with Fedora 25, and a GTX Titan X, I get:

clBuildProgram yields AccessViolationException when building this specific kernel

This is a part of some sort of parallel reduction/extremum kernel. I have reduced it to the minimum code that still gets clBuildProgram crashing (note that it really crashes, and doesn't just return an error code):
EDIT: It seems like this also happens when local_value is declared global instead of local.
EDIT2 / SOLUTION: The problem was that there was an infinite loop. I should have written remaining_items >>= 1 instead of remaining_items >> 1. As has been said in the answers, the nvidia compiler seems not very robust when it comes to compile/optimization errors.
kernel void testkernel(local float *local_value)
{
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1)
{
// throw away the right half of the threads
remaining_items >> 1; // <-- SPOTTED THE BUG
if (thread_id > remaining_items)
{
return;
}
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
{
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Removing the lines return; and/or local_value[thread_id] = right_value; causes clBuildProgram to finish successfully.
I can reproduce this problem on all of my computers (NVIDIA GTX 560, GT 555M, GT 540M, they're all Fermi 2.1 architecture). It's apparent on the NVIDIA CUDA Toolkit SDK versions 4.0, 4.1 and 4.2, when using either x64 or x86 libraries.
Does anyone have an idea what could be the problem?
Is it possible, that local (aka shared) memory is automatically assumed to be (WORK_GROUP_SIZE) * siezof(its_base_type)? That would explain why it works when the lines I mentioned above are removed.
Minimal host code (C99 compatible) for reproduction:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define RETURN_THROW(expression) do { cl_int ret = expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
#define REF_THROW(expression) do { cl_int ret; expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
int main(int argc, char **argv)
{
// Load the kernel source code into the array source_str
FILE *fp;
fp = fopen("testkernel.cl", "rb");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
int filesize = ftell(fp);
rewind(fp);
char *source_str = (char*)calloc(filesize, sizeof(char));
size_t bytes_read = fread(source_str, 1, filesize, fp);
source_str[bytes_read] = 0;
fclose(fp);
// Get platform information
cl_uint num_platforms;
RETURN_THROW(clGetPlatformIDs(0, NULL, &num_platforms));
cl_platform_id *platform_ids = (cl_platform_id *)calloc(num_platforms, sizeof(cl_platform_id));
RETURN_THROW(clGetPlatformIDs(num_platforms, platform_ids, NULL));
cl_device_id selected_device_id = NULL;
printf("available platforms:\n");
for (cl_uint i = 0; i < num_platforms; i++)
{
char platform_name[50];
RETURN_THROW(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 50, platform_name, NULL));
printf("%s\n", platform_name);
// get devices for this platform
cl_uint num_devices;
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));
cl_device_id *device_ids = (cl_device_id *)calloc(num_devices, sizeof(cl_device_id));
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, num_devices, device_ids, NULL));
// select first nvidia device
if (strstr(platform_name, "NVIDIA")) // ADAPT THIS ACCORDINGLY
{
selected_device_id = device_ids[0];
}
}
if (selected_device_id == NULL)
{
printf("No NVIDIA device found\n");
exit(1);
}
// Create an OpenCL context
cl_context context;
REF_THROW(context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret));
// Create a program from the kernel source
cl_program program;
REF_THROW(program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &ret));
// Build the program
cl_int ret = clBuildProgram(program, 1, &selected_device_id, NULL, NULL, NULL);
if (ret)
{
printf("BUILD ERROR\n");
// build error - get build log and display it
size_t build_log_size;
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
char *build_log = new char[build_log_size];
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
printf("%s\n", build_log);
exit(1);
}
printf("build finished successfully\n");
return 0;
}
In my experience the nvidia compiler isn't very robust when it comes to handling build errors, so you probably have a compile error somewhere.
I think your problem is indeed the return, or more to the point its combination with barrier. According to the opencl spec about barriers:
All work-items in a work-group executing the kernel on a processor
must execute this function before any are allowed to continue
execution beyond the barrier. This function must be encountered by all
work-items in a work-group executing the kernel.
If barrier is inside a conditional statement, then all work-items must enter the
onditional if any work-item enters the conditional statement and
executes the barrier.
If barrer is inside a loop, all work-items
must execute the barrier for each iteration of the loop before any are
allowed to continue execution beyond the barrier.
So I think your problem is probably that a lot of threads would return before getting to the barrier, making this code invalid. Maybe you should try something like this:
kernel void testkernel(local float *local_value) {
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1) {
remaining_items >>= 1;// throw away the right half of the threads
if (thread_id <= remaining_items) {
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Edit: Furthermore as noted in the comments it needs to be remaining_items>>=1 instead of remaining_items>>1 in order to avoid producing an infinite loop.

How to create a Queue of unsigned char array in Qt?

I am new in Queue (FIFO) and Qt. I want to create a Queue of unsigned char array in Qt. How to do it? Please help
unsigned char buffer[1024];
If you want to use the Qt API, then you can use the QQueue class -
QQueue<unsigned char> queue;
queue.enqueue(65);
queue.enqueue(66);
queue.enqueue(67);
while (!queue.isEmpty())
cout << queue.dequeue() << endl;
If you want to build the queue on your own, then I guess you can declare a Queue class like this -
class Queue
{
private:
enum{SIZE=1024, EMPTY=0};
unsigned char buffer[SIZE];
int readHead, writeHead;
public:
Queue()
{
readHead = writeHead = EMPTY;
}
void push(unsigned char data);
unsigned char pop();
unsigned char peek();
bool isEmpty();
};
void Queue::push(unsigned char data)
{
if((readHead - writeHead) >= SIZE)
{
// You should handle Queue overflow the way you want here.
return;
}
buffer[writeHead++ % SIZE] = data;
}
unsigned char Queue::pop()
{
unsigned char item = peek();
readHead++;
return item;
}
unsigned char Queue::peek()
{
if(isEmpty())
{
// You should handle Queue underflow the way you want here.
return;
}
return buffer[readHead % SIZE];
}
bool Queue::isEmpty()
{
return (readHead == writeHead);
}
If you want to maintain a Queue of unsigned char array, then you will have to maintain a queue of unsigned char pointers -
QQueue<unsigned char *> queue;
unsigned char *array1 = new unsigned char[10]; // array of 10 items
array1[0] = 65;
array1[1] = 66;
queue.enqueue(array1);
unsigned char *array2 = new unsigned char[20]; // an array of 20 items
queue.enqueue(array2);
unsigned char *arr = queue.dequeue();
qDebug() << arr[0] << ", " << arr[1];
Note: You should take care of the memory cleanup after you are done with this queue. IMHO, you better avoid this type of design though.

Resources