I am writing a path tracer for GPU using CUDA 10.2. The entire program ran fine until i added a recursive call to the trace function. nvcc still compiles it, although with the warning: "Severity Code Description Project File Line Suppression State
Warning Stack size for entry function '' cannot be statically determined". When the GPU reaches the point it stops and the next time CPU gets an cudaError from an API call it is cuda error 715, which is cudaErrorIllegalInstruction. I tried recreating the issue by writing another recursive kernel/function pair, and the compiler gave the same warning, but it executed expectedly. Unfortunately this means i have to dump my entire function here (if there are any questions to the functions and types used i will happily answer them):
__device__ Vec3 trace(
const Settings& settings,
const Ray& r,
const Shape* shapes,
const size_t nshapes,
uint8_t bounces,
curandState& randState) {
if (bounces >= settings.maxBounces) {
return Vec3(0.0f);
}
const Shape* shape = nullptr;
float t = inf;
bool flipNormal;
float dist;
for (size_t i = 0; i < nshapes; i++) {
if (shapes[i].intersect(r, dist, flipNormal) && dist < t) {
shape = shapes + i;
t = dist;
}
}
if (shape == nullptr)
return settings.background;
const Vec3 hitPos = r.ori + t * r.dir;
const Vec3 normal = flipNormal ? -shape->normal(hitPos) : shape->normal(hitPos);
const Vec3 hemiDir = cosineSample(normal, randState);
const Vec3 traceCol = trace(
settings,
Ray(hitPos + normal * settings.bias, hemiDir),
shapes,
nshapes,
bounces + 1,
randState
);
return shape->surface.emittance + shape->surface.color * traceCol;
}
Has anyone else had this issue and in that case, how was it fixed? I could probably redesign to a non-recursive design, although it wouldn't be an optimal solution.
I don't even know where to start with debugging this issue, so any ideas are greatly appreciated.
The problem is that CUDA usually selects a fitting max stack size for a kernel call, but it is unable to because nvcc cannot predict the necessary size for a recursive functions.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdint.h>
__device__ int recurse(uint64_t n, uint64_t max) {
if (n < max)
return recurse(n + 1, max);
else
return n;
}
__global__ void start(uint64_t max) {
uint32_t idx = threadIdx.x + (blockIdx.x * blockDim.x);
if(idx == 256 * 256 - 1)
printf("%i: %i\n", idx, recurse(0, max));
return;
}
int main() {
cudaError_t status;
status = cudaSetDevice(0);
if (status != cudaSuccess) {
std::cerr << "failed: " << cudaGetErrorString(status) << std::endl;
return status;
}
cudaThreadSetLimit(cudaLimitStackSize, 2048);
start<<<256, 256>>>(126);
status = cudaDeviceSynchronize();
if (status != cudaSuccess) {
std::cerr << "failed: " << cudaGetErrorString(status) << std::endl;
return status;
}
return 0;
}
This program will run, but if 2048 is replaced with 1024, it will output the cudaErrorIllegalInstruction.
Related
I want to parallelize a function and have the problem that after a few hours my memory is overloaded.
The test program calculates something simple, and works so far. Only the memory usage is constantly increasing.
QT Project file:
QT -= gui
QT += concurrent widgets
CONFIG += c++11 console
CONFIG -= app_bundle
DEFINES += QT_DEPRECATED_WARNINGS
SOURCES += main.cpp
QT program file:
#include <QCoreApplication>
#include <qdebug.h>
#include <qtconcurrentrun.h>
double parallel_function(int instance){
return (double)(instance)*10.0;
}
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
int nr_of_threads = 8;
double result_sum,temp_var;
for(qint32 i = 0; i<100000000; i++){
QFuture<double> * future = new QFuture<double>[nr_of_threads];
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread] = QtConcurrent::run(parallel_function,thread);
}
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread].waitForFinished();
temp_var = future[thread].result();
qDebug()<<"result: " << temp_var;
result_sum += temp_var;
}
}
qDebug()<<"total: "<<result_sum;
return a.exec();
}
As I have observed, QtConcurrent::run(parallel_function,thread) allocates memory, but does not release memory after future[thread].waitForFinished().
What's wrong here?
You have memory leak because future array is not deleted. Add delete[] future at the end of outer for loop.
for(qint32 i = 0; i<100000000; i++)
{
QFuture<double> * future = new QFuture<double>[nr_of_threads];
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread] = QtConcurrent::run(parallel_function,thread);
}
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread].waitForFinished();
temp_var = future[thread].result();
qDebug()<<"result: " << temp_var;
result_sum += temp_var;
}
delete[] future; // <--
}
Here's how this might look - note how much simpler everything can be! You're dead set on doing manual memory management: why? First of all, QFuture is a value. You can store it very efficiently in any vector container that will manage the memory for you. You can iterate such a container using range-for. Etc.
QT = concurrent # dependencies are automatic, you don't use widgets
CONFIG += c++14 console
CONFIG -= app_bundle
SOURCES = main.cpp
Even though the example is synthetic and the map_function is very simple, it's worth considering how to do things most efficiently and expressively. Your algorithm is a typical map-reduce operation, and blockingMappedReduce has half the overhead of manually doing all of the work.
First of all, let's recast the original problem in C++, instead of some C-with-pluses Frankenstein.
// https://github.com/KubaO/stackoverflown/tree/master/questions/future-ranges-49107082
/* QtConcurrent will include QtCore as well */
#include <QtConcurrent>
#include <algorithm>
#include <iterator>
using result_type = double;
static result_type map_function(int instance){
return instance * result_type(10);
}
static void sum_modifier(result_type &result, result_type value) {
result += value;
}
static result_type sum_function(result_type result, result_type value) {
return result + value;
}
result_type sum_approach1(int const N) {
QVector<QFuture<result_type>> futures(N);
int id = 0;
for (auto &future : futures)
future = QtConcurrent::run(map_function, id++);
return std::accumulate(futures.cbegin(), futures.cend(), result_type{}, sum_function);
}
There is no manual memory management, and no explicit splitting into "threads" - that was pointless, since the concurrent execution platform is aware of how many threads there are. So this is already better!
But this seems quite wasteful: each future internally allocates at least once (!).
Instead of using futures explicitly for each result, we can use the map-reduce framework. To generate the sequence, we can define an iterator that provides the integers we wish to work on. The iterator can be a forward or a bidirectional one, and its implementation is the bare minimum needed by QtConcurrent framework.
#include <iterator>
template <typename tag> class num_iterator : public std::iterator<tag, int, int, const int*, int> {
int num = 0;
using self = num_iterator;
using base = std::iterator<tag, int, int, const int*, int>;
public:
explicit num_iterator(int num = 0) : num(num) {}
self &operator++() { num ++; return *this; }
self &operator--() { num --; return *this; }
self &operator+=(typename base::difference_type d) { num += d; return *this; }
friend typename base::difference_type operator-(self lhs, self rhs) { return lhs.num - rhs.num; }
bool operator==(self o) const { return num == o.num; }
bool operator!=(self o) const { return !(*this == o); }
typename base::reference operator*() const { return num; }
};
using num_f_iterator = num_iterator<std::forward_iterator_tag>;
result_type sum_approach2(int const N) {
auto results = QtConcurrent::blockingMapped<QVector<result_type>>(num_f_iterator{0}, num_f_iterator{N}, map_function);
return std::accumulate(results.cbegin(), results.cend(), result_type{}, sum_function);
}
using num_b_iterator = num_iterator<std::bidirectional_iterator_tag>;
result_type sum_approach3(int const N) {
auto results = QtConcurrent::blockingMapped<QVector<result_type>>(num_b_iterator{0}, num_b_iterator{N}, map_function);
return std::accumulate(results.cbegin(), results.cend(), result_type{}, sum_function);
}
Could we drop the std::accumulate and use blockingMappedReduced instead? Sure:
result_type sum_approach4(int const N) {
return QtConcurrent::blockingMappedReduced(num_b_iterator{0}, num_b_iterator{N},
map_function, sum_modifier);
}
We can also try a random access iterator:
using num_r_iterator = num_iterator<std::random_access_iterator_tag>;
result_type sum_approach5(int const N) {
return QtConcurrent::blockingMappedReduced(num_r_iterator{0}, num_r_iterator{N},
map_function, sum_modifier);
}
Finally, we can switch from using range-generating iterators, to a precomputed range:
#include <numeric>
result_type sum_approach6(int const N) {
QVector<int> sequence(N);
std::iota(sequence.begin(), sequence.end(), 0);
return QtConcurrent::blockingMappedReduced(sequence, map_function, sum_modifier);
}
Of course, our point is to benchmark it all:
template <typename F> void benchmark(F fun, double const N) {
QElapsedTimer timer;
timer.start();
auto result = fun(N);
qDebug() << "sum:" << fixed << result << "took" << timer.elapsed()/N << "ms/item";
}
int main() {
const int N = 1000000;
benchmark(sum_approach1, N);
benchmark(sum_approach2, N);
benchmark(sum_approach3, N);
benchmark(sum_approach4, N);
benchmark(sum_approach5, N);
benchmark(sum_approach6, N);
}
On my system, in release build, the output is:
sum: 4999995000000.000000 took 0.015778 ms/item
sum: 4999995000000.000000 took 0.003631 ms/item
sum: 4999995000000.000000 took 0.003610 ms/item
sum: 4999995000000.000000 took 0.005414 ms/item
sum: 4999995000000.000000 took 0.000011 ms/item
sum: 4999995000000.000000 took 0.000008 ms/item
Note how using map-reduce on a random-iterable sequence has over 3 orders of magnitude lower overhead than using QtConcurrent::run, and is 2 orders of magnitude faster than non-random-iterable solutions.
I am attempting a very simple OpenCL example. I have developed the following code below. It compiles a simple kernel, and then I create a simple float* buffer and set it to a cl::Buffer. However, when I attempt to call the kernel.setArg() function, it crashes, with an error -38. This error states that my cl::Buffer is invalid. I have no idea why this is happening:
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include <CL/cl2.hpp>
#define MULTI_LINE_STRING(ARG) #ARG
namespace op
{
const char *resizeAndMergeKernel = MULTI_LINE_STRING(
__kernel void testKernel(__global float* image)
{
}
);
}
void testCL(){
cl::Device device;
cl::Context context;
cl::CommandQueue queue;
int deviceId = 0;
// Load Device
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
std::string deviceName;
cl_uint i, type;
cl::Platform::get(&platforms);
type = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
if( type == CL_SUCCESS)
{
// Get only relavent device
cl::Context allContext(devices);
std::vector<cl::Device> gpuDevices;
gpuDevices = allContext.getInfo<CL_CONTEXT_DEVICES>();
bool deviceFound = false;
for(int i=0; i<gpuDevices.size(); i++){
if(i == deviceId){
device = gpuDevices[i];
context = cl::Context(device);
queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
deviceFound = true;
cout << "Made new GPU Instance: " << deviceId << endl;
break;
}
}
if(!deviceFound)
{
throw std::runtime_error("Error: Invalid GPU ID");
}
}
// Create Kernel
cl::Program program = cl::Program(context, op::resizeAndMergeKernel, true);
cl::Kernel kernel = cl::Kernel(program, "testKernel");
// Simple Buffer
cl_int err;
float* test = new float[3*224*224];
cl::Buffer x = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * 3 * 224 * 224, (void*)test, &err);
cout << err << endl;
kernel.setArg(0,x); // CRASHES WITH cl::Error -38
}
As you can see the last line kernel.setArg(0,x) crashes with error -38.
It's not a "crash", it's an error code. OpenCL error -38 is CL_INVALID_MEM_OBJECT. It means the cl_mem_obj is not valid. It is because you are passing a cl::Buffer object to setArg, but you need to instead pass the cl_mem handle which represents that buffer. The cl::Buffer operator() method returns that. So use kernel.setArg(0,x()). Note the () are the added part (yes, it's subtle).
I am building simple "hello triangle" program to start with OpenGL-ES 2.0 developement, I am stuck with this tricky error. It displays that I can not link shaders. I have tested shader compilation on RenderMonkey and it is OK. But in my actual application it fails to link.
void COpenGLWidget::initializeGL()
{
const size_t nMaxLength = 255;
glClearColor(1.0f, 0.0f, 0.0, 1.0f);
glEnable(GL_DEPTH_TEST);
glEnable(GL_CULL_FACE);
char lpszVertexBuffer[][nMaxLength] =
{
"uniform mat4 g_MatViewProjection;\n",
"attribute vec4 rm_Vertex;\n",
"void main(void)\n",
"{\n",
"gl_Position = rm_Vertex;\n",
"}"
};
char lpszFragmentBuffer[][nMaxLength] =
{
"precision mediump float; \n",
"void main(void)\n",
"{\n",
"gl_FragColor = vec4(1.0, 0.0, 0.0, 1.0 );\n",
"}\n"
};
m_nVertexShader = glCreateShader(GL_VERTEX_SHADER);
m_nPixelShader = glCreateShader(GL_FRAGMENT_SHADER);
int iVertexShaderLength = 6;
int iPixelShaderLength = 5;
glShaderSource(m_nVertexShader, 1, (const char**)lpszVertexBuffer, &iVertexShaderLength);
glShaderSource(m_nPixelShader, 1, (const char**)lpszFragmentBuffer, &iPixelShaderLength);
glCompileShader(m_nVertexShader);
int iIsOk = 0;
glGetShaderiv(m_nVertexShader, GL_COMPILE_STATUS, &iIsOk);
if(!iIsOk)
{
GLint infoLen = 0;
glGetShaderiv(m_nVertexShader, GL_INFO_LOG_LENGTH, &infoLen);
if(infoLen > 1)
{
char* infoLog = (char*)malloc(sizeof(char) * infoLen);
glGetShaderInfoLog(m_nVertexShader, infoLen, NULL, infoLog);
QMessageBox::warning(this, QString("Error"),
QString(infoLog), QMessageBox::Yes | QMessageBox::Cancel, QMessageBox::Yes);
free(infoLog);
}
glDeleteShader(m_nVertexShader);
return;
}
glCompileShader(m_nPixelShader);
glGetShaderiv(m_nPixelShader, GL_COMPILE_STATUS, &iIsOk);
if(!iIsOk)
{
GLint infoLen = 0;
glGetShaderiv(m_nPixelShader, GL_INFO_LOG_LENGTH, &infoLen);
if(infoLen > 1)
{
char* infoLog = (char*)malloc(sizeof(char) * infoLen);
glGetShaderInfoLog(m_nPixelShader, infoLen, NULL, infoLog);
QMessageBox::warning(this, QString("Error"),
QString(infoLog), QMessageBox::Yes | QMessageBox::Cancel, QMessageBox::Yes);
free(infoLog);
}
glDeleteShader(m_nPixelShader);
return;
}
m_nProgram = glCreateProgram();
glAttachShader(m_nProgram, m_nVertexShader);
glAttachShader(m_nProgram, m_nPixelShader);
glBindAttribLocation(m_nProgram, 0, "rm_Vertex");
glLinkProgram(m_nProgram);
glGetProgramiv(m_nProgram, GL_LINK_STATUS, &iIsOk);
// Fail to pass status validation
if(!iIsOk)
{
GLint infoLen = 0;
glGetProgramiv(m_nProgram, GL_INFO_LOG_LENGTH, &infoLen);
if(infoLen > 1)
{
char* infoLog = (char*)malloc(sizeof(char) * infoLen);
glGetProgramInfoLog(m_nProgram, infoLen, NULL, infoLog);
QMessageBox::warning(this, QString("Error"),
QString(infoLog), QMessageBox::Yes | QMessageBox::Cancel, QMessageBox::Yes);
free(infoLog);
}
glDeleteProgram(m_nProgram);
return;
}
glUseProgram(m_nProgram);
connect(&m_Timer, SIGNAL(timeout()), this, SLOT(update()));
m_Timer.start(1);
}
You got the data structurs for the shader source code totally wrong. glShaderSource accepts an array of pointers to char*, so a sequence of strings. You store your shader source code in a 2-dimensional array char. Contrary to a somewhat common myth, arrays are not pointers in C/C++.
Furthermore, you are telling the GL that there is a single line with the length of 6 characters (5 for the fragment shader, respectively). The shader compiler only sees the very first part of your source code, and hence reports that it can't find a main function.
It is unclear why you even try to split your shader sources into several strings. You do not get any benefit from doing that as along as you don't recombine the different bits and pieces. I suggest you just use a singe string, so something line
const char *source=
"uniform mat4 g_MatViewProjection;\n"
"attribute vec4 rm_Vertex;\n"
"void main(void)\n"
"{\n"
"gl_Position = rm_Vertex;\n"
"}";
In C/C++, you can concatenate strings by simply writing them after each other, and this also works across lines.
THen, you can simply use the address of your source pointer and feed it into the GL:
glShaderSource(shaderName, 1, &source, NULL);
There is also no need to pre-calculate any lenghts, the GL will handle 0-terminated C strings just as well.
If you really want to go the route of different source strings, I strongly recommend you learn the basics of arrays, pointers, arrays of pointers and multidimensional arrays first.
This is a part of some sort of parallel reduction/extremum kernel. I have reduced it to the minimum code that still gets clBuildProgram crashing (note that it really crashes, and doesn't just return an error code):
EDIT: It seems like this also happens when local_value is declared global instead of local.
EDIT2 / SOLUTION: The problem was that there was an infinite loop. I should have written remaining_items >>= 1 instead of remaining_items >> 1. As has been said in the answers, the nvidia compiler seems not very robust when it comes to compile/optimization errors.
kernel void testkernel(local float *local_value)
{
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1)
{
// throw away the right half of the threads
remaining_items >> 1; // <-- SPOTTED THE BUG
if (thread_id > remaining_items)
{
return;
}
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
{
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Removing the lines return; and/or local_value[thread_id] = right_value; causes clBuildProgram to finish successfully.
I can reproduce this problem on all of my computers (NVIDIA GTX 560, GT 555M, GT 540M, they're all Fermi 2.1 architecture). It's apparent on the NVIDIA CUDA Toolkit SDK versions 4.0, 4.1 and 4.2, when using either x64 or x86 libraries.
Does anyone have an idea what could be the problem?
Is it possible, that local (aka shared) memory is automatically assumed to be (WORK_GROUP_SIZE) * siezof(its_base_type)? That would explain why it works when the lines I mentioned above are removed.
Minimal host code (C99 compatible) for reproduction:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define RETURN_THROW(expression) do { cl_int ret = expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
#define REF_THROW(expression) do { cl_int ret; expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
int main(int argc, char **argv)
{
// Load the kernel source code into the array source_str
FILE *fp;
fp = fopen("testkernel.cl", "rb");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
int filesize = ftell(fp);
rewind(fp);
char *source_str = (char*)calloc(filesize, sizeof(char));
size_t bytes_read = fread(source_str, 1, filesize, fp);
source_str[bytes_read] = 0;
fclose(fp);
// Get platform information
cl_uint num_platforms;
RETURN_THROW(clGetPlatformIDs(0, NULL, &num_platforms));
cl_platform_id *platform_ids = (cl_platform_id *)calloc(num_platforms, sizeof(cl_platform_id));
RETURN_THROW(clGetPlatformIDs(num_platforms, platform_ids, NULL));
cl_device_id selected_device_id = NULL;
printf("available platforms:\n");
for (cl_uint i = 0; i < num_platforms; i++)
{
char platform_name[50];
RETURN_THROW(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 50, platform_name, NULL));
printf("%s\n", platform_name);
// get devices for this platform
cl_uint num_devices;
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));
cl_device_id *device_ids = (cl_device_id *)calloc(num_devices, sizeof(cl_device_id));
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, num_devices, device_ids, NULL));
// select first nvidia device
if (strstr(platform_name, "NVIDIA")) // ADAPT THIS ACCORDINGLY
{
selected_device_id = device_ids[0];
}
}
if (selected_device_id == NULL)
{
printf("No NVIDIA device found\n");
exit(1);
}
// Create an OpenCL context
cl_context context;
REF_THROW(context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret));
// Create a program from the kernel source
cl_program program;
REF_THROW(program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &ret));
// Build the program
cl_int ret = clBuildProgram(program, 1, &selected_device_id, NULL, NULL, NULL);
if (ret)
{
printf("BUILD ERROR\n");
// build error - get build log and display it
size_t build_log_size;
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
char *build_log = new char[build_log_size];
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
printf("%s\n", build_log);
exit(1);
}
printf("build finished successfully\n");
return 0;
}
In my experience the nvidia compiler isn't very robust when it comes to handling build errors, so you probably have a compile error somewhere.
I think your problem is indeed the return, or more to the point its combination with barrier. According to the opencl spec about barriers:
All work-items in a work-group executing the kernel on a processor
must execute this function before any are allowed to continue
execution beyond the barrier. This function must be encountered by all
work-items in a work-group executing the kernel.
If barrier is inside a conditional statement, then all work-items must enter the
onditional if any work-item enters the conditional statement and
executes the barrier.
If barrer is inside a loop, all work-items
must execute the barrier for each iteration of the loop before any are
allowed to continue execution beyond the barrier.
So I think your problem is probably that a lot of threads would return before getting to the barrier, making this code invalid. Maybe you should try something like this:
kernel void testkernel(local float *local_value) {
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1) {
remaining_items >>= 1;// throw away the right half of the threads
if (thread_id <= remaining_items) {
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Edit: Furthermore as noted in the comments it needs to be remaining_items>>=1 instead of remaining_items>>1 in order to avoid producing an infinite loop.
I am continuously getting an Access Violation Error with a all my kernels which I am trying to build. Other kernels which I take from books seem to work fine.
https://github.com/ssarangi/VideoCL - This is where the code is.
Something seems to be missing in this. Could someone help me with this.
Thanks so much.
[James] - Thanks for the suggestion and you are right. I am doing it on Win 7 with a AMD Redwood card. I have the Catalyst 11.7 drivers with AMD APP SDK 2.5. I am posting the code below.
#include <iostream>
#include "bmpfuncs.h"
#include "CLManager.h"
void main()
{
float theta = 3.14159f/6.0f;
int W ;
int H ;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
float* ip = readImage(inputFile, &W, &H);
float *op = new float[W*H];
//We assume that the input image is the array “ip”
//and the angle of rotation is theta
float cos_theta = cos(theta);
float sin_theta = sin(theta);
try
{
CLManager* clMgr = new CLManager();
// Build the Source
unsigned int pgmID = clMgr->buildSource("rotation.cl");
// Create the kernel
cl::Kernel* kernel = clMgr->makeKernel(pgmID, "img_rotate");
// Create the memory Buffers
cl::Buffer* clIp = clMgr->createBuffer(CL_MEM_READ_ONLY, W*H*sizeof(float));
cl::Buffer* clOp = clMgr->createBuffer(CL_MEM_READ_WRITE, W*H*sizeof(float));
// Get the command Queue
cl::CommandQueue* queue = clMgr->getCmdQueue();
queue->enqueueWriteBuffer(*clIp, CL_TRUE, 0, W*H*sizeof(float), ip);
// Set the arguments to the kernel
kernel->setArg(0, clOp);
kernel->setArg(1, clIp);
kernel->setArg(2, W);
kernel->setArg(3, H);
kernel->setArg(4, sin_theta);
kernel->setArg(5, cos_theta);
// Run the kernel on specific NDRange
cl::NDRange globalws(W, H);
queue->enqueueNDRangeKernel(*kernel, cl::NullRange, globalws, cl::NullRange);
queue->enqueueReadBuffer(*clOp, CL_TRUE, 0, W*H*sizeof(float), op);
storeImage(op, outputFile, H, W, inputFile);
}
catch(cl::Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
}
I am getting the error at the queue->enqueueNDRangeKernel line.
I have the queue and the kernel stored in a class.
CLManager::CLManager()
: m_programIDs(-1)
{
// Initialize the Platform
cl::Platform::get(&m_platforms);
// Create a Context
cl_context_properties cps[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(m_platforms[0])(),
0
};
m_context = cl::Context(CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
m_devices = m_context.getInfo<CL_CONTEXT_DEVICES>();
cl_int err;
m_queue = new cl::CommandQueue(m_context, m_devices[0], 0, &err);
}
cl::Kernel* CLManager::makeKernel(unsigned int programID, std::string kernelName)
{
cl::CommandQueue queue = cl::CommandQueue(m_context, m_devices[0]);
cl::Kernel* kernel = new cl::Kernel(*(m_programs[programID]), kernelName.c_str());
m_kernels.push_back(kernel);
return kernel;
}
I checked your code. I'm on Linux though. At runtime I'm getting Error -38, which means CL_INVALID_MEM_OBJECT. So I went and checked your buffers.
cl::Buffer* clIp = clMgr->createBuffer(CL_MEM_READ_ONLY, W*H*sizeof(float));
cl::Buffer* clOp = clMgr->createBuffer(CL_MEM_READ_WRITE, W*H*sizeof(float));
Then you pass the buffers as a Pointer:
kernel->setArg(0, clOp);
kernel->setArg(1, clIp);
But setArg is expecting a value, so the buffer pointers should be dereferenced:
kernel->setArg(0, *clOp);
kernel->setArg(1, *clIp);
After those changes the cat rotates ;)