I've created a simple kernel in OpenCL:
__kernel void test(int x){
printf("test\n");
};
I'm calling it in a very simple program:
int main(){
cl_int err;
cl_int i = 0;
cl_kernel testKernel;
CL cl = initCL(1,(const char*[2]){"kernel.c"});
testKernel = clCreateKernel(cl.program, "test", &err);
clSetKernelArg(testKernel, 0, sizeof(cl_int), &i);
clEnqueueTask(cl.queue,testKernel,0,NULL,NULL);
clEnqueueTask(cl.queue,testKernel,0,NULL,NULL);
clEnqueueTask(cl.queue,testKernel,0,NULL,NULL);
clFinish(cl.queue);
return 0;
};
I was expecting "test" to be printed 3 times, but instead, the output of that program is:
test
test
test
test
test
test
test
test
test
test
Why?
Related
My use case is a main process that compiles a bunch of Open CL kernels. Later in the program several subprocesses are forked and will execute one or more of the kernels. For some reason the subprocesses have device errors.
I have determined the problem has nothing to do with compilation and have reproduced it with the following simple script:
import multiprocessing
import pyopencl as cl
def printme():
platforms = cl.get_platforms()
for p in platforms:
print 75*'!'
print p
print 75*':'
printme()
p = multiprocessing.Process(target = printme)
p.start()
p.join()
Seems to be tied to the NVIDIA OpenCL implementation and not related to PyOpenCL as I initially thought. The same problem occurs with the example below.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <CL/cl.h>
#define CHECK(A) if ((status = A) != 0) { \
fprintf(stderr, "failed status %d at line %d\n", status, __LINE__); \
exit(1); \
}
static void
runit() {
int i;
cl_int status;
cl_platform_id *platforms;
cl_uint num_platforms;
cl_uint num_devices;
CHECK(clGetPlatformIDs(0, NULL, &num_platforms));
if (num_platforms == 0) {
fprintf(stderr, "no platforms\n");
exit(1);
}
platforms = malloc(sizeof(cl_platform_id)*num_platforms);
CHECK(clGetPlatformIDs(num_platforms, platforms, NULL));
CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL,
&num_devices));
free(platforms);
}
int main(void) {
runit();
if (fork() == 0) {
runit();
}
else {
wait(NULL);
}
return 0;
}
Exits with failed status = -33 (Invalid Device). Seems that there is some stored state inside the implementation. Note this is running from an NVidia driver.
I am attempting a very simple OpenCL example. I have developed the following code below. It compiles a simple kernel, and then I create a simple float* buffer and set it to a cl::Buffer. However, when I attempt to call the kernel.setArg() function, it crashes, with an error -38. This error states that my cl::Buffer is invalid. I have no idea why this is happening:
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include <CL/cl2.hpp>
#define MULTI_LINE_STRING(ARG) #ARG
namespace op
{
const char *resizeAndMergeKernel = MULTI_LINE_STRING(
__kernel void testKernel(__global float* image)
{
}
);
}
void testCL(){
cl::Device device;
cl::Context context;
cl::CommandQueue queue;
int deviceId = 0;
// Load Device
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
std::string deviceName;
cl_uint i, type;
cl::Platform::get(&platforms);
type = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
if( type == CL_SUCCESS)
{
// Get only relavent device
cl::Context allContext(devices);
std::vector<cl::Device> gpuDevices;
gpuDevices = allContext.getInfo<CL_CONTEXT_DEVICES>();
bool deviceFound = false;
for(int i=0; i<gpuDevices.size(); i++){
if(i == deviceId){
device = gpuDevices[i];
context = cl::Context(device);
queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
deviceFound = true;
cout << "Made new GPU Instance: " << deviceId << endl;
break;
}
}
if(!deviceFound)
{
throw std::runtime_error("Error: Invalid GPU ID");
}
}
// Create Kernel
cl::Program program = cl::Program(context, op::resizeAndMergeKernel, true);
cl::Kernel kernel = cl::Kernel(program, "testKernel");
// Simple Buffer
cl_int err;
float* test = new float[3*224*224];
cl::Buffer x = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * 3 * 224 * 224, (void*)test, &err);
cout << err << endl;
kernel.setArg(0,x); // CRASHES WITH cl::Error -38
}
As you can see the last line kernel.setArg(0,x) crashes with error -38.
It's not a "crash", it's an error code. OpenCL error -38 is CL_INVALID_MEM_OBJECT. It means the cl_mem_obj is not valid. It is because you are passing a cl::Buffer object to setArg, but you need to instead pass the cl_mem handle which represents that buffer. The cl::Buffer operator() method returns that. So use kernel.setArg(0,x()). Note the () are the added part (yes, it's subtle).
In the code given below, d_slot is a double pointer initialized to NULL in the main.
It's value is changed in the kernel 'test'.
The code which i'm going to implement requires the value of d_slot to be carried and not be reverted back to NULL as it happens after 'test' is completed.
(This is perhaps because the double pointer is passed by value and not by reference)
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <helper_cuda.h>
#include <unistd.h>
#include <stdlib.h>
struct radix_tree_root {
unsigned int height;
struct radix_tree_node *rnode;
};
struct radix_tree_node {
unsigned int count;
void *slots[64];
};
__global__ void test1(struct radix_tree_node **d_slot,struct radix_tree_root *d_root)
{
(d_slot) = &d_root->rnode;
printf("From test1: d_slot = %p\t*d_slot = %p\n",d_slot,*d_slot);
}
__global__ void test2(struct radix_tree_node **d_slot)
{
printf("From test2: d_slot = %p\n",d_slot);
}
__global__ void test3(struct radix_tree_node ***d_slot,struct radix_tree_root *d_root)
{
(*d_slot) = &d_root->rnode;
}
int
main(void)
{
struct radix_tree_root *root,*d_root;
struct radix_tree_node **d_slot=NULL;
cudaError_t err = cudaSuccess;
root = (struct radix_tree_root *) malloc(sizeof(struct radix_tree_root));
root->height = 0;
root->rnode =NULL;
//allocate memory to d_root in the GPU//
err = cudaMalloc((void **)&d_root, sizeof(struct radix_tree_root));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device d_root (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//copy root to d_root
err = cudaMemcpy(d_root, root, (sizeof(struct radix_tree_root)), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy root from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("\nFrom the main: d_root = %p\n",d_root);
test1<<<1,1>>>(d_slot,d_root);
err = cudaGetLastError();//brief Returns the last error from a runtime call
cudaDeviceSynchronize();
test2<<<1,1>>>(d_slot);
err = cudaGetLastError();//brief Returns the last error from a runtime call
cudaDeviceSynchronize();
//test3<<<1,1>>>(&d_slot,d_root);
err = cudaGetLastError();//brief Returns the last error from a runtime call
cudaDeviceSynchronize();
//test2<<<1,1>>>(d_slot);
err = cudaGetLastError();//brief Returns the last error from a runtime call
cudaDeviceSynchronize();
err = cudaFree(d_root);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device d_root (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
free(root);
printf("successful execution of entire program\n");
return 0;
}
The output of this code is:
From the main: d_root = 0x900ca0000
From test1: d_slot = 0x900ca0008 *d_slot = (nil)
From test2: d_slot = (nil)
successful execution of entire program
This was all fine. But when i uncommented the 'test3' and the 'test2' kernels given in the above code,
I expected the value of d_slot to be carried forward...
However, there was an error encountered...
The output of the code with 'test3' and 'test2' uncommented is:
From the main: d_root = 0x900ca0000
From test1: d_slot = 0x900ca0008 *d_slot = (nil)
From test2: d_slot = (nil)
Failed to free device d_root (error code an illegal memory access was encountered)!
So my question is,
"How do I successfully assign value to d_slot (a double pointer)
in the kernel without losing it's value after the the completion of kernel-execution?"
There needs to be some location in graphics memory that test1 can write to and test2 and test3 can read from. You could use cudaMalloc a second time to allocate space for a struct radix_tree_node * like so:
cudaMalloc((void **)&d_slot, sizeof(struct radix_tree_root *));
Then test1 can write a pointer value to *d_slot and test2 and test3 can read the value that test1 wrote from *d_slot.
This is a part of some sort of parallel reduction/extremum kernel. I have reduced it to the minimum code that still gets clBuildProgram crashing (note that it really crashes, and doesn't just return an error code):
EDIT: It seems like this also happens when local_value is declared global instead of local.
EDIT2 / SOLUTION: The problem was that there was an infinite loop. I should have written remaining_items >>= 1 instead of remaining_items >> 1. As has been said in the answers, the nvidia compiler seems not very robust when it comes to compile/optimization errors.
kernel void testkernel(local float *local_value)
{
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1)
{
// throw away the right half of the threads
remaining_items >> 1; // <-- SPOTTED THE BUG
if (thread_id > remaining_items)
{
return;
}
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
{
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Removing the lines return; and/or local_value[thread_id] = right_value; causes clBuildProgram to finish successfully.
I can reproduce this problem on all of my computers (NVIDIA GTX 560, GT 555M, GT 540M, they're all Fermi 2.1 architecture). It's apparent on the NVIDIA CUDA Toolkit SDK versions 4.0, 4.1 and 4.2, when using either x64 or x86 libraries.
Does anyone have an idea what could be the problem?
Is it possible, that local (aka shared) memory is automatically assumed to be (WORK_GROUP_SIZE) * siezof(its_base_type)? That would explain why it works when the lines I mentioned above are removed.
Minimal host code (C99 compatible) for reproduction:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define RETURN_THROW(expression) do { cl_int ret = expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
#define REF_THROW(expression) do { cl_int ret; expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
int main(int argc, char **argv)
{
// Load the kernel source code into the array source_str
FILE *fp;
fp = fopen("testkernel.cl", "rb");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
int filesize = ftell(fp);
rewind(fp);
char *source_str = (char*)calloc(filesize, sizeof(char));
size_t bytes_read = fread(source_str, 1, filesize, fp);
source_str[bytes_read] = 0;
fclose(fp);
// Get platform information
cl_uint num_platforms;
RETURN_THROW(clGetPlatformIDs(0, NULL, &num_platforms));
cl_platform_id *platform_ids = (cl_platform_id *)calloc(num_platforms, sizeof(cl_platform_id));
RETURN_THROW(clGetPlatformIDs(num_platforms, platform_ids, NULL));
cl_device_id selected_device_id = NULL;
printf("available platforms:\n");
for (cl_uint i = 0; i < num_platforms; i++)
{
char platform_name[50];
RETURN_THROW(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 50, platform_name, NULL));
printf("%s\n", platform_name);
// get devices for this platform
cl_uint num_devices;
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));
cl_device_id *device_ids = (cl_device_id *)calloc(num_devices, sizeof(cl_device_id));
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, num_devices, device_ids, NULL));
// select first nvidia device
if (strstr(platform_name, "NVIDIA")) // ADAPT THIS ACCORDINGLY
{
selected_device_id = device_ids[0];
}
}
if (selected_device_id == NULL)
{
printf("No NVIDIA device found\n");
exit(1);
}
// Create an OpenCL context
cl_context context;
REF_THROW(context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret));
// Create a program from the kernel source
cl_program program;
REF_THROW(program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &ret));
// Build the program
cl_int ret = clBuildProgram(program, 1, &selected_device_id, NULL, NULL, NULL);
if (ret)
{
printf("BUILD ERROR\n");
// build error - get build log and display it
size_t build_log_size;
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
char *build_log = new char[build_log_size];
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
printf("%s\n", build_log);
exit(1);
}
printf("build finished successfully\n");
return 0;
}
In my experience the nvidia compiler isn't very robust when it comes to handling build errors, so you probably have a compile error somewhere.
I think your problem is indeed the return, or more to the point its combination with barrier. According to the opencl spec about barriers:
All work-items in a work-group executing the kernel on a processor
must execute this function before any are allowed to continue
execution beyond the barrier. This function must be encountered by all
work-items in a work-group executing the kernel.
If barrier is inside a conditional statement, then all work-items must enter the
onditional if any work-item enters the conditional statement and
executes the barrier.
If barrer is inside a loop, all work-items
must execute the barrier for each iteration of the loop before any are
allowed to continue execution beyond the barrier.
So I think your problem is probably that a lot of threads would return before getting to the barrier, making this code invalid. Maybe you should try something like this:
kernel void testkernel(local float *local_value) {
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1) {
remaining_items >>= 1;// throw away the right half of the threads
if (thread_id <= remaining_items) {
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Edit: Furthermore as noted in the comments it needs to be remaining_items>>=1 instead of remaining_items>>1 in order to avoid producing an infinite loop.
I am continuously getting an Access Violation Error with a all my kernels which I am trying to build. Other kernels which I take from books seem to work fine.
https://github.com/ssarangi/VideoCL - This is where the code is.
Something seems to be missing in this. Could someone help me with this.
Thanks so much.
[James] - Thanks for the suggestion and you are right. I am doing it on Win 7 with a AMD Redwood card. I have the Catalyst 11.7 drivers with AMD APP SDK 2.5. I am posting the code below.
#include <iostream>
#include "bmpfuncs.h"
#include "CLManager.h"
void main()
{
float theta = 3.14159f/6.0f;
int W ;
int H ;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
float* ip = readImage(inputFile, &W, &H);
float *op = new float[W*H];
//We assume that the input image is the array “ip”
//and the angle of rotation is theta
float cos_theta = cos(theta);
float sin_theta = sin(theta);
try
{
CLManager* clMgr = new CLManager();
// Build the Source
unsigned int pgmID = clMgr->buildSource("rotation.cl");
// Create the kernel
cl::Kernel* kernel = clMgr->makeKernel(pgmID, "img_rotate");
// Create the memory Buffers
cl::Buffer* clIp = clMgr->createBuffer(CL_MEM_READ_ONLY, W*H*sizeof(float));
cl::Buffer* clOp = clMgr->createBuffer(CL_MEM_READ_WRITE, W*H*sizeof(float));
// Get the command Queue
cl::CommandQueue* queue = clMgr->getCmdQueue();
queue->enqueueWriteBuffer(*clIp, CL_TRUE, 0, W*H*sizeof(float), ip);
// Set the arguments to the kernel
kernel->setArg(0, clOp);
kernel->setArg(1, clIp);
kernel->setArg(2, W);
kernel->setArg(3, H);
kernel->setArg(4, sin_theta);
kernel->setArg(5, cos_theta);
// Run the kernel on specific NDRange
cl::NDRange globalws(W, H);
queue->enqueueNDRangeKernel(*kernel, cl::NullRange, globalws, cl::NullRange);
queue->enqueueReadBuffer(*clOp, CL_TRUE, 0, W*H*sizeof(float), op);
storeImage(op, outputFile, H, W, inputFile);
}
catch(cl::Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
}
I am getting the error at the queue->enqueueNDRangeKernel line.
I have the queue and the kernel stored in a class.
CLManager::CLManager()
: m_programIDs(-1)
{
// Initialize the Platform
cl::Platform::get(&m_platforms);
// Create a Context
cl_context_properties cps[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(m_platforms[0])(),
0
};
m_context = cl::Context(CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
m_devices = m_context.getInfo<CL_CONTEXT_DEVICES>();
cl_int err;
m_queue = new cl::CommandQueue(m_context, m_devices[0], 0, &err);
}
cl::Kernel* CLManager::makeKernel(unsigned int programID, std::string kernelName)
{
cl::CommandQueue queue = cl::CommandQueue(m_context, m_devices[0]);
cl::Kernel* kernel = new cl::Kernel(*(m_programs[programID]), kernelName.c_str());
m_kernels.push_back(kernel);
return kernel;
}
I checked your code. I'm on Linux though. At runtime I'm getting Error -38, which means CL_INVALID_MEM_OBJECT. So I went and checked your buffers.
cl::Buffer* clIp = clMgr->createBuffer(CL_MEM_READ_ONLY, W*H*sizeof(float));
cl::Buffer* clOp = clMgr->createBuffer(CL_MEM_READ_WRITE, W*H*sizeof(float));
Then you pass the buffers as a Pointer:
kernel->setArg(0, clOp);
kernel->setArg(1, clIp);
But setArg is expecting a value, so the buffer pointers should be dereferenced:
kernel->setArg(0, *clOp);
kernel->setArg(1, *clIp);
After those changes the cat rotates ;)