PyOpenCL with multiprocessing INVALID_DEVICE - opencl

My use case is a main process that compiles a bunch of Open CL kernels. Later in the program several subprocesses are forked and will execute one or more of the kernels. For some reason the subprocesses have device errors.
I have determined the problem has nothing to do with compilation and have reproduced it with the following simple script:
import multiprocessing
import pyopencl as cl
def printme():
platforms = cl.get_platforms()
for p in platforms:
print 75*'!'
print p
print 75*':'
printme()
p = multiprocessing.Process(target = printme)
p.start()
p.join()
Seems to be tied to the NVIDIA OpenCL implementation and not related to PyOpenCL as I initially thought. The same problem occurs with the example below.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <CL/cl.h>
#define CHECK(A) if ((status = A) != 0) { \
fprintf(stderr, "failed status %d at line %d\n", status, __LINE__); \
exit(1); \
}
static void
runit() {
int i;
cl_int status;
cl_platform_id *platforms;
cl_uint num_platforms;
cl_uint num_devices;
CHECK(clGetPlatformIDs(0, NULL, &num_platforms));
if (num_platforms == 0) {
fprintf(stderr, "no platforms\n");
exit(1);
}
platforms = malloc(sizeof(cl_platform_id)*num_platforms);
CHECK(clGetPlatformIDs(num_platforms, platforms, NULL));
CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL,
&num_devices));
free(platforms);
}
int main(void) {
runit();
if (fork() == 0) {
runit();
}
else {
wait(NULL);
}
return 0;
}
Exits with failed status = -33 (Invalid Device). Seems that there is some stored state inside the implementation. Note this is running from an NVidia driver.

Related

SQLite: How embed the memvfs extension to Amalgamation?

I need to load/save SQLite database in memory buffer. For this, I want embed the memvfs extension into sqlite3 code and compile it wholly as sqlite3.dll.
How do it?
Update1:
I want use the memvfs as temp memory buffer. My program load data from net to buffer, connect to this memory buffer and restore data into empty in-memory db. I thoutgh that inclusion of memvfs to sqlite amalgamation would improve perfomance.
Update2:
If you want to use memvfs extension pay attention to bug in readme comment in source. Use "PRAGMA journal_mode=OFF" instead "journal_mode=NONE"
Update3:
Another bug in memvfs.c - use 'max' instead 'maxsz' for maxsz param in URI.
The sqlite developers carefully set a rakes :(
Test program to demonstrate using memvfs:
#include <fcntl.h>
#include <sqlite3.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
int main(void) {
sqlite3 *db;
char *err;
// Open an in-memory database to use as a handle for loading the memvfs extension
if (sqlite3_open(":memory:", &db) != SQLITE_OK) {
fprintf(stderr, "open :memory: %s\n", sqlite3_errmsg(db));
return EXIT_FAILURE;
}
sqlite3_enable_load_extension(db, 1);
if (sqlite3_load_extension(db, "./memvfs", NULL, &err) != SQLITE_OK) {
fprintf(stderr, "load extension: %s\n", err);
return EXIT_FAILURE;
}
// Done with this database
sqlite3_close(db);
// Read the real database into memory
int fd = open("foo.db", O_RDONLY);
if (fd < 0) {
perror("open");
return EXIT_FAILURE;
}
struct stat s;
if (fstat(fd, &s) < 0) {
perror("fstat");
return EXIT_FAILURE;
}
void *memdb = sqlite3_malloc64(s.st_size);
if (read(fd, memdb, s.st_size) != s.st_size) {
perror("read");
return EXIT_FAILURE;
}
close(fd);
// And open that memory with memvfs now that it holds a valid database
char *memuri = sqlite3_mprintf("file:whatever?ptr=0x%p&sz=%lld&freeonclose=1",
memdb, (long long)s.st_size);
printf("Trying to open '%s'\n", memuri);
if (sqlite3_open_v2(memuri, &db, SQLITE_OPEN_READWRITE | SQLITE_OPEN_URI,
"memvfs") != SQLITE_OK) {
fprintf(stderr, "open memvfs: %s\n", sqlite3_errmsg(db));
return EXIT_FAILURE;
}
sqlite3_free(memuri);
// Try querying the database to show it works.
sqlite3_stmt *stmt;
if (sqlite3_prepare_v2(db, "SELECT b FROM test", -1, &stmt, NULL) !=
SQLITE_OK) {
fprintf(stderr, "prepare: %s\n", sqlite3_errmsg(db));
sqlite3_close(db);
return EXIT_FAILURE;
}
for (int rc = sqlite3_step(stmt); rc == SQLITE_ROW; rc = sqlite3_step(stmt)) {
printf("%d\n", sqlite3_column_int(stmt, 0));
}
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
Usage:
# Create a test database to use with memvfs
$ sqlite3 foo.db
sqlite> CREATE TABLE test(b INTEGER);
sqlite> INSERT INTO test VALUES (1), (2);
sqlite> .quit
# Compile the memvfs module and test program
$ gcc -O -fPIC -shared -o memvfs.so memvfs.c
$ gcc -O -Wall -Wextra testmem.c -lsqlite3
# And run it.
$ ./a.out
Trying to open 'file:whatever?ptr=0x56653FE2B940&sz=8192&freeonclose=1'
1
2
Same workflow if you compile it directly into your program instead of using a loadable module; you just have to call sqlite3_memvfs_init() with the right arguments instead of using sqlite3_load_extension().

pthread_cond_timedwait() not working on FreeBSD, returns EPERM

I have a sample program which creates a pthread, waits for the thread to join. The thread will invoke phread_cond_timedwait() to wait for 2 seconds. On Linux platforms, the sample code works fine. On FreeBSD, the call returns immediately with EPERM error code.
pthread_condition_timedwait.cpp
#define _BSD_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <sys/time.h>
void *thread_handler(void *ptr){
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
struct timespec ts;
struct timeval tp;
gettimeofday(&tp, NULL);
ts.tv_sec = tp.tv_sec;
ts.tv_nsec = tp.tv_usec*1000;
ts.tv_sec += 2;
//Invoke pthread_cond_timedwait() to wait for 2 seconds
int rcode = pthread_cond_timedwait(&cond, &mutex, &ts);
if (rcode == ETIMEDOUT)
printf("Terminated due to time out\n");
else if (rcode == EPERM)
printf("Terminated due to EPERM\n");
else
printf("Return code is %d\n", rcode);
return NULL;
}
int main(int argc, char** argv){
pthread_t thread;
// start the thread
pthread_create(&thread, NULL, &thread_handler, NULL);
// wait for thread to finish
pthread_join(thread, NULL);
return 0;
}
EPERM is returned if the thread that calls timedwait does not own the mutex. You must lock the mutex before calling timedwait. Also, move the static initialization of mutex and condvar to file scope.
UPDATE: If you initialize the mutex to be an error-checking mutex, Linux will also terminate with EPERM (since it is UB to call pthread_cond_wait/timedwait without holding the mutex).
Modified code below:
//#define _BSD_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <sys/time.h>
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t mutex;
void *thread_handler(void *ptr){
struct timespec ts;
struct timeval tp;
gettimeofday(&tp, NULL);
ts.tv_sec = tp.tv_sec;
ts.tv_nsec = tp.tv_usec*1000;
ts.tv_sec += 2;
//Invoke pthread_cond_timedwait() to wait for 2 seconds
int rcode = pthread_cond_timedwait(&cond, &mutex, &ts);
if (rcode == ETIMEDOUT)
printf("Terminated due to time out\n");
else if (rcode == EPERM)
printf("Terminated due to EPERM\n");
else
printf("Return code is %d\n", rcode);
return NULL;
}
int main(int argc, char** argv){
pthread_mutexattr_t mta;
pthread_mutexattr_init(&mta);
pthread_mutexattr_settype(&mta, PTHREAD_MUTEX_ERRORCHECK);
pthread_mutex_init(&mutex, &mta);
pthread_t thread;
// start the thread
pthread_create(&thread, NULL, &thread_handler, NULL);
// wait for thread to finish
pthread_join(thread, NULL);
return 0;
}
Tested on kernel SMP Debian 4.9.82-1+deb9u3 (2018-03-02) x86_64 GNU/Linux, distro Debian GNU/Linux buster/sid.

CAN bus port access via socket; non-blocking solution needed

I've got an application where I will be using a standalone C programming to read a CAN bus port with a socket. The user interface on this is Qt/QML code. I would like to use a non-blocking approach to call the bin program and either return nothing or return a string of the CAN packet.
The application will be low speed (just monitoring key presses, etc) so speed is not an issue. The current approach involves writing data from the socket program to a file, then having ANOTHER C program take the file and echo the string back to QML. UGH! Seems very messy. A simple Go/NoGo call would be easier. Here's the code I've got so far.
Thanks for any comments.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <net/if.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <linux/can.h>
#include <linux/can/raw.h>
// Returns 0 if no errors, > 0 if errors found
int main(void) {
struct ifreq ifr;
struct can_frame frame;
struct sockaddr_can addr;
int s; // CAN socket descriptor
int nbytes; // Number of bytes read from CAN socket
char run_daemon = 0; // Set to 1 to run as a daemon process
char show_errors = 0; // Set to 1 to print errors
char *ifname = "can0"; // Define the CAN driver for use
if (run_daemon) // Skip the daemon call if not enabled
daemon(1,1);
if ((s = socket(PF_CAN, SOCK_RAW, CAN_RAW)) < 0) {
if (show_errors)
perror("Error while opening RAW socket");
return 1;
}
strcpy (ifr.ifr_name, ifname);
ioctl(s, SIOCGIFINDEX, &ifr);
addr.can_family = AF_CAN;
addr.can_ifindex = ifr.ifr_ifindex;
if (bind(s, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
if (show_errors)
perror("Error in socket bind");
return 2;
}
// Loop here for daemon process
while (1) {
// Read CAN frame data
nbytes = read(s, &frame, sizeof(struct can_frame));
// If data is ready, process it
if (nbytes > 0) {
// Print all relevent frame data to QML
printf("%d ",frame.can_id);
printf("%d ",frame.can_dlc);
if(frame.can_dlc>0) printf("%d ",frame.data[0]);
if(frame.can_dlc>1) printf("%d ",frame.data[1]);
if(frame.can_dlc>2) printf("%d ",frame.data[2]);
if(frame.can_dlc>3) printf("%d ",frame.data[3]);
if(frame.can_dlc>4) printf("%d ",frame.data[4]);
if(frame.can_dlc>5) printf("%d ",frame.data[5]);
if(frame.can_dlc>6) printf("%d ",frame.data[6]);
if(frame.can_dlc>7) printf("%d ",frame.data[7]);
printf("\n");
}
if (!run_daemon) { // Exit if daemon is not running
close(s); // Close the CAN socket
return 0;
}
}
return 0; // Should never get here !!!
}

dlopen/dlsym: error getting function pointer

I am attempting to get the function pointer by using dlopen and dlsym, however I have been unable to get it working correctly. It fails when trying to doing the dlsym call. Following is my code.
Any help please?
#include <dlfcn.h>
#include <stdio.h>
#include <stdlib.h>
int test() {
printf("%s", "test()");
return 123;
}
int main() {
char * functionname = "test";
void* handle = dlopen(NULL,RTLD_LAZY|RTLD_GLOBAL);
if (!handle) {
fprintf(stderr, "Couldn't open handle: %s\n",
dlerror());
exit(1);
}
int (*fun)() = (int (*)())dlsym(handle, functionname);
if (fun == NULL) {
fprintf(stderr, "Couldn't find function: %s\n",functionname);
exit(1);
}
int a = fun();
printf("result: %d \n", a);
}
Probably you need to specify to the linker to export the symbols as dynamic. With gcc you have to use -rdynamic.
You can check the exported dynamic symbols with objdump -T.

clBuildProgram yields AccessViolationException when building this specific kernel

This is a part of some sort of parallel reduction/extremum kernel. I have reduced it to the minimum code that still gets clBuildProgram crashing (note that it really crashes, and doesn't just return an error code):
EDIT: It seems like this also happens when local_value is declared global instead of local.
EDIT2 / SOLUTION: The problem was that there was an infinite loop. I should have written remaining_items >>= 1 instead of remaining_items >> 1. As has been said in the answers, the nvidia compiler seems not very robust when it comes to compile/optimization errors.
kernel void testkernel(local float *local_value)
{
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1)
{
// throw away the right half of the threads
remaining_items >> 1; // <-- SPOTTED THE BUG
if (thread_id > remaining_items)
{
return;
}
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
{
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Removing the lines return; and/or local_value[thread_id] = right_value; causes clBuildProgram to finish successfully.
I can reproduce this problem on all of my computers (NVIDIA GTX 560, GT 555M, GT 540M, they're all Fermi 2.1 architecture). It's apparent on the NVIDIA CUDA Toolkit SDK versions 4.0, 4.1 and 4.2, when using either x64 or x86 libraries.
Does anyone have an idea what could be the problem?
Is it possible, that local (aka shared) memory is automatically assumed to be (WORK_GROUP_SIZE) * siezof(its_base_type)? That would explain why it works when the lines I mentioned above are removed.
Minimal host code (C99 compatible) for reproduction:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define RETURN_THROW(expression) do { cl_int ret = expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
#define REF_THROW(expression) do { cl_int ret; expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
int main(int argc, char **argv)
{
// Load the kernel source code into the array source_str
FILE *fp;
fp = fopen("testkernel.cl", "rb");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
int filesize = ftell(fp);
rewind(fp);
char *source_str = (char*)calloc(filesize, sizeof(char));
size_t bytes_read = fread(source_str, 1, filesize, fp);
source_str[bytes_read] = 0;
fclose(fp);
// Get platform information
cl_uint num_platforms;
RETURN_THROW(clGetPlatformIDs(0, NULL, &num_platforms));
cl_platform_id *platform_ids = (cl_platform_id *)calloc(num_platforms, sizeof(cl_platform_id));
RETURN_THROW(clGetPlatformIDs(num_platforms, platform_ids, NULL));
cl_device_id selected_device_id = NULL;
printf("available platforms:\n");
for (cl_uint i = 0; i < num_platforms; i++)
{
char platform_name[50];
RETURN_THROW(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 50, platform_name, NULL));
printf("%s\n", platform_name);
// get devices for this platform
cl_uint num_devices;
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));
cl_device_id *device_ids = (cl_device_id *)calloc(num_devices, sizeof(cl_device_id));
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, num_devices, device_ids, NULL));
// select first nvidia device
if (strstr(platform_name, "NVIDIA")) // ADAPT THIS ACCORDINGLY
{
selected_device_id = device_ids[0];
}
}
if (selected_device_id == NULL)
{
printf("No NVIDIA device found\n");
exit(1);
}
// Create an OpenCL context
cl_context context;
REF_THROW(context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret));
// Create a program from the kernel source
cl_program program;
REF_THROW(program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &ret));
// Build the program
cl_int ret = clBuildProgram(program, 1, &selected_device_id, NULL, NULL, NULL);
if (ret)
{
printf("BUILD ERROR\n");
// build error - get build log and display it
size_t build_log_size;
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
char *build_log = new char[build_log_size];
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
printf("%s\n", build_log);
exit(1);
}
printf("build finished successfully\n");
return 0;
}
In my experience the nvidia compiler isn't very robust when it comes to handling build errors, so you probably have a compile error somewhere.
I think your problem is indeed the return, or more to the point its combination with barrier. According to the opencl spec about barriers:
All work-items in a work-group executing the kernel on a processor
must execute this function before any are allowed to continue
execution beyond the barrier. This function must be encountered by all
work-items in a work-group executing the kernel.
If barrier is inside a conditional statement, then all work-items must enter the
onditional if any work-item enters the conditional statement and
executes the barrier.
If barrer is inside a loop, all work-items
must execute the barrier for each iteration of the loop before any are
allowed to continue execution beyond the barrier.
So I think your problem is probably that a lot of threads would return before getting to the barrier, making this code invalid. Maybe you should try something like this:
kernel void testkernel(local float *local_value) {
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1) {
remaining_items >>= 1;// throw away the right half of the threads
if (thread_id <= remaining_items) {
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Edit: Furthermore as noted in the comments it needs to be remaining_items>>=1 instead of remaining_items>>1 in order to avoid producing an infinite loop.

Resources