Why do cudaMemcpyAsync and kernel launches block even with an asynchronous stream? - asynchronous

Consider the following program for enqueueing some work on a non-blocking GPU stream:
#include <iostream>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) {
*(reinterpret_cast<bool *>(ptr)) = true;
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main() {
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 1e7;
bool callback_executed = false;
cudaStream_t stream;
auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}
The size of the buffers and the length of the kernel sleep in cycles are high enough, that as they execute in parallel with the CPU thread, it should finish the enqueueing well before they've concluded (8ms+8ms for copying and 20 ms for the kernel).
And yet, looking at the trace below, it seems the two cudaMemcpyAsync() are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contradict the relevant section of the CUDA Runtime API documentation. How does that make sense?
Trace: (numbered lines, time in useconds):
1 "Start" "Duration" "Grid X" "Grid Y" "Grid Z" "Block X" "Block Y" "Block Z"
104 14102.830000 59264.347000 "cudaMalloc"
105 73368.351000 19.886000 "cudaStreamCreateWithFlags"
106 73388.and 20 ms for the kernel).
And yet, looking at the trace below, it seems the two cudaMemcpyAsync()'s are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contradict the relevant section of the CUDA Runtime API documentation. How does it make sense?
850000 8330.257000 "cudaMemcpyAsync"
107 73565.702000 8334.265000 47.683716 5.587311 "Pageable" "Device" "GeForce GTX 650 Ti BOOST (0)" "1"
108 81721.124000 2.394000 "cudaConfigureCall"
109 81723.865000 3.585000 "cudaSetupArgument"
110 81729.332000 30.742000 "cudaLaunch (dummy(__int64) [107])"
111 81760.604000 39589.422000 "cudaMemcpyAsync"
112 81906.303000 20157.648000 128 1 1 128 1 1
113 102073.103000 18736.208000 47.683716 2.485355 "Device" "Pageable" "GeForce GTX 650 Ti BOOST (0)" "1"
114 121351.936000 5.560000 "cudaStreamSynchronize"

This seemed weird, so I contacted someone from the CUDA driver team, who confirmed the documentation is correct. I was also able to confirm it:
#include <iostream>
#include <memory>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) {
*(reinterpret_cast<bool *>(ptr)) = true;
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main(int argc, char* argv[]) {
cudaFree(0);
struct timespec start, stop;
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
bool callback_executed = false;
cudaStream_t stream;
void* host_ptr;
if (argc == 1){
host_ptr = malloc(buffer_size);
}
else {
cudaMallocHost(&host_ptr, buffer_size, 0);
}
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
auto snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}
This is basically your code, with a few modifications:
Time measurement
A switch to allocate from pageable or pinned memory
A buffer size of 2 GiB to ensure a measurable copy time
cudaFree(0) to force CUDA lazy initialisation.
Here are the results:
$ nvcc -std=c++11 main.cu -lrt
$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)
$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)
It is slower when copying from pageable to device, but it is really async.
I'm sorry for my mistake. I deleted my previous comments to avoid confusing people.

It so happens that CUDA memory copies are only asynchronous under strict conditions, as #RobinThoni has kindly indicated. For the code in question, the issue is mostly the use of unpinned (that is, paged) host memory.
To quote from a separate section of the Runtime API documentation (emphasis mine):
2. API synchronization behavior
The API provides memcpy/memset functions in both synchronous and
asynchronous forms, the latter having an "Async" suffix. This is a
misnomer as each function may exhibit synchronous or asynchronous
behavior depending on the arguments passed to the function.
...
Asynchronous
For transfers from device memory to pageable host memory, the function will return only once the copy has completed.
and that's just the half of it! It's actually true that
For transfers from pageable host memory to device memory, the data will first be staged in pinned host memory, then copied to the device; and the function will return only after the staging has occurred.

Related

Problem with CancelIoEx function on some platforms

I work with usb device and use async functions of usbdk driver for I/O operations. I have a problem with canceling of operations on some win platforms. Im using function CancelIoEx for that and it works fine for some part of users but for some it doesn't work. The data is either transmitted completely or not transmitted at all. No intermediate values. CancelIoEx returns 1 in both cases. AbortPipe by usbdk lib doesn't work at all.
HANDLE redirectHandle = usbdk_helper.StartRedirect(&devID);
HANDLE systemHandle = usbdk_helper.GetRedirectorSystemHandle(redirectHandle);
unsigned char* buffer = static_cast<unsigned char*>(std::malloc(0x08 + length));
OVERLAPPED overlapped;
USB_DK_TRANSFER_REQUEST request { 0 };
DWORD overlappedTransfered = 0;
overlapped.hEvent = CreateEvent(NULL, TRUE, FALSE, L"USBEVENT");
...
...
...
request.Buffer = buffer;
request.BufferLength = 0x08 + length;
request.TransferType = ControlTransferType;
auto res = usbdk_helper.WritePipe(redirectHandle, &request, &overlapped); //return TransferSuccessAsync
auto abort_res = CancelIoEx(systemHandle, &overlapped); //Return TRUE
auto event_res = WaitForSingleObject(overlapped.hEvent, INFINITE); //return WAIT_OBJECT_0
std::cout << "count transfered bytes: " << std::dec << request.Result.GenResult.BytesTransferred << std::endl; //0 or full for "bad", or 0 or intermediate for "correct"

pthread is not starting for class instance

NOTE: C++98
Hi, I'm a little new to c++ and I am writing a databaes program and am attempting to start a timer using the boost::asio package using pthread. The aim of the timer is to start after sql queries have been placed inside a buffer, of which will run an execute function if nothing has been received for a period of time. I have managed to get it to compile, but it doesn't look like the pthread instance is starting.
I have called the pthread inside my getInstance method, and the boost::asio alarm has been set up accordingly. What I will show below is that by calling io_run() directly starts the timer falls into a loop within the alarm.
database.h
void *run_io(void *arg);
class Database
{
private:
static Database *dbInstance; //= NULL;
public:
boost::asio::io_service io_service;
boost::posix_time::millisec interval;
boost::asio::deadline_timer timer;
pthread_t timerThread;
public:
static Database &getInstance()
{
if (!dbInstance)
{
dbInstance = new Database();
// pthread_create(&dbInstance->timerThread,NULL,run_io,&dbInstance->io_service);
std::cout << " INSTANCE CREATED " << std::endl;
pthread_create(&dbInstance->timerThread, NULL, run_io, (void *)&dbInstance->io_service);
// pthread_join(&dbInstance->timerThread, NULL);
}
return *dbInstance;
}
};
database.cpp
Database *Database::dbInstance = NULL;
Database::Database()
: interval(2000), timer(io_service, interval) {}
Database::~Database()
{
sqlite3_close(db);
}
void Database::setAlarm(const boost::system::error_code& /* e */)
{
std::cout << "[TEST] WE ARE IN SET ALARM " << std::endl;
DB_WRITE_TIME = 500;
boost::posix_time::milliseconds interval(DB_WRITE_TIME);
// Reschedule the timer for 1 second in the future:
timer.expires_at(timer.expires_at() + interval);
// Posts the timer event
timer.async_wait(boost::bind(&Database::setAlarm, this, _1));
}
int Database::buffer()
{
// DO BUFFER STUFF
timer.async_wait(boost::bind(&Database::setAlarm, this, _1));
// io_service.run() <-- uncommenting this results in the loop
return rc ;
}
void *run_io(void *arg)
{
boost::asio::io_service *io_service = (boost::asio::io_service *)arg;
io_service->run();
}
So I don't feel like the pthread is even starting. I tried putting a print statement in there to see if it came out, and nothing appeared in my terminal.
---- EDIT ----
I have made changes as per Sehe's advice, however it still does not look like I am able to call the alarm handler (setAlarm()). I had to slightly modify it to be compatible with the whole program, but essentially it is this (I gave the interval time a value of 5000 to give it enough time for the tests):
database.h
class Database
{
private:
static boost::shared_ptr<Database> dbInstance;
private:
typedef boost::asio::io_service io_service;
io_service io;
boost::scoped_ptr<io_service::work> work;
boost::posix_time::millisec interval;
boost::asio::deadline_timer timer;
boost::thread timerThread;
void run_io()
{
std::cout << "ENTER IO THREAD" << std::endl;
io.run();
std::cout << "LEAVE IO THREAD" << std::endl;
}
public:
static Database &getInstance()
{
if (!dbInstance)
{
std::cout << " INSTANCE CREATED " << std::endl;
dbInstance.reset(new Database());
dbInstance->timerThread = boost::thread(boost::bind(&Database::run_io,dbInstance));
}
return *dbInstance;
}
Database(); // <-- default constructor (doesn't take any args)
~Database();
database.cpp
boost::shared_ptr<Database> Database::dbInstance;
static const int DB_WRITE_TIME = 5000;
Database::Database()
: work(new io_service::work(io)), interval(5000), timer(io, interval)
{
// std::cout << " CONSTRUCTED " << std::endl;
}
Database::~Database()
{
// std::cout << " DESTROYED " << std::endl;
// sqlite3_close(db);
}
void Database::setAlarm(const boost::system::error_code& ec)
{
std::cout << "[TEST] WE ARE IN SET ALARM - ec message = " << ec.message() << std::endl;
executeSqlInBuffer(); // once timer expire, call the execute function
if(!ec)
{
boost::posix_time::milliseconds interval(DB_WRITE_TIME);
timer.expires_from_now(interval);
timer.async_wait(boost::bind(&Database::setAlarm, this, _1));
}
}
void Database::teardown()
{
// std::cout << " INSTANCE SHUTTING DOWN " << std::endl;
timer.cancel(); // stop timer loop
work.reset(); // allows io.run() to exit
if(timerThread.joinable())
{
std::cout << " JOINED " << std::endl;
timerThread.join(); // releasing bound of shared_ptr
}
else std::cout << " NOT JOINED " << std::endl;
dbInstance.reset(); // releasing instance
}
int Database::buffer()
{
// do buffering
if(buffer.size() == max_size)
{
executeSqlInBuffer();
}
std::cout << timer.expires_from_now(interval) << std::endl;
// std::cout << " ~ BEFORE TIMER ~ " << std::endl;
timer.async_wait(boost::bind(&Database::setAlarm, this, _1));
return 1;
}
main.cpp
int main()
{
pthread_t thread1; // a few pthreads in main that handle other areas of the program.
pthread_create(&thread1,NULL,thread1Arg,NULL);
pthread_t dbThread; // my pthread for the database
pthread_create(&dbThread,NULL,dbThreadArg,NULL);
Database& database = Database::getInstance();
database.teardown();
pthread_join(thread1,NULL);
pthread_join(dbThread,NULL);
return 0;
}
You can also see here that it enters and leaves the IO thread, and creates an instance, plus the debug output for timer.expires_from_now(interval):
INSTANCE CREATED
JOINED
ENTER IO THREAD
LEAVE IO THREAD
...
...
0 ---> first cycle
1 ---> second cycle
...
1 ---> nth cycle
I'm very ccnfused why anyone who uses Boost or C++11 (or both...) would ever use raw pthread threads (see e.g. C++ boost asynchronous timer to run in parallel with program for a good juxtaposition).
The real problem is likely that you have io_service running out of work (see e.g. https://www.boost.org/doc/libs/1_57_0/doc/html/boost_asio/reference/io_service__work.html).
If you have no pending async operations the thread just exits.
Another problem is accuracy issues with
timer.expires_at(timer.expires_at() + interval);
It's possible that some handlers take so much time that by the time you schedule your next alarm, the deadline has already expired. It's probably better to use
timer.expires_from_now(interval);
Note this also matches the comment better. The comment suffers from comment already because it says "1 second" but it is actually some defined constant DB_WRITE_TIME
or separate your timer from the other handlers in some other way to guarantee accurate scheduling.
Finally, you had UB due to the absense of any shutdown. The static instance never gets destroyed, but what's worth the non-detached thread never is joined, creating undefined behaviour at shutdown.
This problem is actually almost identical to the one recently discussed here, where I also explains the way work guards work in more detail: asio::io_service is ending immediately with work
Here's a c++11 rewrite with the necessary fix:
Since I now noticed you're that person stuck in c++03 land for some weird reason, a Boost Thread version:
C++03 DEMO/Boost Thread
Live On Coliru
#include <boost/asio.hpp>
#include <boost/make_shared.hpp>
#include <boost/scoped_ptr.hpp>
#include <boost/thread.hpp>
#include <iostream>
static const int DB_WRITE_TIME = 500;
class Database
{
private:
static boost::shared_ptr<Database> dbInstance;
Database()
: work(new io_service::work(io)),
interval(750),
timer(io, interval)
{
std::cout << "INSTANCE CREATED" << std::endl;
}
void on_timer_completed(const boost::system::error_code& ec) {
std::cout << "[on_timer_completed] " << ec.message() << std::endl;
if (!ec) {
boost::posix_time::milliseconds interval(DB_WRITE_TIME);
// Reschedule the timer
timer.expires_from_now(interval);
timer.async_wait(boost::bind(&Database::on_timer_completed, this, _1));
}
}
int buffer()
{
// DO BUFFER STUFF
timer.expires_from_now(interval);
timer.async_wait(boost::bind(&Database::on_timer_completed, this, _1));
// io_service.run() <-- uncommenting this results in the loop
return 1; // rc ;
}
public:
void do_stuff() {
buffer(); // whatever it does
}
void teardown() {
std::cout << "INSTANCE SHUTTING DOWN\n";
timer.cancel(); // stop timer loop
work.reset(); // allows io.run() to exit
if (timerThread.joinable()) {
timerThread.join(); // releasing the bound shared_ptr
}
dbInstance.reset(); // releasing the instance
}
~Database() {
//sqlite3_close(db);
std::cout << "INSTANCE DESTROYED\n";
}
private:
typedef boost::asio::io_service io_service;
io_service io;
boost::scoped_ptr<io_service::work> work;
boost::posix_time::millisec interval;
boost::asio::deadline_timer timer;
boost::thread timerThread;
void run_io() {
std::cout << "ENTER IO THREAD" << std::endl;
io.run();
std::cout << "LEAVE IO THREAD" << std::endl;
}
public:
static Database &getInstance()
{
if (!dbInstance)
{
dbInstance.reset(new Database());
dbInstance->timerThread =
boost::thread(boost::bind(&Database::run_io, dbInstance));
}
return *dbInstance;
}
};
boost::shared_ptr<Database> Database::dbInstance;
int main() {
Database& db = Database::getInstance();
boost::this_thread::sleep_for(boost::chrono::seconds(1));
db.do_stuff();
boost::this_thread::sleep_for(boost::chrono::seconds(3));
// ....
db.teardown();
}
Prints
INSTANCE CREATED
ENTER IO THREAD
[on_timer_completed] Success
[on_timer_completed] Success
[on_timer_completed] Success
[on_timer_completed] Success
[on_timer_completed] Success
INSTANCE SHUTTING DOWN
[on_timer_completed] Operation canceled
LEAVE IO THREAD
INSTANCE DESTROYED

How could you develop a simple DEALER/ROUTER message flow, using ZeroMQ?

I'm fairly new to TCP messaging (and programming in general) and I am trying to develop a simple ROUTER/DEALER message pair with ZeroMQ but am struggling in getting the router to receive a message from the dealer and send one back.
I can do a simple REQ/REP pattern with no problem, where I can send one message from my machine to my VM.
However, when trying to develop a ROUTER/DEALER pair, I can't seem to get the ROUTER-instance to receive the message (ROUTER on VM, DEALER on main box). I have had some success where I could spam 50 messages in a while(){...} loop, but can't send a single message and have the ROUTER send one back.
So from what I have read, a TCP message in a ROUTER/DEALER pair are sent with a delimiter of 0 at the beginning, and this 0 must be sent first to the ROUTER to register an incoming message.
So I just want to send the message "ROUTER_TEST" to my server, and for my server to respond with "RECEIVED".
DEALER
#include <cstdlib>
#include <iostream>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include "zmq.h"
const char connection[] = "tcp://10.0.10.76:5555";
int main(void)
{
int major, minor, patch;
zmq_version(&major, &minor, &patch);
printf("\nInstalled ZeroMQ version: %d.%d.%d\n", major, minor, patch);
printf("Connecting to: %s\n", connection);
void *context = zmq_ctx_new();
void *requester = zmq_socket(context, ZMQ_DEALER);
int zc = zmq_connect(requester, connection);
std::cout << "zmq_connect = " << zc << std::endl;
int sm = zmq_socket_monitor(requester, connection, ZMQ_EVENT_ALL);
std::cout << "zmq_socket_monitor = " << sm << std::endl;
char messageSend[] = "ROUTER_TEST";
int request_nbr;
int n = zmq_send(requester, NULL, 0, ZMQ_DONTWAIT|ZMQ_SNDMORE );
int ii = 0;
if(n==0) {
std::cout << "n = " << n << std::endl;
while (ii < 50)
{
n = zmq_send(requester, messageSend, 31, ZMQ_DONTWAIT);
ii++;
}
}
return 0;
}
ROUTER
// SERVER
#include <cstdlib>
#include <iostream>
#include <string.h>
#include <assert.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include "zmq.h"
int main(void)
{
void *context = zmq_ctx_new();
void *responder = zmq_socket(context, ZMQ_ROUTER);
printf("THIS IS WORKING - ROUTER\n");
int rc = zmq_bind(responder, "tcp://*:5555");
assert(rc == 0);
zmq_pollitem_t pollItems[] = {
{responder, 0, ZMQ_POLLIN, -1}};
int sm = zmq_socket_monitor(responder, "tcp://*:5555", ZMQ_EVENT_LISTENING);
std::cout << "zmq_socket_monitor = " << sm << std::endl;
uint8_t buffer[15];
while (1)
{
int rc = zmq_recv(responder, buffer, 5, ZMQ_DONTWAIT);
if (rc == 0)
{
std::cout << "zmq_recv = " << rc << std::endl;
zmq_send(responder, "RECIEVED", 9,0);
}
zmq_poll(pollItems, sizeof(pollItems), -1);
}
return 0;
}
Your code calls, on the DEALER-side a series of:
void *requester = zmq_socket( context,
ZMQ_DEALER // <-- .STO <ZMQ_DEALER>, *requester
);
...
int n = zmq_send( requester, // <~~ <~~ <~~ <~~ <~~ <~~ .STO 0, n
NULL, // NULL,sizeof(NULL)== 0
0, // explicitly declared 0
ZMQ_DONTWAIT // _DONTWAIT flag
| ZMQ_SNDMORE //---- 1x ZMQ_SNDMORE flag==
); // 1.Frame in 1st MSG
int ii = 0; // MSG-under-CONSTRUCTION
if ( n == 0 ) // ...until complete, not yet sent
{
std::cout << "PREVIOUS[" << ii << ".] CALL of zmq_send() has returned n = " << n << std::endl;
while ( ii < 50 )
{ ii++;
n = zmq_send( requester, //---------//---- 1x ZMQ_SNDMORE following
messageSend, // // 2.Frame in 1st MSG
31, // // MSG-under-CONSTRUCTION, but
ZMQ_DONTWAIT // // NOW complete & will get sent
); //---------//----49x monoFrame MSGs follow
}
}
...
What happens on the opposite side, the ROUTER-side code ?
...
while (1)
{
int rc = zmq_recv( responder, //----------------- 1st .recv()
buffer,
5,
ZMQ_DONTWAIT
);
if ( rc == 0 )
{
std::cout << "zmq_recv = " << rc << std::endl;
zmq_send( responder, // _____________________ void *socket
"RECEIVED", // _____________________ void *buffer
9, // _____________________________ size_t len
0 // _____________________________ int flags
);
}
zmq_poll( pollItems,
sizeof( pollItems ),
-1 // __________________________________ block ( forever )
);// till ( if ever ) ...?
}
Here, most probably, the rc == 0 but once, if not missed, but never more
Kindly notice, that your code does not detect in any way if a .recv()-call is also being flagged by a ZMQ_RECVMORE - signaling a need to first also .recv()-all-the-rest multi-Frame parts of the first message, before becoming able to .send()-any-answer...
An application that processes multi-part messages must use the ZMQ_RCVMORE zmq_getsockopt(3) option after calling zmq_recv() to determine if there are further parts to receive.
Next, the buffer and messageSend message-"payloads" are a kind of fragile entities and ought be re-composed ( for details best read again all details about how to carefully initialise, work with and safely-touch any zmq_msg_t object(s) ), as after a successful .send()/.recv() the low level API ( since 2.11.x+ ) considers 'em disposed-off, not re-useable. Also note, that messageSend is not (as imperatively put into the code) a 31-char[]-long, was it? Was there any particular intention to do this?
The zmq_send() function shall return number of bytes in the message if successful. Otherwise it shall return -1 and set errno to one of the values defined below. { EAGAIN, ENOTSUP, EINVAL, EFSM, ETERM, ENOTSOCK, EINTR, EHOSTUNREACH }
Not testing error-state means knowing nothing about the actual state ( see EFSM and other potential trouble explainers ) of REQ/REP and DEALER/ROUTER (extended) .send()/.recv()/.send()/.recv()/... mandatory dFSA's order of these steps
"So from what I have read, a TCP message in a ROUTER/DEALER pair are sent with a delimiter of 0 at the beginning, and this 0 must be sent first to the ROUTER to register an incoming message."
This seems to be a misunderstood part. The app-side is free to compose any number of monoframe or multi-frame messages, yet the "trick" of a ROUTER prepended identity-frame is performed without users assistance ( message-labelling is performed automatically, before any ( now, principally all ) multi-frame(d) messages get delivered to the app-side ( using the receiver's side .recv()-method ). Due handling of multi-frame messages was noted above.

Unable to read shared memory data using boost created by Qt

I've created a Qt shared memory program to write a string into shared memory. Now After writing, I need to read it from Boost program. I tried using simple programs, but I couldn't read the string using Boost interprocess.
Here is the Qt code that is writing into the shared memory. And I'm double checking if the string is written by reading from the shared memory from the same program.
void CDialog::loadString()
{
if(sharedMemory.isAttached())
{
if(!sharedMemory.detach())
{
lbl->setText("Unable to detach from Shared Memory");
return;
}
}
lbl->setText("Click on Top Button");
char sString[] = "my string";
QBuffer buffer;
buffer.open(QBuffer::ReadWrite);
QDataStream out(&buffer);
out << sString;
int size = buffer.size();
qDebug() << size;
if(!sharedMemory.create(size))
{
lbl->setText("Unable to create shared memory segment");
qDebug() << lbl->text();
}
sharedMemory.lock();
char *to = (char *) sharedMemory.data();
const char *from = buffer.data();
memcpy(to, from, qMin(sharedMemory.size(), size));
sharedMemory.unlock();
char * str;
QDataStream in(&buffer);
sharedMemory.lock();
buffer.setData((char *)sharedMemory.constData(), sharedMemory.size());
buffer.open(QBuffer::ReadOnly);
in >> str;
sharedMemory.unlock();
qDebug() << str;
}
And I'm reading it from boost using the same key which I've provided in the Qt program.
Below is the Boost program code -
int main()
{
boost::interprocess::shared_memory_object shdmem(boost::interprocess::open_only, "Highscore", boost::interprocess::read_only);
boost::interprocess::offset_t size;
if (shdmem.get_size(size))
std::cout << "Shared Mem Size: " << size << std::endl;
boost::interprocess::mapped_region region2(shdmem, boost::interprocess::read_only);
char *i2 = static_cast<char *>(region2.get_address());
std::cout << i2 << std::endl;
return 0;
}
Kindly help me in reading the shared memory data from Boost program.
Thank you.
From the Qt docs:
Warning: QSharedMemory changes the key in a Qt-specific way. It is therefore currently not possible to use the shared memory of non-Qt applications with QSharedMemory.
You will probably need to use Boost on both sides.

OpenCL 1.2 C++ Wrapper - undefined reference to clReleaseDevice

I am trying to use the OpenCL C++ wrapper API for the following program :
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <cstdio>
#include <cstdlib>
#include <iostream>
const char helloStr [] = "__kernel void "
"hello(void) "
"{ "
" "
"} ";
int
main(void)
{
cl_int err = CL_SUCCESS;
try {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.size() == 0) {
std::cout << "Platform size 0\n";
return -1;
}
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
cl::Context context(CL_DEVICE_TYPE_CPU, properties);
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
cl::Program::Sources source(1,
std::make_pair(helloStr,strlen(helloStr)));
cl::Program program_ = cl::Program(context, source);
program_.build(devices);
cl::Kernel kernel(program_, "hello", &err);
cl::Event event;
cl::CommandQueue queue(context, devices[0], 0, &err);
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(4,4),
cl::NullRange,
NULL,
&event);
event.wait();
}
catch (cl::Error err) {
std::cerr
<< "ERROR: "
<< err.what()
<< "("
<< err.err()
<< ")"
<< std::endl;
}
return EXIT_SUCCESS;
}
I use the same kernel file from that blog post, anyways that is not the issue since I can't get past compilation.
I'm compiling the program with the following command :
g++ example.cpp -o example -l OpenCL
and I get the following error message :
/tmp/ccbUf7dB.o: In function `cl::detail::ReferenceHandler<_cl_device_id*>::release(_cl_device_id*)':
example.cpp:(.text._ZN2cl6detail16ReferenceHandlerIP13_cl_device_idE7releaseES3_[_ZN2cl6detail16ReferenceHandlerIP13_cl_device_idE7releaseES3_]+0x14): undefined reference to `clReleaseDevice'
collect2: error: ld returned 1 exit status
I've read stuff about clReleaseDevice not working for legacy devices (see for example this question), but my graphics card is pretty recent (NVidia GTX 660 Ti, supports OpenCL 1.2). Where can I go from there?
I am running this on Ubuntu 13.04 x64 with nvidia-opencl-dev and opencl-headers installed from ubuntu repositories.
The root cause
The problem is that the OpenCL library that you are linking against does not support OpenCL 1.2. Speaking generically, until an OpenCL implementation supporting the version you want to use becomes available for a specific platform, you will have this problem when linking against the OpenCL shared library provided with it. There are two solutions:
Download the version of cl.hpp from Khronos that matches the OpenCL version provided by your chosen hardware and continue using the library provided by your device's manufacturer.
Link against an OpenCL shared library that implements the latest OpenCL standard, but write multiple code paths - one for each OpenCL version, ensuring that each code path only uses OpenCL functions that are supported by that version. This route is harder and I have no idea how you can do it with the C++ wrapper. If you try to call an OpenCL 1.2 function on a platform that doesn't support OpenCL 1.2 you will get a segfault, that's why different code paths are needed.
Nvidia specific
Nvidia has been very slow in delivering OpenCL 1.2 support. As a result, their OpenCL library did not provide the OpenCL 1.2 functions the linker was looking for, resulting in the errors.
At the end of May 2015 Nvidia released drivers that support OpenCL 1.2, see the comments by Z Boson below. Updating your drivers should resolve the linker error. GeForce GTX 6xx and later cards (except for rebrands of earlier generations) support OpenCL 1.2. You can check the conformant products list on the Khronos OpenCL site to make sure. The GTX 660 Ti is listed so you're in luck.
Yeah. I have never seen OpenCL 1.2 on Nvidia devices. Compile this on your system, and look at the "OpenCL C Version":
#include <iostream>
#include <vector>
#include <CL/cl.hpp>
int main() {
// Get the platforms
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
// Loop over the number of platforms
for ( size_t i = 0; i < platforms.size(); ++i ) {
// Display the platform information
std::cout << "Platform " << i+1 << ": "
<< platforms[i].getInfo<CL_PLATFORM_NAME>()
<< "\n----------------------------------------------"
<< "\nVendor : " << platforms[i].getInfo<CL_PLATFORM_VENDOR>()
<< "\nVersion : " << platforms[i].getInfo<CL_PLATFORM_VERSION>();
// Get the devices on the current platform
std::vector <cl::Device> devices;
platforms[i].getDevices( CL_DEVICE_TYPE_ALL , & devices);
// Loop over the devices
std::cout << "\n----------------------------------------------\n";
for ( size_t j = 0; j < devices.size(); ++j ) {
// Display the device information
std::cout
<< "\n Device " << j+1 << ": "
<< devices[j].getInfo< CL_DEVICE_NAME >()
<< "\n\t Device Version : "
<< devices[j].getInfo< CL_DEVICE_VERSION >()
<< "\n\t OpenCL C Version : "
<< devices[j].getInfo< CL_DEVICE_OPENCL_C_VERSION >()
<< "\n\t Compute Units : "
<< devices[j].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >()
<< "\n\t Max Work Group Size: "
<< devices[j].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >()
<< "\n\t Clock Frequency : "
<< devices[j].getInfo< CL_DEVICE_MAX_CLOCK_FREQUENCY >()
<< "\n\t Local Memory Size : "
<< devices[j].getInfo< CL_DEVICE_LOCAL_MEM_SIZE >()
<< "\n\t Global Memory Size : "
<< devices[j].getInfo< CL_DEVICE_GLOBAL_MEM_SIZE >();
// Check if the device supports double precision
std::string str = devices[j].getInfo<CL_DEVICE_EXTENSIONS>();
size_t found = str.find("cl_khr_fp64");
std::cout << "\n\t Double Precision : ";
if ( found != std::string::npos ){ std::cout << "yes\n"; }
else { std::cout << "no\n"; }
}
std::cout << "\n----------------------------------------------\n";
}
// std::cin.ignore();
return 0;
}

Resources