So please bear with me if the question is very trivial.
I wanted to install Intel implementation of OpenCL to code in integrated intel GPU.
I installed the OpenCL driver using Beignet. It successfully installed and clinfo shows the number of platforms and other details.
I have the following code,
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <CL\cl.hpp>
#include<iostream>
int main() {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::cout << "Total platforms including cpu: " << platforms.size() << std::endl;
if (platforms.size() == 0) {
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
for (int j = 0; j < platforms.size(); j++) {
auto p = platforms[j];//Change platform from 0,1 and 2
std::vector <cl::Device> devices;
p.getDevices(CL_DEVICE_TYPE_ALL, &devices);
for (int i = 0; i < devices.size(); i++) {
auto device = devices[i];
auto vendor = device.getInfo<CL_DEVICE_VENDOR>();
std::cout << vendor << std::endl;
auto version = device.getInfo<CL_DEVICE_VERSION>();
}
std::cout << "----------------------\n";
}
}
When I compile using $g++ -o test test.cpp -lOpenCL it throws the following error
fatal error: CL\cl.hpp: No such file or directory
#include <CL\cl.hpp>
I linked the library as follows,
sudo ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so /usr/local/lib/libOpenCl.so
I'm not sure how to proceed further. Please help
Consider the following program for enqueueing some work on a non-blocking GPU stream:
#include <iostream>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) {
*(reinterpret_cast<bool *>(ptr)) = true;
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main() {
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 1e7;
bool callback_executed = false;
cudaStream_t stream;
auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}
The size of the buffers and the length of the kernel sleep in cycles are high enough, that as they execute in parallel with the CPU thread, it should finish the enqueueing well before they've concluded (8ms+8ms for copying and 20 ms for the kernel).
And yet, looking at the trace below, it seems the two cudaMemcpyAsync() are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contradict the relevant section of the CUDA Runtime API documentation. How does that make sense?
Trace: (numbered lines, time in useconds):
1 "Start" "Duration" "Grid X" "Grid Y" "Grid Z" "Block X" "Block Y" "Block Z"
104 14102.830000 59264.347000 "cudaMalloc"
105 73368.351000 19.886000 "cudaStreamCreateWithFlags"
106 73388.and 20 ms for the kernel).
And yet, looking at the trace below, it seems the two cudaMemcpyAsync()'s are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contradict the relevant section of the CUDA Runtime API documentation. How does it make sense?
850000 8330.257000 "cudaMemcpyAsync"
107 73565.702000 8334.265000 47.683716 5.587311 "Pageable" "Device" "GeForce GTX 650 Ti BOOST (0)" "1"
108 81721.124000 2.394000 "cudaConfigureCall"
109 81723.865000 3.585000 "cudaSetupArgument"
110 81729.332000 30.742000 "cudaLaunch (dummy(__int64) [107])"
111 81760.604000 39589.422000 "cudaMemcpyAsync"
112 81906.303000 20157.648000 128 1 1 128 1 1
113 102073.103000 18736.208000 47.683716 2.485355 "Device" "Pageable" "GeForce GTX 650 Ti BOOST (0)" "1"
114 121351.936000 5.560000 "cudaStreamSynchronize"
This seemed weird, so I contacted someone from the CUDA driver team, who confirmed the documentation is correct. I was also able to confirm it:
#include <iostream>
#include <memory>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) {
*(reinterpret_cast<bool *>(ptr)) = true;
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main(int argc, char* argv[]) {
cudaFree(0);
struct timespec start, stop;
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
bool callback_executed = false;
cudaStream_t stream;
void* host_ptr;
if (argc == 1){
host_ptr = malloc(buffer_size);
}
else {
cudaMallocHost(&host_ptr, buffer_size, 0);
}
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
auto snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}
This is basically your code, with a few modifications:
Time measurement
A switch to allocate from pageable or pinned memory
A buffer size of 2 GiB to ensure a measurable copy time
cudaFree(0) to force CUDA lazy initialisation.
Here are the results:
$ nvcc -std=c++11 main.cu -lrt
$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)
$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)
It is slower when copying from pageable to device, but it is really async.
I'm sorry for my mistake. I deleted my previous comments to avoid confusing people.
It so happens that CUDA memory copies are only asynchronous under strict conditions, as #RobinThoni has kindly indicated. For the code in question, the issue is mostly the use of unpinned (that is, paged) host memory.
To quote from a separate section of the Runtime API documentation (emphasis mine):
2. API synchronization behavior
The API provides memcpy/memset functions in both synchronous and
asynchronous forms, the latter having an "Async" suffix. This is a
misnomer as each function may exhibit synchronous or asynchronous
behavior depending on the arguments passed to the function.
...
Asynchronous
For transfers from device memory to pageable host memory, the function will return only once the copy has completed.
and that's just the half of it! It's actually true that
For transfers from pageable host memory to device memory, the data will first be staged in pinned host memory, then copied to the device; and the function will return only after the staging has occurred.
i´m new to OpenCl and i´m trying to learn it right know.
I installed Intel® SDK for OpenCL™ Applications and now i´m trying to use it with visual studio 2015.
When i´m trying to run an hello world example it returns an error at the "program.build" -part in the following codepart.
Can somebody tell me what i´m missing?
Thanks :)
The consol returns this:
Using platform: Intel<R> OpenCL
Using device: Intel<R> Core<TM> i7-3770 CPU e 3.40 GHz
-44
Error building:
Examplecode:
//get all platforms (drivers)
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
cl::Platform default_platform = all_platforms[0];
std::cout << "Using platform: " << default_platform.getInfo<CL_PLATFORM_NAME>() << "\n";
//get default device of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << " No devices found. Check OpenCL installation!\n";
exit(1);
}
cl::Device default_device = all_devices[0];
std::cout << "Using device: " << default_device.getInfo<CL_DEVICE_NAME>() << "\n";
cl::Context context({ default_device });
cl::Program::Sources sources;
// kernel calculates for each element C=A+B
std::string kernel_code =
" void kernel simple_add(global const int* A, global const int* B, global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" } ";
sources.push_back({ kernel_code.c_str(),kernel_code.length() });
cl::Program program(context, sources);
if (**program.build({ default_device }) != CL_SUCCESS**) {
std::cout << program.build({ default_device }) <<"\n";
std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << "\n";
exit(1);
}
Error code -44 means that "the program object is invalid". However this is inconclusive and hints to some other issue that corrupts memory. Also see this question.
Calling program.build() twice is - although not good practice - not the issue here, but maybe some threading issues? The problem certainly is not in the code snipped you provided.
I've created a Qt shared memory program to write a string into shared memory. Now After writing, I need to read it from Boost program. I tried using simple programs, but I couldn't read the string using Boost interprocess.
Here is the Qt code that is writing into the shared memory. And I'm double checking if the string is written by reading from the shared memory from the same program.
void CDialog::loadString()
{
if(sharedMemory.isAttached())
{
if(!sharedMemory.detach())
{
lbl->setText("Unable to detach from Shared Memory");
return;
}
}
lbl->setText("Click on Top Button");
char sString[] = "my string";
QBuffer buffer;
buffer.open(QBuffer::ReadWrite);
QDataStream out(&buffer);
out << sString;
int size = buffer.size();
qDebug() << size;
if(!sharedMemory.create(size))
{
lbl->setText("Unable to create shared memory segment");
qDebug() << lbl->text();
}
sharedMemory.lock();
char *to = (char *) sharedMemory.data();
const char *from = buffer.data();
memcpy(to, from, qMin(sharedMemory.size(), size));
sharedMemory.unlock();
char * str;
QDataStream in(&buffer);
sharedMemory.lock();
buffer.setData((char *)sharedMemory.constData(), sharedMemory.size());
buffer.open(QBuffer::ReadOnly);
in >> str;
sharedMemory.unlock();
qDebug() << str;
}
And I'm reading it from boost using the same key which I've provided in the Qt program.
Below is the Boost program code -
int main()
{
boost::interprocess::shared_memory_object shdmem(boost::interprocess::open_only, "Highscore", boost::interprocess::read_only);
boost::interprocess::offset_t size;
if (shdmem.get_size(size))
std::cout << "Shared Mem Size: " << size << std::endl;
boost::interprocess::mapped_region region2(shdmem, boost::interprocess::read_only);
char *i2 = static_cast<char *>(region2.get_address());
std::cout << i2 << std::endl;
return 0;
}
Kindly help me in reading the shared memory data from Boost program.
Thank you.
From the Qt docs:
Warning: QSharedMemory changes the key in a Qt-specific way. It is therefore currently not possible to use the shared memory of non-Qt applications with QSharedMemory.
You will probably need to use Boost on both sides.
I'm somewhat new to C++ and QT.
Trying to run a very simple program in QtCreator, which uses console input on WinXP:
#include <QString>
#include <QTextStream>
int main() {
QTextStream streamOut(stdout);
QTextStream streamIn(stdin);
QString s1("This "), s2("is a "), s3("string.");
QString s4 = s1 + s2 + s3;
streamOut << s4 << endl;
streamOut << "The length of that string is " << s4.length() << endl;
streamOut << "Enter a sentence with whitespaces: " << endl;
s4 = streamIn.readLine();
streamOut << "Here is your sentence: \n" << s4 << endl;
streamOut << "The length of your sentence is: " << s4.length() << endl;
return 0;
}
Problem is that native QTCreator's application output, for it's name, doesn't support typing in things. Here is application output:
Starting C:\QProject\test-build-desktop-Qt_4_8_0_for_Desktop_-MinGW_Qt_SDK___>z>>\debug\test.exe...
This is a string.
The length of that string is 17
Enter a sentence with whitespaces:
Qml debugging is enabled. Only use this in a safe environment!
I've tried checking "Run in terminal" in Projects>Desktop>Run as some answers to similar questions here suggested and the terminal shows up, but it doesn't seem to interact with the program anyhow. Terminal's output:
Press RETURN to close this window...
I would say that checking Run in terminal is correct and needed.
What is surprising is that you don't get any compile error, as there is a mistake at line 8 :
cout << "Enter a sentence: "<<;
The last << is wrong.
Correcting your code, I get this :
#include <QString>
#include <QTextStream>
QTextStream cout(stdout);
QTextStream cin(stdin);
int main() {
QString s2;
cout << "Enter a sentence: ";
s2 = cin.readLine();
cout << "Here is your sentence:" << s2 << endl;
cout << "The length of your sentence is: " << s2.length() << endl;
return 0;
}
which works fine on my computer (WinXP, QtCreator 2.2.0).
Are you sure your Qt project is correct and that you are compiling the right file ?