On running the code in Intel Devcloud, it is throwing runtime error - pointers

I was trying to run this code but while compiling in intel devcloud using these commands:
icpx -qopenmp -fopenmp-targets=spir64 openmp_target_offload_clause_ordering.cpp
export OMP_TARGET_OFFLOAD=MANDATORY
it is showing runtime error.
#include <stdio.h>
int main() {
double *V = reinterpret_cast<double*>(0xdeadbeef);
printf("pointer=%p\n", V);
#pragma omp target parallel for simd is_device_ptr(V) if(true)
for(int i = 0; i < 1; ++i) {
printf("pointer=%p\n", V);
}
#pragma omp target parallel for simd if(true) is_device_ptr(V)
for(int i = 0; i < 1; ++i) {
printf("pointer=%p\n", V);
}
return 100;
}

The device pointer you are entering is invalid.
Replace (0xdeadbeef) with (omp_target_alloc(size, 0)) in the first line of your main function as follows:
double ptr = reinterpret_cast<double>(omp_target_alloc(size, 0));
Hope this helps!

Related

SYCL kernel cannot call an undefined function without SYCL_EXTERNAL attribute

I am trying to calculate the euclidean distance for KNN but in parallel using dpc++. the training dataset contains 5 features and 1600 rows, while I want to calculate the distance between the current test point and each training point on the grid in parallel, but I keep getting an error regarding sycl kernal.
code for the function:
code
std::vector<double> distance_calculation_FPGA(queue& q,const std::vector<std::vector<double>>& dataset,const std::vector<double>& curr_test) {
range<1> num_items{ dataset.size()};
std::vector<double>res;
res.resize(dataset.size());
buffer dataset_buf(dataset);
buffer curr_test_buf(curr_test);
buffer res_buf(res.data(), num_items);
q.submit([&](handler& h) {
accessor a(dataset_buf, h, read_only);
accessor b(curr_test_buf, h, read_only);
accessor dif(res_buf, h, write_only, no_init);
h.parallel_for(num_items, [=](auto i) {
for (int j = 0; j <(const int) a[i].size(); ++j) {
dif[i] += (a[i][j] - b[j]) * (a[i][j] - b[j]) ;
}
});
});
for (int i = 0; i < res.size(); ++i) {
std::cout << res[i] << std::endl;
}
//old distance calculation (serial)
//for (int i = 0; i < dataset.size(); ++i) {
// double dis = 0;
// for (int j = 0; j < dataset[i].size(); ++j) {
// dis += (curr_test[j] - dataset[i][j]) * (curr_test[j] - dataset[i][j]);
//}
//res.push_back(dis);
//}
return res;
}
the error I am receiving:
SYCL kernel cannot call a variadic function
SYCL kernel cannot call an undefined function without SYCL_EXTERNAL attribute
Would be extremely grateful for any help!
Thanks
We tried running your code by creating dummy 'dataset' and 'curr_test' variables. We were able to run the program successfully. Please refer this thread
Please refer to the complete code attached below.
#include <CL/sycl.hpp>
#include <iostream>
using namespace sycl;
std::vector<double> distance_calculation_FPGA(queue& q,const std::vector<std::vector<double>>& dataset,const std::vector<double>& curr_test)
{
range<1> num_items{ dataset.size()};
std::vector<double>res;
res.resize(dataset.size());
buffer dataset_buf(dataset);
buffer curr_test_buf(curr_test);
buffer res_buf(res.data(), num_items);
q.submit([&](handler& h) {
accessor a(dataset_buf, h, read_only);
accessor b(curr_test_buf, h, read_only);
accessor dif(res_buf, h, write_only, no_init);
h.parallel_for(num_items, [=](auto i) {
for (int j = 0; j <(const int) a[i].size(); ++j) {
// dif[i] += (a[i][j] - b[j]) * (a[i][j] - b[j]) ;
dif[i]+=a[i][j];
}
});
});
q.wait(); //We have added this line of code for synchronization.
for (int i : res) {
std::cout <<i<< std::endl;
}
return res;
}
int main(){
std::vector<std::vector<double>> dataset;
for(int i=0;i<5;i++)
{
std::vector<double> d;
for(int j=0;j<1600;j++)
{
d.push_back((double)j);
}
dataset.push_back(d);
}
std::vector<double> curr_test;
for(int i=0;i<1600;i++)
{
curr_test.push_back((double)i);
}
queue q;
std::cout << "Running on "<<
q.get_device().get_info<sycl::info::device::name>()<< std::endl;
//print the device name as a test to check the parallelisation
distance_calculation_FPGA(q,dataset,curr_test);
return 0;
}

Segmentation fault inside range

#include <iostream>
#include <vector>
#include <algorithm>
#include <queue> // std::priority_queue
using std::vector;
using std::cin;
using std::cout;
struct fj{
int indexI=0;
int freeT=0;
};
struct DereferenceCompareNode : public std::binary_function<fj, fj, bool>
{
bool operator()(const fj lhs, const fj rhs) const
{
return lhs.freeT > rhs.freeT;
}
};
class JobQueue {
private:
int num_workers_;
vector<int> jobs_;
vector<int> assigned_workers_;
vector<long long> start_times_;
void WriteResponse() const {
for (int i = 0; i < jobs_.size(); ++i) {
cout << assigned_workers_[i] << " " << start_times_[i] << "\n";
}
}
void ReadData() {
int m;
cin >> num_workers_ >> m;
jobs_.resize(m);
std::cout<<"Read fault"<<"\n";
for(int i = 0; i < m; i++)
cin >> jobs_[i];
std::cout<<"Read fault ends"<<"\n";
}
void AssignJobs() {
// TODO: replace this code with a faster algorithm.
std::cout<<"Fault point 1"<<"\n";
assigned_workers_.resize(jobs_.size());
start_times_.resize(jobs_.size());
vector<long long> next_free_time(num_workers_, 0);
std::priority_queue<int, vector<int>, std::greater<int> > thread;
std::priority_queue<fj, vector<fj>, DereferenceCompareNode > freeJob;
/*
for (int i = 0; i < jobs_.size(); ++i) {
int duration = jobs_[i];
int next_worker = 0;
for (int j = 0; j < num_workers_; ++j) {
if (next_free_time[j] < next_free_time[next_worker])
next_worker = j;
}
assigned_workers_[i] = next_worker;
start_times_[i] = next_free_time[next_worker];
next_free_time[next_worker] += duration;
}
*/
std::cout<<"dump point 2"<<"\n";
for(int i=0;i<num_workers_;i++){
thread.push(i);
}
std::cout<<"dump point 1"<<"\n";
int counter = 0;
while(jobs_.size()!=0){
std::cout<<"jobs_.size:"<<jobs_.size()<<"\n";
std::cout<<"freeJob.size:"<<freeJob.size()<<"\n";
//check logic
do{
if(freeJob.top().freeT == counter){
std::cout<<"freeJob.top().freeT:"<<freeJob.top().freeT<<"\n";
std::cout<<"counter:"<<counter<<"\n";
thread.push(freeJob.top().indexI);
freeJob.pop();
}else{
break;
}
}
while(freeJob.size()!=0);
std::cout<<"Thread:"<<thread.size()<<"\n";
while(thread.size()!=0){
if(jobs_.size()!=0){
fj currA;
currA.indexI = thread.top();
currA.freeT = jobs_.at(0)+counter;
std::cout<<"currA.indexI:"<<currA.indexI<<"\n";
std::cout<<"currA.freeT:"<<currA.freeT<<"\n";
thread.pop();
jobs_.erase(jobs_.begin());
assigned_workers_.push_back(currA.indexI);
start_times_.push_back(currA.freeT);
}else{
break;
}
}
counter++;
}
}
public:
void Solve() {
ReadData();
AssignJobs();
WriteResponse();
}
};
int main() {
std::ios_base::sync_with_stdio(false);
JobQueue job_queue;
job_queue.Solve();
return 0;
}
I am getting segmentation fault in function ReadData while taking inputs for vector jobs.
I am getting fault even when I am inside bounds of defined size.
Everything was fine when have not written AssignJob function.
Am I doing something wrong with some bounds or taking illegal inputs format or messing with some other stuff?
Am I doing something wrong
Yes, you are: freeJob starts out empty, so this is undefined behavior:
if(freeJob.top().freeT == counter){
In fact, you never push anything into freeJob, you only pop() things from it.

QFuture Memoryleak

I want to parallelize a function and have the problem that after a few hours my memory is overloaded.
The test program calculates something simple, and works so far. Only the memory usage is constantly increasing.
QT Project file:
QT -= gui
QT += concurrent widgets
CONFIG += c++11 console
CONFIG -= app_bundle
DEFINES += QT_DEPRECATED_WARNINGS
SOURCES += main.cpp
QT program file:
#include <QCoreApplication>
#include <qdebug.h>
#include <qtconcurrentrun.h>
double parallel_function(int instance){
return (double)(instance)*10.0;
}
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
int nr_of_threads = 8;
double result_sum,temp_var;
for(qint32 i = 0; i<100000000; i++){
QFuture<double> * future = new QFuture<double>[nr_of_threads];
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread] = QtConcurrent::run(parallel_function,thread);
}
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread].waitForFinished();
temp_var = future[thread].result();
qDebug()<<"result: " << temp_var;
result_sum += temp_var;
}
}
qDebug()<<"total: "<<result_sum;
return a.exec();
}
As I have observed, QtConcurrent::run(parallel_function,thread) allocates memory, but does not release memory after future[thread].waitForFinished().
What's wrong here?
You have memory leak because future array is not deleted. Add delete[] future at the end of outer for loop.
for(qint32 i = 0; i<100000000; i++)
{
QFuture<double> * future = new QFuture<double>[nr_of_threads];
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread] = QtConcurrent::run(parallel_function,thread);
}
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread].waitForFinished();
temp_var = future[thread].result();
qDebug()<<"result: " << temp_var;
result_sum += temp_var;
}
delete[] future; // <--
}
Here's how this might look - note how much simpler everything can be! You're dead set on doing manual memory management: why? First of all, QFuture is a value. You can store it very efficiently in any vector container that will manage the memory for you. You can iterate such a container using range-for. Etc.
QT = concurrent # dependencies are automatic, you don't use widgets
CONFIG += c++14 console
CONFIG -= app_bundle
SOURCES = main.cpp
Even though the example is synthetic and the map_function is very simple, it's worth considering how to do things most efficiently and expressively. Your algorithm is a typical map-reduce operation, and blockingMappedReduce has half the overhead of manually doing all of the work.
First of all, let's recast the original problem in C++, instead of some C-with-pluses Frankenstein.
// https://github.com/KubaO/stackoverflown/tree/master/questions/future-ranges-49107082
/* QtConcurrent will include QtCore as well */
#include <QtConcurrent>
#include <algorithm>
#include <iterator>
using result_type = double;
static result_type map_function(int instance){
return instance * result_type(10);
}
static void sum_modifier(result_type &result, result_type value) {
result += value;
}
static result_type sum_function(result_type result, result_type value) {
return result + value;
}
result_type sum_approach1(int const N) {
QVector<QFuture<result_type>> futures(N);
int id = 0;
for (auto &future : futures)
future = QtConcurrent::run(map_function, id++);
return std::accumulate(futures.cbegin(), futures.cend(), result_type{}, sum_function);
}
There is no manual memory management, and no explicit splitting into "threads" - that was pointless, since the concurrent execution platform is aware of how many threads there are. So this is already better!
But this seems quite wasteful: each future internally allocates at least once (!).
Instead of using futures explicitly for each result, we can use the map-reduce framework. To generate the sequence, we can define an iterator that provides the integers we wish to work on. The iterator can be a forward or a bidirectional one, and its implementation is the bare minimum needed by QtConcurrent framework.
#include <iterator>
template <typename tag> class num_iterator : public std::iterator<tag, int, int, const int*, int> {
int num = 0;
using self = num_iterator;
using base = std::iterator<tag, int, int, const int*, int>;
public:
explicit num_iterator(int num = 0) : num(num) {}
self &operator++() { num ++; return *this; }
self &operator--() { num --; return *this; }
self &operator+=(typename base::difference_type d) { num += d; return *this; }
friend typename base::difference_type operator-(self lhs, self rhs) { return lhs.num - rhs.num; }
bool operator==(self o) const { return num == o.num; }
bool operator!=(self o) const { return !(*this == o); }
typename base::reference operator*() const { return num; }
};
using num_f_iterator = num_iterator<std::forward_iterator_tag>;
result_type sum_approach2(int const N) {
auto results = QtConcurrent::blockingMapped<QVector<result_type>>(num_f_iterator{0}, num_f_iterator{N}, map_function);
return std::accumulate(results.cbegin(), results.cend(), result_type{}, sum_function);
}
using num_b_iterator = num_iterator<std::bidirectional_iterator_tag>;
result_type sum_approach3(int const N) {
auto results = QtConcurrent::blockingMapped<QVector<result_type>>(num_b_iterator{0}, num_b_iterator{N}, map_function);
return std::accumulate(results.cbegin(), results.cend(), result_type{}, sum_function);
}
Could we drop the std::accumulate and use blockingMappedReduced instead? Sure:
result_type sum_approach4(int const N) {
return QtConcurrent::blockingMappedReduced(num_b_iterator{0}, num_b_iterator{N},
map_function, sum_modifier);
}
We can also try a random access iterator:
using num_r_iterator = num_iterator<std::random_access_iterator_tag>;
result_type sum_approach5(int const N) {
return QtConcurrent::blockingMappedReduced(num_r_iterator{0}, num_r_iterator{N},
map_function, sum_modifier);
}
Finally, we can switch from using range-generating iterators, to a precomputed range:
#include <numeric>
result_type sum_approach6(int const N) {
QVector<int> sequence(N);
std::iota(sequence.begin(), sequence.end(), 0);
return QtConcurrent::blockingMappedReduced(sequence, map_function, sum_modifier);
}
Of course, our point is to benchmark it all:
template <typename F> void benchmark(F fun, double const N) {
QElapsedTimer timer;
timer.start();
auto result = fun(N);
qDebug() << "sum:" << fixed << result << "took" << timer.elapsed()/N << "ms/item";
}
int main() {
const int N = 1000000;
benchmark(sum_approach1, N);
benchmark(sum_approach2, N);
benchmark(sum_approach3, N);
benchmark(sum_approach4, N);
benchmark(sum_approach5, N);
benchmark(sum_approach6, N);
}
On my system, in release build, the output is:
sum: 4999995000000.000000 took 0.015778 ms/item
sum: 4999995000000.000000 took 0.003631 ms/item
sum: 4999995000000.000000 took 0.003610 ms/item
sum: 4999995000000.000000 took 0.005414 ms/item
sum: 4999995000000.000000 took 0.000011 ms/item
sum: 4999995000000.000000 took 0.000008 ms/item
Note how using map-reduce on a random-iterable sequence has over 3 orders of magnitude lower overhead than using QtConcurrent::run, and is 2 orders of magnitude faster than non-random-iterable solutions.

Why am I not getting I/O-compute overlap with this code?

The following program:
#include <iostream>
#include <array>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles)
{
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
__global__ void dummy(clock_value_t duration_in_cycles)
{
gpu_sleep(duration_in_cycles);
}
int main()
{
const clock_value_t duration_in_clocks = 1e7;
const size_t buffer_size = 5e7;
constexpr const auto num_streams = 2;
std::array<char*, num_streams> host_ptrs;
std::array<char*, num_streams> device_ptrs;
std::array<cudaStream_t, num_streams> streams;
for (auto i=0; i<num_streams; i++) {
cudaMallocHost(&host_ptrs[i], buffer_size);
cudaMalloc(&device_ptrs[i], buffer_size);
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
}
cudaDeviceSynchronize();
for (auto i=0; i<num_streams; i++) {
cudaMemcpyAsync(device_ptrs[i], host_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
dummy<<<128, 128, 0, streams[i]>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptrs[i], device_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
}
for (auto i=0; i<num_streams; i++) { cudaStreamSynchronize(streams[i]); }
for (auto i=0; i<num_streams; i++) {
cudaFreeHost(host_ptrs[i]);
cudaFree(device_ptrs[i]);
}
}
should result in overlapping I/O and Compute between the work on the first and second streams: When the first stream's Host-to-Device ends, the first stream's kernel can start, but so can the second stream's Host-to-Device transfer. Instead, I get the following timeline, with no overlap:
I think I've covered my bases to ensure overlap. The streams are non-blocking (and indeed the enqueueing of work concludes well before the first HtoD does); the host memory is pinned... so what's missing for me to see overlap?
Using CUDA 8.0.61 on GNU/Linux Mint 18.2 with an NVIDIA GTX 650 Ti Boost. But the driver is v384.59.
Ok, it must be something with my GPU model, because with Fedora 25, and a GTX Titan X, I get:

Examples for reading text files in FreeBSD kernel module

Could anyone give some simple examples (function names are good) for reading text files line by line (binary is OK if text is really hard) in a FreeBSD kernel module, from a given directory?
Really appreciate your kind help.
Here's a sample kernel module that'll cat your /etc/motd on load:
// kernel module motd catter.
// Doug Luce doug#forephypodia.con.com
#include <sys/param.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/sbuf.h>
static int catfile(const char *filename) {
struct sbuf *sb;
static char buf[128];
struct nameidata nd;
off_t ofs;
ssize_t resid;
int error, flags, len;
NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, curthread);
flags = FREAD;
error = vn_open(&nd, &flags, 0, NULL);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
ofs = 0;
len = sizeof(buf) - 1;
sb = sbuf_new_auto();
while (1) {
error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred,
NOCRED, &resid, curthread);
if (error)
break;
if (resid == len)
break;
buf[len - resid] = 0;
sbuf_printf(sb, "%s", buf);
ofs += len - resid;
}
VOP_UNLOCK(nd.ni_vp, 0);
vn_close(nd.ni_vp, FREAD, curthread->td_ucred, curthread);
uprintf("%s", sbuf_data(sb));
return 0;
}
static int EventHandler(struct module *inModule, int inEvent, void *inArg) {
switch (inEvent) {
case MOD_LOAD:
uprintf("MOTD module loading.\n");
if (catfile("/etc/motd") != 0)
uprintf("Error reading MOTD.\n");
return 0;
case MOD_UNLOAD:
uprintf("MOTD module unloading.\n");
return 0;
default:
return EOPNOTSUPP;
}
}
static moduledata_t moduleData = {
"motd_kmod",
EventHandler,
NULL
};
DECLARE_MODULE(motd_kmod, moduleData, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
This was cobbled together mostly from bits of https://svnweb.freebsd.org/base/release/10.1.0/sys/kern/vfs_mountroot.c?revision=274417&view=markup
There's no nice scanning/parsing facilities native kernel-side, so
that's usually done the hard way.

Resources