I was trying to run this code but while compiling in intel devcloud using these commands:
icpx -qopenmp -fopenmp-targets=spir64 openmp_target_offload_clause_ordering.cpp
export OMP_TARGET_OFFLOAD=MANDATORY
it is showing runtime error.
#include <stdio.h>
int main() {
double *V = reinterpret_cast<double*>(0xdeadbeef);
printf("pointer=%p\n", V);
#pragma omp target parallel for simd is_device_ptr(V) if(true)
for(int i = 0; i < 1; ++i) {
printf("pointer=%p\n", V);
}
#pragma omp target parallel for simd if(true) is_device_ptr(V)
for(int i = 0; i < 1; ++i) {
printf("pointer=%p\n", V);
}
return 100;
}
The device pointer you are entering is invalid.
Replace (0xdeadbeef) with (omp_target_alloc(size, 0)) in the first line of your main function as follows:
double ptr = reinterpret_cast<double>(omp_target_alloc(size, 0));
Hope this helps!
Related
I am trying to calculate the euclidean distance for KNN but in parallel using dpc++. the training dataset contains 5 features and 1600 rows, while I want to calculate the distance between the current test point and each training point on the grid in parallel, but I keep getting an error regarding sycl kernal.
code for the function:
code
std::vector<double> distance_calculation_FPGA(queue& q,const std::vector<std::vector<double>>& dataset,const std::vector<double>& curr_test) {
range<1> num_items{ dataset.size()};
std::vector<double>res;
res.resize(dataset.size());
buffer dataset_buf(dataset);
buffer curr_test_buf(curr_test);
buffer res_buf(res.data(), num_items);
q.submit([&](handler& h) {
accessor a(dataset_buf, h, read_only);
accessor b(curr_test_buf, h, read_only);
accessor dif(res_buf, h, write_only, no_init);
h.parallel_for(num_items, [=](auto i) {
for (int j = 0; j <(const int) a[i].size(); ++j) {
dif[i] += (a[i][j] - b[j]) * (a[i][j] - b[j]) ;
}
});
});
for (int i = 0; i < res.size(); ++i) {
std::cout << res[i] << std::endl;
}
//old distance calculation (serial)
//for (int i = 0; i < dataset.size(); ++i) {
// double dis = 0;
// for (int j = 0; j < dataset[i].size(); ++j) {
// dis += (curr_test[j] - dataset[i][j]) * (curr_test[j] - dataset[i][j]);
//}
//res.push_back(dis);
//}
return res;
}
the error I am receiving:
SYCL kernel cannot call a variadic function
SYCL kernel cannot call an undefined function without SYCL_EXTERNAL attribute
Would be extremely grateful for any help!
Thanks
We tried running your code by creating dummy 'dataset' and 'curr_test' variables. We were able to run the program successfully. Please refer this thread
Please refer to the complete code attached below.
#include <CL/sycl.hpp>
#include <iostream>
using namespace sycl;
std::vector<double> distance_calculation_FPGA(queue& q,const std::vector<std::vector<double>>& dataset,const std::vector<double>& curr_test)
{
range<1> num_items{ dataset.size()};
std::vector<double>res;
res.resize(dataset.size());
buffer dataset_buf(dataset);
buffer curr_test_buf(curr_test);
buffer res_buf(res.data(), num_items);
q.submit([&](handler& h) {
accessor a(dataset_buf, h, read_only);
accessor b(curr_test_buf, h, read_only);
accessor dif(res_buf, h, write_only, no_init);
h.parallel_for(num_items, [=](auto i) {
for (int j = 0; j <(const int) a[i].size(); ++j) {
// dif[i] += (a[i][j] - b[j]) * (a[i][j] - b[j]) ;
dif[i]+=a[i][j];
}
});
});
q.wait(); //We have added this line of code for synchronization.
for (int i : res) {
std::cout <<i<< std::endl;
}
return res;
}
int main(){
std::vector<std::vector<double>> dataset;
for(int i=0;i<5;i++)
{
std::vector<double> d;
for(int j=0;j<1600;j++)
{
d.push_back((double)j);
}
dataset.push_back(d);
}
std::vector<double> curr_test;
for(int i=0;i<1600;i++)
{
curr_test.push_back((double)i);
}
queue q;
std::cout << "Running on "<<
q.get_device().get_info<sycl::info::device::name>()<< std::endl;
//print the device name as a test to check the parallelisation
distance_calculation_FPGA(q,dataset,curr_test);
return 0;
}
#include <iostream>
#include <vector>
#include <algorithm>
#include <queue> // std::priority_queue
using std::vector;
using std::cin;
using std::cout;
struct fj{
int indexI=0;
int freeT=0;
};
struct DereferenceCompareNode : public std::binary_function<fj, fj, bool>
{
bool operator()(const fj lhs, const fj rhs) const
{
return lhs.freeT > rhs.freeT;
}
};
class JobQueue {
private:
int num_workers_;
vector<int> jobs_;
vector<int> assigned_workers_;
vector<long long> start_times_;
void WriteResponse() const {
for (int i = 0; i < jobs_.size(); ++i) {
cout << assigned_workers_[i] << " " << start_times_[i] << "\n";
}
}
void ReadData() {
int m;
cin >> num_workers_ >> m;
jobs_.resize(m);
std::cout<<"Read fault"<<"\n";
for(int i = 0; i < m; i++)
cin >> jobs_[i];
std::cout<<"Read fault ends"<<"\n";
}
void AssignJobs() {
// TODO: replace this code with a faster algorithm.
std::cout<<"Fault point 1"<<"\n";
assigned_workers_.resize(jobs_.size());
start_times_.resize(jobs_.size());
vector<long long> next_free_time(num_workers_, 0);
std::priority_queue<int, vector<int>, std::greater<int> > thread;
std::priority_queue<fj, vector<fj>, DereferenceCompareNode > freeJob;
/*
for (int i = 0; i < jobs_.size(); ++i) {
int duration = jobs_[i];
int next_worker = 0;
for (int j = 0; j < num_workers_; ++j) {
if (next_free_time[j] < next_free_time[next_worker])
next_worker = j;
}
assigned_workers_[i] = next_worker;
start_times_[i] = next_free_time[next_worker];
next_free_time[next_worker] += duration;
}
*/
std::cout<<"dump point 2"<<"\n";
for(int i=0;i<num_workers_;i++){
thread.push(i);
}
std::cout<<"dump point 1"<<"\n";
int counter = 0;
while(jobs_.size()!=0){
std::cout<<"jobs_.size:"<<jobs_.size()<<"\n";
std::cout<<"freeJob.size:"<<freeJob.size()<<"\n";
//check logic
do{
if(freeJob.top().freeT == counter){
std::cout<<"freeJob.top().freeT:"<<freeJob.top().freeT<<"\n";
std::cout<<"counter:"<<counter<<"\n";
thread.push(freeJob.top().indexI);
freeJob.pop();
}else{
break;
}
}
while(freeJob.size()!=0);
std::cout<<"Thread:"<<thread.size()<<"\n";
while(thread.size()!=0){
if(jobs_.size()!=0){
fj currA;
currA.indexI = thread.top();
currA.freeT = jobs_.at(0)+counter;
std::cout<<"currA.indexI:"<<currA.indexI<<"\n";
std::cout<<"currA.freeT:"<<currA.freeT<<"\n";
thread.pop();
jobs_.erase(jobs_.begin());
assigned_workers_.push_back(currA.indexI);
start_times_.push_back(currA.freeT);
}else{
break;
}
}
counter++;
}
}
public:
void Solve() {
ReadData();
AssignJobs();
WriteResponse();
}
};
int main() {
std::ios_base::sync_with_stdio(false);
JobQueue job_queue;
job_queue.Solve();
return 0;
}
I am getting segmentation fault in function ReadData while taking inputs for vector jobs.
I am getting fault even when I am inside bounds of defined size.
Everything was fine when have not written AssignJob function.
Am I doing something wrong with some bounds or taking illegal inputs format or messing with some other stuff?
Am I doing something wrong
Yes, you are: freeJob starts out empty, so this is undefined behavior:
if(freeJob.top().freeT == counter){
In fact, you never push anything into freeJob, you only pop() things from it.
I want to parallelize a function and have the problem that after a few hours my memory is overloaded.
The test program calculates something simple, and works so far. Only the memory usage is constantly increasing.
QT Project file:
QT -= gui
QT += concurrent widgets
CONFIG += c++11 console
CONFIG -= app_bundle
DEFINES += QT_DEPRECATED_WARNINGS
SOURCES += main.cpp
QT program file:
#include <QCoreApplication>
#include <qdebug.h>
#include <qtconcurrentrun.h>
double parallel_function(int instance){
return (double)(instance)*10.0;
}
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
int nr_of_threads = 8;
double result_sum,temp_var;
for(qint32 i = 0; i<100000000; i++){
QFuture<double> * future = new QFuture<double>[nr_of_threads];
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread] = QtConcurrent::run(parallel_function,thread);
}
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread].waitForFinished();
temp_var = future[thread].result();
qDebug()<<"result: " << temp_var;
result_sum += temp_var;
}
}
qDebug()<<"total: "<<result_sum;
return a.exec();
}
As I have observed, QtConcurrent::run(parallel_function,thread) allocates memory, but does not release memory after future[thread].waitForFinished().
What's wrong here?
You have memory leak because future array is not deleted. Add delete[] future at the end of outer for loop.
for(qint32 i = 0; i<100000000; i++)
{
QFuture<double> * future = new QFuture<double>[nr_of_threads];
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread] = QtConcurrent::run(parallel_function,thread);
}
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread].waitForFinished();
temp_var = future[thread].result();
qDebug()<<"result: " << temp_var;
result_sum += temp_var;
}
delete[] future; // <--
}
Here's how this might look - note how much simpler everything can be! You're dead set on doing manual memory management: why? First of all, QFuture is a value. You can store it very efficiently in any vector container that will manage the memory for you. You can iterate such a container using range-for. Etc.
QT = concurrent # dependencies are automatic, you don't use widgets
CONFIG += c++14 console
CONFIG -= app_bundle
SOURCES = main.cpp
Even though the example is synthetic and the map_function is very simple, it's worth considering how to do things most efficiently and expressively. Your algorithm is a typical map-reduce operation, and blockingMappedReduce has half the overhead of manually doing all of the work.
First of all, let's recast the original problem in C++, instead of some C-with-pluses Frankenstein.
// https://github.com/KubaO/stackoverflown/tree/master/questions/future-ranges-49107082
/* QtConcurrent will include QtCore as well */
#include <QtConcurrent>
#include <algorithm>
#include <iterator>
using result_type = double;
static result_type map_function(int instance){
return instance * result_type(10);
}
static void sum_modifier(result_type &result, result_type value) {
result += value;
}
static result_type sum_function(result_type result, result_type value) {
return result + value;
}
result_type sum_approach1(int const N) {
QVector<QFuture<result_type>> futures(N);
int id = 0;
for (auto &future : futures)
future = QtConcurrent::run(map_function, id++);
return std::accumulate(futures.cbegin(), futures.cend(), result_type{}, sum_function);
}
There is no manual memory management, and no explicit splitting into "threads" - that was pointless, since the concurrent execution platform is aware of how many threads there are. So this is already better!
But this seems quite wasteful: each future internally allocates at least once (!).
Instead of using futures explicitly for each result, we can use the map-reduce framework. To generate the sequence, we can define an iterator that provides the integers we wish to work on. The iterator can be a forward or a bidirectional one, and its implementation is the bare minimum needed by QtConcurrent framework.
#include <iterator>
template <typename tag> class num_iterator : public std::iterator<tag, int, int, const int*, int> {
int num = 0;
using self = num_iterator;
using base = std::iterator<tag, int, int, const int*, int>;
public:
explicit num_iterator(int num = 0) : num(num) {}
self &operator++() { num ++; return *this; }
self &operator--() { num --; return *this; }
self &operator+=(typename base::difference_type d) { num += d; return *this; }
friend typename base::difference_type operator-(self lhs, self rhs) { return lhs.num - rhs.num; }
bool operator==(self o) const { return num == o.num; }
bool operator!=(self o) const { return !(*this == o); }
typename base::reference operator*() const { return num; }
};
using num_f_iterator = num_iterator<std::forward_iterator_tag>;
result_type sum_approach2(int const N) {
auto results = QtConcurrent::blockingMapped<QVector<result_type>>(num_f_iterator{0}, num_f_iterator{N}, map_function);
return std::accumulate(results.cbegin(), results.cend(), result_type{}, sum_function);
}
using num_b_iterator = num_iterator<std::bidirectional_iterator_tag>;
result_type sum_approach3(int const N) {
auto results = QtConcurrent::blockingMapped<QVector<result_type>>(num_b_iterator{0}, num_b_iterator{N}, map_function);
return std::accumulate(results.cbegin(), results.cend(), result_type{}, sum_function);
}
Could we drop the std::accumulate and use blockingMappedReduced instead? Sure:
result_type sum_approach4(int const N) {
return QtConcurrent::blockingMappedReduced(num_b_iterator{0}, num_b_iterator{N},
map_function, sum_modifier);
}
We can also try a random access iterator:
using num_r_iterator = num_iterator<std::random_access_iterator_tag>;
result_type sum_approach5(int const N) {
return QtConcurrent::blockingMappedReduced(num_r_iterator{0}, num_r_iterator{N},
map_function, sum_modifier);
}
Finally, we can switch from using range-generating iterators, to a precomputed range:
#include <numeric>
result_type sum_approach6(int const N) {
QVector<int> sequence(N);
std::iota(sequence.begin(), sequence.end(), 0);
return QtConcurrent::blockingMappedReduced(sequence, map_function, sum_modifier);
}
Of course, our point is to benchmark it all:
template <typename F> void benchmark(F fun, double const N) {
QElapsedTimer timer;
timer.start();
auto result = fun(N);
qDebug() << "sum:" << fixed << result << "took" << timer.elapsed()/N << "ms/item";
}
int main() {
const int N = 1000000;
benchmark(sum_approach1, N);
benchmark(sum_approach2, N);
benchmark(sum_approach3, N);
benchmark(sum_approach4, N);
benchmark(sum_approach5, N);
benchmark(sum_approach6, N);
}
On my system, in release build, the output is:
sum: 4999995000000.000000 took 0.015778 ms/item
sum: 4999995000000.000000 took 0.003631 ms/item
sum: 4999995000000.000000 took 0.003610 ms/item
sum: 4999995000000.000000 took 0.005414 ms/item
sum: 4999995000000.000000 took 0.000011 ms/item
sum: 4999995000000.000000 took 0.000008 ms/item
Note how using map-reduce on a random-iterable sequence has over 3 orders of magnitude lower overhead than using QtConcurrent::run, and is 2 orders of magnitude faster than non-random-iterable solutions.
The following program:
#include <iostream>
#include <array>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles)
{
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
__global__ void dummy(clock_value_t duration_in_cycles)
{
gpu_sleep(duration_in_cycles);
}
int main()
{
const clock_value_t duration_in_clocks = 1e7;
const size_t buffer_size = 5e7;
constexpr const auto num_streams = 2;
std::array<char*, num_streams> host_ptrs;
std::array<char*, num_streams> device_ptrs;
std::array<cudaStream_t, num_streams> streams;
for (auto i=0; i<num_streams; i++) {
cudaMallocHost(&host_ptrs[i], buffer_size);
cudaMalloc(&device_ptrs[i], buffer_size);
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
}
cudaDeviceSynchronize();
for (auto i=0; i<num_streams; i++) {
cudaMemcpyAsync(device_ptrs[i], host_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
dummy<<<128, 128, 0, streams[i]>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptrs[i], device_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
}
for (auto i=0; i<num_streams; i++) { cudaStreamSynchronize(streams[i]); }
for (auto i=0; i<num_streams; i++) {
cudaFreeHost(host_ptrs[i]);
cudaFree(device_ptrs[i]);
}
}
should result in overlapping I/O and Compute between the work on the first and second streams: When the first stream's Host-to-Device ends, the first stream's kernel can start, but so can the second stream's Host-to-Device transfer. Instead, I get the following timeline, with no overlap:
I think I've covered my bases to ensure overlap. The streams are non-blocking (and indeed the enqueueing of work concludes well before the first HtoD does); the host memory is pinned... so what's missing for me to see overlap?
Using CUDA 8.0.61 on GNU/Linux Mint 18.2 with an NVIDIA GTX 650 Ti Boost. But the driver is v384.59.
Ok, it must be something with my GPU model, because with Fedora 25, and a GTX Titan X, I get:
Could anyone give some simple examples (function names are good) for reading text files line by line (binary is OK if text is really hard) in a FreeBSD kernel module, from a given directory?
Really appreciate your kind help.
Here's a sample kernel module that'll cat your /etc/motd on load:
// kernel module motd catter.
// Doug Luce doug#forephypodia.con.com
#include <sys/param.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/sbuf.h>
static int catfile(const char *filename) {
struct sbuf *sb;
static char buf[128];
struct nameidata nd;
off_t ofs;
ssize_t resid;
int error, flags, len;
NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, curthread);
flags = FREAD;
error = vn_open(&nd, &flags, 0, NULL);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
ofs = 0;
len = sizeof(buf) - 1;
sb = sbuf_new_auto();
while (1) {
error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred,
NOCRED, &resid, curthread);
if (error)
break;
if (resid == len)
break;
buf[len - resid] = 0;
sbuf_printf(sb, "%s", buf);
ofs += len - resid;
}
VOP_UNLOCK(nd.ni_vp, 0);
vn_close(nd.ni_vp, FREAD, curthread->td_ucred, curthread);
uprintf("%s", sbuf_data(sb));
return 0;
}
static int EventHandler(struct module *inModule, int inEvent, void *inArg) {
switch (inEvent) {
case MOD_LOAD:
uprintf("MOTD module loading.\n");
if (catfile("/etc/motd") != 0)
uprintf("Error reading MOTD.\n");
return 0;
case MOD_UNLOAD:
uprintf("MOTD module unloading.\n");
return 0;
default:
return EOPNOTSUPP;
}
}
static moduledata_t moduleData = {
"motd_kmod",
EventHandler,
NULL
};
DECLARE_MODULE(motd_kmod, moduleData, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
This was cobbled together mostly from bits of https://svnweb.freebsd.org/base/release/10.1.0/sys/kern/vfs_mountroot.c?revision=274417&view=markup
There's no nice scanning/parsing facilities native kernel-side, so
that's usually done the hard way.