I want to store a 3-dimensional matrix in a way that all the data is contiguous. This is because I need to send the matrix to a master node using MPI, and then concantenate all the smaller matrices into one big matrix. This becomes cumbersome with a nested vector, since you can only send more than one int, double, etc. if they are contiguous in memory, so you have to send each of the innermost vectors separately, using something like
for (int i = 0; i < nx; i++) {
for (int j = 0; j < ny; j++) {
int id_of_data = i*ny*nz + j*nz;
MPI_Send(&local_matrix[i][j][0], nz, MPI_DOUBLE, 0, id_of_data, MPI_COMM_WORLD);
}
}
and then receive the data on the master node, using id_of_data
Right now I'm using std vectors for storing the matrix,
std::vector<std::vector<std::vector<double> > > matrix;
so I'm trying to stick to std::vector for now. I realize it's easy to solve this using arrays/pointers, but I want to try this using std::vectors before resorting to that. The dimensions of the matrix are constant, so I don't have to worry about dynamic allocation.
I can access the elements of the vector using pointers like this
double* a = &matrix[x][y][z];
but I don't know how to do "the opposite", ie. something like
double* vec_ptr = new float(nz);
&matrix[x][y][0] = vec_ptr;
What I would like to do is
double* linear_matrix = new double(nx*ny*nz);
and then somehow make the nested vectors point to this data.
I ended up creating my own classes for this kind of situations. It is very typical for matrix manipulation libraries to take a matrix as a contiguous block of data, but at the same time I would like to be able to fill and read it using the standard m[i][j] accessors. So, here you go. Below is the code to create containers in arbitrary dimensions as a contiguous block of memory. Using boost preprocessor here to be able to do any dimension. If you only need it up to 3d it is quite obvious to de-boostify it :)
memblock.h
#ifndef BOOST_PP_IS_ITERATING
#ifndef MEMBLOCK_INCLUDED
#define MEMBLOCK_INCLUDED
#include <algorithm>
#include <boost/preprocessor.hpp>
# ifndef MBLOCK_MAX_DIM
# define MBLOCK_MAX_DIM 4
# endif
template <typename T> class MemoryBlock1d
{
protected:
T* m_data;
public:
MemoryBlock1d() { m_data = 0; }
MemoryBlock1d(int size) { m_data = 0; resize(size); }
MemoryBlock1d(int size, T val) { m_data = 0; resize(size, val); }
virtual ~MemoryBlock1d();
inline T*& data() { return m_data; }
inline void resize(int size);
inline void resize(int size, T val);
inline void clear();
inline T& operator[](int i) { return m_data[i]; }
};
// Implementation 1d:
template <typename T> inline MemoryBlock1d<T>::~MemoryBlock1d()
{
if (m_data)
{
delete[] m_data;
}
}
template <typename T> inline void MemoryBlock1d<T>::clear()
{
if (m_data)
{
delete[] m_data;
}
m_data = 0;
}
template <typename T> inline void MemoryBlock1d<T>::resize(int size)
{
if (m_data)
{
delete[] m_data;
}
m_data = new T[size];
}
template <typename T> void MemoryBlock1d<T>::resize(int size, T val)
{
resize(size);
std::fill(m_data, m_data + size, val);
}
// generate multi_dimensional blocks:
# define BOOST_PP_ITERATION_LIMITS (2, MBLOCK_MAX_DIM)
# define BOOST_PP_FILENAME_1 "memblock.h" // this file
# include BOOST_PP_ITERATE()
#endif // MEMBLOCK_INCLUDED
#else // BOOST_PP_IS_ITERATING
# define n BOOST_PP_ITERATION()
# define nprev BOOST_PP_SUB(n, 1)
# define MemoryBlockNd BOOST_PP_CAT(MemoryBlock, BOOST_PP_CAT(n, d))
# define MemoryBlockPREVd BOOST_PP_CAT(MemoryBlock, BOOST_PP_CAT(nprev, d))
# define MB_print(z, m, data) data
# define MB_timesDATAM(z, m, data) * BOOST_PP_CAT(data, m)
template <typename T> class MemoryBlockNd : public MemoryBlock1d<T>
{
private:
MemoryBlockPREVd<T*> m_pointers;
public:
MemoryBlockNd() { MemoryBlock1d<T>::data() = 0; }
MemoryBlockNd(BOOST_PP_ENUM_PARAMS(n, int size)) { MemoryBlock1d<T>::data() = 0; resize(BOOST_PP_ENUM_PARAMS(n, size)); }
MemoryBlockNd(BOOST_PP_ENUM_PARAMS(n, int size), T val) { MemoryBlock1d<T>::data() = 0; resize(BOOST_PP_ENUM_PARAMS(n, size), val); }
inline void resize(BOOST_PP_ENUM_PARAMS(n, int size));
inline void resize(BOOST_PP_ENUM_PARAMS(n, int size), T val);
inline T BOOST_PP_REPEAT(nprev, MB_print,*) & operator[](int i) { return m_pointers[i]; }
};
// Implementation n-dim:
template <typename T> void MemoryBlockNd<T>::resize(BOOST_PP_ENUM_PARAMS(n, int size))
{
m_pointers.resize(BOOST_PP_ENUM_PARAMS(nprev, size));
int sizePointers = size0 BOOST_PP_REPEAT_FROM_TO(1, nprev, MB_timesDATAM, size);
int sizeData = sizePointers * BOOST_PP_CAT(size, nprev);
MemoryBlock1d<T>::resize(sizeData);
T *p = MemoryBlock1d<T>::data();
T **ptr = m_pointers.data();
for (int i=0; i < sizePointers; i++, p += BOOST_PP_CAT(size, nprev)) ptr[i] = p;
}
template <typename T> void MemoryBlockNd<T>::resize(BOOST_PP_ENUM_PARAMS(n, int size), T val)
{
resize(BOOST_PP_ENUM_PARAMS(n, size));
int sizeData = size0 BOOST_PP_REPEAT_FROM_TO(1, n, MB_timesDATAM, size);
std::fill(MemoryBlock1d<T>::data(), MemoryBlock1d<T>::data() + sizeData, val);
}
# undef MB_timesDATAM
# undef MB_print
# undef MemoryBlockPREVd
# undef MemoryBlockNd
# undef nprev
# undef n
#endif // BOOST_PP_IS_ITERATING
memblock.cpp
// test the class here
#include <stdio.h>
#include <memblock.h>
int main() {
MemoryBlock3d<double> cube(2, 2, 2);
double *p = &cube[0][0][0];
for (int i=0; i < 8; i++)
p[i] = 0.5 + i;
for (int i=0; i < 2; i++)
for (int j=0; j < 2; j++)
for (int k=0; k < 2; k++)
printf("%f\n", cube[i][j][k]);
return 0;
}
Related
I got a homework where I needed to wright a program to find MST of a graph. I tried running it on the school server but I get a run-time error. On big servers like HackerEarth or HackerRank, however, I got correct answers on all test cases. The boundary for the number of edges and vertices is 100000 and for the weight of an edge 10000. Vertices are labeled from 0 to n.
Here is my code:
#include <stdio.h>
#include <algorithm>
using namespace std;
class Edge
{
public:
int x, y;
long long w;
bool operator<(const Edge& next) const;
};
bool Edge::operator<(const Edge& next) const
{
return this->w < next.w;
}
class DisjointSet
{
public:
int parent, rank;
};
DisjointSet set[100100];
Edge edges[100100];
int findWithPathCompression(int x)
{
if(set[x].parent != x)
set[x].parent = findWithPathCompression(set[x].parent);
return set[x].parent;
}
bool unionByRank(int x1, int y1)
{
int x = findWithPathCompression(x1);
int y = findWithPathCompression(y1);
if(x == y)
return false;
if(set[x].rank > set[y].rank)
set[y].parent = x;
else if(set[y].rank > set[x].rank)
set[x].parent = y;
else
{
set[y].parent = x;
set[x].rank++;
}
return true;
}
int main()
{
int n, m, e = 0, c = 0;
long long r = 0;
scanf("%d %d",&n,&m);
for(int i = 0; i <= n; i++)
{
set[i].parent = i;
set[i].rank = 1;
}
for(int i = 0; i < m; i++)
{
scanf("%d %d %lld",&edges[i].x,&edges[i].y,&edges[i].w);
edges[i].x;
edges[i].y;
}
sort(edges,edges + m);
while(e != n - 1)
{
if(unionByRank(edges[c].x,edges[c].y))
{
r += edges[c].w;
e++;
}
c++;
}
printf("%lld\n",r);
}
I am using RcppParallel to speed up some calculations. However, I am running out of memory in the process, so I would like to save results within the Parallel loop that are pass some relevance threshold. Below is a toy example to illustrate my point:
#include <Rcpp.h>
#include <RcppParallel.h>
using namespace Rcpp;
// [[Rcpp::depends(RcppParallel)]]
// [[Rcpp::plugins(cpp11)]]
struct Example : public RcppParallel::Worker {
RcppParallel::RVector<double> xvals, xvals_output, yvals;
Example(const NumericVector & xvals, NumericVector & yvals, NumericVector & xvals_output) :
xvals(xvals), xvals_output(xvals_output), yvals(yvals) {}
void operator()(std::size_t begin, size_t end) {
for(std::size_t i=begin; i < end; i++) {
double y = xvals[i] * (xvals[i] - 1);
// if(y < 0) {
// xvals_output.push_back(xvals[i]);
// yvals.push_back(y);
// }
xvals_output[i] = xvals[i];
yvals[i] = y;
}
}
};
// [[Rcpp::export]]
List find_values(NumericVector xvals) {
NumericVector xvals_output(xvals.size());
NumericVector yvals(xvals.size());
Example ex(xvals, yvals, xvals_output);
parallelFor(0, xvals.size(), ex);
List L = List::create(xvals_output, yvals);
return(L);
}
The R code would be:
find_values(seq(-10,10, by=0.5))
The commented out code is what I would like to do.
That is, I would like to initialize an empty vector, and append only the y-values that pass a certain threshold and also the associated x-values.
In my real usage, I am calculating a MxN matrix, so memory is an issue.
What is the correct way to approach this issue?
If anyone ever comes across a similar problem, here's a solution using "concurrent_vector" from TBB (which RcppParallel uses under the hood and is available as a header).
#include <Rcpp.h>
#include <RcppParallel.h>
#include <tbb/concurrent_vector.h>
using namespace Rcpp;
// [[Rcpp::depends(RcppParallel)]]
// [[Rcpp::plugins(cpp11)]]
struct Example : public RcppParallel::Worker {
RcppParallel::RVector<double> xvals;
tbb::concurrent_vector< std::pair<double, double> > &output;
Example(const NumericVector & xvals, tbb::concurrent_vector< std::pair<double, double> > &output) :
xvals(xvals), output(output) {}
void operator()(std::size_t begin, size_t end) {
for(std::size_t i=begin; i < end; i++) {
double y = xvals[i] * (xvals[i] - 1);
if(y < 0) {
output.push_back( std::pair<double, double>(xvals[i], y) );
}
}
}
};
// [[Rcpp::export]]
List find_values(NumericVector xvals) {
tbb::concurrent_vector< std::pair<double, double> > output;
Example ex(xvals,output);
parallelFor(0, xvals.size(), ex);
NumericVector xout(output.size());
NumericVector yout(output.size());
for(int i=0; i<output.size(); i++) {
xout[i] = output[i].first;
yout[i] = output[i].second;
}
List L = List::create(xout, yout);
return(L);
}
Output:
> find_values(seq(-10,10, by=0.5))
[[1]]
[1] 0.5
[[2]]
[1] -0.25
This is my code for MICEMAZE. Write a program that, given a description of the maze and the time limit, predicts the number of mice that will exit the maze. Assume that there are no bottlenecks is the maze, i.e. that all cells have room for an arbitrary number of mice.
#include <iostream>
#include <cstring>
#include <climits>
using namespace std;
int n,e,t,m;
int d[101][101];
int dis[101];
bool vis[101];
void djk()
{
memset(vis,false,sizeof vis);
for(int i=0;i<n;i++)
{
dis[i]=300000000;
}
dis[e]=0;
for(int i=0;i<n;i++)
{
int mi = INT_MAX, u;
for (int v = 0; v < n; v++)
if (vis[v] == false && dis[v] <= mi)
mi = dis[v], u = v;
vis[u]=true;
for(int j=0;j<n;j++)
{
if(d[u][j]!=0 && dis[j]!=INT_MAX && vis[j]==false && dis[j]>dis[u]+d[u][j])
dis[j]=dis[u]+d[u][j];
}
}
}
int main()
{
cin>>n>>e>>t>>m;
memset(d,0,sizeof d);
e--;
for(int i=0;i<m;i++)
{
int x,y,s;
cin>>x>>y>>s;
d[x-1][y-1]=s;
}
int cnt=0;
djk();
for(int i=0;i<n;i++)
{
//cout<<dis[i]<<endl;
if(dis[i]<=t) cnt++;
}
cout<<cnt<<endl;
return 0;
}
I'm a beginner in OpenCL. I'm trying to implement an OpenCL application.I have a doubt that how to write opencl kernel code . i have given a original c code.
Question :- help me to change that given c code into opencl kernel code?.
ORIGINAL C CODE:
int i, j;
// initialization of indexes
for (i = 0; i<n; i++)
Index[i] = i;
// Bubble sort
for (i = 0; i<n - 1; i++)
{
for (j = i + 1; j<n; j++)
{
if (I[i] > I[j])
{
double z = I[i]; // exchange attractiveness
I[i] = I[j];
I[j] = z;
z = f[i]; // exchange fitness
f[i] = f[j];
f[j] = z;
int k = Index[i]; // exchange indexes
Index[i] = Index[j];
Index[j] = k;
}
}
}
Example for 4096 element arrays(alternate bubble1 and bubble2 at least 2048 times--->4096(N) kernel executions ):
index init on host side since its just assignment.
Auxiliary functions:
void swap2p(__private int * I,int i,int j)
{
int tmp=I[i];
I[i]=I[j];
I[j]=tmp;
}
void swap2g(__global int * I,int i,int j)
{
int tmp=I[i];
I[i]=I[j];
I[j]=tmp;
}
Alternating kernel-1:
__kernel void bubble1(__global int * I, __global int * f, __global int * Index){
int threadId=get_global_id(0);
__private int vals[2];
if(threadId*2+1<4096)
{
vals[0]=I[threadId*2];
vals[1]=I[threadId*2+1];
if(vals[0]>vals[1])
{
swap2p(vals,threadId*2,threadId*2+1);
swap2g(f,threadId*2,threadId*2+1);
swap2g(Index,threadId*2,threadId*2+1);
I[threadId*2]=vals[0];
I[threadId*2+1]=vals[1];
}
}
}
alternating kernel-2:
__kernel void bubble2(__global int * I){
int threadId=get_global_id(0);
__private int vals[2];
if(threadId*2+2<4096)
{
vals[0]=I[threadId*2+1];
vals[1]=I[threadId*2+2];
if(vals[0]>vals[1])
{
swap2p(vals,threadId*2+1,threadId*2+2);
swap2g(f,threadId*2+1,threadId*2+2);
swap2g(Index,threadId*2+1,threadId*2+2);
I[threadId*2+1]=vals[0];
I[threadId*2+2]=vals[1];
}
}
}
Global thread number: N/2 (2048)
I made dynamic vector class..
But the problem show when main function is looping on and on,
my2dArr's row size is increasing when the function is looping
When data is coming on looping, i want to copy new data..
void main()
{
int data[450];
DynamicArray<int> my2dArr(36, 100);
for(int i = 0;i < 36;++i)
{
for(int j = 1;j < 16;++j)
{
my2dArr[i][j-1] = data[i];
}
}
}
// vector class
class DynamicArray
{
public:
DynamicArray(){};
DynamicArray(int rows, int cols): dArray(rows, vector<T>(cols)){}
vector<T> & operator[](int i)
{
return dArray[i];
}
const vector<T> & operator[] (int i) const
{
return dArray[i];
}
void resize(int rows, int cols)//resize the two dimentional array .
{
dArray.resize(rows);
for(int i = 0;i < rows;++i) dArray[i].resize(cols);
}
void clearCOL()
{
for(int i = 0;i < dArray.size();i++)
{
for(int j = 0;j < dArray[i].size();++j)
{
dArray[j].erase();
}
}
}
private:
vector<vector<T> > dArray;
};
The nested for loop should be fine at Initializing your array, but you'd need to put values into the data array to use it in initializing.
If you're only initializing the data once you might consider a third constructor overload that takes in an int[], like so:
DynamicArray( int rows, int cols, T array[] ): dArray( rows, vector< T >( cols ) )
{
for( int i = 0; i < rows; i++ )
{
for( int j = 0; j < cols; j++ )
{
dArray[i][j] = array[i * rows + j];
}
}
}
You'd need to make sure the array was the size you specified. In your example you pass a 450 int array in to initialize a 3,600 int DynamicArray. In you're example you're actually reading illegal data cause you go to the 16th column of each of the 36 rows so you're actually reading 576 elements from a 450 int array. I suppose the array is uninitialized anyway though, so it's all garbage.