I have developed a dual chain markov monte carlo model designed to forecast loan portfolios in the excellent package Rcpp but have run into an issue trying to implement a parallelised version of these functions with RcppParallel.
I have based my attempts this far on this vignette (https://gallery.rcpp.org/articles/parallel-distance-matrix/) and this stackoverflow thread (How to call user-defined function in RcppParallel?).
All of the UDFs underlying this logic are implemented using Armadillo type objects, which I understand are threadsafe, and the writing of data between functions and pre-allocated outputs should be working smoothly as I have this same logic implemented successfully in serial functions. It's also true that the function portfolio_simulation_rating_model_rs_ts works well with the inputs used outside of the RcppParallel wrapper and there are no compilation errors or warnings when I source this code and the underlying functions. However, once I get to running the dcmcmc_portfolio_rating_model_parallel function in R, my session crashes only saying that there has been a fatal error.
Clearly I am missing something in the parallelisation, so any help/suggestions would be greatly appreciated.
// [[Rcpp::depends(RcppArmadillo, RcppParallel)]]
#include <string>
#include <algorithm>
#include <vector>
#include <math.h>
#include <RcppArmadillo.h>
#include <RcppParallel.h>
using namespace arma;
using namespace RcppParallel;
using namespace Rcpp;
using namespace std;
struct dcmcmc_portfolio_rating_model_worker : public Worker {
// Input Values
const int n_loans ;
const int n_regime ;
const int n_matrix ;
const int n_amort ;
const RVector<double> loan_ids ;
const RVector<double> starting_balances ;
const RVector<double> starting_positions ;
const RVector<double> cprs ;
const RVector<double> sim_regime_indices ;
const RVector<double> loan_regime_indices ;
const RVector<double> starting_periods ;
const RVector<double> regime_matrix_indices ;
const RVector<double> matrix_indices ;
const RVector<double> matrix_elements ;
const int nrow ;
const int ncol ;
const RMatrix<double> amortisation_schedules ;
const int periods ;
const int iterations ;
// Output Matrix
RMatrix<double> output_mx ;
dcmcmc_portfolio_rating_model_worker(
const int& n_loans,
const int& n_regime,
const int& n_matrix,
const int& n_amort,
const NumericVector& loan_ids,
const NumericVector& starting_balances,
const NumericVector& starting_positions,
const NumericVector& cprs,
const NumericVector& sim_regime_indices,
const NumericVector& loan_regime_indices,
const NumericVector& starting_periods,
const NumericVector& regime_matrix_indices,
const NumericVector& matrix_indices,
const NumericVector& matrix_elements,
const int& nrow,
const int& ncol,
const NumericMatrix& amortisation_schedules,
const int& periods,
const int& iterations,
NumericMatrix& output_mx)
: n_loans(n_loans),
n_regime(n_regime),
n_matrix(n_matrix),
n_amort(n_amort),
loan_ids(loan_ids),
starting_balances(starting_balances),
starting_positions(starting_positions),
cprs(cprs),
sim_regime_indices(sim_regime_indices),
loan_regime_indices(loan_regime_indices),
starting_periods(starting_periods),
regime_matrix_indices(regime_matrix_indices),
matrix_indices(matrix_indices),
matrix_elements(matrix_elements),
nrow(nrow),
ncol(ncol),
amortisation_schedules(amortisation_schedules),
periods(periods),
iterations(iterations),
output_mx(output_mx) {}
// Setting up functions to convert inputs to arma
arma::vec convert_input_vector(RVector<double> input_vector, int length)
{RVector<double> tmp_input_vector = input_vector ;
arma::vec input_vector_ts(tmp_input_vector.begin(), length, false) ;
return input_vector_ts ;}
arma::mat convert_input_matrix(RMatrix<double> input_matrix, int rows, int cols)
{RMatrix<double> tmp_input_matrix = input_matrix ;
arma::mat input_matrix_ts(tmp_input_matrix.begin(), rows, cols, false) ;
return input_matrix_ts ;}
// Function to iterate
void operator()(std::size_t begin, std::size_t end){
arma::vec loan_ids_ts = convert_input_vector(loan_ids, n_loans) ;
arma::vec starting_balances_ts = convert_input_vector(starting_balances, n_loans) ;
arma::vec starting_positions_ts = convert_input_vector(starting_positions, n_loans) ;
arma::vec cprs_ts = convert_input_vector(cprs, n_loans) ;
arma::vec sim_regime_indices_ts = convert_input_vector(sim_regime_indices, n_regime);
arma::vec loan_regime_indices_ts = convert_input_vector(loan_regime_indices, n_regime) ;
arma::vec starting_periods_ts = convert_input_vector(starting_periods, n_regime) ;
arma::vec regime_matrix_indices_ts = convert_input_vector(regime_matrix_indices, n_regime);
arma::vec matrix_indices_ts = convert_input_vector(matrix_indices, n_matrix) ;
arma::vec matrix_elements_ts = convert_input_vector(matrix_elements, n_matrix) ;
arma::mat amortisation_schedules_ts = convert_input_matrix(amortisation_schedules, n_amort, 3) ;
for(unsigned int i = begin; i < end; i++){
arma::vec i_sim_regime_indices = allwhich_ts(sim_regime_indices_ts,
i) ;
int sim_begin = as_scalar(i_sim_regime_indices.head(1)) ;
int sim_end = as_scalar(i_sim_regime_indices.tail(1)) ;
arma::vec i_loan_regime_indices = loan_regime_indices_ts.subvec(sim_begin, sim_end) ;
arma::vec i_starting_periods = starting_periods_ts.subvec(sim_begin, sim_end) ;
arma::vec i_regime_matrix_indices = regime_matrix_indices_ts.subvec(sim_begin, sim_end) ;
arma::mat pf_simulation = portfolio_simulation_rating_model_rs_ts(
loan_ids_ts,
starting_balances_ts,
starting_positions_ts,
cprs_ts,
i_loan_regime_indices,
i_starting_periods,
i_regime_matrix_indices,
matrix_indices_ts,
matrix_elements_ts,
nrow,
ncol,
amortisation_schedules_ts,
periods
) ;
int sim_rows = pf_simulation.n_rows ;
int sim_cols = pf_simulation.n_cols ;
for(int c = 0; c < sim_cols; c++){
for(int r = 0; r < sim_rows; r++){
output_mx((n_loans*periods*i + r), c) = pf_simulation(r, c) ;
}
}
for(int r = 0; r < sim_rows; r++){
output_mx((n_loans*periods*i + r), 7) = (i + 1) ;
}
}
}
};
//[[Rcpp::export]]
NumericMatrix dcmcmc_portfolio_rating_model_parallel(
const NumericVector& loan_ids,
const NumericVector& starting_balances,
const NumericVector& starting_positions,
const NumericVector& cprs,
const NumericVector& sim_regime_indices,
const NumericVector& loan_regime_indices,
const NumericVector& starting_periods,
const NumericVector& regime_matrix_indices,
const NumericVector& matrix_indices,
const NumericVector& matrix_elements,
int nrow,
int ncol,
const NumericMatrix& amortisation_schedules,
int periods,
int iterations
){
int n_loans = loan_ids.size() ;
int n_regime = sim_regime_indices.size() ;
int n_matrix = matrix_indices.size() ;
int n_amort = amortisation_schedules.nrow() ;
NumericMatrix output_mx(n_loans*periods*iterations, 8) ;
// Creating Worker object
dcmcmc_portfolio_rating_model_worker DCMCMC(
n_loans,
n_regime,
n_matrix,
n_amort,
loan_ids,
starting_balances,
starting_positions,
cprs,
sim_regime_indices,
loan_regime_indices,
starting_periods,
regime_matrix_indices,
matrix_indices,
matrix_elements,
nrow,
ncol,
amortisation_schedules,
periods,
iterations,
output_mx
) ;
// Call parellised worker
parallelFor(0, iterations, DCMCMC) ;
return(output_mx) ;
}
EDIT:
I have produced a minimum reproducible example, trying to incorporate the helpful comments recieved on this post so far. The example sets up trivial functions designed to mimic the structure of my modelling functions. The final function causing a crash takes three vectors, vec1, vec2, and vec_ind. It applies a worker which attempts to take chunks of equal size (indentified by indices stored in vec_ind) of vec1 and vec2, add these subvector chunks, and store the results in the relevant portions of an output vector.
I have reproduced the example below using both arma::vec and std::vector types and experience the crashing behaviour in both. I present the std::vector code below, further to Dirk's suggestion that the RcppArmadillo types may be relying on R memory, and I have removed all namespace inclusions other than RcppParallel to avoid conflicts, as per onyambu's remark.
Here is the Rcpp
// [[Rcpp::depends(RcppArmadillo, RcppParallel)]]
#include <string>
#include <algorithm>
#include <vector>
#include <math.h>
#include <RcppArmadillo.h>
#include <RcppParallel.h>
using namespace RcppParallel;
//[[Rcpp::export]]
std::vector<double> allwhich_ts(std::vector<double> vector, double value){
int length = vector.size() ;
std::vector<double> values(0) ;
for(int i = 0; i < length; i++){
bool match = vector[i] == value;
if(match){
values.push_back(i);
}
}
return(values);
}
//[[Rcpp::export]]
std::vector<double> vector_addition(std::vector<double> vector1, std::vector<double> vector2){
int n_elements = vector1.size() ;
std::vector<double> output_vec = std::vector<double>(n_elements) ;
for(int i = 0; i < n_elements; i++){
output_vec[i] = vector1[i] + vector2[i] ;
}
return(output_vec) ;
}
struct vector_addition_worker : public Worker {
const RVector<double> vector1 ;
const RVector<double> vector2 ;
const RVector<double> vector_indices ;
const int vector_length ;
RVector<double> output_vec ;
vector_addition_worker(
const Rcpp::NumericVector& vector1,
const Rcpp::NumericVector& vector2,
const Rcpp::NumericVector& vector_indices,
const int& vector_length,
Rcpp::NumericVector& output_vec
) : vector1(vector1),
vector2(vector2),
vector_indices(vector_indices),
vector_length(vector_length),
output_vec(output_vec) {}
std::vector<double> convert_input_vec(RVector<double> input_vector, int vec_length){
RVector<double> tmp_vector = input_vector ;
std::vector<double> input_vector_ts(tmp_vector.begin(), tmp_vector.end()) ;
return(input_vector_ts) ;
}
void operator()(std::size_t begin, std::size_t end){
std::vector<double> vector1_ts = convert_input_vec(vector1, vector_length) ;
std::vector<double> vector2_ts = convert_input_vec(vector2, vector_length) ;
std::vector<double> vector_indices_ts = convert_input_vec(vector_indices, vector_length) ;
for(unsigned int i = begin; i < end; i++){
std::vector<double> indices = allwhich_ts(vector_indices_ts, i) ;
int values_begin = indices.at(1) ;
int values_end = indices.at(std::distance(indices.begin(), indices.end())) ;
std::vector<double> values1(vector1_ts.begin() + values_begin, vector1_ts.begin() + values_end) ;
std::vector<double> values2(vector2_ts.begin() + values_begin, vector2_ts.begin() + values_end) ;
std::vector<double> interim_op = vector_addition(values1, values2) ;
int op_size = interim_op.size() ;
for(int n = 0; n < op_size; n++){
output_vec[i*op_size + n] = interim_op[n] ;
}
}
}
};
//[[Rcpp::export]]
Rcpp::NumericVector vector_addition_parallel(Rcpp::NumericVector vec1,
Rcpp::NumericVector vec2,
Rcpp::NumericVector vec_ind){
int vec_length = vec1.size() ;
double n_indices = *std::max_element(vec_ind.begin(), vec_ind.end()) ;
Rcpp::NumericVector op_vec(vec_length);
vector_addition_worker vec_add_worker(
vec1,
vec2,
vec_ind,
vec_length,
op_vec
) ;
parallelFor(0, n_indices, vec_add_worker) ;
return(op_vec) ;
}
Here is the R code which tests for expected behaviour
library(Rcpp)
library(RcppParallel)
library(RcppArmadillo)
# Setting up dummy data
vec1 = rep(1, 500)
vec2 = rep(1, 500)
vec_inds = sort(rep(1:20, 25))
length(vec1);length(vec2);length(vec_inds)
## Checking that allwhich_ts is working as expected
allwhich_ts(vec_inds, 1)
# Checking that vector_addition is working as expected
vector_addition(vec1, vec2)
# Checking that the same logic can be applied serially (mainly to verify data handling method)
r_repro_function <- function(vec1, vec2, vec_inds){
op_vec = numeric(length(vec1))
for(i in unique(vec_inds)){
tmp1 = vec1[vec_inds == i]
tmp2 = vec2[vec_inds == i]
tmp_op = tmp1 + tmp2
for(n in 1:length(tmp1)){
op_vec[(i - 1)*length(tmp1) + n] = tmp_op[n]
}
}
op_vec
}
r_repro_function(vec1, vec2, vec_inds)
vector_addition_parallel(vec1 = vec1,
vec2 = vec2,
vec_ind = vec_inds)
So following Dirk's suggestion I am posting an answer with a pared back example to illustrate the problem I had and the solution I arrived at with his help.
The mistake I made was actually in how I treated the begin and end variables within my worker. In contrast to the articles in the RcppParallel gallery, I was not using begin/end to guide iterators to the relevant portions of the calculation, but rather trying to use them to index the relevant part of my input dataset for each portion.
This caused dimension errors, which on my machine simply crashed the R session.
The solution to this mistake would be to either (1) ensure any UDFs you are applying deal in iterators rather than vector values or (2) to bridge the begin/end variables correctly to the vectors you are trying to index.
Given that all of my modelling functions are already in the business of taking vector indices, I have applied the second approach and create a unique_indices vector within my function which the begin/end values can simply select values from. The current solution makes some assumptions about how the input indices will work (i.e. simply integer values from smallest to largest in the argument vector).
Apologies if this is still considered verbose, but I thought it worth keeping the data-handling logic as it was in the problem statement because that is where the problem arose. That is where a submatrix is identified by an index and used as the arguments to some calculation. The key differences to the example above are on lines 48-52 and 62-65
Where (1) each i between begin and end is used to select an index as so int index_value = unique_indices[i] ; which then identifies the relevant input data and (2) the unique_indices vector is defined by the characteristics of the vector of indices vec_ind
:
// [[Rcpp::depends(RcppArmadillo, RcppParallel)]]
#include <string>
#include <algorithm>
#include <vector>
#include <math.h>
#include <RcppArmadillo.h>
#include <RcppParallel.h>
using namespace RcppParallel;
//[[Rcpp::export]]
std::vector<double> allwhich_ts(std::vector<double> vector, double value){
int length = vector.size() ;
std::vector<double> values(length) ;
int matches = 0;
for(int i = 0; i < length; i++){
bool match = vector[i] == value;
if(match){values[matches] = i;
matches++ ;}}
std::vector<double> op(values.begin(), values.begin() + matches) ;
return(op);
}
struct vector_double_worker : public Worker {
// Defining worker arguments
const RVector<double> vector1 ;
const RVector<double> vector_indices ;
const RVector<double> unique_indices ;
const int vector_length ;
RVector<double> output_vec ;
// Initialising function argument values
vector_double_worker(
const Rcpp::NumericVector& vector1, const Rcpp::NumericVector& vector_indices,
const Rcpp::NumericVector& unique_indices, const int& vector_length, Rcpp::NumericVector& output_vec
) : vector1(vector1),vector_indices(vector_indices),unique_indices(unique_indices),
vector_length(vector_length),output_vec(output_vec) {}
// Setting up conversion function so that UDFs can deal in std:: types
std::vector<double> convert_input_vec(RVector<double> input_vector, int vec_length){
std::vector<double> input_vector_ts(input_vector.begin(), input_vector.end()) ;
return(input_vector_ts) ;}
// Defining operator ranges which will breakdown the task into partitions
void operator()(std::size_t begin, std::size_t end){
// Converting input vectors to std types
std::vector<double> vector1_ts = convert_input_vec(vector1, vector_length) ;
std::vector<double> vector_indices_ts = convert_input_vec(vector_indices, vector_length) ;
// For loop to perform calculations for each element in a given partition
for(unsigned int i = begin; i < end; i++){
int index_value = unique_indices[i] ; // begin and end now used to index the vector of input indices defined outside of the function
std::vector<double> indices = allwhich_ts(vector_indices_ts, index_value) ; // identifying sub-vector indices
int values_begin = indices.at(0) ;
int values_end = indices.at(std::distance(indices.begin(), indices.end()) - 1) ; // - 1 was added to avoid dimension error
std::vector<double> values1(vector1_ts.begin() + values_begin, vector1_ts.begin() + values_end + 1) ; // + 1 was added to avoid dimension error
int op_size = values1.size() ;
for(int n = 0; n < op_size; n++){output_vec[i*op_size + n] = values1[n] * 2 ;} // Trivial example calculation
}}};
//[[Rcpp::export]]
Rcpp::NumericVector vector_double_parallel(Rcpp::NumericVector vec1, Rcpp::NumericVector vec_ind){
int vec_length = vec1.size() ; // Setting up output vector
Rcpp::NumericVector op_vec(vec_length);
double n_indices = *std::max_element(vec_ind.begin(), vec_ind.end()) ; // Identifying unique index values
double min_indices = *std::min_element(vec_ind.begin(), vec_ind.end()) ;
Rcpp::NumericVector unique_indices(n_indices) ;
std::iota(unique_indices.begin(), unique_indices.end(), min_indices);
vector_double_worker vec_2_worker(vec1,vec_ind,unique_indices,vec_length,op_vec) ; // Setting up parallel worker
parallelFor(0, n_indices, vec_2_worker) ; // Populating output vector with results
return(op_vec) ;}
Related
I am trying to return a bunch of matrices using RCPP. My code below is extremely inefficient. I would like to know if the following code can be efficient.
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
Rcpp::List hello(
const arma::rowvec& g,
const int& n,
const int& p,
const arma::mat& S,
const arma::mat& zc,
const arma::rowvec& dl){
Rcpp::List ht(n);
for(int t=0; t < n;++t){
arma::mat hhat(p,n);
hhat.fill(0.0);
for(int i = 0;i < n; ++i){
arma::mat h(p,1);
h.fill(0.0);
if (t > i){
for(int u=i;u <= t; ++u){
arma::rowvec zr = zc.rows(i,i);
h += exp(arma::as_scalar(g*zr.t())) * (zr.t() - S.cols(u,u))*dl(u);
}
}
hhat.cols(i,i) = h;
}
ht[t] = hhat;
}
// Specify list length
Rcpp::List res(1);
res[0] = ht;
return(res);
}
Here is the example.
g=c(1,2.1,3.1)
n=1600
p=3
S = matrix(rnorm(4800),nrow=3,ncol=1600)
dl=runif(1600)
z=matrix(runif(4800),nrow=1600,ncol=3)
ptm=proc.time();kkk= hello(g=g,n=n,p=p,S = S,zc=z,dl = dl);proc.time()-ptm;
user system elapsed
31.25 0.00 31.30
Any help would be appreciated.
Following the updated code. Initially I was returning list of a list. Now it returns a list. This reduces the computing time by 10 seconds. I hope this code can be improved further.
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
Rcpp::List hello(
const arma::rowvec& g,
const int& n,
const int& p,
const arma::mat& S,
const arma::mat& zc,
const arma::rowvec& dl){
Rcpp::List ht(n);
for(int t=0; t < n;++t){
arma::mat hhat(p,n);
hhat.zeros();
for(int i = 0;i < n; ++i){
arma::mat h(p,1);
// h.fill(0.0);
h.zeros();
if (t > i){
for(int u=i;u <= t; ++u){
//arma::rowvec zr = zc.rows(i,i);
h += exp(arma::as_scalar(g*zc.row(i).t())) * (zc.row(i).t() - S.col(u))*dl(u);
}
}
hhat.col(i) = h;
}
ht[t] = hhat;
}
// Specify list length
// Rcpp::List res(1);
// res[0] = ht;
return(ht);
}
The formula that I am trying to implement is given below.
In my other answer I looked at the efficiency of returning data and at simple optimizations. Here I want to look at something different: Optimization of the algorithm.
You want to compute hhat(i, t) for 0 <= i, t < n and i < t. Looking at your formula we see that the dependency of hhat on i and t is very different. In particular, hhat(i, t + 1) can be written as hhat(i, t) + something. Right now your outer loop is over t and you are recomputing all these intermediate values. By switching the loop order, it is easy to do each such computation only once, bringing the algorithm down to a two nested loops. This means you have to generate the resulting matrices separately. And since you cannot store an arma::mat inside a Rcpp::List, I need an additional std::vector for storage:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
// [[Rcpp::export]]
Rcpp::List hello(
const arma::rowvec& g,
const int& n,
const int& p,
const arma::mat& S,
const arma::mat& zc,
const arma::rowvec& dl){
std::vector<arma::mat> foo(n);
for(int t=0; t < n;++t){
arma::mat hhat(p,n);
hhat.zeros();
foo[t] = hhat;
}
for(int i = 0;i < n; ++i){
arma::mat h = exp(arma::as_scalar(g*zc.row(i).t())) * (zc.row(i).t() - S.col(i))*dl(i);
for(int t=i+1; t < n;++t){
h += exp(arma::as_scalar(g*zc.row(i).t())) * (zc.row(i).t() - S.col(t))*dl(t);
foo[t].col(i) = h;
}
}
Rcpp::List ht(n);
for(int t=0; t < n;++t){
ht[t] = foo[t];
}
return(ht);
}
// [[Rcpp::export]]
Rcpp::List hello_orig(
const arma::rowvec& g,
const int& n,
const int& p,
const arma::mat& S,
const arma::mat& zc,
const arma::rowvec& dl){
Rcpp::List ht(n);
for(int t=0; t < n;++t){
arma::mat hhat(p,n);
hhat.zeros();
for(int i = 0;i < n; ++i){
arma::mat h(p,1);
h.zeros();
if (t > i){
for(int u=i;u <= t; ++u){
h += exp(arma::as_scalar(g*zc.row(i).t())) * (zc.row(i).t() - S.col(u))*dl(u);
}
}
hhat.col(i) = h;
}
ht[t] = hhat;
}
return(ht);
}
/***R
g=c(1,2.1,3.1)
n=1600
p=3
S = matrix(rnorm(p*n),nrow=p,ncol=n)
dl=runif(n)
z=matrix(runif(p*n),nrow=n,ncol=p)
bench::mark(hello_orig(g=g,n=n,p=p,S = S,zc=z,dl = dl),
hello(g=g,n=n,p=p,S = S,zc=z,dl = dl))
*/
Result:
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch:> <bch:> <dbl> <bch:byt>
1 hello_orig(g = g, n = n, p = p, S = S, zc = z, dl = dl) 14.2s 14.2s 0.0703 58.7MB
2 hello(g = g, n = n, p = p, S = S, zc = z, dl = dl) 53.9ms 85.9ms 11.1 58.7MB
# … with 8 more variables: `gc/sec` <dbl>, n_itr <int>, n_gc <dbl>, total_time <bch:tm>,
# result <list>, memory <list>, time <list>, gc <list>
More than a factor 100 faster!
You can get cleaner (and maybe even a bit faster code) by floowing #coatless' suggestions in the comments to use an arma::cube. The most compact form will give you a different return structure, though. Instead of a list of p x n you will get a p x n x n array:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
// [[Rcpp::export]]
arma::cube coatless(
const arma::rowvec& g,
const int& n,
const int& p,
const arma::mat& S,
const arma::mat& zc,
const arma::rowvec& dl){
arma::cube ht(p, n, n);
ht.zeros();
for(int i = 0;i < n; ++i){
arma::mat h = exp(arma::as_scalar(g*zc.row(i).t())) * (zc.row(i).t() - S.col(i))*dl(i);
for(int t=i+1; t < n;++t){
h += exp(arma::as_scalar(g*zc.row(i).t())) * (zc.row(i).t() - S.col(t))*dl(t);
ht.slice(t).col(i) = h;
}
}
return(ht);
}
Your question title makes one think you see the problem in returning the data to R. Rest assured that this is not an issue. You can easily check this by calling a function that returns matrices of zeros in the required size:
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
Rcpp::List minimal(
const arma::rowvec& g,
const int& n,
const int& p,
const arma::mat& S,
const arma::mat& zc,
const arma::rowvec& dl){
Rcpp::List ht(n);
for(int t=0; t < n;++t){
arma::mat hhat(p,n);
hhat.zeros();
ht[t] = hhat;
}
return(ht);
}
On my system this function takes about 0.01 s with your input data. In other words, your real function spends most of its time on computing the actual results.
As for optimizing that part, it would be helpful if you could provide an idea of what you are trying to implement, e.g. with the help of mathematical formulas. As it stands, I can only do some simple changes:
In the i loop you only do something for t > i. Therefore it is sufficient to let the loop run till i < t.
The u loop can be formulated as a matrix-vector product, for which efficient implementations exist.
With changes like this I end up with
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
Rcpp::List hello(
const arma::rowvec& g,
const int& n,
const int& p,
const arma::mat& S,
const arma::mat& zc,
const arma::rowvec& dl){
Rcpp::List ht(n);
for(int t=0; t < n;++t){
arma::mat hhat(p,n);
hhat.zeros();
for(int i = 0;i < t; ++i){
arma::mat Sit = S.cols(i,t);
hhat.col(i) = - exp(arma::as_scalar(g*zc.row(i).t())) *
(Sit.each_col() - zc.row(i).t()) * dl.subvec(i,t).t();
}
ht[t] = hhat;
}
return(ht);
}
On my system this is about a factor of two faster than your code. It might well be possible to get even faster, though.
This is a rather simple question, but I haven't been able to quite find the answer on the web yet.
Wishing my latest attempt, here is latest compiler output:
note: candidate function not viable: no known conversion from 'double (unsigned int, const double *, void *, void )' to 'nlopt_func' (aka 'double ()(unsigned int, const double *, double *, void *)') for 2nd argument
From this error I surmise that I am now wrapping or 'type casting' the data argument correctly and also the parameter vector. The discrepency between the third input, the gradient, confuses me. As I am calling a gradient free optimization routine.
Here is a simple linear regression with a constant and a variable:
#include "RcppArmadillo.h"
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::depends(nloptr)]]
//#include <vector>
#include <nloptrAPI.h>
using namespace arma;
using namespace Rcpp;
typedef struct {
arma::mat data_in;
} *my_func_data;
typedef struct {
double a, b;
} my_theta;
double myfunc(unsigned n, const double *theta, void *grad, void *data){
my_func_data &temp = (my_func_data &) data;
arma::mat data_in = temp->data_in;
my_theta *theta_temp = (my_theta *) theta;
double a = theta_temp->a, b = theta_temp->b;
int Len = arma::size(data_in)[0];
arma::vec Y1 = data_in(span(0, Len-1), 1);
arma::vec Y2 = data_in(span(0, Len-1), 2);
arma::vec res = data_in(span(0, Len-1), 0) - a*Y1 - b*Y2 ;
return sum(res);
}
// [[Rcpp::export]]
void test_nlopt_c() {
arma::mat data_in(10,3);
data_in(span(0,9),0) = arma::regspace(40, 49);
data_in(span(0,9),1) = arma::ones(10);
data_in(span(0,9),2) = arma::regspace(10, 19);
my_func_data &temp = (my_func_data &) data_in;
double lb[2] = { 0, 0,}; /* lower bounds */
nlopt_opt opt;
opt = nlopt_create(NLOPT_LN_NELDERMEAD, 2); /* algorithm and dimensionality */
nlopt_set_lower_bounds(opt, lb);
nlopt_set_min_objective(opt, myfunc, &data_in );
nlopt_set_xtol_rel(opt, 1e-4);
double minf; /* the minimum objective value, upon return */
double x[2] = {0.5, 0.5}; /* some initial guess */
nlopt_result result = nlopt_optimize(opt, x, &minf);
Rcpp::Rcout << "result:" << result;
return;
}
Got it figured out, stupid answer turns out to be correct, just change 'void' to 'double', no clue why. Anyway, the example code needs some improving but it works.
#include "RcppArmadillo.h"
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::depends(nloptr)]]
//#include <vector>
#include <nloptrAPI.h>
using namespace arma;
using namespace Rcpp;
typedef struct {
arma::mat data_in;
} *my_func_data;
typedef struct {
double a, b;
} my_theta;
double myfunc(unsigned n, const double *theta, double *grad, void *data){
my_func_data &temp = (my_func_data &) data;
arma::mat data_in = temp->data_in;
my_theta *theta_temp = (my_theta *) theta;
double a = theta_temp->a, b = theta_temp->b;
int Len = arma::size(data_in)[0];
arma::vec Y1 = data_in(span(0, Len-1), 1);
arma::vec Y2 = data_in(span(0, Len-1), 2);
arma::vec res = data_in(span(0, Len-1), 0) - a*Y1 - b*Y2 ;
return sum(res);
}
// [[Rcpp::export]]
void test_nlopt_c() {
arma::mat data_in(10,3);
data_in(span(0,9),0) = arma::regspace(40, 49);
data_in(span(0,9),1) = arma::ones(10);
data_in(span(0,9),2) = arma::regspace(10, 19);
my_func_data &temp = (my_func_data &) data_in;
double lb[2] = { 0, 0,}; /* lower bounds */
nlopt_opt opt;
opt = nlopt_create(NLOPT_LN_NELDERMEAD, 2); /* algorithm and dimensionality */
nlopt_set_lower_bounds(opt, lb);
nlopt_set_min_objective(opt, myfunc, &data_in );
nlopt_set_xtol_rel(opt, 1e-4);
double minf; /* the minimum objective value, upon return */
double x[2] = {0.5, 0.5}; /* some initial guess */
nlopt_result result = nlopt_optimize(opt, x, &minf);
Rcpp::Rcout << "result:" << result;
return;
}
I was trying to use rcpp/armadillo with openmp to speed up a loop in R. The loop takes a matrix with each row containing indices of a location vector(or matrix if it's 2D locations) as input(and other matrix/vec to be used). Inside the loop, I extracted each row of input indices matrix and find the corresponding locations, calculate distance matrix, and covariance matrix, do cholesky and backsolve, save the backsolve results to a new matrix. Here is the rcpp code:
`#include <iostream>
#include <RcppArmadillo.h>
#include <omp.h>
#include <Rcpp.h>
// [[Rcpp::plugins(openmp)]]
using namespace Rcpp;
using namespace arma;
using namespace std;
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
mat NZentries_new2 (int m, int nnp, const mat& locs, const umat& revNNarray, const mat& revCondOnLatent, const vec& nuggets, const vec covparms){
// initialized the output matrix
mat Lentries=zeros(nnp,m+1);
// initialized objects in parallel part
int n0; //number of !is_na elements
uvec inds;//
vec revCon_row;//
uvec inds00;//
vec nug;//
mat covmat;//
vec onevec;//
vec M;//
mat dist;//
int k;//
omp_set_num_threads(2);// selects the number of cores to use.
#pragma omp parallel for shared(locs,revNNarray,revCondOnLatent,nuggets,nnp,m,Lentries) private(k,M,dist,onevec,covmat,nug,n0,inds,revCon_row,inds00) default(none) schedule(static)
for (k = 0; k < nnp; k++) {
// extract a row to work with
inds=revNNarray.row(k).t();
revCon_row=revCondOnLatent.row(k).t();
if (k < m){
n0=k+1;
} else {
n0=m+1;
}
// extract locations
inds00=inds(span(m+1-n0,m))-ones<uvec>(n0);
nug=nuggets.elem(inds00) % (ones(n0)-revCon_row(span(m+1-n0,m))); // vec is vec, cannot convert to mat
dist=calcPWD2(locs.rows(inds00));
#pragma omp critical
{
//calculate covariance matrix
covmat= MaternFun(dist,covparms) + diagmat(nug) ; // summation from arma
}
// get last row of inverse Cholesky
onevec = zeros(n0);
onevec[n0-1] = 1;
M=solve(chol(covmat,"upper"),onevec);
// save the entries to matrix
Lentries(k,span(0,n0-1)) = M.t();
}
return Lentries;
}`
The current version works fine but speed is slow(almost the same as no parallel version), if I take the line in omp critical bracket out, it cause segment fault and R will be crashed. This MaterFun is a function I defined as below with several other small functions. So my question is that why MaternFun has to stay in the critical part.
// [[Rcpp::export]]
mat MaternFun( mat distmat, vec covparms ){
int d1 = distmat.n_rows;
int d2 = distmat.n_cols;
int j1;
int j2;
mat covmat(d1,d2);
double scaledist;
double normcon = covparms(0)/(pow(2.0,covparms(2)-1)*Rf_gammafn(covparms(2)));
for (j1 = 0; j1 < d1; j1++){
for (j2 = 0; j2 < d2; j2++){
if ( distmat(j1,j2) == 0 ){
covmat(j1,j2) = covparms(0);
} else {
scaledist = distmat(j1,j2)/covparms(1);
covmat(j1,j2) = normcon*pow( scaledist, covparms(2) )*
Rf_bessel_k(scaledist,covparms(2),1.0);
}
}
}
return covmat;
}
// [[Rcpp::export]]
double dist2(double lat1,double long1,double lat2,double long2) {
double dist = sqrt(pow(lat1 - lat2, 2) + pow(long1 - long2, 2)) ;
return (dist) ;
}
// [[Rcpp::export]]
mat calcPWD2( mat x) {//Rcpp::NumericMatrix
int outrows = x.n_rows ;
int outcols = x.n_rows ;
mat out(outrows, outcols) ;
for (int arow = 0 ; arow < outrows ; arow++) {
for (int acol = 0 ; acol < outcols ; acol++) {
out(arow, acol) = dist2(x(arow, 0),x(arow, 1),
x(acol, 0),x(acol, 1)) ; //extract element from mat
}
}
return (out) ;
}
Here is some sample inputs for testing the MaterFun in R:
library(fields)
distmat=rdist(1:5) # distance matrix
covparms=c(1,0.2,1.5)
The issue is there are two calls to R math functions (Rf_bessel_k and Rf_gammafn) that require the access to be single threaded instead of parallel.
To get around this, let's add a dependency on boost via BH to obtain the cyl_bessel_k and tgamma functions. Alternatively, there is always the option of reimplementing R's besselK and gamma in C++ so it doesn't use the single-threaded R variant.
This gives:
#include <Rcpp.h>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/gamma.hpp>
// [[Rcpp::depends(BH)]]
// [[Rcpp::export]]
double besselK_boost(double x, double v) {
return boost::math::cyl_bessel_k(v, x);
}
// [[Rcpp::export]]
double gamma_fn_boost(double x) {
return boost::math::tgamma(x);
}
Test Code
x0 = 9.536743e-07
nu = -10
all.equal(besselK(x0, nu), besselK_boost(x0, nu))
# [1] TRUE
x = 2
all.equal(gamma(x), gamma_fn_boost(x))
# [1] TRUE
Note: The order of parameters for boost's variant differs from R's:
cyl_bessel_k(v, x)
Rf_bessel_k(x, v, expon.scaled = FALSE)
From here, we can modify the MaternFun. Unfortunately, because calcPWD2 is missing, the furthest we can go is switching to use boost and incorporating in OpenMP protections.
#include <RcppArmadillo.h>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/gamma.hpp>
#ifdef _OPENMP
#include <omp.h>
#endif
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::depends(BH)]]
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::export]]
arma::mat MaternFun(arma::mat distmat, arma::vec covparms) {
int d1 = distmat.n_rows;
int d2 = distmat.n_cols;
int j1;
int j2;
arma::mat covmat(d1,d2);
double scaledist;
double normcon = covparms(0) /
(pow(2.0, covparms(2) - 1) * boost::math::tgamma(covparms(2)));
for (j1 = 0; j1 < d1; ++j1){
for (j2 = 0; j2 < d2; ++j2){
if ( distmat(j1, j2) == 0 ){
covmat(j1, j2) = covparms(0);
} else {
scaledist = distmat(j1, j2)/covparms(1);
covmat(j1, j2) = normcon * pow( scaledist, covparms(2) ) *
boost::math::cyl_bessel_k(covparms(2), scaledist);
}
}
}
return covmat;
}
I have written parallel implementation of sums in groups using RcppParallel.
// [[Rcpp::depends(RcppParallel)]]
#include <Rcpp.h>
#include <RcppParallel.h>
using namespace Rcpp;
using namespace RcppParallel;
struct SumsG: public Worker
{
const RVector<double> v;
const RVector<int> gi;
RVector<double> sg;
SumsG(const NumericVector v, const IntegerVector gi, NumericVector sg): v(v), gi(gi), sg(sg) {}
SumsG(const SumsG& p, Split): v(p.v), gi(p.gi), sg(p.sg) {}
void operator()(std::size_t begin, std::size_t end) {
for (std::size_t i = begin; i < end; i++) {
sg[gi[i]] += v[i];
}
}
void join(const SumsG& p) {
for(std::size_t i = 0; i < sg.length(); i++) {
sg[i] += p.sg[i];
}
}
};
// [[Rcpp::export]]
List sumsingroups(NumericVector v, IntegerVector gi, int ni) {
NumericVector sg(ni);
SumsG p(v, gi, sg);
parallelReduce(0, v.length(), p);
return List::create(_["sg"] = p.sg);
}
It compiles using Rcpp::sourceCpp. Now when I call it from R sumsingroups(1:10, rep(0:1, each = 5), 2) several times I get the right answer (15 40) and then something different (usually some multiplicative of the right answer). Running
res <- sumsingroups(1:10, rep(0:1, each = 5), 2)
for(i in 1:1000) {
tmp <- sumsingroups(1:10, rep(0:1, each = 5), 2)
if(res[[1]][1] != tmp[[1]][1]) break
Sys.sleep(0.1)
}
breaks at random iteration returning
$sg
[1] 60 160
or
$sg
[1] 30 80
I am new to Rcpp and RcppParallel and do not know what could cause such behavior.
Update. Things that did not help:
Added for (std::size_t i = 0; i < sg.length(); i++) sg[i] = 0; to
both of constructors.
Changed names so that they are different in
Worker definition and in function implementation.
Try this.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
using namespace RcppParallel;
struct SumsInGroups5: public Worker
{
const RVector<double> v;
const RVector<int> g;
std::vector<double> s;
SumsInGroups5(const NumericVector v, const IntegerVector g): v(v), g(g), s(*std::max_element(g.begin(), g.end()) + 1, 0.0){ }
SumsInGroups5(const SumsInGroups5& p, Split): v(p.v), g(p.g), s(*std::max_element(g.begin(), g.end()) + 1, 0.0) {}
void operator()(std::size_t begin, std::size_t end) {
for (std::size_t i = begin; i < end; ++i) {
s[g[i]]+=v[i];
}
}
void join(const SumsInGroups5& rhs) {
for(std::size_t i = 0; i < s.size(); i++) {
s[i] += rhs.s[i];
}
}
};
// [[Rcpp::export]]
NumericVector sg5(NumericVector v, IntegerVector g) {
SumsInGroups5 p(v, g);
parallelReduce(0, v.length(), p);
return wrap(p.s);
}
/*** R
a <- 1:10
g <- c(rep(0,5),rep(1,5))
bb <- lapply(1:10000,function(x)sg5(a,g))
cc<-do.call("rbind",bb)
unique(cc)
*/
Compared to my other tries this code did not produce weird result in the same cases other code did. Not very assuring.
I have written a function that desirably sums values in groups. It takes two vectors of the same length: v and g and should return a vector of length the same as unique elements in g. Groups are encoded as integers starting from zero. Using Rcpp::sourceCpp the code compiles but when called from R (sg(runif(6), rep(0:1,each = 3)) for example) returns numeric(0).
// [[Rcpp::depends(RcppParallel)]]
#include <Rcpp.h>
#include <RcppParallel.h>
using namespace Rcpp;
using namespace RcppParallel;
struct SumsInGroups: public Worker
{
const RVector<double> v;
const RVector<int> g;
RVector<double> s;
SumsInGroups(const NumericVector v, const IntegerVector g, NumericVector s): v(v), g(g), s(s) {}
SumsInGroups(const SumsInGroups& p, Split): v(p.v), g(p.g), s(p.s) {}
void operator()(std::size_t begin, std::size_t end) {
for (std::size_t i = begin; i < end; ++i) {
if (s[g[i]] != s[g[i]]) s[g[i]] = v[i];
else s[g[i]] += v[i];
}
}
void join(const SumsInGroups& rhs) {
for(std::size_t i = 0; i < s.length(); i++) {
s[i] += rhs.s[i];
}
}
};
// [[Rcpp::export]]
RVector<double> sg(NumericVector v, IntegerVector g) {
NumericVector s;
SumsInGroups p(v, g, s);
parallelReduce(0, v.length(), p);
return p.s;
}
I am very new to RcppParallel so any comments and suggestions are welcomed.
You need to initialize s. I suggest initializing with zeroes. Here is the code which worked for me. Note that since I initialize with zeroes, I do not need the checking you do in operator ().
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
using namespace Rcpp;
using namespace RcppParallel;
struct SumsInGroups: public Worker
{
const RVector<double> v;
const RVector<int> g;
RVector<double> s;
SumsInGroups(const NumericVector v, const IntegerVector g, NumericVector s): v(v), g(g), s(s) {}
SumsInGroups(const SumsInGroups& p, Split): v(p.v), g(p.g), s(p.s) {}
void operator()(std::size_t begin, std::size_t end) {
for (std::size_t i = begin; i < end; ++i) {
s[g[i]] += v[i];
}
}
void join(const SumsInGroups& rhs) {
for(std::size_t i = 0; i < s.length(); i++) {
s[i] += rhs.s[i];
}
}
};
// [[Rcpp::export]]
RVector<double> sg(NumericVector v, IntegerVector g) {
NumericVector s(*std::max_element(g.begin(), g.end()) + 1);
SumsInGroups p(v, g, s);
parallelReduce(0, v.length(), p);
return p.s;
}
/*** R
set.seed(101)
o <- runif(15)
i <-sample(0:3,15, rep = TRUE)
sg(o, i)
tapply(o, i, sum)
*/