Passing an Armadillo sparse matrix by reference with Rcpp - r

This question is related to this and this. The difference here is that I'm not passing an Rcpp type like NumericVector or NumericMatrix, but an arma::sp_mat.
Is there any way to pass an sp_mat to C++, modify its values, and have the changes show up in the original object in R?
This can be done with a NumericMatrix, for example:
cppFunction("void frob(NumericMatrix& x)
{
for(NumericMatrix::iterator it = x.begin(); it != x.end(); ++it)
{
if(*it != 0) *it = *it + 5;
}
}")
M <- Matrix(0, 5, 1, sparse=TRUE)
M[1] <- 1.2345
m <- as.matrix(M)
frob(m)
m
#[,1]
#[1,] 6.2345
#[2,] 0.0000
#[3,] 0.0000
#[4,] 0.0000
#[5,] 0.0000
The same technique works for an arma::mat dense matrix. But for a sparse matrix, it doesn't work:
cppFunction("void frob2(arma::sp_mat& x)
{
for(arma::sp_mat::iterator it = x.begin(); it != x.end(); ++it)
{
*it = *it + 5;
}
}", depends="RcppArmadillo")
frob2(M)
M
#5 x 1 sparse Matrix of class "dgCMatrix"
#[1,] 1.2345
#[2,] .
#[3,] .
#[4,] .
#[5,] .

Unfortunately there is no auxiliary memory constructor for sparse matrices in Armadillo.
However you can construct sparse matrix like structure in C++ using pointers to R objects. Here is example:
template< typename T>
class MappedCSC {
public:
MappedCSC();
MappedCSC(std::uint32_t n_rows,
std::uint32_t n_cols,
size_t nnz,
std::uint32_t * row_indices,
std::uint32_t * col_ptrs,
T * values):
n_rows(n_rows), n_cols(n_cols), nnz(nnz), row_indices(row_indices), col_ptrs(col_ptrs), values(values) {};
const std::uint32_t n_rows;
const std::uint32_t n_cols;
const size_t nnz;
const std::uint32_t * row_indices;
const std::uint32_t * col_ptrs;
T * values;
};
using dMappedCSC = MappedCSC<double>;
Here is how you can extract it:
dMappedCSC extract_mapped_csc(Rcpp::S4 input) {
Rcpp::IntegerVector dim = input.slot("Dim");
Rcpp::NumericVector values = input.slot("x");
uint32_t nrows = dim[0];
uint32_t ncols = dim[1];
Rcpp::IntegerVector row_indices = input.slot("i");
Rcpp::IntegerVector col_ptrs = input.slot("p");
return dMappedCSC(nrows, ncols, values.length(), (uint32_t *)row_indices.begin(), (uint32_t *)col_ptrs.begin(), values.begin());
}
And here is example on how to iterate column by column:
Rcpp::NumericMatrix dense_csc_prod(const Rcpp::NumericMatrix &x_r, const Rcpp::S4 &y_csc_r) {
const arma::dmat x = arma::dmat((double *)&x_r[0], x_r.nrow(), x_r.ncol(), false, false);
const dMappedCSC y_csc = extract_mapped_csc(y_csc_r);
Rcpp::NumericMatrix res(x.n_rows, y_csc.n_cols);
arma::dmat res_arma_map = arma::dmat(res.begin(), res.nrow(), res.ncol(), false, false);
for (uint32_t i = 0; i < y_csc.n_cols; i++) {
const uint32_t p1 = y_csc.col_ptrs[i];
const uint32_t p2 = y_csc.col_ptrs[i + 1];
// mapped indices are uint32_t, but arma only allows indices be uvec = vec<uword> = vec<size_t>
// so we need to construct these indices by copying from uint32_t to uword
const arma::Col<uint32_t> idx_temp = arma::Col<uint32_t>(&y_csc.row_indices[p1], p2 - p1);
const arma::uvec idx = arma::conv_to<arma::uvec>::from(idx_temp);
const arma::colvec y_csc_col = arma::colvec(&y_csc.values[p1], p2 - p1, false, false);
res_arma_map.col(i) = x.cols(idx) * y_csc_col;
}
return res;
}

Related

Results for calculating nearest positive definite matrix are different in R function and Rcpp function

In R, we can use Matrix::nearPD() to calculate nearest positive definite matrix.
I have written a Rcpp-version, nearPD_c, myself as follows (c++ file),
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
using namespace arma;
using namespace Rcpp;
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
vec rep_each(const vec& x, const int each) {
std::size_t n=x.n_elem;
std::size_t n_out=n*each;
vec res(n_out);
auto begin = res.begin();
for (std::size_t i = 0, ind = 0; i < n; ind += each, ++i) {
auto start = begin + ind;
auto end = start + each;
std::fill(start, end, x[i]);
}
return res;
}
mat mat_vec_same_len(mat mt1, vec v1){
//do not check the input...
int t=0;
for(int i=0;i<mt1.n_cols;i++){
for(int j=0;j<mt1.n_rows;j++){
mt1(j,i)=mt1(j,i)*v1(t);
t++;
}
}
return(mt1);
}
// [[Rcpp::export]]
vec pmax_c(double a, vec b){
vec c(b.n_elem);
for(int i=0;i<b.n_elem;i++){
c(i)=std::max(a,b(i));
}
return c;
}
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
mat nearPD_c(mat x,
bool corr = false, bool keepDiag = false
,bool do2eigen = true // if TRUE do a sfsmisc::posdefify() eigen step
,bool doSym = false // symmetrize after tcrossprod()
, bool doDykstra = true // do use Dykstra's correction
,bool only_values = false // if TRUE simply return lambda[j].
, double eig_tol = 1e-6 // defines relative positiveness of eigenvalues compared to largest
, double conv_tol = 1e-7 // convergence tolerance for algorithm
,double posd_tol = 1e-8 // tolerance for enforcing positive definiteness
, int maxit = 100 // maximum number of iterations allowed
, bool trace = false // set to TRUE (or 1 ..) to trace iterations
){
int n = x.n_cols;
vec diagX0;
if(keepDiag) {
diagX0 = x.diag();
}
mat D_S;
if(doDykstra) {
//D_S should be like x, but filled with '0' -- following also works for 'Matrix':
D_S = x;
D_S.zeros(); //set all element
}
mat X = x;
int iter = 0 ;
bool converged = false;
double conv = R_PosInf;
mat Y;
mat R;
mat B;
while (iter < maxit && !converged) {
Y = X;
if(doDykstra){
R = Y - D_S;
}
vec d;
mat Q;
if(doDykstra){
B=R;
}else{
B=Y;
}
eig_sym(d, Q, B);
// create mask from relative positive eigenvalues
uvec p= (d>eig_tol*d[1]);
if(sum(p)==0){
//stop("Matrix seems negative semi-definite")
break;
}
// use p mask to only compute 'positive' part
uvec p_indexes(sum(p));
int p_i_i=0;
for(int i=0;i<p.n_elem;i++){
if(p(i)){
p_indexes(p_i_i)=i;
p_i_i++;
}
}
Q=Q.cols(p_indexes);
X=mat_vec_same_len(Q,rep_each(d.elem(p_indexes),Q.n_rows))*Q.t();
// update Dykstra's correction D_S = \Delta S_k
if(doDykstra){
D_S = X - R;
}
// project onto symmetric and possibly 'given diag' matrices:
if(doSym){
X = (X + X.t())/2;
}
if(corr){
X.diag().ones(); //set diagnols as ones
}
else if(keepDiag){
X.diag() = diagX0;
}
conv = norm(Y-X,"inf")/norm(Y,"inf");
iter = iter + 1;
if (trace){
// cat(sprintf("iter %3d : #{p}=%d, ||Y-X|| / ||Y||= %11g\n",
// iter, sum(p), conv))
Rcpp::Rcout << "iter " << iter <<" : #{p}= "<< sum(p) << std::endl;
}
converged = (conv <= conv_tol);
// force symmetry is *NEVER* needed, we have symmetric X here!
//X <- (X + t(X))/2
if(do2eigen || only_values) {
// begin from posdefify(sfsmisc)
eig_sym(d, Q, X);
double Eps = posd_tol * std::abs(d[1]);
// if (d[n] < Eps) { //should be n-1?
if (d(n-1) < Eps) {
uvec d_comp = d < Eps;
for(int i=0;i<sum(d_comp);i++){
if(d_comp(i)){
d(i)=Eps;
}
}
// d[d < Eps] = Eps; //how to assign values likes this?
if(!only_values) {
vec o_diag = X.diag();
X = Q * (d *Q.t());
vec D = sqrt(pmax_c(Eps, o_diag)/X.diag());
x=D * X * rep_each(D, n);
}
}
if(only_values) return(d);
// unneeded(?!): X <- (X + t(X))/2
if(corr) {
X.diag().ones(); //set diag as ones
}
else if(keepDiag){
X.diag()= diagX0;
}
} //end from posdefify(sfsmisc)
}
if(!converged){ //not converged
Rcpp::Rcout << "did not converge! " <<std::endl;
}
return X;
// return List::create(_["mat"] = X,_["eigenvalues"]=d,
//
// _["corr"] = corr, _["normF"] = norm(x-X, "fro"), _["iterations"] = iter,
// _["rel.tol"] = conv, _["converged"] = converged);
}
However, although nearPD and nearPD_c give similar results, they are not identical. For example (in R):
> mt0=matrix(c(0.5416, -0.0668 , -0.1538, -0.2435,
+ -0.0668 , 0.9836 , -0.0135 , -0.0195,
+ -0.1538 , -0.0135 , 0.0226 , 0.0334,
+ -0.2435, -0.0195 , 0.0334 , 0.0487),4,byrow = T)
> nearPD(mt0)$mat
4 x 4 Matrix of class "dpoMatrix"
[,1] [,2] [,3] [,4]
[1,] 0.55417390 -0.06540967 -0.14059121 -0.22075966
[2,] -0.06540967 0.98375373 -0.01203943 -0.01698557
[3,] -0.14059121 -0.01203943 0.03650733 0.05726836
[4,] -0.22075966 -0.01698557 0.05726836 0.08983952
> nearPD_c(mt0)
[,1] [,2] [,3] [,4]
[1,] 0.55417390 -0.06540967 -0.14059123 -0.22075967
[2,] -0.06540967 0.98375373 -0.01203944 -0.01698557
[3,] -0.14059123 -0.01203944 0.03650733 0.05726837
[4,] -0.22075967 -0.01698557 0.05726837 0.08983952
There are some differences in 7th or 8th decimal, which make nearPD(mt0) positive define while nearPD_c(mt0) not.
> chol(nearPD(mt0)$mat)
4 x 4 Matrix of class "Cholesky"
[,1] [,2] [,3] [,4]
[1,] 7.444286e-01 -8.786561e-02 -1.888579e-01 -2.965491e-01
[2,] . 9.879440e-01 -2.898297e-02 -4.356729e-02
[3,] . . 1.029821e-04 1.014128e-05
[4,] . . . 1.071201e-04
> chol(nearPD_c(mt0))
Error in chol.default(nearPD_c(mt0)) :
the leading minor of order 3 is not positive definite
I sense that there might be some rounding issue in Rcpp. But I couldn't identify it. Any insights of what goes wrong?
There is at least one logic error in your post-processing. In R we have:
e <- eigen(X, symmetric = TRUE)
d <- e$values
Eps <- posd.tol * abs(d[1])
if (d[n] < Eps) {
d[d < Eps] <- Eps
[...]
While you have:
eig_sym(d, Q, X);
double Eps = posd_tol * std::abs(d[1]);
// if (d[n] < Eps) { //should be n-1?
if (d(n-1) < Eps) {
uvec d_comp = d < Eps;
for(int i=0;i<sum(d_comp);i++){
if(d_comp(i)){
d(i)=Eps;
}
}
According to the Armadillo docs, eigen values are in ascending order, while they are in decreasing order in R. So R builds Eps based on the largest eigen value, while you use the second(!) smallest. Then R compares with the smallest eigen value, while you compare with the largest. Something like this should give the same results as R (untested):
eig_sym(d, Q, X);
double Eps = posd_tol * std::abs(d[n-1]);
if (d(0) < Eps) {
uvec d_comp = d < Eps;
for(int i=0;i<sum(d_comp);i++){
if(d_comp(i)){
d(i)=Eps;
}
}
BTW, you only need // [[Rcpp::export]] for functions that you want to call from R.

Interleaving results from many objects in Rcpp

I need to write to a file row by row of matrices and sparse matrices that appears in a list and I am doing something like this:
#include <RcppArmadillo.h>
// [[Rcpp::export]]
bool write_rows (Rcpp::List data, Rcpp::CharacterVector clss, int n) {
int len = data.length();
for(int i = 0; i<n; i++) {
for(int j=0; j<len; j++) {
if (clss[j] == "matrix") {
Rcpp::NumericMatrix x = data[j];
auto row = x.row(i);
// do something with row i
} else if (clss[j] == "dgCMatrix") {
arma::sp_mat x = data[j];
auto row = x.row(i);
// do something different with row i
}
}
}
return true;
}
This function can be called in R with:
data <- list(
x = Matrix::rsparsematrix(nrow = 1000, ncol = 1000, density = 0.3),
y = matrix(1:10000, nrow = 1000, ncol = 10)
)
clss <- c("dgCMatrix", "matrix")
write_rows(data, clss, 1000)
The function receives a list of matrices or sparse matrices with the same number of rows and writes those matrices row by row, ie. first writes first rows of all elements in data then the second row of all elements and etc.
My problem is that it seems that this line arma::sp_mat x = data[i]; seems to have a huge impact in performance since it seems that I am implicitly casting the list element data[j] to an Armadillo Sparse Matrix n times.
My question is: is there anyway I could avoid this? Is there a more efficient solution? I tried to find a solution by looking into readr's source code, since they also write list elements row by row, but they also do a cast for each row (in this line for example, but maybe this doesn't impact the performance because they deal with SEXPS?
With the clarification, it seems that the result should interleave the rows from each matrix. You can still do this while avoiding multiple conversions.
This is the original code, modified to generate some actual output:
// [[Rcpp::export]]
arma::mat write_rows(Rcpp::List data, Rcpp::CharacterVector clss, int nrows, int ncols) {
int len = data.length();
arma::mat result(nrows*len, ncols);
for (int i = 0, k = 0; i < nrows; i++) {
for (int j = 0; j < len; j++) {
arma::rowvec r;
if (clss[j] == "matrix") {
Rcpp::NumericMatrix x = data[j];
r = x.row(i);
}
else {
arma::sp_mat x = data[j];
r = x.row(i);
}
result.row(k++) = r;
}
}
return result;
}
The following code creates a vector of converted objects, and then extracts the rows from each object as required. The conversion is only done once per matrix. I use a struct containing a dense and sparse mat because it's a lot simpler than dealing with unions; and I don't want to drag in boost::variant or require C++17. Since there's only 2 classes we want to deal with, the overhead is minimal.
struct Matrix_types {
arma::mat m;
arma::sp_mat M;
};
// [[Rcpp::export]]
arma::mat write_rows2(Rcpp::List data, Rcpp::CharacterVector clss, int nrows, int ncols) {
const int len = data.length();
std::vector<Matrix_types> matr(len);
std::vector<bool> is_dense(len);
arma::mat result(nrows*len, ncols);
// populate the structs
for (int j = 0; j < len; j++) {
is_dense[j] = (clss[j] == "matrix");
if (is_dense[j]) {
matr[j].m = Rcpp::as<arma::mat>(data[j]);
}
else {
matr[j].M = Rcpp::as<arma::sp_mat>(data[j]);
}
}
// populate the result
for (int i = 0, k = 0; i < nrows; i++) {
for (int j = 0; j < len; j++, k++) {
if (is_dense[j]) {
result.row(k) = matr[j].m.row(i);
}
else {
arma::rowvec r(matr[j].M.row(i));
result.row(k) = r;
}
}
}
return result;
}
Running on some test data:
data <- list(
a=Matrix(1.0, 1000, 1000, sparse=TRUE),
b=matrix(2.0, 1000, 1000),
c=Matrix(3.0, 1000, 1000, sparse=TRUE),
d=matrix(4.0, 1000, 1000)
)
system.time(z <- write_rows(data, sapply(data, class), 1000, 1000))
# user system elapsed
# 185.75 35.04 221.38
system.time(z2 <- write_rows2(data, sapply(data, class), 1000, 1000))
# user system elapsed
# 4.21 0.05 4.25
identical(z, z2)
# [1] TRUE

Unexpected behaviour in Rcpp

Please note that this error was taken from a bigger context, which I cannot obviously report here entirely.
I have the following functions in the file fun.cpp
#include <RcppArmadilloExtensions/sample.h>
using namespace Rcpp;
// [[Rcpp::depends(RcppArmadillo)]]
arma::vec colMeans(arma::mat data){
int n_0 = data.n_rows;
arma::vec xbar(data.n_cols);
for(int i = 0; i < data.n_rows; i++){
for(int j = 0; j < data.n_cols; j++){
xbar[j] += data(i,j) /n_0;
}
}
return xbar;
}
// [[Rcpp::export]]
List PosteriorNIW(arma::mat data, arma::vec mu0, double lambda0,
double df0, arma::mat V){
// Compute posterior
int n = data.n_rows;
arma::vec xbar = colMeans(data);
double lambdan = lambda0 + n;
arma::vec mun = (lambda0 * mu0 + n * xbar) / lambdan;
arma::mat S;
S.zeros(data.n_cols, data.n_cols);
for(int i = 0; i < n; i++){
S += (arma::conv_to<arma::vec>::from(data.row(i)) - xbar) * arma::trans(arma::conv_to<arma::vec>::from(data.row(i)) - xbar);
}
arma::mat Vn = V + S + ((lambda0*n)/(lambda0 + n)) * (xbar - mu0) * arma::trans(xbar - mu0);
return List::create(_["mun"] = mun,
_["Vn"] = Vn,
_["lambdan"] = lambdan);
}
Calling now:
library(Rcpp); library(RcppArmadillo)
mu0 <- c(3,3)
V0 <- matrix(c(2.5,0.0,0.0,2.5), nrow = 2)
sourceCpp("fun.cpp")
data <- cbind(rep(5,15),rep(0,15))
PosteriorNIW(data, mu0, 1, 1, V0)
gives the expected result.
$mun
[,1]
[1,] 4.8750
[2,] 0.1875
$Vn
[,1] [,2]
[1,] 6.250 -5.6250
[2,] -5.625 10.9375
$lambdan
[1] 16
Now if I add to the file fun.cpp the following functions (again, these are taken from a bigger context so don't bother trying to understand but just paste them) strange things happens:
// [[Rcpp::export]]
NumericMatrix myFun(arma::mat t_dish, arma::cube data){
int l = 0;
for(int j = 0; j < data.n_rows; j++){
l++;
}
NumericMatrix Dk(l, 2);
return Dk;
}
// [[Rcpp::export]]
int myFun2(arma::cube n_cust){
arma::mat temp = n_cust.subcube(arma::span(0), arma::span(), arma::span());
int i;
for(i = 0; i < n_cust.n_cols; i++){
arma::rowvec temp2 = temp.row(i);
}
return i + 1;
}
// [[Rcpp::export]]
arma::vec myFun3(arma::mat k_tables){
arma::vec temp(k_tables.n_cols * k_tables.n_rows);
int l = 0;
if(!R_IsNA(k_tables(0,0))){
l++;
}
arma::vec temp2(l);
arma::vec tmp3 = sort(temp2);
return tmp3;
}
double myFun4(arma::vec x, double nu, arma::vec mu, arma::mat Sigma){
arma::vec product = (arma::trans(x - mu) * arma::inv(Sigma) * (x - mu));
double num = pow(1 + (1 / nu) * product[0], - ( nu + 2 ) / 2);
double den = pow(sqrt(M_PI * nu),2) * sqrt(arma::det(Sigma));
return num / den;
}
bool myFun5(NumericVector X, double z) {
return std::find(X.begin(), X.end(), z)!=X.end();
}
calling PosteriorNIW(data, mu0, 1, 1, V0) repeatedly starts giving different results every time. Note that there is no randomness in the functions and that obviously those functions have got no impact as they are not called in the original function.
I have tried on a different machine to make sure it was not a problem of my compiler but the error keeps happening.
I know that removing those function (even just one of them) fixes the problem but clearly this is not a feasible solution when I am working with more functions.
I would like to know if other users are able to replicate this behavior and if yes if there is a fix for it.
Thank you in advance
EDIT:
The version of R is 3.3.2 and Rtools is 3.4. Both Rcpp and RcppArmadillo are up-to-date
You're not zeroing xbar in your colMeans function. If I do do that:
arma::vec colMeans(arma::mat data){
int n_0 = data.n_rows;
arma::vec xbar;
xbar.zeros(data.n_cols);
for(int i = 0; i < data.n_rows; i++){
for(int j = 0; j < data.n_cols; j++){
xbar[j] += data(i,j) /n_0;
}
}
return xbar;
}
I get this everytime:
> PosteriorNIW(data, mu0, 1, 1.1, V0)
$mun
[,1]
[1,] 4.8750
[2,] 0.1875
$Vn
[,1] [,2]
[1,] 6.250 -5.6250
[2,] -5.625 10.9375
$lambdan
[1] 16
Even when I do add your extra block of code.
I don't know if these vectors are documented to be initialised to zero by their constructor (in which case this might be a bug there) or not, in which case its your bug!

Rcpp returns large negative number when 2 large positives are multiplied

I am creating a function that calculates area under the curve and when I take the 2 partials and multiply them for the numerator I exceed 2^31 and then a value like -2013386137 is used in the calculation.
Here are the cpp chunks
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector sort_rcpp(NumericVector x) {
std::vector<double> tmp = Rcpp::as< std::vector<double> > (x);
std::sort(tmp.begin(), tmp.end());
return wrap(tmp);
}
// [[Rcpp::export]]
IntegerVector rank(NumericVector x) {
return match(x, sort_rcpp(x));
}
// [[Rcpp::export]]
double auc_(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
int NPos = sum(actual == 1);
int NNeg = (actual.size() - NPos);
int sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
and then the test example that has the issue
N = 100000
Actual = as.numeric(runif(N) > .65)
Predicted = as.numeric(runif(N))
actual = Actual
predicted = Predicted
auc_(Actual, Predicted)
I am also putting this in an R package
devtools::install_github("JackStat/ModelMetrics")
N = 100000
Actual = as.numeric(runif(N) > .65)
Predicted = as.numeric(runif(N))
actual = Actual
predicted = Predicted
ModelMetrics::auc(Actual, Predicted)
You use int internally in your function which leads to overflow. Use a double and things look sunnier:
R> sourceCpp("/tmp/jackstat.cpp")
R> N <- 100000
R> Actual <- as.numeric(runif(N) > .65)
R> Predicted <- as.numeric(runif(N))
R> auc1(Actual, Predicted) # your function
[1] -0.558932
R> auc2(Actual, Predicted) # my variant using double
[1] 0.499922
R>
The complete corrected file is below:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector sort_rcpp(NumericVector x) {
std::vector<double> tmp = Rcpp::as< std::vector<double> > (x);
std::sort(tmp.begin(), tmp.end());
return wrap(tmp);
}
// [[Rcpp::export]]
IntegerVector rank(NumericVector x) {
return match(x, sort_rcpp(x));
}
// [[Rcpp::export]]
double auc1(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
int NPos = sum(actual == 1);
int NNeg = (actual.size() - NPos);
int sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
// [[Rcpp::export]]
double auc2(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
double NPos = sum(actual == 1);
double NNeg = (actual.size() - NPos);
double sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
/*** R
N <- 100000
Actual <- as.numeric(runif(N) > .65)
Predicted <- as.numeric(runif(N))
auc1(Actual, Predicted)
auc2(Actual, Predicted)
*/

Finding unique rows in arma::mat

In R we can use unique method to find unique rows
> data <- matrix(c(1,1,0,1,1,1,0,1),ncol = 2)
> data
[,1] [,2]
[1,] 1 1
[2,] 1 1
[3,] 0 0
[4,] 1 1
> unique(data)
[,1] [,2]
[1,] 1 1
[2,] 0 0
How can we do it for arma::mat in Rcpp?
Here unique function returns unique elements not unique rows.
I don't think there is a built-in way to do this in the Armadillo library, but here is a simple approach:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
template <typename T>
inline bool rows_equal(const T& lhs, const T& rhs, double tol = 0.00000001) {
return arma::approx_equal(lhs, rhs, "absdiff", tol);
}
// [[Rcpp::export]]
arma::mat unique_rows(const arma::mat& x) {
unsigned int count = 1, i = 1, j = 1, nr = x.n_rows, nc = x.n_cols;
arma::mat result(nr, nc);
result.row(0) = x.row(0);
for ( ; i < nr; i++) {
bool matched = false;
if (rows_equal(x.row(i), result.row(0))) continue;
for (j = i + 1; j < nr; j++) {
if (rows_equal(x.row(i), x.row(j))) {
matched = true;
break;
}
}
if (!matched) result.row(count++) = x.row(i);
}
return result.rows(0, count - 1);
}
/*** R
data <- matrix(c(1,1,0,1,1,1,0,1), ncol = 2)
all.equal(unique(data), unique_rows(data))
#[1] TRUE
data2 <- matrix(1:9, nrow = 3)
all.equal(unique(data2), unique_rows(data2))
#[1] TRUE
data3 <- matrix(0, nrow = 3, ncol = 3)
all.equal(unique(data3), unique_rows(data3))
#[1] TRUE
data4 <- matrix(c(0, 0, 0, 1, 1, 0, 1, 1), ncol = 2)
all.equal(unique(data4), unique_rows(data4))
#[1] TRUE
*/
As suggested by mtall in the comments, rows_equal is using arma::approx_equal to test for equality, rather than operator==, to avoid some of the comparison issues inherent to floating point numbers. The options used in this function were chosen somewhat arbitrarily and can of course be changed as needed; but the value of tol is roughly equal to the default tolerance used by R's all.equal, which is .Machine$double.eps^0.5 (~0.00000001490116 on my machine).
Same approach inspired by #nrussell, slightly shorter:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
template <typename T>
inline bool approx_equal_cpp(const T& lhs, const T& rhs, double tol = 0.00000001) {
return arma::approx_equal(lhs, rhs, "absdiff", tol);
}
// [[Rcpp::export]]
arma::mat unique_rows(const arma::mat& m) {
arma::uvec ulmt = arma::zeros<arma::uvec>(m.n_rows);
for (arma::uword i = 0; i < m.n_rows; i++) {
for (arma::uword j = i + 1; j < m.n_rows; j++) {
if (approx_equal_cpp(m.row(i), m.row(j))) { ulmt(j) = 1; break; }
}
}
return m.rows(find(ulmt == 0));
}
// [[Rcpp::export]]
arma::mat unique_cols(const arma::mat& m) {
arma::uvec vlmt = arma::zeros<arma::uvec>(m.n_cols);
for (arma::uword i = 0; i < m.n_cols; i++) {
for (arma::uword j = i + 1; j < m.n_cols; j++) {
if (approx_equal_cpp(m.col(i), m.col(j))) { vlmt(j) = 1; break; }
}
}
return m.cols(find(vlmt == 0));
}
/*** R
data <- matrix(c(1,1,0,1,1,1,0,1), ncol = 2)
all.equal(unique(data), unique_rows(data))
#[1] TRUE
data2 <- matrix(1:9, nrow = 3)
all.equal(unique(data2), unique_rows(data2))
#[1] TRUE
data3 <- matrix(0, nrow = 3, ncol = 3)
all.equal(unique(data3), unique_rows(data3))
#[1] TRUE
data4 <- matrix(c(0, 0, 0, 1, 1, 0, 1, 1), ncol = 2)
all.equal(unique(data4), unique_rows(data4))
#[1] TRUE
*/

Resources