Ordering Permutation in Rcpp i.e. base::order() - r

I have a ton of code using the base::order() command and I am really too lazy to code around that in rcpp. Since Rcpp only supports sort, but not order, I spent 2 minutes creating this function:
// [[Rcpp::export]]
Rcpp::NumericVector order_cpp(Rcpp::NumericVector invec){
int leng = invec.size();
NumericVector y = clone(invec);
for(int i=0; i<leng; ++i){
y[sum(invec<invec[i])] = i+1;
}
return(y);
}
It somehow works. If the vectors are containing unique numbers, I get the same result as order(). If they are not unique, results are different, but not wrong (no unique solution really).
Using it:
c=sample(1:1000,500)
all.equal(order(c),order_cpp(c))
microbenchmark(order(c),order_cpp(c))
Unit: microseconds
expr min lq median uq max neval
order(c) 33.507 36.223 38.035 41.356 78.785 100
order_cpp(c) 2372.889 2427.071 2466.312 2501.932 2746.586 100
Ouch!
I need an efficient algorithm.
Ok, so I dug up a bubblesort implementation and adapted it:
// [[Rcpp::export]]
Rcpp::NumericVector bubble_order_cpp2(Rcpp::NumericVector vec){
double tmp = 0;
int n = vec.size();
Rcpp::NumericVector outvec = clone(vec);
for (int i = 0; i <n; ++i){
outvec[i]=static_cast<double>(i)+1.0;
}
int no_swaps;
int passes;
passes = 0;
while(true) {
no_swaps = 0;
for (int i = 0; i < n - 1 - passes; ++i) {
if(vec[i] > vec[i+1]) {
no_swaps++;
tmp = vec[i];
vec[i] = vec[i+1];
vec[i+1] = tmp;
tmp = outvec[i];
outvec[i] = outvec[i+1];
outvec[i+1] = tmp;
};
};
if(no_swaps == 0) break;
passes++;
};
return(outvec);
}
Well, it's better - but not great:
microbenchmark(order(c),order_cpp(c),bubble_order_cpp2(c),sort(c),c[order(c)])
Unit: microseconds
expr min lq median uq max neval
order(c) 33.809 38.034 40.1475 43.3170 72.144 100
order_cpp(c) 2339.080 2435.675 2478.5385 2526.8350 3535.637 100
bubble_order_cpp2(c) 219.752 231.977 234.5430 241.1840 322.383 100
sort(c) 59.467 64.749 68.2205 75.4645 148.815 100
c[order(c)] 38.336 41.204 44.3735 48.1460 93.878 100
Another finding: It's faster to order than to sort.
Well, then for shorter vectors:
c=sample(1:100)
microbenchmark(order(c),order_cpp(c),bubble_order_cpp2(c),sort(c),c[order(c)])
Unit: microseconds
expr min lq median uq max neval
order(c) 10.566 11.4710 12.8300 14.1880 63.089 100
order_cpp(c) 95.689 100.8200 102.7825 107.3105 198.018 100
bubble_order_cpp2(c) 9.962 11.1700 12.0750 13.2830 64.598 100
sort(c) 39.242 41.5065 42.5620 46.3355 155.758 100
c[order(c)] 11.773 12.6790 13.5840 15.9990 82.710 100
Oh well, I have overlooked an RcppArmadillo function:
// [[Rcpp::export]]
Rcpp::NumericVector ordera(arma::vec x) {
return(Rcpp::as<Rcpp::NumericVector>(Rcpp::wrap(arma::sort_index( x )+1)) );
}
microbenchmark(order(c),order_(c),ordera(c))
Unit: microseconds
expr min lq median uq max neval
order(c) 9.660 11.169 11.773 12.377 46.185 100
order_(c) 4.529 5.133 5.736 6.038 34.413 100
ordera(c) 4.227 4.830 5.434 6.038 60.976 100

Here's a simple version leveraging Rcpp sugar to implement an order function. We put in a check for duplicates so that we guarantee that things work 'as expected'. (There is also a bug with Rcpp's sort method when there are NAs, so that may want to be checked as well -- this will be fixed by the next release).
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector order_(NumericVector x) {
if (is_true(any(duplicated(x)))) {
Rf_warning("There are duplicates in 'x'; order not guaranteed to match that of R's base::order");
}
NumericVector sorted = clone(x).sort();
return match(sorted, x);
}
/*** R
library(microbenchmark)
x <- runif(1E5)
identical( order(x), order_(x) )
microbenchmark(
order(x),
order_(x)
)
*/
gives me
> Rcpp::sourceCpp('~/test-order.cpp')
> set.seed(456)
> library(microbenchmark)
> x <- runif(1E5)
> identical( order(x), order_(x) )
[1] TRUE
> microbenchmark(
+ order(x),
+ order_(x)
+ )
Unit: milliseconds
expr min lq median uq max neval
order(x) 15.48007 15.69709 15.86823 16.21142 17.22293 100
order_(x) 10.81169 11.07167 11.40678 11.87135 48.66372 100
>
Of course, if you're comfortable with the output not matching R, you can remove the duplicated check -- x[order_(x)] will still be properly sorted; more specifically, all(x[order(x)] == x[order_(x)]) should return TRUE.

Another solution based on the C++11:
// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
using namespace Rcpp;
template <int RTYPE>
IntegerVector order_impl(const Vector<RTYPE>& x, bool desc) {
auto n = x.size();
IntegerVector idx = no_init(n);
std::iota(idx.begin(), idx.end(), static_cast<size_t>(1));
if (desc) {
auto comparator = [&x](size_t a, size_t b){ return x[a - 1] > x[b - 1]; };
std::stable_sort(idx.begin(), idx.end(), comparator);
} else {
auto comparator = [&x](size_t a, size_t b){ return x[a - 1] < x[b - 1]; };
std::stable_sort(idx.begin(), idx.end(), comparator);
// simulate na.last
size_t nas = 0;
for (size_t i = 0; i < n; ++i, ++nas)
if (!Vector<RTYPE>::is_na(x[idx[i] - 1])) break;
std::rotate(idx.begin(), idx.begin() + nas, idx.end());
}
return idx;
}
// [[Rcpp::export]]
IntegerVector order2(SEXP x, bool desc = false) {
switch(TYPEOF(x)) {
case INTSXP: return order_impl<INTSXP>(x, desc);
case REALSXP: return order_impl<REALSXP>(x, desc);
case STRSXP: return order_impl<STRSXP>(x, desc);
default: stop("Unsupported type.");
}
}
/***R
int <- sample.int(1000, 1E5, replace = TRUE)
dbl <- runif(1E5)
chr <- sample(letters, 1E5, replace = TRUE)
library(benchr)
benchmark(order(int), order2(int))
benchmark(order(dbl), order2(dbl))
benchmark(order(chr), order2(chr))
*/
Compare performance:
R> int <- sample.int(1000, 1E5, replace = TRUE)
R> dbl <- runif(1E5)
R> chr <- sample(letters, 1E5, replace = TRUE)
R> library(benchr)
R> benchmark(order(int), order2(int))
Benchmark summary:
Time units : microseconds
expr n.eval min lw.qu median mean up.qu max total relative
order(int) 100 442 452 464 482 486 1530 48200 1.0
order2(int) 100 5150 5170 5220 5260 5270 6490 526000 11.2
R> benchmark(order(dbl), order2(dbl))
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
order(dbl) 100 13.90 14.00 14.20 14.80 15.8 17.4 1480 1.98
order2(dbl) 100 7.11 7.13 7.15 7.26 7.3 8.8 726 1.00
R> benchmark(order(chr), order2(chr))
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
order(chr) 100 128.0 131.0 133.0 133.0 134.0 148.0 13300 7.34
order2(chr) 100 17.7 17.9 18.1 18.2 18.3 22.2 1820 1.00
Note that radix method from the base order much faster.

Here is another approach using std::sort.
typedef std::pair<int, double> paired;
bool cmp_second(const paired & left, const paired & right) {
return left.second < right.second;
}
Rcpp::IntegerVector order(const Rcpp::NumericVector & x) {
const size_t n = x.size();
std::vector<paired> pairs;
pairs.reserve(n);
for(size_t i = 0; i < n; i++)
pairs.push_back(std::make_pair(i, x(i)));
std::sort(pairs.begin(), pairs.end(), cmp_second<paired>);
Rcpp::IntegerVector result = Rcpp::no_init(n);
for(size_t i = 0; i < n; i++)
result(i) = pairs[i].first;
return result;
}

Related

R: Efficient iterative subsetting and filtering of large vector

I'd like to perform the following operation more quickly.
Logic: I have a vector big of 4 elements 1, 2, 3, 4. I also have a same-length vector of thresholds 1.1, 3.1, 4.1, 5.1. I want for each element to find the index of the first next element to be above the corresponding threshold. In this case my expected output is
2, 3, NA, NA:
the first element after the first one (included) which is above the threshold of 1.1 is at index 2 (value of 2).
The first element above the second threshold of 3.1 is of value 4, and is the third element after the current one at index 2 (included).
Base implementation
start <- Sys.time()
bigg <- rnorm(25000)
thresh <- bigg+0.5
result <- rep(NA, length(bigg))
for(i in 1:length(bigg)) {
result[i] <- which(bigg[(i+1):length(bigg)]>thresh[i])[1] # the first next element that is higher than thresh
if(i%%1000==0) print(paste0(i, " ", round(i/length(bigg),3)))
}
end <- Sys.time()
end-start
head(result)
Basically, taking the first element of the vector x after the current one that satisfies a threshold condition.
I tried using Rcpp
// [[Rcpp::export]]
int cppnextup_(NumericVector x, double thresh, bool is_up = true) {
int n = x.size();
//int idx = 0;
int res = -1;
for(int idx = 0; idx < n; ++idx) {
if(x[idx]>thresh && is_up == true) {
res = idx;
//Rcout << "The value of idx : " << idx <<" "<< x[idx]<<"\n";
break;
}
if(x[idx]<thresh && is_up == false) {
res = idx;
//Rcout << "The value of idx : " << idx <<" "<< x[idx]<<"\n";
break;
}
}
return res;
}
Benchmarking:
# base --------------------------------------------------------------------
base_ <- function() {
for(i in 1:length(bigg)) {
result[i] <- which(bigg[(i+1):length(bigg)]>thresh[i])[1] # the first next element that is higher than thresh
if(i%%1000==0) print(paste0(i, " ", round(i/length(bigg),3)))
}
}
# cpp ----------------------------------------------------------------
result_cpp <- rep(NA, length(bigg))
cpp_ <- function() {
for(i in 1:length(bigg)) {
result_cpp[i] <- cppnextup_(bigg[(i+1):length(bigg)], thresh[i]) # the first next element that is higher than thresh
if(i%%1000==0) print(paste0(i, " ", round(i/length(bigg),3)))
}
}
#result_cpp <- ifelse(result_cpp==-1, NA, result_cpp)
#result_cpp <- result_cpp+1
#all.equal(result, result_cpp)
#[1] TRUE
# benchmark ---------------------------------------------------------------
microbenchmark::microbenchmark(base_(),
cpp_(), times=3)
Unit: milliseconds
expr min lq mean median uq max neval
base_() 2023.510 2030.3154 2078.7867 2037.1211 2106.4252 2175.7293 3
cpp_() 661.277 665.3456 718.8851 669.4141 747.6891 825.9641 3
My Rcpp implementation reduces base time by 65%, is there a better (vectorized) way? Looking for any backend, be it Rcpp, data.table, dtplyr etc.
My dtplyr attempt yields all NA's:
library(dtplyr)
nx <- length(bigg)
df <- tibble(bigg, thresh)
bigg %>% lazy_dt() %>% mutate(res = which(bigg[row_number():nx]>thresh)[1])
Warning message:
In seq_len(.N):..nx :
numerical expression has 25000 elements: only the first used
Cheers
Btw, my real vector has 8,406,600 elements.
EDIT: vectorized Rcpp
I also have another, faster Rcpp function which relies on the first one:
// [[Rcpp::export]]
NumericVector cppnextup(NumericVector x, double threshup, bool is_up = true) {
int n = x.size();
NumericVector up(n);
if(is_up == true) {
up = x + threshup;
} else {
up = x - threshup;
}
// Rcout << "The value of up : " << up[0] <<" "<< up[1] <<"\n";
NumericVector result(n);
int idx = 0;
for(int i = 0; i < n; ++i) {
double thisup = up[idx];
NumericVector thisvect = x[Rcpp::Range((idx), (n-1))];
//Rcout <<idx<< " " << "thisvect : " << thisvect[0] <<" thisup: "<< thisup <<" buy " << buy << "\n";
int resi = cppnextup_(thisvect, thisup, is_up = is_up);
if(resi != 0) {
result[idx] = resi+1;
} else {
result[idx] = resi;
}
//Rcout << "RESI: " << resi <<" "<< up[1] <<"\n";
idx = idx + 1;
}
return result;
}
As you can see it is faster than the previous two:
# cpp_vectorized ----------------------------------------------------------
cpp_vect <- function(bigg) {
res_cppvect <- cppnextup(bigg, 0.5)
}
# benchmark ---------------------------------------------------------------
microbenchmark::microbenchmark(base_(),
cpp_(),
cpp_vect(),
times=3)
expr min lq mean median uq max neval
base_() 2014.7211 2016.8679 2068.9869 2019.0146 2096.1198 2173.2250 3
cpp_() 663.0874 666.1540 718.5863 669.2207 746.3357 823.4507 3
cpp_vect() 214.1745 221.2103 223.9532 228.2460 228.8426 229.4392 3
BUT when I pass a larger vector in argument, it freezes and never returns a result.
res <- cpp_vect(bigg=rnorm(1000000)) # freezes
Any help welcome.
A data.table non-equi join with mult = "first" works well. It won't be as fast as an optimized Rcpp function, though.
library(data.table)
bigg <- rnorm(25000)
thresh <- bigg+0.5
f1 <- function(bigg, thresh) {
result <- rep(NA, length(bigg))
for(i in 1:length(bigg)) {
result[i] <- which(bigg[(i+1):length(bigg)]>thresh[i])[1] # the first next element that is higher than thresh
}
result
}
f2 <- function(bigg, thresh) {
data.table(
val = bigg,
r = seq_along(bigg)
)[
data.table(
val = thresh,
r = seq_along(thresh)
),
on = .(val > val, r > r),
.(result = x.r - i.r),
mult = "first"
]$result
}
microbenchmark::microbenchmark(f1 = f1(bigg, thresh),
f2 = f2(bigg, thresh),
times = 10,
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 2167.139 2199.801 2217.6945 2222.4937 2233.254 2250.1693 10
#> f2 605.999 610.576 612.0431 611.1439 614.195 618.6248 10
bigg <- rnorm(1e6)
thresh <- bigg+0.5
system.time(f2(bigg, thresh))
#> user system elapsed
#> 375.71 0.15 375.81
Although this Rcpp code isn't optimized, it performs quite well on a 1e7 vector (less than 1 second, probably due to the normal distribution of test data):
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector cppnextup_(NumericVector x, NumericVector thresh, bool is_up = true) {
int n = x.size();
IntegerVector res(n);
std::fill(res.begin(), res.end(), NA_INTEGER);
for(int i = 0; i < n; i++) {
for (int j = i+1; j < n; j++){
if(x[j]>thresh[i] && is_up == true) {
res[i] = j-i;
//Rcout << "The value of idx : " << idx <<" "<< x[idx]<<"\n";
break;
}
if(x[j]<thresh[i] && is_up == false) {
res[i] = j-i;
//Rcout << "The value of idx : " << idx <<" "<< x[idx]<<"\n";
break;
}
}
}
return res;
}
Speed comparison on a 1e5 vector (a longer vector would take too much time for comparison):
bigg <- rnorm(1e5)
thresh <- bigg+0.5
f1 <- function(bigg, thresh) {
result <- rep(NA, length(bigg))
for(i in 1:length(bigg)) {
result[i] <- which(bigg[(i+1):length(bigg)]>thresh[i])[1] # the first next element that is higher than thresh
}
result
}
f_cpp <- function(bigg, thresh){
cppnextup_(bigg, thresh)
}
microbenchmark::microbenchmark(f1 = f1(bigg, thresh),
f_cpp = f_cpp(bigg, thresh),
times = 1,
check="identical")
Unit: milliseconds
expr min lq mean median uq max neval
f1 59614.42 59614.42 59614.42 59614.42 59614.42 59614.42 1
f_cpp 5.56 5.56 5.56 5.56 5.56 5.56 1
In a worst case scenario where the threshold is reached in the last values of the vector, the data.table approach shows its effectiveness :
bigg <- rep(0,1e5)
thresh <- bigg+0.5
bigg[(1e5-2):1e5] <- 1
microbenchmark::microbenchmark( f1 = f1(bigg, thresh),
f2 = f2(bigg, thresh),
f_cpp = f_cpp(bigg, thresh),
times = 1)
Unit: milliseconds
expr min lq mean median uq max neval
f1 48546.2250 48546.2250 48546.2250 48546.2250 48546.2250 48546.2250 1
f2 40.0642 40.0642 40.0642 40.0642 40.0642 40.0642 1
f_cpp 4521.9461 4521.9461 4521.9461 4521.9461 4521.9461 4521.9461 1

Faster alternative to file.exists()

I maintain an R package that needs to check the existence of lots of little files individually. Repeated calls to file.exists() produce noticeable slowness (benchmarking results here). Unfortunately, situational constraints prevent me from calling file.exists() once on the entire batch of files in vectorized fashion, which I believe would be a lot faster. Is there a faster way to check for the existence of a single file? Maybe in C? This way does not seem to be any faster on my system (the same one that produced these benchmarks):
library(inline)
library(microbenchmark)
body <- "
FILE *fp = fopen(CHAR(STRING_ELT(r_path, 0)), \"r\");
SEXP result = PROTECT(allocVector(INTSXP, 1));
INTEGER(result)[0] = fp == NULL? 0 : 1;
UNPROTECT(1);
return result;
"
file_exists_c <- cfunction(sig = signature(r_path = "character"), body = body)
tmp <- tempfile()
microbenchmark(
c = file_exists_c(tmp),
r = file.exists(tmp)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> c 4.912 5.0230 5.42443 5.0605 5.1240 25.264 100
#> r 3.972 4.0525 4.32615 4.1835 4.2675 11.750 100
file.create(tmp)
#> [1] TRUE
microbenchmark(
c = file_exists_c(tmp),
r = file.exists(tmp)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> c 16.212 16.6245 17.04727 16.7645 16.9860 32.207 100
#> r 6.242 6.4175 7.16057 7.2830 7.4605 26.781 100
Created on 2019-12-06 by the reprex package (v0.3.0)
Edit: access()
access() does appear to be faster, but not by very much.
library(inline)
library(microbenchmark)
body <- "
SEXP result = PROTECT(allocVector(INTSXP, 1));
INTEGER(result)[0] = access(CHAR(STRING_ELT(r_path, 0)), 0)? 0 : 1;
UNPROTECT(1);
return result;
"
file_exists_c <- cfunction(
sig = signature(r_path = "character"),
body = body,
includes = "#include <unistd.h>"
)
tmp <- tempfile()
microbenchmark(
c = file_exists_c(tmp),
r = file.exists(tmp)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> c 1.033 1.048 1.21334 1.0745 1.0910 13.793 100
#> r 1.051 1.068 1.19280 1.0930 1.1175 10.048 100
file.create(tmp)
#> [1] TRUE
microbenchmark(
c = file_exists_c(tmp),
r = file.exists(tmp)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> c 1.073 1.0910 1.33543 1.1285 1.1500 16.676 100
#> r 1.172 1.1965 1.32934 1.2335 1.2695 9.916 100
Created on 2019-12-07 by the reprex package (v0.3.0)
Here is the entirety of file.exists source code (as of this writing):
https://github.com/wch/r-source/blob/bfe73ecd848198cb9b68427cec7e70c40f96bd72/src/main/platform.c#L1375-L1404
SEXP attribute_hidden do_fileexists(SEXP call, SEXP op, SEXP args, SEXP rho)
{
SEXP file, ans;
int i, nfile;
checkArity(op, args);
if (!isString(file = CAR(args)))
error(_("invalid '%s' argument"), "file");
nfile = LENGTH(file);
ans = PROTECT(allocVector(LGLSXP, nfile));
for (i = 0; i < nfile; i++) {
LOGICAL(ans)[i] = 0;
if (STRING_ELT(file, i) != NA_STRING) {
#ifdef Win32
/* Package XML sends arbitrarily long strings to file.exists! */
size_t len = strlen(CHAR(STRING_ELT(file, i)));
if (len > MAX_PATH)
LOGICAL(ans)[i] = FALSE;
else
LOGICAL(ans)[i] =
R_WFileExists(filenameToWchar(STRING_ELT(file, i), TRUE));
#else
// returns NULL if not translatable
const char *p = translateCharFP2(STRING_ELT(file, i));
LOGICAL(ans)[i] = p && R_FileExists(p);
#endif
} else LOGICAL(ans)[i] = FALSE;
}
UNPROTECT(1); /* ans */
return ans;
}
As for R_FileExists, it's here:
https://github.com/wch/r-source/blob/bfe73ecd848198cb9b68427cec7e70c40f96bd72/src/main/sysutils.c#L60-L79
#ifdef Win32
Rboolean R_FileExists(const char *path)
{
struct _stati64 sb;
return _stati64(R_ExpandFileName(path), &sb) == 0;
}
#else
Rboolean R_FileExists(const char *path)
{
struct stat sb;
return stat(R_ExpandFileName(path), &sb) == 0;
}
(R_ExpandFileName is just doing path.expand). It's relying on the stat system utility:
https://en.wikipedia.org/wiki/Stat_(system_call)
https://pubs.opengroup.org/onlinepubs/007908799/xsh/sysstat.h.html
It's built for vectorized inputs, so as mentioned it's much preferable to do file.exists(vector_of_files) than to repeatedly run file.exists(single_file).
From what I can tell (admittedly I'm no expert on the system utilities here), any efficiency gains come at the cost of robustness.
A simple solution in C would be to use access( name of file , 0); if the function returns 0 then the file exists. The second parameter 0 specifies check only if it exists.
Example: I check for the file test.txt in /test directory
#include "io.h"
#include "stdio.h"
int main()
{
if(!access("/test/test.txt",0)) printf("file exists");
}

How can I do logical operations on Rcpp::NumericMatrix using a "sugar" manner?

I have to compare a matrix entry-wisely with a number, so I try to define a Cxx function such as
src <- '
LogicalMatrix f(NumericMatrix fmat, double t){
LogicalMatrix result = fmat >= t;
return result;
}
'
cppFunction(src)
But some exceptions are thrown out. What is the reason? So how can I do it in a tidy way?
The answer by #duckmayr is really spot-on, and shows an important detail: we may as well hide implementation detail behind a function because after all that is all Rcpp Sugar et al do for us anyway.
But we can rely on Sugar operation as desired by #zengchao if we first convert the matrix to a vector, operate on that vector and then restore a matrix. That works because internally a matrix is just a vector with added dimensions (of order two; arrays generalise to more than two).
But it turns out ... that that version is (marginally) more expensive than just looping (and marginally cheaper than working on columns). See below for complete details but a function f3() could be:
// [[Rcpp::export]]
LogicalMatrix f3(NumericMatrix fmat, double t) {
IntegerVector dims = fmat.attr("dim");
NumericVector v(fmat);
LogicalVector lv = v >= t;
return LogicalMatrix(dims[0], dims[1], lv.begin());
}
But the non-obvious element-wise f2() remains the fastest:
R> microbenchmark(f(mat, 1.0), f2(mat, 1.0), f3(mat, 1.0), times = 5e4)
Unit: nanoseconds
expr min lq mean median uq max neval
f(mat, 1) 873 992 1322.10 1042 1118.0 1493236 50000
f2(mat, 1) 823 925 1195.49 975 1049.5 2068214 50000
f3(mat, 1) 864 977 1288.68 1031 1114.0 1909361 50000
R>
Moral: The simple looping solution does the least copying of temporary objects and is fastest. Overall, the speed difference between all three hardly matters.
And for larger matrices the advantage of not copying temporaries gets more important:
R> mat <- matrix(sqrt(1:1000), 1000)
R> microbenchmark(f(mat, 1.0), f2(mat, 1.0), f3(mat, 1.0), times = 1e3)
Unit: microseconds
expr min lq mean median uq max neval
f(mat, 1) 3.720 3.895 3.99972 3.9555 4.0425 16.758 1000
f2(mat, 1) 1.999 2.122 2.23261 2.1760 2.2545 17.325 1000
f3(mat, 1) 3.921 4.156 4.31034 4.2220 4.3270 19.982 1000
R>
Full code below.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
LogicalMatrix f(NumericMatrix fmat, double t){
int n = fmat.nrow(), m = fmat.ncol();
LogicalMatrix result(n, m);
for ( int j = 0; j < m; ++j ) {
result(_, j) = fmat(_, j) >= t;
}
return result;
}
// [[Rcpp::export]]
LogicalMatrix f2(NumericMatrix fmat, double t){
int n = fmat.nrow(), m = fmat.ncol();
LogicalMatrix result(n, m);
for ( int i = 0; i < n; ++i ) {
for ( int j = 0; j < m; ++j ) {
result(i, j) = fmat(i, j) >= t;
}
}
return result;
}
// [[Rcpp::export]]
LogicalMatrix f3(NumericMatrix fmat, double t) {
int dims[2] = { fmat.nrow(), fmat.ncol() };
NumericVector v(fmat);
LogicalVector lv = v >= t;
return LogicalMatrix(dims[0], dims[1], lv.begin());
}
/*** R
mat <- matrix(c(1,2,3,4), 2, 2)
library(microbenchmark)
microbenchmark(f(mat, 1.0), f2(mat, 1.0), f3(mat, 1.0), times = 1e5)
mat <- matrix(sqrt(1:1000), 1000)
microbenchmark(f(mat, 1.0), f2(mat, 1.0), f3(mat, 1.0), times = 1e3)
*/
Edit: And we can remove one more line relative to f3() but it makes little difference on run-time:
// [[Rcpp::export]]
LogicalMatrix f4(NumericMatrix fmat, double t) {
IntegerVector dims = fmat.attr("dim");
LogicalVector lv = NumericVector(fmat) >= t;
return LogicalMatrix(dims[0], dims[1], lv.begin());
}
I assume by "tidy way" you mean avoiding loops in favor of using syntactic sugar provided in Rcpp. Since sugar provides a comparator with one value for vectors but not for matrices (see here and here), I think the most "tidy way" you can do for now is to loop (only) over columns (or rows), i.e., without having to loop over columns and rows:
// [[Rcpp::export]]
LogicalMatrix f(NumericMatrix fmat, double t){
int n = fmat.nrow(), m = fmat.ncol();
LogicalMatrix result(n, m);
for ( int j = 0; j < m; ++j ) {
result(_, j) = fmat(_, j) >= t;
}
return result;
}
> f(fmat, 1.0)
[,1] [,2]
[1,] TRUE FALSE
[2,] FALSE TRUE
> f(fmat, -1.0)
[,1] [,2]
[1,] TRUE TRUE
[2,] TRUE TRUE
> f(fmat, 2.0)
[,1] [,2]
[1,] FALSE FALSE
[2,] FALSE FALSE
However, I would suggest that avoiding the extra loop doesn't really buy you anything in terms of readability (and in fact may harm readability for some readers of your code); consider the function where you loop over rows and columns:
// [[Rcpp::export]]
LogicalMatrix f2(NumericMatrix fmat, double t){
int n = fmat.nrow(), m = fmat.ncol();
LogicalMatrix result(n, m);
for ( int i = 0; i < n; ++i ) {
for ( int j = 0; j < m; ++j ) {
result(i, j) = fmat(i, j) >= t;
}
}
return result;
}
I don't really see how this is significantly harder to type, it seems to be essentially performance equivalent (the mean execution time is slightly lower though the median is slightly higher -- see benchmarks below), and at least for some readers I bet it would be more readily apparent precisely what you're doing.
That said, if skipping a loop helps you out, I think this is the best you can do for now.
library(microbenchmark)
> microbenchmark(loop = f(fmat, 1.0), nonloop = f2(fmat, 1.0), times = 1e4)
Unit: microseconds
expr min lq mean median uq max neval cld
loop 6.564 7.402 9.77116 7.612 8.031 9173.952 10000 a
nonloop 6.425 7.123 10.01659 7.333 7.682 4377.448 10000 a
> microbenchmark(nonloop = f2(fmat, 1.0), loop = f(fmat, 1.0), times = 1e4)
Unit: microseconds
expr min lq mean median uq max neval cld
nonloop 6.356 7.124 10.179950 7.333 7.544 4822.066 10000 a
loop 6.775 7.404 9.588326 7.613 7.892 4278.971 10000 a

Fast checking of missing values in Rcpp

This question is linked to NA values in Rcpp conditional.
I basically have some Rcpp code that loop over multiple (double) elements. And I need to check if there are missing values, for each element (and I can't use vectorization). Let's count the number of missing values in a vector, just as minimal reproducible example:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
int nb_na(const NumericVector& x) {
int n = x.size();
int c = 0;
for (int i = 0; i < n; i++) if (R_IsNA(x[i])) c++;
return c;
}
// [[Rcpp::export]]
int nb_na3(const NumericVector& x) {
int n = x.size();
int c = 0;
for (int i = 0; i < n; i++) if (x[i] == 3) c++;
return c;
}
// [[Rcpp::export]]
LogicalVector na_real(NumericVector x) {
return x == NA_REAL;
}
Then, in R, we get:
> x <- rep(c(1, 2, NA), 1e4)
> x2 <- replace(x, is.na(x), 3)
> microbenchmark::microbenchmark(
+ nb_na(x),
+ nb_na3(x2)
+ )
Unit: microseconds
expr min lq mean median uq max neval
nb_na(x) 135.633 135.982 153.08586 139.753 140.3115 1294.928 100
nb_na3(x2) 22.490 22.908 30.14005 23.188 23.5025 684.026 100
> all.equal(nb_na(x), nb_na3(x2))
[1] TRUE
> na_real(x[1:3])
[1] NA NA NA
As noted in the linked question, you can't just check x[i] == NA_REAL because it always returns a missing value. Yet, using R_IsNA(x[i]) is much slower that checking equality with a numeric value (e.g. 3).
Basically, I want a solution where I can check that a single value is a missing value. This solution should be as fast as checking equality with a numeric value.
Checking for missing value or any NaN specific variant is always going to be more expensive than checking for a specific value. That's just floating point arithmetic.
However there's still room for improvement in your code. I would encourage you to use NumericVector::is_na instead of R_IsNA but this is mostly cosmetic.
Then branching can be expensive, i.e. I'd replace if (R_IsNA(x[i])) c++; by c += NumericVector::is_na(x[i]). This gives this version:
// [[Rcpp::export]]
int nb_na4(const NumericVector& x) {
int n = x.size();
int c = 0;
for (int i = 0; i < n; i++) c += NumericVector::is_na(x[i]) ;
return c;
}
Then iterating on an int and accessing x[i] can be replaced by using the std::count_if algorithm. This is it's raison d'ĂȘtre. Leading to this version:
// [[Rcpp::export]]
int nb_na5(const NumericVector& x) {
return std::count_if(x.begin(), x.end(), NumericVector::is_na ) ;
}
Now if the performance is still not good enough, you might want to try parallelization, for this I typically use the tbb library from the RcppParallel package.
// [[Rcpp::export]]
int nb_na6(const NumericVector& x) {
return tbb::parallel_reduce(
tbb::blocked_range<const double*>(x.begin(), x.end()),
0,
[](const tbb::blocked_range<const double*>& r, int init) -> int {
return init + std::count_if( r.begin(), r.end(), NumericVector::is_na );
},
[]( int x, int y){ return x+y; }
) ;
}
Benchmarking with this function:
library(microbenchmark)
bench <- function(n){
x <- rep(c(1, 2, NA), n)
microbenchmark(
nb_na = nb_na(x),
nb_na4 = nb_na4(x),
nb_na5 = nb_na5(x),
nb_na6 = nb_na6(x)
)
}
bench(1e5)
On my machine I get:
> bench(1e4)
Unit: microseconds
expr min lq mean median uq max neval cld
nb_na 84.358 94.6500 107.41957 110.482 118.9580 137.393 100 d
nb_na4 59.984 69.4925 79.42195 82.442 85.9175 106.567 100 b
nb_na5 65.047 75.2625 85.17134 87.501 93.0315 116.993 100 c
nb_na6 39.205 51.0785 59.20582 54.457 68.9625 97.225 100 a
> bench(1e5)
Unit: microseconds
expr min lq mean median uq max neval cld
nb_na 730.416 732.2660 829.8440 797.4350 872.3335 1410.467 100 d
nb_na4 520.800 521.6215 598.8783 562.7200 657.1755 1059.991 100 b
nb_na5 578.527 579.3805 664.8795 626.5530 710.5925 1166.365 100 c
nb_na6 294.486 345.2050 368.6664 353.6945 372.6205 897.552 100 a
Another way is to totally circumvent floating point arithmetic and pretend the vector is a vector of long long, aka 64 bit integers and compare the values to the bit pattern of NA_REAL:
> devtools::install_github( "ThinkR-open/seven31" )
> seven31::reveal(NA, NaN, +Inf, -Inf )
0 11111111111 ( NaN ) 0000000000000000000000000000000000000000011110100010 : NA
0 11111111111 ( NaN ) 1000000000000000000000000000000000000000000000000000 : NaN
0 11111111111 ( NaN ) 0000000000000000000000000000000000000000000000000000 : +Inf
1 11111111111 ( NaN ) 0000000000000000000000000000000000000000000000000000 : -Inf
A serial solution using this hack:
// [[Rcpp::export]]
int nb_na7( const NumericVector& x){
const long long* p = reinterpret_cast<const long long*>(x.begin()) ;
long long na = *reinterpret_cast<long long*>(&NA_REAL) ;
return std::count(p, p + x.size(), na ) ;
}
And then a parallel version:
// [[Rcpp::export]]
int nb_na8( const NumericVector& x){
const long long* p = reinterpret_cast<const long long*>(x.begin()) ;
long long na = *reinterpret_cast<long long*>(&NA_REAL) ;
auto count_chunk = [=](const tbb::blocked_range<const long long*>& r, int init) -> int {
return init + std::count( r.begin(), r.end(), na);
} ;
return tbb::parallel_reduce(
tbb::blocked_range<const long long*>(p, p + x.size()),
0,
count_chunk,
[]( int x, int y){ return x+y; }
) ;
}
> bench(1e5)
Unit: microseconds
expr min lq mean median uq max neval cld
nb_na 730.346 762.5720 839.9479 857.5865 881.8635 1045.048 100 f
nb_na4 520.946 521.6850 589.0911 578.2825 653.4950 832.449 100 d
nb_na5 578.621 579.3245 640.9772 616.8645 701.8125 890.736 100 e
nb_na6 291.115 307.4300 340.1626 344.7955 360.7030 484.261 100 c
nb_na7 122.156 123.4990 141.1954 132.6385 149.7895 253.988 100 b
nb_na8 69.356 86.9980 109.6427 115.2865 126.2775 182.184 100 a
> bench(1e6)
Unit: microseconds
expr min lq mean median uq max neval cld
nb_na 7342.984 7956.3375 10261.583 9227.7450 10869.605 79757.09 100 d
nb_na4 5286.970 5721.9150 7659.009 6660.2390 9234.646 31141.47 100 c
nb_na5 5840.946 6272.7050 7307.055 6883.2430 8205.117 10420.48 100 c
nb_na6 2833.378 2895.7160 3891.745 3049.4160 4054.022 18242.26 100 b
nb_na7 1661.421 1791.1085 2708.992 1916.6055 2232.720 60827.63 100 ab
nb_na8 650.639 869.6685 1289.373 939.0045 1291.025 10223.29 100 a
This assumes there's only one bit pattern to represent NA.
Here's my entire file for reference:
#include <Rcpp.h>
#include <RcppParallel.h>
// [[Rcpp::depends(RcppParallel)]]
// [[Rcpp::plugins(cpp11)]]
using namespace Rcpp;
// [[Rcpp::export]]
int nb_na(const NumericVector& x) {
int n = x.size();
int c = 0;
for (int i = 0; i < n; i++) if (R_IsNA(x[i])) c++;
return c;
}
// [[Rcpp::export]]
int nb_na4(const NumericVector& x) {
int n = x.size();
int c = 0;
for (int i = 0; i < n; i++) c += NumericVector::is_na(x[i]) ;
return c;
}
// [[Rcpp::export]]
int nb_na5(const NumericVector& x) {
return std::count_if(x.begin(), x.end(), NumericVector::is_na ) ;
}
// [[Rcpp::export]]
int nb_na6(const NumericVector& x) {
return tbb::parallel_reduce(
tbb::blocked_range<const double*>(x.begin(), x.end()),
0,
[](const tbb::blocked_range<const double*>& r, int init) -> int {
return init + std::count_if( r.begin(), r.end(), NumericVector::is_na );
},
[]( int x, int y){ return x+y; }
) ;
}
// [[Rcpp::export]]
int nb_na7( const NumericVector& x){
const long long* p = reinterpret_cast<const long long*>(x.begin()) ;
long long na = *reinterpret_cast<long long*>(&NA_REAL) ;
return std::count(p, p + x.size(), na ) ;
}
// [[Rcpp::export]]
int nb_na8( const NumericVector& x){
const long long* p = reinterpret_cast<const long long*>(x.begin()) ;
long long na = *reinterpret_cast<long long*>(&NA_REAL) ;
auto count_chunk = [=](const tbb::blocked_range<const long long*>& r, int init) -> int {
return init + std::count( r.begin(), r.end(), na);
} ;
return tbb::parallel_reduce(
tbb::blocked_range<const long long*>(p, p + x.size()),
0,
count_chunk,
[]( int x, int y){ return x+y; }
) ;
}
/*** R
library(microbenchmark)
bench <- function(n){
x <- rep(c(1, 2, NA), n)
microbenchmark(
nb_na = nb_na(x),
nb_na4 = nb_na4(x),
nb_na5 = nb_na5(x),
nb_na6 = nb_na6(x),
nb_na7 = nb_na7(x),
nb_na8 = nb_na8(x)
)
}
bench(1e5)
bench(1e6)
*/
Checking for (IEEE) missing floating-point values is an expensive operating and there is no way around it. This is unrelated to R.
This is one reason why we're excited about the upcoming ALTREP in R - there we can for instance keep track of whether a double/real vector contains missing values or not - if it doesn't, then we don't have to waste time looking for them. Although not updated to mention ALTREP, you can get the gist from https://github.com/HenrikBengtsson/Wishlist-for-R/issues/12

Rcpp version of tabulate is slower; where is this from, how to understand

In the process of creating some sampling functions for already aggregated data I found that table was rather slow on the size data I am working with. I tried two improvements, first an Rcpp function as follows
// [[Rcpp::export]]
IntegerVector getcts(NumericVector x, int m) {
IntegerVector cts(m);
int t;
for (int i = 0; i < x.length(); i++) {
t = x[i] - 1;
if (0 <= t && t < m)
cts[t]++;
}
return cts;
}
And then while trying to understand why table was rather slow I found it being based on tabulate. Tabulate works well for me, and is faster than the Rcpp version. The code for tabulate is at:
https://github.com/wch/r-source/blob/545d365bd0485e5f0913a7d609c2c21d1f43145a/src/main/util.c#L2204
With the key line being:
for(R_xlen_t i = 0 ; i < n ; i++)
if (x[i] != NA_INTEGER && x[i] > 0 && x[i] <= nb) y[x[i] - 1]++;
Now the key parts of tabulate and my Rcpp version seem pretty close (I have not bothered dealing with NA).
Q1: why is my Rcpp version 3 times slower?
Q2: how can I find out where this time goes?
I would very much appreciate knowing where the time went, but even better would be a good way to profile the code. My C++ skills are only so so, but this seems simple enough that I should (cross my fingers) have been able to avoid any silly stuff that would triple my time.
My timing code:
max_x <- 100
xs <- sample(seq(max_x), size = 50000000, replace = TRUE)
system.time(getcts(xs, max_x))
system.time(tabulate(xs))
This gives 0.318 for getcts and 0.126 for tabulate.
Your function calls a length method in each loop iteration. Seems compiler don't cache it. To fix this store size of the vector in a separate variable or use range based loop. Also note that we don't really need explicit missing values check because in C++ all comparisons involving a NaN always return false.
Let's compare performance:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector tabulate1(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
for (std::size_t i = 0; i < x.size(); i++) {
if (x[i] > 0 && x[i] <= max)
counts[x[i] - 1]++;
}
return counts;
}
// [[Rcpp::export]]
IntegerVector tabulate2(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
std::size_t n = x.size();
for (std::size_t i = 0; i < n; i++) {
if (x[i] > 0 && x[i] <= max)
counts[x[i] - 1]++;
}
return counts;
}
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
IntegerVector tabulate3(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
for (auto& now : x) {
if (now > 0 && now <= max)
counts[now - 1]++;
}
return counts;
}
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
IntegerVector tabulate4(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
for (auto it = x.begin(); it != x.end(); it++) {
if (*it > 0 && *it <= max)
counts[*it - 1]++;
}
return counts;
}
/***R
library(microbenchmark)
x <- sample(10, 1e5, rep = TRUE)
microbenchmark(
tabulate(x, 10), tabulate1(x, 10),
tabulate2(x, 10), tabulate3(x, 10), tabulate4(x, 10)
)
x[sample(10e5, 10e3)] <- NA
microbenchmark(
tabulate(x, 10), tabulate1(x, 10),
tabulate2(x, 10), tabulate3(x, 10), tabulate4(x, 10)
)
*/
tabulate1 is the original version.
Benchmark results:
Without NA:
Unit: microseconds
expr min lq mean median uq max neval
tabulate(x, 10) 143.557 146.8355 169.2820 156.1970 177.327 286.370 100
tabulate1(x, 10) 390.706 392.6045 437.7357 416.5655 443.065 748.767 100
tabulate2(x, 10) 108.149 111.4345 139.7579 118.2735 153.118 337.647 100
tabulate3(x, 10) 107.879 111.7305 138.2711 118.8650 139.598 300.023 100
tabulate4(x, 10) 391.003 393.4530 436.3063 420.1915 444.048 777.862 100
With NA:
Unit: microseconds
expr min lq mean median uq max neval
tabulate(x, 10) 943.555 1089.5200 1614.804 1333.806 2042.320 3986.836 100
tabulate1(x, 10) 4523.076 4787.3745 5258.490 4929.586 5624.098 7233.029 100
tabulate2(x, 10) 765.102 931.9935 1361.747 1113.550 1679.024 3436.356 100
tabulate3(x, 10) 773.358 914.4980 1350.164 1140.018 1642.354 3633.429 100
tabulate4(x, 10) 4241.025 4466.8735 4933.672 4717.016 5148.842 8603.838 100
The tabulate4 function which uses an iterator also slower than tabulate. We can improve it:
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
IntegerVector tabulate4(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
auto start = x.begin();
auto end = x.end();
for (auto it = start; it != end; it++) {
if (*(it) > 0 && *(it) <= max)
counts[*(it) - 1]++;
}
return counts;
}

Resources