Fastet way to position a double in an existing vector - r

Let's say I have a double like 3.5 and I would like to find out where to sort it in an existing sorted vector say seq(1, 10), put differently, which index the new number would take in the vector. Of course it sits somewhere between 3 and 4 and hence between the third and fourth index, but what would be the fastet way to arrive at this result?

As mentioned in the comments, findInterval is fastest for this purpose. Even a very simple loop in C++ that does the same thing is a little slower on average.
library(Rcpp)
cppFunction("int find_index(double x, NumericVector v) {
int len = v.size();
for(int i = 0; i < len; ++i) {
if(x <= v[i]) return i + 1;
}
return NA_INTEGER;
}")
microbenchmark::microbenchmark(
findInterval = findInterval(453993.5, 1:1000000),
find_index = find_index(453993.5, 1:1000000)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> findInterval 1.9646 2.1739 2.996931 2.32375 2.4846 37.4218 100
#> find_index 2.2151 2.4502 11.319199 2.60925 2.9800 337.9229 100

Something like this?
First define dbl and my_seq
then concatenate both with c(dbl, my_seq) and wrap it with sort
then define the index with which(my_vec == dbl):
dbl <- 3.5
my_seq <- seq(1,10)
my_vec <- sort(c(dbl, my_seq))
index <- which(my_vec == dbl)
index
output:
[1] 4

Related

Split string fixed width [duplicate]

I have an object containing a text string:
x <- "xxyyxyxy"
and I want to split that into a vector with each element containing two letters:
[1] "xx" "yy" "xy" "xy"
It seems like the strsplit should be my ticket, but since I have no regular expression foo, I can't figure out how to make this function chop the string up into chunks the way I want it. How should I do this?
Using substring is the best approach:
substring(x, seq(1, nchar(x), 2), seq(2, nchar(x), 2))
But here's a solution with plyr:
library("plyr")
laply(seq(1, nchar(x), 2), function(i) substr(x, i, i+1))
Here is a fast solution that splits the string into characters, then pastes together the even elements and the odd elements.
x <- "xxyyxyxy"
sst <- strsplit(x, "")[[1]]
paste0(sst[c(TRUE, FALSE)], sst[c(FALSE, TRUE)])
Benchmark Setup:
library(microbenchmark)
GSee <- function(x) {
sst <- strsplit(x, "")[[1]]
paste0(sst[c(TRUE, FALSE)], sst[c(FALSE, TRUE)])
}
Shane1 <- function(x) {
substring(x, seq(1,nchar(x),2), seq(2,nchar(x),2))
}
library("plyr")
Shane2 <- function(x) {
laply(seq(1,nchar(x),2), function(i) substr(x, i, i+1))
}
seth <- function(x) {
strsplit(gsub("([[:alnum:]]{2})", "\\1 ", x), " ")[[1]]
}
geoffjentry <- function(x) {
idx <- 1:nchar(x)
odds <- idx[(idx %% 2) == 1]
evens <- idx[(idx %% 2) == 0]
substring(x, odds, evens)
}
drewconway <- function(x) {
c<-strsplit(x,"")[[1]]
sapply(seq(2,nchar(x),by=2),function(y) paste(c[y-1],c[y],sep=""))
}
KenWilliams <- function(x) {
n <- 2
sapply(seq(1,nchar(x),by=n), function(xx) substr(x, xx, xx+n-1))
}
RichardScriven <- function(x) {
regmatches(x, gregexpr("(.{2})", x))[[1]]
}
Benchmark 1:
x <- "xxyyxyxy"
microbenchmark(
GSee(x),
Shane1(x),
Shane2(x),
seth(x),
geoffjentry(x),
drewconway(x),
KenWilliams(x),
RichardScriven(x)
)
# Unit: microseconds
# expr min lq median uq max neval
# GSee(x) 8.032 12.7460 13.4800 14.1430 17.600 100
# Shane1(x) 74.520 80.0025 84.8210 88.1385 102.246 100
# Shane2(x) 1271.156 1288.7185 1316.6205 1358.5220 3839.300 100
# seth(x) 36.318 43.3710 45.3270 47.5960 67.536 100
# geoffjentry(x) 9.150 13.5500 15.3655 16.3080 41.066 100
# drewconway(x) 92.329 98.1255 102.2115 105.6335 115.027 100
# KenWilliams(x) 77.802 83.0395 87.4400 92.1540 163.705 100
# RichardScriven(x) 55.034 63.1360 65.7545 68.4785 108.043 100
Benchmark 2:
Now, with bigger data.
x <- paste(sample(c("xx", "yy", "xy"), 1e5, replace=TRUE), collapse="")
microbenchmark(
GSee(x),
Shane1(x),
Shane2(x),
seth(x),
geoffjentry(x),
drewconway(x),
KenWilliams(x),
RichardScriven(x),
times=3
)
# Unit: milliseconds
# expr min lq median uq max neval
# GSee(x) 29.029226 31.3162690 33.603312 35.7046155 37.805919 3
# Shane1(x) 11754.522290 11866.0042600 11977.486230 12065.3277955 12153.169361 3
# Shane2(x) 13246.723591 13279.2927180 13311.861845 13371.2202695 13430.578694 3
# seth(x) 86.668439 89.6322615 92.596084 92.8162885 93.036493 3
# geoffjentry(x) 11670.845728 11681.3830375 11691.920347 11965.3890110 12238.857675 3
# drewconway(x) 384.863713 438.7293075 492.594902 515.5538020 538.512702 3
# KenWilliams(x) 12213.514508 12277.5285215 12341.542535 12403.2315015 12464.920468 3
# RichardScriven(x) 11549.934241 11730.5723030 11911.210365 11989.4930080 12067.775651 3
How about
strsplit(gsub("([[:alnum:]]{2})", "\\1 ", x), " ")[[1]]
Basically, add a separator (here " ") and then use strsplit
strsplit is going to be problematic, look at a regexp like this
strsplit(z, '[[:alnum:]]{2}')
it will split at the right points but nothing is left.
You could use substring & friends
z <- 'xxyyxyxy'
idx <- 1:nchar(z)
odds <- idx[(idx %% 2) == 1]
evens <- idx[(idx %% 2) == 0]
substring(z, odds, evens)
Here's one way, but not using regexen:
a <- "xxyyxyxy"
n <- 2
sapply(seq(1,nchar(a),by=n), function(x) substr(a, x, x+n-1))
ATTENTION with substring, if string length is not a multiple of your requested length, then you will need a +(n-1) in the second sequence:
substring(x,seq(1,nchar(x),n),seq(n,nchar(x)+n-1,n))
Total hack, JD, but it gets it done
x <- "xxyyxyxy"
c<-strsplit(x,"")[[1]]
sapply(seq(2,nchar(x),by=2),function(y) paste(c[y-1],c[y],sep=""))
[1] "xx" "yy" "xy" "xy"
A helper function:
fixed_split <- function(text, n) {
strsplit(text, paste0("(?<=.{",n,"})"), perl=TRUE)
}
fixed_split(x, 2)
[[1]]
[1] "xx" "yy" "xy" "xy"
Using C++ one can be even faster. Comparing with GSee's version:
GSee <- function(x) {
sst <- strsplit(x, "")[[1]]
paste0(sst[c(TRUE, FALSE)], sst[c(FALSE, TRUE)])
}
rstub <- Rcpp::cppFunction( code = '
CharacterVector strsplit2(const std::string& hex) {
unsigned int length = hex.length()/2;
CharacterVector res(length);
for (unsigned int i = 0; i < length; ++i) {
res(i) = hex.substr(2*i, 2);
}
return res;
}')
x <- "xxyyxyxy"
all.equal(GSee(x), rstub(x))
#> [1] TRUE
microbenchmark::microbenchmark(GSee(x), rstub(x))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> GSee(x) 4.272 4.4575 41.74284 4.5855 4.7105 3702.289 100
#> rstub(x) 1.710 1.8990 139.40519 2.0665 2.1250 13722.075 100
set.seed(42)
x <- paste(sample(c("xx", "yy", "xy"), 1e5, replace = TRUE), collapse = "")
all.equal(GSee(x), rstub(x))
#> [1] TRUE
microbenchmark::microbenchmark(GSee(x), rstub(x))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> GSee(x) 17.931801 18.431504 19.282877 18.738836 19.47943 27.191390 100
#> rstub(x) 3.197587 3.261109 3.404973 3.341099 3.45852 4.872195 100
Well, I used the following pseudo-code to fulfill this task:
Insert a special sequence at each chunk of length n.
Split the string by said sequence.
In code, I did
chopS <- function( text, chunk_len = 2, seqn)
{
# Specify select and replace patterns
insert <- paste("(.{",chunk_len,"})", sep = "")
replace <- paste("\\1", seqn, sep = "")
# Insert sequence with replaced pattern, then split by the sequence
interp_text <- gsub( pattern, replace, text)
strsplit( interp_text, seqn)
}
This returns a list with the split vector inside, though, not a vector.
From my testing, the code below is faster than the previous methods that were benchmarked. stri_sub is pretty fast, and seq.int is better than seq. It's also easy to change the size of the strings by changing all the 2Ls to something else.
library(stringi)
split_line <- function(x) {
row_length <- stri_length(x)
stri_sub(x, seq.int(1L, row_length, 2L), seq.int(2L, row_length, 2L))
}
I didn't notice a difference when string chunks were 2 characters long, but for bigger chunks this is slightly better.
split_line <- function(x) {
stri_sub(x, seq.int(1L, stri_length(x), 109L), length = 109L)
}
I set out looking for a vectorised solution to this, in order to avoid
lapply()ing one of the single string solutions across long vectors. Failing
to find an existing solution, I somehow fell down a rabbit hole of
painstakingly writing one in C. It ended up hilariously complicated compared
to the many one-line R solutions shown here (no thanks to me deciding to also
want to handle Unicode strings to match the R versions), but I thought I’d
share the result, in case it somehow someday helps somebody. Here’s what
eventually became of that:
#define R_NO_REMAP
#include <R.h>
#include <Rinternals.h>
// Find the width (in bytes) of a UTF-8 character, given its first byte
size_t utf8charw(char b) {
if (b == 0x00) return 0;
if ((b & 0x80) == 0x00) return 1;
if ((b & 0xe0) == 0xc0) return 2;
if ((b & 0xf0) == 0xe0) return 3;
if ((b & 0xf8) == 0xf0) return 4;
return 1; // Really an invalid character, but move on
}
// Find the number of UTF-8 characters in a string
size_t utf8nchar(const char* str) {
size_t nchar = 0;
while (*str != '\0') {
str += utf8charw(*str); nchar++;
}
return nchar;
}
SEXP C_str_chunk(SEXP x, SEXP size_) {
// Allocate a list to store the result
R_xlen_t n = Rf_xlength(x);
SEXP result = PROTECT(Rf_allocVector(VECSXP, n));
int size = Rf_asInteger(size_);
for (R_xlen_t i = 0; i < n; i++) {
const char* str = Rf_translateCharUTF8(STRING_ELT(x, i));
// Figure out number of chunks
size_t nchar = utf8nchar(str);
size_t nchnk = nchar / size + (nchar % size != 0);
SEXP chunks = PROTECT(Rf_allocVector(STRSXP, nchnk));
for (size_t j = 0, nbytes = 0; j < nchnk; j++, str += nbytes) {
// Find size of next chunk in bytes
nbytes = 0;
for (int cp = 0; cp < size; cp++) {
nbytes += utf8charw(str[nbytes]);
}
// Assign to chunks vector as an R string
SET_STRING_ELT(chunks, j, Rf_mkCharLenCE(str, nbytes, CE_UTF8));
}
SET_VECTOR_ELT(result, i, chunks);
}
// Clean up
UNPROTECT(n);
UNPROTECT(1);
return result;
}
I then put this monstrosity into a file called str_chunk.c, and compiled with R CMD SHLIB str_chunk.c.
To try it out, we need some set-up on the R side:
str_chunk <- function(x, n) {
.Call("C_str_chunk", x, as.integer(n))
}
# The (currently) accepted answer
str_chunk_one <- function(x, n) {
substring(x, seq(1, nchar(x), n), seq(n, nchar(x), n))
}
dyn.load("str_chunk.dll")
So what we’ve achieved with the C version is to take a vector inputs and return a list:
str_chunk(rep("0123456789AB", 2), 2)
#> [[1]]
#> [1] "01" "23" "45" "67" "89" "AB"
#>
#> [[2]]
#> [1] "01" "23" "45" "67" "89" "AB"
Now off we go with benchmarking.
We start off strong with a 200x improvement for a long(ish) vector of
short strings:
x <- rep("0123456789AB", 1000)
microbenchmark::microbenchmark(
accepted = lapply(x, str_chunk_one, 2),
str_chunk(x, 2)
) |> print(unit = "relative")
#> Unit: relative
#> expr min lq mean median uq max neval
#> accepted 229.5826 216.8246 182.5449 203.785 182.3662 25.88823 100
#> str_chunk(x, 2) 1.0000 1.0000 1.0000 1.000 1.0000 1.00000 100
… which then shrinks to a distinctly less impressive 3x improvement for
large strings.
x <- rep(strrep("0123456789AB", 1000), 10)
microbenchmark::microbenchmark(
accepted = lapply(x, str_chunk_one, 2),
str_chunk(x, 2)
) |> print(unit = "relative")
#> Unit: relative
#> expr min lq mean median uq max neval
#> accepted 2.77981 2.802641 3.304573 2.787173 2.846268 13.62319 100
#> str_chunk(x, 2) 1.00000 1.000000 1.000000 1.000000 1.000000 1.00000 100
dyn.unload("str_chunk.dll")
So, was it worth it? Well, absolutely not considering how long it took to
actually get working properly – But if this was in a package, it would have
saved quite a lot of time in my use-case (short strings, long vectors).
Here is one option using stringi::stri_sub(). Try:
x <- "xxyyxyxy"
stringi::stri_sub(x, seq(1, stringi::stri_length(x), by = 2), length = 2)
# [1] "xx" "yy" "xy" "xy"

What is the fastest way to perform multiple logical comparisons in R?

What is the fastest way to perform multiple logical comparisons in R?
Consider for example the vector x
set.seed(14)
x = sample(LETTERS[1:4], size=10, replace=TRUE)
I want to test if each entry of x is either a "A" or a "B" (and not anything else). The following works
x == "A" | x == "B"
[1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
The above code loops three times through the length of the whole vector. Is there a way in R to loop only once and test for each item whether it satisfies one or another condition?
If your objective is just to make a single pass, that is pretty straightforward to write in Rcpp, even if you don't have much experience with C++:
#include <Rcpp.h>
// [[Rcpp::export]]
Rcpp::LogicalVector single_pass(Rcpp::CharacterVector x, Rcpp::String a, Rcpp::String b) {
R_xlen_t i = 0, n = x.size();
Rcpp::LogicalVector result(n);
for ( ; i < n; i++) {
result[i] = (x[i] == a || x[i] == b);
}
return result;
}
For such a small object as the one used in your example, the slight overhead of .Call (presumably) masks the speed of the Rcpp version,
r_fun <- function(X) X == "A" | X == "B"
##
cpp_fun <- function(X) single_pass(X, "A", "B")
##
all.equal(r_fun(x), cpp_fun(x))
#[1] TRUE
microbenchmark::microbenchmark(
r_fun(x), cpp_fun(x), times = 1000L)
#Unit: microseconds
#expr min lq mean median uq max neval
#r_fun(x) 1.499 1.584 1.974156 1.6795 1.8535 37.903 1000
#cpp_fun(x) 1.860 2.334 3.042671 2.7450 3.1140 51.870 1000
But for larger vectors (I'm assuming this is your real intention), it is considerably faster:
x2 <- sample(LETTERS, 10E5, replace = TRUE)
##
all.equal(r_fun(x2), cpp_fun(x2))
# [1] TRUE
microbenchmark::microbenchmark(
r_fun(x2), cpp_fun(x2), times = 200L)
#Unit: milliseconds
#expr min lq mean median uq max neval
#r_fun(x2) 78.044518 79.344465 83.741901 80.999538 86.368627 149.5106 200
#cpp_fun(x2) 7.104929 7.201296 7.797983 7.605039 8.184628 10.7250 200
Here's a quick attempt at generalizing the above, if you have any use for it.

Vectorization while using which() function in R

I have 3 vectors and I want to apply separately on each of them the 'which()' function.
I'm trying to find the max index of values less than some given number.
How can I operate this task using vectorization?
my 3 vectors (may have various lengths)
vec1 <- c(1,2,3,4,5)
vec2 <- c(11,12,13)
vec3 <- c(1,2,3,4,5,6,7,8)
How can I vectorize it?
max(which(vec1<3))
max(which(vec2<12.3))
max(which(vec3<5.7))
The expected result is:
2
2
5
One way to get a speedup would be to use Rcpp to search for elements smaller than your cutoff, starting from the right side of the vector and moving left. You can return as soon as you find the element that meets your criteria, which means that if your target is near the right side of the vector you might avoid looking at most of the vector's elements (meanwhile, which looks at all vector elements and max looks at all values returned by which). The speedup would be largest for long vectors where the target element is close to the end.
library(Rcpp)
rightmost.small <- cppFunction(
'double rightmostSmall(NumericVector x, const double cutoff) {
for (int i=x.size()-1; i >= 0; --i) {
if (x[i] < cutoff) return i+1; // 1-index
}
return 0; // None found
}')
rightmost.small(vec1, 3)
# [1] 2
rightmost.small(vec2, 12.3)
# [1] 2
rightmost.small(vec3, 5.7)
# [1] 5
Let's look at the performance for a vector where we expect this to give us a big speedup:
set.seed(144)
vec.large <- rnorm(1000000)
all.equal(max(which(vec.large < -1)), rightmost.small(vec.large, -1))
# [1] TRUE
library(microbenchmark)
microbenchmark(max(which(vec.large < -1)), rightmost.small(vec.large, -1))
# Unit: microseconds
# expr min lq mean median uq max neval
# max(which(vec.large < -1)) 4912.016 8097.290 12816.36406 9189.0685 9883.9775 60405.585 100
# rightmost.small(vec.large, -1) 1.643 2.476 8.54274 8.8915 12.8375 58.152 100
For this vector of length 1 million, we see a speedup of about 1000x using the Rcpp code.
This speedup should carry directly over to the case where you have many vectors stored in a list; you can use #JoshO'Brien's mapply code and observe a speedup when you switch from max(which(...)) to the Rcpp code:
f <- function(v,m) max(which(v < m))
l <- list(vec.large)[rep(1, 100)]
m <- rep(-1, 100)
microbenchmark(mapply(f, l, m), mapply(rightmost.small, l, m))
Unit: microseconds
expr min lq mean median uq max neval
mapply(f, l, m) 865287.828 907893.8505 931448.1555 918637.343 935632.0505 1133909.950 100
mapply(rightmost.small, l, m) 253.573 281.6855 344.5437 303.094 335.1675 712.897 100
We see a 3000x speedup by using the Rcpp code here.
l <- list(vec1,vec2,vec3)
m <- c(3, 12.3, 5.7)
f <- function(v,m) max(which(v < m))
mapply(f,l,m)
# [1] 2 2 5

Optimize r code

I want to optimize my r function for calculating gini mean difference:
gini.md<- function(x)
{
n = length(x)
nm = n+1
x = sort(x)
return (2/n^2*sum((2*(1:n)-nm)*x))
}
Do you have any idea how to make it faster? Generating seqences with seq was slow. bitwShiftL((1:n), 1) is slower than 2* (1:n). How is that possible?
Moreover I found out that mean(x) is slower than sum(x)/length(x). Again why??? Mean is an internal function it should be faster.
Ignoring my own advice, I guessed that the most likely source of any speed problem is unnecessary creation of long vectors. The following C implementation avoids creating four vectors (1:n, 2 * (1:n), 2 * (1:n) - nm, and finally (2*(1:n)-nm)*x).
library(inline)
gini <- cfunction(signature(x="REALSXP"), "
double n = Rf_length(x), nm = n + 1, ans = 0;
const double *xp = REAL(x);
for (int i = 0; i < n; ++i)
ans += (2 * (i + 1) - nm) * xp[i];
return ScalarReal(2 * ans / (n * n));
")
but this doesn't seem to help much. I realized after the fact that evaluation time is dominated by sort().
> library(microbenchmark)
> x <- rnorm(100000)
> all.equal(gini.md(x), gini(sort(x)))
[1] TRUE
> microbenchmark(gini.md(x), gini(sort(x)), sort(x), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
gini.md(x) 10.668591 10.98063 11.09274 11.03377 11.20588 11.62714 10
gini(sort(x)) 10.439458 10.64972 10.78242 10.70099 10.93015 11.36177 10
sort(x) 9.995886 10.18180 10.31508 10.27024 10.46160 10.66006 10
Maybe there's more speed to be had, but it will be similarly marginal.

More Efficient Way To Do A Conditional Running Total In R

As this is my first time asking a question on SO, I apologize in advance for any improper formatting.
I am very new to R and am trying to create a function that will return the row value of a data frame column once a running total in another column has met or exceeded a given value (the row that the running sum begins in is also an argument).
For example, given the following data frame, if given a starting parameter of x=3 and stop parameter of y=17, the function should return 5 (the x value of the row where the sum of y >= 17).
X Y
1 5
2 10
3 5
4 10
5 5
6 10
7 5
8 10
The function as I've currently written it returns the correct answer, but I have to believe there is a much more 'R-ish' way to accomplish this, instead of using loops and incrementing temporary variables, and would like to learn the right way, rather than form bad habits that I will have to correct later.
A very simplified version of the function:
myFunction<-function(DataFrame,StartRow,Total){
df<-DataFrame[DataFrame[[1]] >= StartRow,]
i<-0
j<-0
while (j < Total) {
i<-i+1
j<-sum(df[[2]][1:i])
}
x<-df[[1]][i]
return(x)
}
All the solutions posted so far compute the cumulative sum of the entire Y variable, which can be inefficient in cases where the data frame is really large but the index is near the beginning. In this case, a solution with Rcpp could be more efficient:
library(Rcpp)
get_min_cum2 = cppFunction("
int gmc2(NumericVector X, NumericVector Y, int start, int total) {
double running = 0.0;
for (int idx=0; idx < Y.size(); ++idx) {
if (X[idx] >= start) {
running += Y[idx];
if (running >= total) {
return X[idx];
}
}
}
return -1; // Running total never exceeds limit
}")
Comparison with microbenchmark:
get_min_cum <-
function(start,total)
with(dat[dat$X>=start,],X[min(which(cumsum(Y)>total))])
get_min_dt <- function(start, total)
dt[X >= start, X[cumsum(Y) >= total][1]]
set.seed(144)
dat = data.frame(X=1:1000000, Y=abs(rnorm(1000000)))
dt = data.table(dat)
get_min_cum(3, 17)
# [1] 29
get_min_dt(3, 17)
# [1] 29
get_min_cum2(dat$X, dat$Y, 3, 17)
# [1] 29
library(microbenchmark)
microbenchmark(get_min_cum(3, 17), get_min_dt(3, 17),
get_min_cum2(dat$X, dat$Y, 3, 17))
# Unit: milliseconds
# expr min lq median uq max neval
# get_min_cum(3, 17) 125.324976 170.052885 180.72279 193.986953 418.9554 100
# get_min_dt(3, 17) 100.990098 149.593250 162.24523 176.661079 399.7531 100
# get_min_cum2(dat$X, dat$Y, 3, 17) 1.157059 1.646184 2.30323 4.628371 256.2487 100
In this case, it's about 100x faster to use the Rcpp solution than other approaches.
Try this for example, I am using cumsum and vectorized logical subsetting:
get_min_cum <-
function(start,total)
with(dat[dat$X>=start,],X[min(which(cumsum(Y)>total))])
get_min_cum(3,17)
5
Here you go (using data.table because of ease of syntax):
library(data.table)
dt = data.table(df)
dt[X >= 3, X[cumsum(Y) >= 17][1]]
#[1] 5
Well, here's one way:
i <- 3
j <- 17
min(df[i:nrow(df),]$X[cumsum(df$Y[i:nrow(df)])>j])
# [1] 5
This takes df$X for rows i:nrow(df) and indexes that based on cumsum(df$Y) > j, starting also at row i. This returns all df$X for which the cumsum > j. min(...) then returns the smallest value.
with(df, which( cumsum( (x>=3)*y) >= 17)[1] )

Resources