Most efficient way for rolling sum [duplicate] - r

I have the following vector:
x = c(1, 2, 3, 10, 20, 30)
At each index, 3 consecutive elements are summed, resulting in the following vector:
c(6, 15, 33, 60)
Thus, first element is 1 + 2 + 3 = 6, the second element is 2 + 3 + 10 = 15, et.c

What you have is a vector, not an array. You can use rollapply function from zoo package to get what you need.
> x <- c(1, 2, 3, 10, 20, 30)
> #library(zoo)
> rollapply(x, 3, sum)
[1] 6 15 33 60
Take a look at ?rollapply for further details on what rollapply does and how to use it.

I put together a package for handling these kinds of 'roll'ing functions that offers functionality similar to zoo's rollapply, but with Rcpp on the backend. Check out RcppRoll on CRAN.
library(microbenchmark)
library(zoo)
library(RcppRoll)
x <- rnorm(1E5)
all.equal( m1 <- rollapply(x, 3, sum), m2 <- roll_sum(x, 3) )
## from flodel
rsum.cumsum <- function(x, n = 3L) {
tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
}
microbenchmark(
unit="ms",
times=10,
rollapply(x, 3, sum),
roll_sum(x, 3),
rsum.cumsum(x, 3)
)
gives me
Unit: milliseconds
expr min lq median uq max neval
rollapply(x, 3, sum) 1056.646058 1068.867550 1076.550463 1113.71012 1131.230825 10
roll_sum(x, 3) 0.405992 0.442928 0.457642 0.51770 0.574455 10
rsum.cumsum(x, 3) 2.610119 2.821823 6.469593 11.33624 53.798711 10
You might find it useful if speed is a concern.

If speed is a concern, you could use a convolution filter and chop off the ends:
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
Or even faster, write it as the difference between two cumulative sums:
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
Both use base functions only. Some benchmarks:
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){
sum(x[i:(i+n-1)])})
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x)
)
# Unit: microseconds
# expr min lq median uq max neval
# rsum.rollapply(x) 12891.315 13267.103 14635.002 17081.5860 28059.998 100
# rsum.sapply(x) 4287.533 4433.180 4547.126 5148.0205 12967.866 100
# rsum.filter(x) 170.165 208.661 269.648 290.2465 427.250 100
# rsum.cumsum(x) 97.539 130.289 142.889 159.3055 449.237 100
Also I imagine all methods will be faster if x and all applied weights were integers instead of numerics.

Using just the base R you could do:
v <- c(1, 2, 3, 10, 20, 30)
grp <- 3
res <- sapply(1:(length(v)-grp+1),function(x){sum(v[x:(x+grp-1)])})
> res
[1] 6 15 33 60
Another way, faster than sapply (comparable to #flodel's rsum.cumsum), is the following:
res <- rowSums(outer(1:(length(v)-grp+1),1:grp,FUN=function(i,j){v[(j - 1) + i]}))
Here's flodel's benchmark updated:
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){sum(x[i:(i+n-1)])})
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
rsum.outer <- function(x, n = 3L) rowSums(outer(1:(length(x)-n+1),1:n,FUN=function(i,j){x[(j - 1) + i]}))
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x)
)
# Unit: microseconds
# expr min lq median uq max neval
# rsum.rollapply(x) 9464.495 9929.4480 10223.2040 10752.7960 11808.779 100
# rsum.sapply(x) 3013.394 3251.1510 3466.9875 4031.6195 7029.333 100
# rsum.filter(x) 161.278 178.7185 229.7575 242.2375 359.676 100
# rsum.cumsum(x) 65.280 70.0800 88.1600 95.1995 181.758 100
# rsum.outer(x) 66.880 73.7600 82.8795 87.0400 131.519 100

If you need real speed, try
rsum.cumdiff <- function(x, n = 3L) (cs <- cumsum(x))[-(1:(n-1))] - c(0,cs[1:(length(x)-n)])
It's all in base R, and updating flodel's microbenchmark speaks for itself
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){sum(x[i:(i+n-1)])})
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
rsum.outer <- function(x, n = 3L) rowSums(outer(1:(length(x)-n+1),1:n,FUN=function(i,j){x[(j - 1) + i]}))
rsum.cumdiff <- function(x, n = 3L) (cs <- cumsum(x))[-(1:(n-1))] - c(0, cs[1:(length(x)-n)])
all.equal(rsum.rollapply(x), rsum.sapply(x))
# [1] TRUE
all.equal(rsum.sapply(x), rsum.filter(x))
# [1] TRUE
all.equal(rsum.filter(x), rsum.outer(x))
# [1] TRUE
all.equal(rsum.outer(x), rsum.cumsum(x))
# [1] TRUE
all.equal(rsum.cumsum(x), rsum.cumdiff(x))
# [1] TRUE
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x),
rsum.cumdiff(x)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# rsum.rollapply(x) 3369.211 4104.2415 4630.89799 4391.7560 4767.2710 12002.904 100
# rsum.sapply(x) 850.425 999.2730 1355.56383 1086.0610 1246.5450 6915.877 100
# rsum.filter(x) 48.970 67.1525 97.28568 96.2430 113.6975 248.728 100
# rsum.cumsum(x) 47.515 62.7885 89.12085 82.1825 106.6675 230.303 100
# rsum.outer(x) 69.819 85.3340 160.30133 92.6070 109.0920 5740.119 100
# rsum.cumdiff(x) 9.698 12.6070 70.01785 14.3040 17.4555 5346.423 100
## R version 3.5.1 "Feather Spray"
## zoo and microbenchmark compiled under R 3.5.3
Oddly enough, everything is faster the second time through microbenchmark:
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x),
rsum.cumdiff(x)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# rsum.rollapply(x) 3127.272 3477.5750 3869.38566 3593.4540 3858.9080 7836.603 100
# rsum.sapply(x) 844.122 914.4245 1059.89841 965.3335 1032.2425 5184.968 100
# rsum.filter(x) 47.031 60.8490 80.53420 74.1830 90.9100 260.365 100
# rsum.cumsum(x) 45.092 55.2740 69.90630 64.4855 81.4555 122.668 100
# rsum.outer(x) 68.850 76.6070 88.49533 82.1825 91.8800 166.304 100
# rsum.cumdiff(x) 9.213 11.1520 13.18387 12.1225 13.5770 49.456 100

library runner may also be used
x <- c(1, 2, 3, 10, 20, 30)
runner::sum_run(x, k=3, na_pad = T)
#> [1] NA NA 6 15 33 60
or slider is also useful
x <- c(1, 2, 3, 10, 20, 30)
slider::slide_sum(x, before = 2, complete = T)
#> [1] NA NA 6 15 33 60
Created on 2021-06-14 by the reprex package (v2.0.0)

Related

Rolling min without replacement

Given a vector (vec) and a window size of 5,
winsz <- 5L
vec <- c(9, 3, 10, 5, 6, 2, 4, 8, 7, 1)
Is there a faster way to calculate rolling minimum without replacement?
Rolling min w/o replacement: Using vec as an example and window size of 5.
In the first pass, the min is extracted from first 5 elements (9, 3, 10, 5, 6). Min is 3 in the first pass.
In the 2nd pass, the min is extracted from 4 elements left over from the first pass (9, 10, 5, 6) and the new windowed element (2). Min is 2 in the second pass.
In the 3rd pass, the min is extracted from elements left over from the previous pass (9, 10, 5, 6) and the new windowed element (4). Min is 4 in the third pass. So on and so forth.
The result from the example is
[1] 3 2 4 5 6 1 7 8 9 10
Please kindly exclude Rcpp implementation for now.
Current implementation and timing code:
#rolling min without replacement
set.seed(0L)
N <- 10e5
winsz <- 5L
vec <- sample(N)
mtd0 <- compiler::cmpfun(function(x) {
subx <- x[seq_len(winsz)]
n <- 1L
drawn <- rep(NA_integer_, length(x))
while (n <= length(x)-winsz) {
idx <- which.min(subx)
drawn[n] <- subx[idx]
subx[idx] <- x[n+winsz]
n <- n + 1
}
drawn[tail(seq_along(drawn), winsz)] <- sort(subx)
drawn
})
library(microbenchmark)
microbenchmark(mtd0(vec), times=3L)
Around 8s for a window size of 5 and vector of length 1e6.
Not sure how this is going to clock in but here is another option
f <- function(x, window = 5) {
ret <- numeric(length = length(x))
i <- 1L
while (length(x) > 0) {
idx.min <- which.min(x[1:window])
ret[i] <- x[idx.min]
x <- x[-idx.min]
i <- i + 1
}
return(ret)
}
f(vec)
# [1] 3 2 4 5 6 1 7 8 9 10
Or
f2 <- function(x, window = 5) {
ret <- numeric(length = length(x))
i <- 1L
while (i <= length(x)) {
idx.min <- which.min(x[1:(window + i - 1)])
ret[i] <- x[idx.min]
x[idx.min] <- NA
i <- i + 1
}
return(ret)
}
On a side note...
Kudos for the numeric(length = length(x)) part goes to #RonakShah; it's interesting that numeric(length = length(x)) is much faster than rep(0, length(x)) (which is what I wrote originally;-)
res <- microbenchmark(
rep = rep(0, 10^6),
numeric = numeric(length = 10^6)
)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# rep 1392.582 2549.219 3682.897 2694.137 3098.073 14726.81 100 a
# numeric 424.257 1592.110 2902.232 1727.431 2174.159 11747.87 100 a
Timings so far:
#rolling min without replacement
set.seed(0L)
N <- 10e4
winsz <- 5L
vec <- sample(N)
f <- compiler::cmpfun(function(x, window = 5) {
ret <- numeric(length = length(x))
i <- 1L
while (length(x) > 0) {
idx.min <- which.min(x[1:window])
ret[i] <- x[idx.min]
x <- x[-idx.min]
i <- i + 1
}
return(ret)
})
mtd0 <- compiler::cmpfun(function(x) {
subx <- x[seq_len(winsz)]
n <- 1L
drawn <- rep(NA_integer_, length(x))
while (n <= length(x)-winsz) {
idx <- which.min(subx)
drawn[n] <- subx[idx]
subx[idx] <- x[n+winsz]
n <- n + 1
}
drawn[tail(seq_along(drawn), winsz)] <- sort(subx)
drawn
})
mtd1 <- compiler::cmpfun(function(x) {
res <- Reduce(function(ans, s) {
v <- ans$students
idx <- which.min(v)
list(students=c(v[-idx], s), drawn=v[idx])
},
x=as.list(x[seq_along(x)[-seq_len(winsz)]]),
init=list(students=x[seq_len(winsz)], drawn=NULL),
accumulate=TRUE)
c(unlist(lapply(res, `[[`, "drawn")), sort(res[[length(res)]]$students))
})
#all.equal(f(vec), mtd0(vec))
# [1] TRUE
#all.equal(mtd0(vec), mtd1(vec))
# [1] TRUE
library(microbenchmark)
microbenchmark(f(vec), mtd0(vec), mtd1(vec), times=3L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval cld
f(vec) 16234.97047 16272.00705 16457.05138 16309.04363 16568.092 16827.1400 3 b
mtd0(vec) 75.18676 83.34443 96.03222 91.50209 106.455 121.4078 3 a
mtd1(vec) 301.56747 342.36437 427.33052 383.16127 490.212 597.2628 3 a

in R, if matrix select first elements row-wise, if vector select first elements

Is there elegant R syntax to select, depending on the type of object, either the first n elements from a matrix row-wise, or the first n elements of a vector.
I can obviously do this with conditional statements, but I wonder if there is a simple solution. I also want to avoid calling t() on the whole matrix due to efficiency concerns.
M = matrix(1:12,3,4)
x = 1:12
slct = function(obj,n){
if(is.matrix(obj)) res = c(t(obj))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
slct(M,5); slct(x,5)
So avoiding calling t() on the whole matrix is the key. I think the other solutions are more interesting and pedagogical, but the fastest one I see is the following.
Efficiency is probably just because these rely on C subroutines to do the same vectorization as others suggest. Probably if you need only a specific subset of the elements 1:n there are cases where it would be faster to modify the other methods.
I still wonder if there is some builtin that does this?
Here are my two solutions (thanks to some ideas from the other posts):
funOPmod2 = function(obj,n){
if(is.matrix(obj)){
nc = ncol(obj)
nr = (n %/% nc) + 1
subM = obj[1:nr,]
res = matrix(subM, ncol = nr,
byrow = TRUE)[1:n] }
if(is.vector(obj)) res = obj[1:n]
res
}
funOPmod = function(obj,n){
if(is.matrix(obj)){
nc = ncol(obj)
nr = (n %/% nc) + 1
res = t(obj[1:nr,])[1:n] }
if(is.vector(obj)) res = obj[1:n]
res
}
funOP = function(obj,n){
if(is.matrix(obj)) res = c(t(obj))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
funRyan <- function(x, n){
if(is.vector(x)) i <- 1:n
if(is.matrix(x))
i <- cbind(ceiling(1:n/ncol(x)), rep_len(seq(ncol(x)), n))
x[i]
}
funEmil <- function(obj, n) {
myDim <- dim(obj)
vec <- 1:n
if (is.null(myDim))
return(obj[vec])
nr <- myDim[1]
nc <- myDim[2]
vec1 <- vec - 1L
rem <- vec1 %% nc
quot <- vec1 %/% nc
obj[quot + (rem * nr + 1L)]
}
n <- 25000
set.seed(42)
MBig <- matrix(sample(10^7, 10^6, replace = TRUE), nrow = 10^4)
## Returns same results
all.equal(funOPmod2(MBig, n), funOP(MBig, n))
all.equal(funOPmod(MBig, n), funOP(MBig, n))
all.equal(funOP(MBig, n), funEmil(MBig, n))
all.equal(funRyan(MBig, n), funEmil(MBig, n))
library(microbenchmark)
microbenchmark(funOP(MBig, n), funOPmod(MBig, n), funOPmod2(MBig, n), funRyan(MBig, n), funEmil(MBig, n), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funOP(MBig, n) 13.788456 13.343185 15.776079 13.104634 15.064036 13.1959488 100
funOPmod(MBig, n) 1.052210 1.089507 1.071219 1.118461 1.025714 0.4533697 100
funOPmod2(MBig, n) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
funRyan(MBig, n) 2.689417 2.694442 2.464471 2.637720 2.351565 0.9274931 100
funEmil(MBig, n) 2.760368 2.681478 2.434167 2.591716 2.308087 0.8921837 100
What about this?
slct = function(obj,n){
if(is.matrix(obj)) res = as.vector(matrix(M, dim(M),
byrow = TRUE))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
> slct(M,5); slct(x,5)
[1] 1 5 9 2 6
[1] 1 2 3 4 5
Seems to be twice as fast according to benchmark:
Unit: microseconds
expr min lq mean median uq max neval cld
t() 7.654 8.420 9.077494 8.675 8.675 10440.259 1e+05 b
matrix 3.316 3.827 4.411272 4.082 4.083 9502.881 1e+05 a
Note: You should specify is.vector rather than is.numeric in second line, since is.numeric(M) yields TRUE.
You can take advantage of array-indexes in [.
# new function
slct2 <- function(x, n){
if(is.vector(x)) i <- 1:n
if(is.matrix(x))
i <- cbind(ceiling(1:n/ncol(mat)), rep_len(seq(ncol(mat)), n))
x[i]
}
# old function
slct = function(obj,n){
if(is.matrix(obj)) res = c(t(obj))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
Benchmark
m <- 1e4
mat <- matrix(runif(m^2), m)
n <- floor(m*2.3)
all.equal(slct(mat, n), slct2(mat, n))
# [1] TRUE
microbenchmark(slct(mat, n), slct2(mat, n), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# slct(mat, n) 2471.438599 2606.071460 3466.046729 3137.255011 4420.69364 4985.20781 10
# slct2(mat, n) 2.358151 4.748712 6.627644 4.973533 11.05927 13.73906 10
Can't you just use head?...
head(c(t(M)),5)
[1] 1 4 7 10 2
head(c(t(x)),5)
[1] 1 2 3 4 5
Here is base R solution:
funEmil <- function(obj, n) {
myDim <- dim(obj)
vec <- 1:n
if (is.null(myDim))
return(obj[vec])
nr <- myDim[1]
nc <- myDim[2]
vec1 <- vec - 1L
rem <- vec1 %% nc
quot <- vec1 %/% nc
obj[quot + (rem * nr + 1L)]
}
It relies on basic vectorized modular arithmetic %% and integer division %/%. It is also very fast:
set.seed(42)
MBig <- matrix(sample(10^7, 10^6, replace = TRUE), nrow = 10^4)
funOP = function(obj,n){
if(is.matrix(obj)) res = c(t(obj))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
funRyan <- function(x, n){
if(is.vector(x)) i <- 1:n
if(is.matrix(x))
i <- cbind(ceiling(1:n/ncol(x)), rep_len(seq(ncol(x)), n))
x[i]
}
n <- 25000
## Returns same results
all.equal(funRyan(MBig, n), funEmil(MBig, n))
[1] TRUE
all.equal(funOP(MBig, n), funEmil(MBig, n))
[1] TRUE
library(microbenchmark)
microbenchmark(funOP(MBig, n), funRyan(MBig, n), funWoody(MBig, n), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funOP(MBig, n) 6.154284 5.915182 5.659250 5.880826 9.140565 1.0344393 100
funRyan(MBig, n) 1.015332 1.030278 1.028644 1.018446 1.032610 0.8330967 100
funEmil(MBig, n) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
Here are the benchmarks using the example by #Ryan and the OP's modified solution:
n <- 1e4
mat <- matrix(runif(n^2), n)
s <- floor(n*2.3)
microbenchmark(funOP(mat, s), funRyan(mat, s),
funWoody(mat, s), funOPmod(mat, s), unit = "relative", times = 10)
Unit: relative
expr min lq mean median uq max neval
funOP(mat, s) 6189.449838 5558.293891 3871.425974 5139.192594 2443.203331 2222.778805 10
funRyan(mat, s) 2.633685 3.032467 2.155205 2.863710 1.445421 1.537473 10
funEmil(mat, s) 2.654739 2.714287 1.969482 2.642673 1.277088 1.326510 10
funOPmod(mat, s) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
The new modified is much faster and still give correct results.. very impressive!!
identical(funOPmod(mat, s), funRyan(mat, s))
[1] TRUE

Compare each row with other rows of matrix

I am looking for an efficient solution for the following problem:
b <- matrix(c(0,0,0,1,1,0), nrow = 2, byrow = T)
weight <- c(1,1)
times <- 5
abc <- do.call(rbind, replicate(times, b, simplify=FALSE))
weight <- rep.int(weight,times)
sum1 <- as.numeric(rep.int(NA,nrow(abc)))
##Rprof()
for(j in 1:nrow(abc)){
a <- abc[j,]
sum1[j] <- sum(weight[rowSums(t(a == t(abc)) + 0) == ncol(abc)])
}
##Rprof(NULL)
##summaryRprof()
Is there a faster way to do this? Rprof shows that rowSums(), t(), == and + are quite slow. If nrows is 20,000 it takes like 21 seconds.
Thanks for helping!
Edit: I have a matrix abc and a vector weight with length equal to nrow(abc). The first value of weight corresponds to the first row of matrix abc and so on... Now, I would like to determine which rows of matrix abc are equal. Then, I want to remember the position of those rows in order to sum up the corresponding weights which have the same position. The appropriate sum I wanna store for each row.
Here is a way that looks valid and fast:
ff <- function(mat, weights)
{
rs <- apply(mat, 1, paste, collapse = ";")
unlist(lapply(unique(rs),
function(x)
sum(weights[match(rs, x, 0) > 0])))[match(rs, unique(rs))]
}
ff(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
And comparing with your function:
ffOP <- function(mat, weights)
{
sum1 <- as.numeric(rep.int(NA,nrow(mat)))
for(j in 1:nrow(mat)) {
a <- mat[j,]
sum1[j] <- sum(weights[rowSums(t(a == t(mat)) + 0) == ncol(mat)])
}
sum1
}
ffOP(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
library(microbenchmark)
m = do.call(rbind, replicate(1e3, matrix(0:11, 3, 4), simplify = F))
set.seed(101); w = runif(1e3*3)
all.equal(ffOP(m, w), ff(m, w))
#[1] TRUE
microbenchmark(ffOP(m, w), ff(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 969.83968 986.47941 996.68563 1015.53552 1051.23847 10
# ff(m, w) 20.42426 20.64002 21.36508 21.97182 22.59127 10
For the record, I, also, implemented your approach in C and here are the benchmarkings:
#> microbenchmark(ffOP(m, w), ff(m, w), ffC(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 957.66691 967.09429 991.35232 1000.53070 1016.74100 10
# ff(m, w) 20.60243 20.85578 21.70578 22.13434 23.04924 10
# ffC(m, w) 36.24618 36.40940 37.18927 37.39877 38.83358 10

Find most recent non-missing value in a vector

I'm trying to return the most recent row in the vector with a non-missing value. For instance, given
x <- c(1,2,NA,NA,3,NA,4)
Then function(x) would output a list like:
c(1,2,2,2,3,3,4)
Very simple question, but running it with loops or brute force on multiple columns takes forever.
You can use zoo::na.locf for that
require(zoo)
x <- c(1, 2, NA, NA, 3, NA, 4)
na.locf(x)
## [1] 1 2 2 2 3 3 4
You can do this using the Reduce function:
> x <- c(1,2,NA,NA,3,NA,4)
> locf <- function(x,y) if(is.na(y)) x else y
> Reduce( locf, x, accumulate=TRUE )
[1] 1 2 2 2 3 3 4
This way you do not need to load an extra package (and it could be customized to different types of objects if needed).
The Reduce option is quicker than zoo::na.locf for the sample vector on my computer:
> library(zoo)
> library(microbenchmark)
>
> microbenchmark(
+ Reduce( locf, x, accumulate=TRUE ),
+ na.locf(x)
+ )
Unit: microseconds
expr min lq median uq max
Reduce(locf, x, accumulate = TRUE) 22.169 24.0160 27.506 29.3530 112.073
na.locf(x) 149.841 151.8945 154.357 169.5465 377.271
neval
100
100
Though there may be other situations where na.locf will be faster. I was actually surprised at the amount of difference.
Benchmarking on bigger data shows the difference clearly between na.locf from zoo package and using Reduce:
x <- sample(c(1:5, NA), 1e6, TRUE)
require(zoo)
require(microbenchmark)
locf <- function(x,y) if(is.na(y)) x else y
microbenchmark(Reduce( locf, x, accumulate=TRUE ), na.locf(x), times=10)
Unit: milliseconds
expr min lq median uq max neval
Reduce(locf, x, accumulate = TRUE) 5480.4796 5958.0905 6605.3547 7458.404 7915.046 10
na.locf(x) 661.2886 911.1734 950.2542 1026.348 1095.642 10

which.max ties method in R

which.max and which.min will return the smallest index of the max or min value if there are ties.
Is there a way around this so that the largest index is returned without affecting the efficiency of the function?
max.col has this exact functionality, but I am dealing with a vector not a matrix.
You could do like this:
x<-c(1,2,1,4,3,4)
#identical to which.max, except returns all indices with max
which(x==max(x))
[1] 4 6
z<-which(x==max(x))
z[length(z)]
[1] 6
#or with tail
tail(which(x==max(x)),1)
[1] 6
edit:
Or, you could also use max.col function for vectors like this:
max.col(t(x),"last")
[1] 6
#or
max.col(matrix(x,nrow=1),"last")
[1] 6
edit: Some benchmarking:
x<-sample(1:1000,size=10000,replace=TRUE)
library(microbenchmark)
microbenchmark(which.max(x),{z<-which(x==max(x));z[length(z)]},
tail(which(x==max(x)),1),max.col(matrix(x,nrow=1),"last"),
max.col(t(x),"last"),which.max(rev(x)),times=1000)
Unit: microseconds
expr min lq median uq max neval
which.max(x) 29.390 30.323 30.323 31.256 17550.276 1000
{ z <- which(x == max(x)) z[length(z)] } 40.586 42.452 42.919 44.318 631.178 1000
tail(which(x == max(x)), 1) 57.380 60.646 61.579 64.844 596.657 1000
max.col(matrix(x, nrow = 1), "last") 134.353 138.085 139.485 144.383 710.949 1000
max.col(t(x), "last") 116.159 119.425 121.291 125.956 729.610 1000
which.max(rev(x)) 89.569 91.435 92.368 96.566 746.404 1000
So all methods seem to be slower than the original (which gives wrong result), but z <- which(x == max(x));z[length(z)] seems to be fastest option of these.
You could reverse x
which.max(rev(x))
which.min(rev(x))
The which function has an 'arr.ind' parameter normally set to FALSE but usefully set to TRUE in this case:
x <- sample(1:20, 50, repl=TRUE)
> which(x==max(x), arr.ind=TRUE)
[1] 11 23
> tail(which(x==max(x), arr.ind=TRUE) , 1)
[1] 23
Using the arr.ind argument is particularly useful with matrix or array structures, but it does work with atomic vectors as well.
To expand on Jouni's answer, you could instead use max on the result of which:
x <- c(1, 2, 1, 4, 3, 4)
which(x == max(x))
[1] 4 6
max(which(x == max(x)))
[1] 6
Benchmarking:
x <- sample(1:1000, size = 10000, replace = TRUE)
library(microbenchmark)
microbenchmark(which.max(x), {z <- which(x == max(x)); z[length(z)]},
tail(which(x == max(x)), 1), max.col(matrix(x, nrow = 1), "last"),
max.col(t(x), "last"), which.max(rev(x)), max(which(x == max(x))), times = 1000)
Unit: microseconds
expr min lq mean median uq max neval
which.max(x) 6.322 6.717 7.171838 7.112 7.112 40.297 1000
{ z <- which(x == max(x)) z[length(z)] } 27.260 28.445 37.126964 28.840 29.630 2276.346 1000
tail(which(x == max(x)), 1) 35.952 37.927 45.198484 38.718 40.298 1005.038 1000
max.col(matrix(x, nrow = 1), "last") 160.791 162.766 181.698171 163.557 169.087 1688.494 1000
max.col(t(x), "last") 84.149 86.124 100.249921 86.915 89.680 1230.618 1000
which.max(rev(x)) 53.729 55.310 69.442985 56.100 57.680 1076.149 1000
max(which(x == max(x))) 26.865 27.655 35.552256 28.050 28.841 1029.137 1000

Resources