which.max ties method in R - r

which.max and which.min will return the smallest index of the max or min value if there are ties.
Is there a way around this so that the largest index is returned without affecting the efficiency of the function?
max.col has this exact functionality, but I am dealing with a vector not a matrix.

You could do like this:
x<-c(1,2,1,4,3,4)
#identical to which.max, except returns all indices with max
which(x==max(x))
[1] 4 6
z<-which(x==max(x))
z[length(z)]
[1] 6
#or with tail
tail(which(x==max(x)),1)
[1] 6
edit:
Or, you could also use max.col function for vectors like this:
max.col(t(x),"last")
[1] 6
#or
max.col(matrix(x,nrow=1),"last")
[1] 6
edit: Some benchmarking:
x<-sample(1:1000,size=10000,replace=TRUE)
library(microbenchmark)
microbenchmark(which.max(x),{z<-which(x==max(x));z[length(z)]},
tail(which(x==max(x)),1),max.col(matrix(x,nrow=1),"last"),
max.col(t(x),"last"),which.max(rev(x)),times=1000)
Unit: microseconds
expr min lq median uq max neval
which.max(x) 29.390 30.323 30.323 31.256 17550.276 1000
{ z <- which(x == max(x)) z[length(z)] } 40.586 42.452 42.919 44.318 631.178 1000
tail(which(x == max(x)), 1) 57.380 60.646 61.579 64.844 596.657 1000
max.col(matrix(x, nrow = 1), "last") 134.353 138.085 139.485 144.383 710.949 1000
max.col(t(x), "last") 116.159 119.425 121.291 125.956 729.610 1000
which.max(rev(x)) 89.569 91.435 92.368 96.566 746.404 1000
So all methods seem to be slower than the original (which gives wrong result), but z <- which(x == max(x));z[length(z)] seems to be fastest option of these.

You could reverse x
which.max(rev(x))
which.min(rev(x))

The which function has an 'arr.ind' parameter normally set to FALSE but usefully set to TRUE in this case:
x <- sample(1:20, 50, repl=TRUE)
> which(x==max(x), arr.ind=TRUE)
[1] 11 23
> tail(which(x==max(x), arr.ind=TRUE) , 1)
[1] 23
Using the arr.ind argument is particularly useful with matrix or array structures, but it does work with atomic vectors as well.

To expand on Jouni's answer, you could instead use max on the result of which:
x <- c(1, 2, 1, 4, 3, 4)
which(x == max(x))
[1] 4 6
max(which(x == max(x)))
[1] 6
Benchmarking:
x <- sample(1:1000, size = 10000, replace = TRUE)
library(microbenchmark)
microbenchmark(which.max(x), {z <- which(x == max(x)); z[length(z)]},
tail(which(x == max(x)), 1), max.col(matrix(x, nrow = 1), "last"),
max.col(t(x), "last"), which.max(rev(x)), max(which(x == max(x))), times = 1000)
Unit: microseconds
expr min lq mean median uq max neval
which.max(x) 6.322 6.717 7.171838 7.112 7.112 40.297 1000
{ z <- which(x == max(x)) z[length(z)] } 27.260 28.445 37.126964 28.840 29.630 2276.346 1000
tail(which(x == max(x)), 1) 35.952 37.927 45.198484 38.718 40.298 1005.038 1000
max.col(matrix(x, nrow = 1), "last") 160.791 162.766 181.698171 163.557 169.087 1688.494 1000
max.col(t(x), "last") 84.149 86.124 100.249921 86.915 89.680 1230.618 1000
which.max(rev(x)) 53.729 55.310 69.442985 56.100 57.680 1076.149 1000
max(which(x == max(x))) 26.865 27.655 35.552256 28.050 28.841 1029.137 1000

Related

Count number of 1's from right to left, stopping at the first 0

I want to count the number of 1's that occur from RIGHT to LEFT across multiple columns, which stops when encountering the first 0.
Example DF:
df<-data.frame(replicate(7,sample(0:1,30,rep=T)))
colnames(df)<-seq(1950,2010,10)
I've manually entered the desired result here under a new column "condition" as an example:
Thanks in advance for your help,
Cai
Here's a fully vectorized attempt
indx <- rowSums(df) == ncol(df) # Per Jaaps comment
df$condition <- ncol(df) - max.col(-df, ties = "last")
df$condition[indx] <- ncol(df) - 1
This is basically finds the first zero from the right and counts how many columns were before that (which are basically the 1s in a binary data)
EDIT
Had to add handling for the special case when all the rows are ones
df$condition <- apply(df, 1, function(x) {
y <- rev(x)
sum(cumprod(y))
})
[Edit: now works]
Try this
df$condition <- apply(df,1,function(x){x<- rev(x);m <- match(0,x)[1]; if (is.na(m)) sum(x) else sum(x[1:m])})
we're matching the first 0, then summing up until this element.
If there's no zero we sum the full row
Here's a benchmark of all solutions :
library(stringr)
microbenchmark(
Moody_Mudskipper = apply(df,1,function(x){x<- rev(x);m <- match(0,x)[1]; if (is.na(m)) sum(x) else sum(x[1:m])}),
akrun = apply(df, 1, function(x) {x1 <- rle(x)
x2 <- tail(x1$lengths, 1)[tail(x1$values, 1)==1]
if(length(x2)==0) 0 else x2}),
akrun2 = str_count(do.call(paste0, df), "[1]+$"),
roland = apply(df, 1, function(x) {y <- rev(x);sum(y * cumprod(y != 0L))}),
David_Arenburg = ncol(df) - max.col(-df, ties = "last"),
times = 10)
# Unit: microseconds
# expr min lq mean median uq max neval
# Moody_Mudskipper 1437.948 1480.417 1677.1929 1536.159 1597.209 3009.320 10
# akrun 6985.174 7121.078 7718.2696 7691.053 7856.862 9289.146 10
# akrun2 1101.731 1188.793 1290.8971 1226.486 1343.099 1790.091 10
# akrun3 693.315 791.703 830.3507 820.371 884.782 1030.240 10
# roland 1197.995 1270.901 1708.5143 1332.305 1727.802 4568.660 10
# David_Arenburg 2845.459 3060.638 3406.3747 3167.519 3495.950 5408.494 10
# David_Arenburg_corrected 3243.964 3341.644 3757.6330 3384.645 4195.635 4943.099 10
For a bigger example David's solution is indeed the fastest, as said in the chosen solution's comments:
df<-data.frame(replicate(7,sample(0:1,1000,rep=T)))
# Unit: milliseconds
# expr min lq mean median uq max neval
# Moody_Mudskipper 31.324456 32.155089 34.168533 32.827345 33.848560 44.952570 10
# akrun 225.592061 229.055097 238.307506 234.761584 241.266853 271.000470 10
# akrun2 28.779824 29.261499 33.316700 30.118144 38.026145 46.711869 10
# akrun3 14.184466 14.334879 15.528201 14.633227 17.237317 18.763742 10
# roland 27.946005 28.341680 29.328530 28.497224 29.760516 33.692485 10
# David_Arenburg 3.149823 3.282187 3.630118 3.455427 3.727762 5.240031 10
# David_Arenburg_corrected 3.464098 3.534527 4.103335 3.833937 4.187141 6.165159 10
We can loop through the rows, use rle
df$condition <- apply(df, 1, function(x) {x1 <- rle(x)
x2 <- tail(x1$lengths, 1)[tail(x1$values, 1)==1]
if(length(x2)==0) 0 else x2})
Or another option is str_extract
library(stringr)
v1 <- str_extract(do.call(paste0, df), "1+$")
d$condition <- ifelse(is.na(v1), 0, nchar(v1))
Or with a slightly more efficient stringi
library(stringi)
v1 <- stri_count(stri_extract(do.call(paste0, df), regex = "1+$"), regex = ".")
v1[is.na(v1)] <- 0
df$condition <- v1
Or with a more compact option
stri_count(do.call(paste0, df), regex = '(?=1+$)')

Optimizing speed of nearest search function in R [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 6 years ago.
Improve this question
I'm trying to make this function faster (ideally with RcppAmadillo or some other alternative). myfun takes a matrix, mat, that can get quite large, but is always two columns. myfun finds the closest rows for each row in the matrix that are +1 or -1 away in absolute value from each row
As an example below, the first row of mat is 3,3. Therefore, myfun will output a list with rows 2 and 3 being closest to row 1, but not row 5, which is +2 away.
library(microbenchmark)
dim(mat)
[1] 1000 2
head(mat)
x y
[1,] 3 3
[2,] 3 4
[3,] 3 2
[4,] 7 3
[5,] 4 4
[6,] 10 1
output
[[1]]
[1] 2 3
[[2]]
[1] 1
[[3]]
[1] 1
[[4]]
integer(0)
[[5]]
integer(0)
[[6]]
integer(0)
microbenchmark( myfun(mat), times = 100) #mat of 1000 rows
# Unit: milliseconds
# expr min lq mean median uq max neval
# myfun(mat) 89.30126 90.28618 95.50418 90.91281 91.50875 180.1505 100
microbenchmark( myfun(mat), times = 100) #mat of 10,000 rows
# Unit: seconds
# expr min lq mean median uq max neval
# myfun(layout.old) 5.912633 5.912633 5.912633 5.912633 5.912633 5.912633 1
This is what myfun looks like
myfun = function(x){
doo <- function(j) {
j.mat <- matrix(rep(j, length = length(x)), ncol = ncol(x), byrow = TRUE)
j.abs <- abs(j.mat - x)
return(which(rowSums(j.abs) == 1))
}
return(apply(x, 1, doo))
}
Below, I have a base R solution that is much faster than myfun provided by the OP.
myDistOne <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
rs <- rowSums(m)
lapply(seq_along(rs), function(x) {
t1 <- which(abs(rs[x] - rs) == 1)
t2 <- t1[which(abs(v1[x] - v1[t1]) <= 1)]
t2[which(abs(v2[x] - v2[t2]) <= 1)]
})
}
Here are some benchmarks:
library(microbenchmark)
set.seed(9711)
m1 <- matrix(sample(50, 2000, replace = TRUE), ncol = 2) ## 1,000 rows
microbenchmark(myfun(m1), myDistOne(m1))
Unit: milliseconds
expr min lq mean median uq max neval cld
myfun(m1) 78.61637 78.61637 80.47931 80.47931 82.34225 82.34225 2 b
myDistOne(m1) 27.34810 27.34810 28.18758 28.18758 29.02707 29.02707 2 a
identical(myfun(m1), myDistOne(m1))
[1] TRUE
m2 <- matrix(sample(200, 20000, replace = TRUE), ncol = 2) ## 10,000 rows
microbenchmark(myfun(m2), myDistOne(m2))
Unit: seconds
expr min lq mean median uq max neval cld
myfun(m2) 5.219318 5.533835 5.758671 5.714263 5.914672 7.290701 100 b
myDistOne(m2) 1.230721 1.366208 1.433403 1.419413 1.473783 1.879530 100 a
identical(myfun(m2), myDistOne(m2))
[1] TRUE
Here is a very large example:
m3 <- matrix(sample(1000, 100000, replace = TRUE), ncol = 2) ## 50,000 rows
system.time(testJoe <- myDistOne(m3))
user system elapsed
26.963 10.988 37.973
system.time(testUser <- myfun(m3))
user system elapsed
148.444 33.297 182.639
identical(testJoe, testUser)
[1] TRUE
I'm sure there is a faster solution. Maybe by sorting the rowSums upfront and working from there could see an improvement (it could also get very messy).
Update
As I predicted, working from a sorted rowSums is much faster (and uglier!)
myDistOneFast <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
origrs <- rowSums(m)
mySort <- order(origrs)
rs <- origrs[mySort]
myDiff <- c(0L, diff(rs))
brks <- which(myDiff > 0L)
lenB <- length(brks)
n <- nrow(m)
myL <- vector("list", length = n)
findRows <- function(v, s, r, u1, u2) {
lapply(v, function(x) {
sx <- s[x]
tv1 <- s[r]
tv2 <- tv1[which(abs(u1[sx] - u1[tv1]) <= 1)]
tv2[which(abs(u2[sx] - u2[tv2]) <= 1)]
})
}
t1 <- brks[1L]; t2 <- brks[2L]
## setting first index in myL
myL[mySort[1L:(t1-1L)]] <- findRows(1L:(t1-1L), mySort, t1:(t2-1L), v1, v2)
k <- t0 <- 1L
while (k < (lenB-1L)) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- brks[k+2L]
vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
if (myDiff[t2] > 1L) {
if (myDiff[t3] > 1L) {
k <- k+2L; t0 <- t2
} else {
k <- k+1L; t0 <- t1
}
} else {k <- k+1L; t0 <- t1}
}
## setting second to last index in myL
if (k == lenB-1L) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- n+1L; vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
k <- k+1L; t0 <- t1
}
t1 <- brks[k]; vec <- t1:n
if (myDiff[t1] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
myL
}
The results are not even close. myDistOneFast is over 100x faster than the OP's original myfun on very large matrices and also scales well. Below are some benchmarks:
microbenchmark(OP = myfun(m1), Joe = myDistOne(m1), JoeFast = myDistOneFast(m1))
Unit: milliseconds
expr min lq mean median uq max neval
OP 57.60683 59.51508 62.91059 60.63064 61.87141 109.39386 100
Joe 22.00127 23.11457 24.35363 23.87073 24.87484 58.98532 100
JoeFast 11.27834 11.99201 12.59896 12.43352 13.08253 15.35676 100
microbenchmark(OP = myfun(m2), Joe = myDistOne(m2), JoeFast = myDistOneFast(m2))
Unit: milliseconds
expr min lq mean median uq max neval
OP 4461.8201 4527.5780 4592.0409 4573.8673 4633.9278 4867.5244 100
Joe 1287.0222 1316.5586 1339.3653 1331.2534 1352.3134 1524.2521 100
JoeFast 128.4243 134.0409 138.7518 136.3929 141.3046 172.2499 100
system.time(testJoeFast <- myDistOneFast(m3))
user system elapsed
0.68 0.00 0.69 ### myfun took over 100s!!!
To test equality, we have to sort each vector of indices. We also can't use identical for comparison as myL is initialized as an empty list, thus some of the indices contain NULL values (these correspond to integer(0) in the result from myfun and myDistOne).
testJoeFast <- lapply(testJoeFast, sort)
all(sapply(1:50000, function(x) all(testJoe[[x]]==testJoeFast[[x]])))
[1] TRUE
unlist(testJoe[which(sapply(testJoeFast, is.null))])
integer(0)
Here is an example with 500,000 rows:
set.seed(42)
m4 <- matrix(sample(2000, 1000000, replace = TRUE), ncol = 2)
system.time(myDistOneFast(m4))
user system elapsed
10.84 0.06 10.94
Here is an overview of how the algorithm works:
Calculate rowSums
Order the rowSums (i.e. returns the indices from the original vector of the sorted vector)
Call diff
Mark each non-zero instance
Determine which indices in small range satisfy the OP's request
Use the ordered vector calculated in 2 to determine original index
This is much faster than comparing one rowSum to all of the rowSum every time.

Most efficient way for rolling sum [duplicate]

I have the following vector:
x = c(1, 2, 3, 10, 20, 30)
At each index, 3 consecutive elements are summed, resulting in the following vector:
c(6, 15, 33, 60)
Thus, first element is 1 + 2 + 3 = 6, the second element is 2 + 3 + 10 = 15, et.c
What you have is a vector, not an array. You can use rollapply function from zoo package to get what you need.
> x <- c(1, 2, 3, 10, 20, 30)
> #library(zoo)
> rollapply(x, 3, sum)
[1] 6 15 33 60
Take a look at ?rollapply for further details on what rollapply does and how to use it.
I put together a package for handling these kinds of 'roll'ing functions that offers functionality similar to zoo's rollapply, but with Rcpp on the backend. Check out RcppRoll on CRAN.
library(microbenchmark)
library(zoo)
library(RcppRoll)
x <- rnorm(1E5)
all.equal( m1 <- rollapply(x, 3, sum), m2 <- roll_sum(x, 3) )
## from flodel
rsum.cumsum <- function(x, n = 3L) {
tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
}
microbenchmark(
unit="ms",
times=10,
rollapply(x, 3, sum),
roll_sum(x, 3),
rsum.cumsum(x, 3)
)
gives me
Unit: milliseconds
expr min lq median uq max neval
rollapply(x, 3, sum) 1056.646058 1068.867550 1076.550463 1113.71012 1131.230825 10
roll_sum(x, 3) 0.405992 0.442928 0.457642 0.51770 0.574455 10
rsum.cumsum(x, 3) 2.610119 2.821823 6.469593 11.33624 53.798711 10
You might find it useful if speed is a concern.
If speed is a concern, you could use a convolution filter and chop off the ends:
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
Or even faster, write it as the difference between two cumulative sums:
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
Both use base functions only. Some benchmarks:
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){
sum(x[i:(i+n-1)])})
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x)
)
# Unit: microseconds
# expr min lq median uq max neval
# rsum.rollapply(x) 12891.315 13267.103 14635.002 17081.5860 28059.998 100
# rsum.sapply(x) 4287.533 4433.180 4547.126 5148.0205 12967.866 100
# rsum.filter(x) 170.165 208.661 269.648 290.2465 427.250 100
# rsum.cumsum(x) 97.539 130.289 142.889 159.3055 449.237 100
Also I imagine all methods will be faster if x and all applied weights were integers instead of numerics.
Using just the base R you could do:
v <- c(1, 2, 3, 10, 20, 30)
grp <- 3
res <- sapply(1:(length(v)-grp+1),function(x){sum(v[x:(x+grp-1)])})
> res
[1] 6 15 33 60
Another way, faster than sapply (comparable to #flodel's rsum.cumsum), is the following:
res <- rowSums(outer(1:(length(v)-grp+1),1:grp,FUN=function(i,j){v[(j - 1) + i]}))
Here's flodel's benchmark updated:
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){sum(x[i:(i+n-1)])})
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
rsum.outer <- function(x, n = 3L) rowSums(outer(1:(length(x)-n+1),1:n,FUN=function(i,j){x[(j - 1) + i]}))
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x)
)
# Unit: microseconds
# expr min lq median uq max neval
# rsum.rollapply(x) 9464.495 9929.4480 10223.2040 10752.7960 11808.779 100
# rsum.sapply(x) 3013.394 3251.1510 3466.9875 4031.6195 7029.333 100
# rsum.filter(x) 161.278 178.7185 229.7575 242.2375 359.676 100
# rsum.cumsum(x) 65.280 70.0800 88.1600 95.1995 181.758 100
# rsum.outer(x) 66.880 73.7600 82.8795 87.0400 131.519 100
If you need real speed, try
rsum.cumdiff <- function(x, n = 3L) (cs <- cumsum(x))[-(1:(n-1))] - c(0,cs[1:(length(x)-n)])
It's all in base R, and updating flodel's microbenchmark speaks for itself
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){sum(x[i:(i+n-1)])})
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
rsum.outer <- function(x, n = 3L) rowSums(outer(1:(length(x)-n+1),1:n,FUN=function(i,j){x[(j - 1) + i]}))
rsum.cumdiff <- function(x, n = 3L) (cs <- cumsum(x))[-(1:(n-1))] - c(0, cs[1:(length(x)-n)])
all.equal(rsum.rollapply(x), rsum.sapply(x))
# [1] TRUE
all.equal(rsum.sapply(x), rsum.filter(x))
# [1] TRUE
all.equal(rsum.filter(x), rsum.outer(x))
# [1] TRUE
all.equal(rsum.outer(x), rsum.cumsum(x))
# [1] TRUE
all.equal(rsum.cumsum(x), rsum.cumdiff(x))
# [1] TRUE
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x),
rsum.cumdiff(x)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# rsum.rollapply(x) 3369.211 4104.2415 4630.89799 4391.7560 4767.2710 12002.904 100
# rsum.sapply(x) 850.425 999.2730 1355.56383 1086.0610 1246.5450 6915.877 100
# rsum.filter(x) 48.970 67.1525 97.28568 96.2430 113.6975 248.728 100
# rsum.cumsum(x) 47.515 62.7885 89.12085 82.1825 106.6675 230.303 100
# rsum.outer(x) 69.819 85.3340 160.30133 92.6070 109.0920 5740.119 100
# rsum.cumdiff(x) 9.698 12.6070 70.01785 14.3040 17.4555 5346.423 100
## R version 3.5.1 "Feather Spray"
## zoo and microbenchmark compiled under R 3.5.3
Oddly enough, everything is faster the second time through microbenchmark:
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x),
rsum.cumdiff(x)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# rsum.rollapply(x) 3127.272 3477.5750 3869.38566 3593.4540 3858.9080 7836.603 100
# rsum.sapply(x) 844.122 914.4245 1059.89841 965.3335 1032.2425 5184.968 100
# rsum.filter(x) 47.031 60.8490 80.53420 74.1830 90.9100 260.365 100
# rsum.cumsum(x) 45.092 55.2740 69.90630 64.4855 81.4555 122.668 100
# rsum.outer(x) 68.850 76.6070 88.49533 82.1825 91.8800 166.304 100
# rsum.cumdiff(x) 9.213 11.1520 13.18387 12.1225 13.5770 49.456 100
library runner may also be used
x <- c(1, 2, 3, 10, 20, 30)
runner::sum_run(x, k=3, na_pad = T)
#> [1] NA NA 6 15 33 60
or slider is also useful
x <- c(1, 2, 3, 10, 20, 30)
slider::slide_sum(x, before = 2, complete = T)
#> [1] NA NA 6 15 33 60
Created on 2021-06-14 by the reprex package (v2.0.0)

Compare each row with other rows of matrix

I am looking for an efficient solution for the following problem:
b <- matrix(c(0,0,0,1,1,0), nrow = 2, byrow = T)
weight <- c(1,1)
times <- 5
abc <- do.call(rbind, replicate(times, b, simplify=FALSE))
weight <- rep.int(weight,times)
sum1 <- as.numeric(rep.int(NA,nrow(abc)))
##Rprof()
for(j in 1:nrow(abc)){
a <- abc[j,]
sum1[j] <- sum(weight[rowSums(t(a == t(abc)) + 0) == ncol(abc)])
}
##Rprof(NULL)
##summaryRprof()
Is there a faster way to do this? Rprof shows that rowSums(), t(), == and + are quite slow. If nrows is 20,000 it takes like 21 seconds.
Thanks for helping!
Edit: I have a matrix abc and a vector weight with length equal to nrow(abc). The first value of weight corresponds to the first row of matrix abc and so on... Now, I would like to determine which rows of matrix abc are equal. Then, I want to remember the position of those rows in order to sum up the corresponding weights which have the same position. The appropriate sum I wanna store for each row.
Here is a way that looks valid and fast:
ff <- function(mat, weights)
{
rs <- apply(mat, 1, paste, collapse = ";")
unlist(lapply(unique(rs),
function(x)
sum(weights[match(rs, x, 0) > 0])))[match(rs, unique(rs))]
}
ff(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
And comparing with your function:
ffOP <- function(mat, weights)
{
sum1 <- as.numeric(rep.int(NA,nrow(mat)))
for(j in 1:nrow(mat)) {
a <- mat[j,]
sum1[j] <- sum(weights[rowSums(t(a == t(mat)) + 0) == ncol(mat)])
}
sum1
}
ffOP(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
library(microbenchmark)
m = do.call(rbind, replicate(1e3, matrix(0:11, 3, 4), simplify = F))
set.seed(101); w = runif(1e3*3)
all.equal(ffOP(m, w), ff(m, w))
#[1] TRUE
microbenchmark(ffOP(m, w), ff(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 969.83968 986.47941 996.68563 1015.53552 1051.23847 10
# ff(m, w) 20.42426 20.64002 21.36508 21.97182 22.59127 10
For the record, I, also, implemented your approach in C and here are the benchmarkings:
#> microbenchmark(ffOP(m, w), ff(m, w), ffC(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 957.66691 967.09429 991.35232 1000.53070 1016.74100 10
# ff(m, w) 20.60243 20.85578 21.70578 22.13434 23.04924 10
# ffC(m, w) 36.24618 36.40940 37.18927 37.39877 38.83358 10

Find most recent non-missing value in a vector

I'm trying to return the most recent row in the vector with a non-missing value. For instance, given
x <- c(1,2,NA,NA,3,NA,4)
Then function(x) would output a list like:
c(1,2,2,2,3,3,4)
Very simple question, but running it with loops or brute force on multiple columns takes forever.
You can use zoo::na.locf for that
require(zoo)
x <- c(1, 2, NA, NA, 3, NA, 4)
na.locf(x)
## [1] 1 2 2 2 3 3 4
You can do this using the Reduce function:
> x <- c(1,2,NA,NA,3,NA,4)
> locf <- function(x,y) if(is.na(y)) x else y
> Reduce( locf, x, accumulate=TRUE )
[1] 1 2 2 2 3 3 4
This way you do not need to load an extra package (and it could be customized to different types of objects if needed).
The Reduce option is quicker than zoo::na.locf for the sample vector on my computer:
> library(zoo)
> library(microbenchmark)
>
> microbenchmark(
+ Reduce( locf, x, accumulate=TRUE ),
+ na.locf(x)
+ )
Unit: microseconds
expr min lq median uq max
Reduce(locf, x, accumulate = TRUE) 22.169 24.0160 27.506 29.3530 112.073
na.locf(x) 149.841 151.8945 154.357 169.5465 377.271
neval
100
100
Though there may be other situations where na.locf will be faster. I was actually surprised at the amount of difference.
Benchmarking on bigger data shows the difference clearly between na.locf from zoo package and using Reduce:
x <- sample(c(1:5, NA), 1e6, TRUE)
require(zoo)
require(microbenchmark)
locf <- function(x,y) if(is.na(y)) x else y
microbenchmark(Reduce( locf, x, accumulate=TRUE ), na.locf(x), times=10)
Unit: milliseconds
expr min lq median uq max neval
Reduce(locf, x, accumulate = TRUE) 5480.4796 5958.0905 6605.3547 7458.404 7915.046 10
na.locf(x) 661.2886 911.1734 950.2542 1026.348 1095.642 10

Resources