Related
I have a matrix X:
one two three four
[1,] 1 3 2 4
[2,] 2 0 1 5
[3,] 3 2 1 4
[4,] 4 9 11 19
[5,] 4 3 2 1
I want to get a new matrix Y which only contains rows that are permutations of "1", "2", "3", "4". That is:
one two three four
[1,] 1 3 2 4
[3,] 3 2 1 4
[5,] 4 3 2 1
What function or command should I use?
mat <- rbind(
c(1, 3, 2, 4),
c(2, 0, 1, 5),
c(3, 2, 1, 4)
)
ok <- apply(mat, 1L, function(x) setequal(x, c(1, 2, 3, 4)))
mat[ok, ]
Your example matrix and target vector:
X <- structure(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4, 5, 4, 19, 1),
dim = 5:4)
v <- 1:4
But let's construct a more challenging one (thanks to user harre):
X <- rbind(X, 1, c(1, 2, 1, 2))
A fully vectorized approach (using package matrixStats)
rk <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
ct <- matrixStats::rowTabulates(rk, values = 1:length(v))
zo <- matrixStats::rowCounts(ct, value = 0L)
## all rows that are permutations of 'v'
X[zo == 0L, ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
## remove rows that are permutations of 'v'
X[zo > 0L, ]
Another fully vectorized method (base R)
This is a mathematical solution. For a nonlinear and asymmetric weight function w(x), the following weighted sum:
1 x w(1) + 2 x w(2) + 3 x w(3) + 4 x w(4)
is a unique score or identifier and is invariant to permutations. So for example, the following gives the same value:
2 x w(2) + 1 x w(1) + 3 x w(3) + 4 x w(4)
But anything else will give different values, like:
1 x w(1) + 3 x w(1) + 3 x w(3) + 4 x w(4)
0 x w(0) + 3 x w(1) + 0 x w(0) + 4 x w(4)
Here is an implementation using cosine weights. It works even if X and v are floating point numbers or characters.
## method = "tab" for tabulation method
## method = "cos" for cosine weights method
FindPerm <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
X[FindPerm(X, v, "cos"), ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
Benchmark
Performance depends on the number of values in v. The tabulation method will slow down as v becomes long.
## a benchmark function, relying on package "microbenchmark"
## nr: number of matrix rows
## nc: number of elements in 'v'
bm <- function (nr, nc) {
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
check = "identical")
}
bm(2e+4, 4)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 4.302674 4.324236 4.536260 4.336955 4.359814 7.039699
# cos 4.846893 4.872361 5.163209 4.882942 4.901288 7.837580
bm(2e+4, 20)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 30.63438 30.70217 32.73508 30.77588 33.08046 135.64322
# cos 21.16669 21.26161 22.28298 21.37563 23.60574 26.31775
Update since there's so much interest in this question, here's a method using indexing to give a speed boost on Zheyuan Li's excellent generalization of my original answer.
The idea is to index on a length(v)-dimensional array for small v, or to index on v*sin(w*v) using the results of match instead of calculating X*sin(W*X) when v is large:
library(RcppAlgos)
# simplified version of Zheyuan Li's function
f1 <- function(X, v) {
n <- length(v)
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
w <- pi/(n + 1)
rowSums(Xi*sin(Xi*w)) == sum(vi*sin(vi*w))
}
f2 <- function(X, v) {
n <- length(v)
if (n < 6) {
# index an n-dimensional array
m <- array(FALSE, rep(n + 1L, n))
m[permuteGeneral(n)] <- TRUE
X[] <- match(X, v, nomatch = length(v) + 1L)
m[X]
} else {
nn <- 1:n
u <- c(nn*sin(pi*nn/(n + 1L)), 0)
X[] <- u[match(X, v, nomatch = n + 1L)]
rowSums(X) == sum(u)
}
}
set.seed(123)
# using Zheyuan Li's test dataset
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 344.4 367.25 438.932 374.05 386.75 5960.6 100
#> f2 81.9 85.00 163.332 88.90 98.50 6924.4 100
# Zheyuan Li's larger test dataset
set.seed(123)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 1569.2 1575.45 1653.510 1601.30 1683.6 3972.6 100
#> f2 355.2 359.90 431.705 366.85 408.6 2253.8 100
Original answer edited to use X + exp(1/X) (see comments).
This should work with positive integers:
Y <- X[rowSums(X + exp(1/X)) == sum(1:4 + exp(1/(1:4))),]
Benchmarking against the apply solution:
f1 <- function(x) x[apply(x, 1L, function(x) setequal(x, 1:4)),]
f2 <- function(x) x[rowSums(x + exp(1/x)) == sum(1:4 + exp(1/(1:4))),]
X <- matrix(sample(10, 4e5, TRUE), 1e5)
microbenchmark::microbenchmark(f1 = f1(X),
f2 = f2(X),
times = 10,
check = "equal")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 448.2680 450.8778 468.55179 461.62620 472.0022 542.0455 10
#> f2 28.5362 28.6889 31.50941 29.44845 30.2693 50.4402 10
This question is HOT. I am learning so I take this as a good opportunity to learn. It is really difficult for me to come up with new solutions, but I found two things missing here:
there is no serious validation of these answers;
there is no benchmark for all of them.
I would like to convert each answer to a function that returns a TRUE/FALSE vector for flagging rows. I also want this function to work with any matrix and any vector.
Stéphane Laurent's answer, Zheyuan Li's answer and ThomasIsCoding's answer need minimal adaption.
Mohamed Desouky's answer is also easy to adapt, by taking out the function applied in Filter() and apply() it over matrix rows.
jblood94's answer is challenging. It was commented that for other matrices and vectors, conversion is needed using match. I don't know what is the appropriate way, but I saw match in Zheyuan Li's answer, so I borrowed that part.
TarJae's answer is awful (sorry; don't take this as an insult). None of them seems to work. I don't see any comparison between matrix rows and vectors in the base R solution. For other tidyverse codes, I don't know what df_matrix is. I have requested TarJae to please revise the answer.
harre's answer uses tidyverse and dose not return TRUE/FALSE. So I have to exclude it from the benchmark (sorry).
Here are the functions for the benchmark.
S.Laurent <- function (X, v) apply(X, 1L, function(x) setequal(x, v))
Z.Li <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
Thomas <- function (X, v) colSums(mapply(`%in%`, list(v), asplit(X, 1))) == ncol(X)
M.Desouky <- function (X, v) apply(X, 1, function (x) all((x %in% v) & length(unique(x)) == length(v)))
jblood94 <- function (X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:length(v)
rowSums(Xi + exp(1/Xi)) == sum(vi + exp(1/vi))
}
For benchmark, I followed the setup in Zheyuan Li's answer.
library(matrixStats)
library(microbenchmark); library(ggplot2)
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm1
autoplot(bm1)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm2
autoplot(bm2)
I don't know how time is transformed for plotting, but clearly, they are not on the usual scale. Those to the left are far faster than it is suggested from the plot!
Conclusion: Zheyuan Li's "cos" method is the winner.
We can try this
> mat[colSums(mapply(`%in%`, list(1:4), asplit(mat, 1))) == ncol(mat), ]
[,1] [,2] [,3] [,4]
[1,] 1 3 2 4
[2,] 3 2 1 4
[3,] 4 3 2 1
Another option is using Filter function
t(Filter(\(x) all((x %in% 1:4) & length(unique(x)) == 4) ,
data.frame(t(X))))
Just for fun
Who can give me 4 distinct digits whose sum equal 6 ?
there is just {0,1,2,3}
then we can use the module of 4 using %%
X[apply(X , 1 , \(x) sum(unique(x %% 4)) == 6 & length(unique(x)) == 4) , ]
OR
with Using pure for loop
ans <- data.frame(matrix(NA , ncol = ncol(X)))
r <- 1
for(i in 1:nrow(X)){
if(all((X[i,] %in% 1:4) & length(unique(X[i,])) == 4)){
ans[r,] <- X[i,]
r <- r + 1
}
}
ans <- as.matrix(ans)
output
X1 X2 X3 X4
1 1 3 2 4
2 3 2 1 4
3 4 3 2 1
data
X <- matrix(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4,
5, 4, 19, 1) , ncol = 4)
For the fun of tidyverse-solutions, even if I think we'd rather work on the matrices directly. However, we could use rowwise() and c_across():
With set-operations (inspired by #Stéphane Laurent):
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(setequal(c_across(), c(1, 2, 3, 4))) |>
ungroup() |>
as.matrix()
Or without set-operations:
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(1 %in% c_across(everything()) &
2 %in% c_across(everything()) &
3 %in% c_across(everything()) &
4 %in% c_across(everything())
) |>
ungroup() |>
as.matrix()
Or inspired by #Mohamed Desouky:
mat %>%
as_tibble() |>
rowwise() |>
filter(all(c_across() %in% 1:4) & n_distinct(c_across()) == 4) |>
ungroup() |>
as.matrix()
And so on..
The algorithm library in C++ offers a function called std::is_permutation that does just the trick.
The workhorse function below uses Rcpp and is fairly straightforward.
#include <Rcpp.h>
// [[Rcpp::export]]
SEXP perm_idx_cpp(Rcpp::IntegerMatrix mat, const std::vector<int> &v) {
const int nRows = mat.nrow();
const int nCols = mat.ncol();
std::vector<int> test(nCols);
Rcpp::LogicalVector res(nRows);
for (int i = 0; i < nRows; ++i) {
for (int j = 0; j < nCols; ++j) {
test[j] = mat(i, j);
}
res[i] = std::is_permutation(
test.begin(), test.end(), v.begin()
);
}
return res;
}
And calling it in R we have (N.B. We use match to get integer indices as in #Zheyuan Li's answer, which is absolutely brilliant btw!):
get_perm_idx <- function(X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), ncol = ncol(X))
perm_idx_cpp(Xi, seq_along(v))
}
It is very efficient as well. Here is a simple benchmark:
nr <- 2e4
nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
"is_perm_cpp" = get_perm_idx(X, v),
check = "identical")
Unit: milliseconds
expr min lq mean median uq max neval
tab 33.641345 36.479660 39.00994 37.402306 39.560015 54.88057 100
cos 9.496309 12.887493 15.30122 13.306302 14.053643 132.24079 100
is_perm_cpp 3.232093 4.819553 6.08687 4.993367 5.248818 19.56919 100
You could probably squeeze out some extra efficiency, but it is tough to beat the simplicity here.
Is there elegant R syntax to select, depending on the type of object, either the first n elements from a matrix row-wise, or the first n elements of a vector.
I can obviously do this with conditional statements, but I wonder if there is a simple solution. I also want to avoid calling t() on the whole matrix due to efficiency concerns.
M = matrix(1:12,3,4)
x = 1:12
slct = function(obj,n){
if(is.matrix(obj)) res = c(t(obj))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
slct(M,5); slct(x,5)
So avoiding calling t() on the whole matrix is the key. I think the other solutions are more interesting and pedagogical, but the fastest one I see is the following.
Efficiency is probably just because these rely on C subroutines to do the same vectorization as others suggest. Probably if you need only a specific subset of the elements 1:n there are cases where it would be faster to modify the other methods.
I still wonder if there is some builtin that does this?
Here are my two solutions (thanks to some ideas from the other posts):
funOPmod2 = function(obj,n){
if(is.matrix(obj)){
nc = ncol(obj)
nr = (n %/% nc) + 1
subM = obj[1:nr,]
res = matrix(subM, ncol = nr,
byrow = TRUE)[1:n] }
if(is.vector(obj)) res = obj[1:n]
res
}
funOPmod = function(obj,n){
if(is.matrix(obj)){
nc = ncol(obj)
nr = (n %/% nc) + 1
res = t(obj[1:nr,])[1:n] }
if(is.vector(obj)) res = obj[1:n]
res
}
funOP = function(obj,n){
if(is.matrix(obj)) res = c(t(obj))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
funRyan <- function(x, n){
if(is.vector(x)) i <- 1:n
if(is.matrix(x))
i <- cbind(ceiling(1:n/ncol(x)), rep_len(seq(ncol(x)), n))
x[i]
}
funEmil <- function(obj, n) {
myDim <- dim(obj)
vec <- 1:n
if (is.null(myDim))
return(obj[vec])
nr <- myDim[1]
nc <- myDim[2]
vec1 <- vec - 1L
rem <- vec1 %% nc
quot <- vec1 %/% nc
obj[quot + (rem * nr + 1L)]
}
n <- 25000
set.seed(42)
MBig <- matrix(sample(10^7, 10^6, replace = TRUE), nrow = 10^4)
## Returns same results
all.equal(funOPmod2(MBig, n), funOP(MBig, n))
all.equal(funOPmod(MBig, n), funOP(MBig, n))
all.equal(funOP(MBig, n), funEmil(MBig, n))
all.equal(funRyan(MBig, n), funEmil(MBig, n))
library(microbenchmark)
microbenchmark(funOP(MBig, n), funOPmod(MBig, n), funOPmod2(MBig, n), funRyan(MBig, n), funEmil(MBig, n), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funOP(MBig, n) 13.788456 13.343185 15.776079 13.104634 15.064036 13.1959488 100
funOPmod(MBig, n) 1.052210 1.089507 1.071219 1.118461 1.025714 0.4533697 100
funOPmod2(MBig, n) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
funRyan(MBig, n) 2.689417 2.694442 2.464471 2.637720 2.351565 0.9274931 100
funEmil(MBig, n) 2.760368 2.681478 2.434167 2.591716 2.308087 0.8921837 100
What about this?
slct = function(obj,n){
if(is.matrix(obj)) res = as.vector(matrix(M, dim(M),
byrow = TRUE))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
> slct(M,5); slct(x,5)
[1] 1 5 9 2 6
[1] 1 2 3 4 5
Seems to be twice as fast according to benchmark:
Unit: microseconds
expr min lq mean median uq max neval cld
t() 7.654 8.420 9.077494 8.675 8.675 10440.259 1e+05 b
matrix 3.316 3.827 4.411272 4.082 4.083 9502.881 1e+05 a
Note: You should specify is.vector rather than is.numeric in second line, since is.numeric(M) yields TRUE.
You can take advantage of array-indexes in [.
# new function
slct2 <- function(x, n){
if(is.vector(x)) i <- 1:n
if(is.matrix(x))
i <- cbind(ceiling(1:n/ncol(mat)), rep_len(seq(ncol(mat)), n))
x[i]
}
# old function
slct = function(obj,n){
if(is.matrix(obj)) res = c(t(obj))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
Benchmark
m <- 1e4
mat <- matrix(runif(m^2), m)
n <- floor(m*2.3)
all.equal(slct(mat, n), slct2(mat, n))
# [1] TRUE
microbenchmark(slct(mat, n), slct2(mat, n), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# slct(mat, n) 2471.438599 2606.071460 3466.046729 3137.255011 4420.69364 4985.20781 10
# slct2(mat, n) 2.358151 4.748712 6.627644 4.973533 11.05927 13.73906 10
Can't you just use head?...
head(c(t(M)),5)
[1] 1 4 7 10 2
head(c(t(x)),5)
[1] 1 2 3 4 5
Here is base R solution:
funEmil <- function(obj, n) {
myDim <- dim(obj)
vec <- 1:n
if (is.null(myDim))
return(obj[vec])
nr <- myDim[1]
nc <- myDim[2]
vec1 <- vec - 1L
rem <- vec1 %% nc
quot <- vec1 %/% nc
obj[quot + (rem * nr + 1L)]
}
It relies on basic vectorized modular arithmetic %% and integer division %/%. It is also very fast:
set.seed(42)
MBig <- matrix(sample(10^7, 10^6, replace = TRUE), nrow = 10^4)
funOP = function(obj,n){
if(is.matrix(obj)) res = c(t(obj))[1:n]
if(is.vector(obj)) res = obj[1:n]
res
}
funRyan <- function(x, n){
if(is.vector(x)) i <- 1:n
if(is.matrix(x))
i <- cbind(ceiling(1:n/ncol(x)), rep_len(seq(ncol(x)), n))
x[i]
}
n <- 25000
## Returns same results
all.equal(funRyan(MBig, n), funEmil(MBig, n))
[1] TRUE
all.equal(funOP(MBig, n), funEmil(MBig, n))
[1] TRUE
library(microbenchmark)
microbenchmark(funOP(MBig, n), funRyan(MBig, n), funWoody(MBig, n), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funOP(MBig, n) 6.154284 5.915182 5.659250 5.880826 9.140565 1.0344393 100
funRyan(MBig, n) 1.015332 1.030278 1.028644 1.018446 1.032610 0.8330967 100
funEmil(MBig, n) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
Here are the benchmarks using the example by #Ryan and the OP's modified solution:
n <- 1e4
mat <- matrix(runif(n^2), n)
s <- floor(n*2.3)
microbenchmark(funOP(mat, s), funRyan(mat, s),
funWoody(mat, s), funOPmod(mat, s), unit = "relative", times = 10)
Unit: relative
expr min lq mean median uq max neval
funOP(mat, s) 6189.449838 5558.293891 3871.425974 5139.192594 2443.203331 2222.778805 10
funRyan(mat, s) 2.633685 3.032467 2.155205 2.863710 1.445421 1.537473 10
funEmil(mat, s) 2.654739 2.714287 1.969482 2.642673 1.277088 1.326510 10
funOPmod(mat, s) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
The new modified is much faster and still give correct results.. very impressive!!
identical(funOPmod(mat, s), funRyan(mat, s))
[1] TRUE
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 6 years ago.
Improve this question
I'm trying to make this function faster (ideally with RcppAmadillo or some other alternative). myfun takes a matrix, mat, that can get quite large, but is always two columns. myfun finds the closest rows for each row in the matrix that are +1 or -1 away in absolute value from each row
As an example below, the first row of mat is 3,3. Therefore, myfun will output a list with rows 2 and 3 being closest to row 1, but not row 5, which is +2 away.
library(microbenchmark)
dim(mat)
[1] 1000 2
head(mat)
x y
[1,] 3 3
[2,] 3 4
[3,] 3 2
[4,] 7 3
[5,] 4 4
[6,] 10 1
output
[[1]]
[1] 2 3
[[2]]
[1] 1
[[3]]
[1] 1
[[4]]
integer(0)
[[5]]
integer(0)
[[6]]
integer(0)
microbenchmark( myfun(mat), times = 100) #mat of 1000 rows
# Unit: milliseconds
# expr min lq mean median uq max neval
# myfun(mat) 89.30126 90.28618 95.50418 90.91281 91.50875 180.1505 100
microbenchmark( myfun(mat), times = 100) #mat of 10,000 rows
# Unit: seconds
# expr min lq mean median uq max neval
# myfun(layout.old) 5.912633 5.912633 5.912633 5.912633 5.912633 5.912633 1
This is what myfun looks like
myfun = function(x){
doo <- function(j) {
j.mat <- matrix(rep(j, length = length(x)), ncol = ncol(x), byrow = TRUE)
j.abs <- abs(j.mat - x)
return(which(rowSums(j.abs) == 1))
}
return(apply(x, 1, doo))
}
Below, I have a base R solution that is much faster than myfun provided by the OP.
myDistOne <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
rs <- rowSums(m)
lapply(seq_along(rs), function(x) {
t1 <- which(abs(rs[x] - rs) == 1)
t2 <- t1[which(abs(v1[x] - v1[t1]) <= 1)]
t2[which(abs(v2[x] - v2[t2]) <= 1)]
})
}
Here are some benchmarks:
library(microbenchmark)
set.seed(9711)
m1 <- matrix(sample(50, 2000, replace = TRUE), ncol = 2) ## 1,000 rows
microbenchmark(myfun(m1), myDistOne(m1))
Unit: milliseconds
expr min lq mean median uq max neval cld
myfun(m1) 78.61637 78.61637 80.47931 80.47931 82.34225 82.34225 2 b
myDistOne(m1) 27.34810 27.34810 28.18758 28.18758 29.02707 29.02707 2 a
identical(myfun(m1), myDistOne(m1))
[1] TRUE
m2 <- matrix(sample(200, 20000, replace = TRUE), ncol = 2) ## 10,000 rows
microbenchmark(myfun(m2), myDistOne(m2))
Unit: seconds
expr min lq mean median uq max neval cld
myfun(m2) 5.219318 5.533835 5.758671 5.714263 5.914672 7.290701 100 b
myDistOne(m2) 1.230721 1.366208 1.433403 1.419413 1.473783 1.879530 100 a
identical(myfun(m2), myDistOne(m2))
[1] TRUE
Here is a very large example:
m3 <- matrix(sample(1000, 100000, replace = TRUE), ncol = 2) ## 50,000 rows
system.time(testJoe <- myDistOne(m3))
user system elapsed
26.963 10.988 37.973
system.time(testUser <- myfun(m3))
user system elapsed
148.444 33.297 182.639
identical(testJoe, testUser)
[1] TRUE
I'm sure there is a faster solution. Maybe by sorting the rowSums upfront and working from there could see an improvement (it could also get very messy).
Update
As I predicted, working from a sorted rowSums is much faster (and uglier!)
myDistOneFast <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
origrs <- rowSums(m)
mySort <- order(origrs)
rs <- origrs[mySort]
myDiff <- c(0L, diff(rs))
brks <- which(myDiff > 0L)
lenB <- length(brks)
n <- nrow(m)
myL <- vector("list", length = n)
findRows <- function(v, s, r, u1, u2) {
lapply(v, function(x) {
sx <- s[x]
tv1 <- s[r]
tv2 <- tv1[which(abs(u1[sx] - u1[tv1]) <= 1)]
tv2[which(abs(u2[sx] - u2[tv2]) <= 1)]
})
}
t1 <- brks[1L]; t2 <- brks[2L]
## setting first index in myL
myL[mySort[1L:(t1-1L)]] <- findRows(1L:(t1-1L), mySort, t1:(t2-1L), v1, v2)
k <- t0 <- 1L
while (k < (lenB-1L)) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- brks[k+2L]
vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
if (myDiff[t2] > 1L) {
if (myDiff[t3] > 1L) {
k <- k+2L; t0 <- t2
} else {
k <- k+1L; t0 <- t1
}
} else {k <- k+1L; t0 <- t1}
}
## setting second to last index in myL
if (k == lenB-1L) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- n+1L; vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
k <- k+1L; t0 <- t1
}
t1 <- brks[k]; vec <- t1:n
if (myDiff[t1] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
myL
}
The results are not even close. myDistOneFast is over 100x faster than the OP's original myfun on very large matrices and also scales well. Below are some benchmarks:
microbenchmark(OP = myfun(m1), Joe = myDistOne(m1), JoeFast = myDistOneFast(m1))
Unit: milliseconds
expr min lq mean median uq max neval
OP 57.60683 59.51508 62.91059 60.63064 61.87141 109.39386 100
Joe 22.00127 23.11457 24.35363 23.87073 24.87484 58.98532 100
JoeFast 11.27834 11.99201 12.59896 12.43352 13.08253 15.35676 100
microbenchmark(OP = myfun(m2), Joe = myDistOne(m2), JoeFast = myDistOneFast(m2))
Unit: milliseconds
expr min lq mean median uq max neval
OP 4461.8201 4527.5780 4592.0409 4573.8673 4633.9278 4867.5244 100
Joe 1287.0222 1316.5586 1339.3653 1331.2534 1352.3134 1524.2521 100
JoeFast 128.4243 134.0409 138.7518 136.3929 141.3046 172.2499 100
system.time(testJoeFast <- myDistOneFast(m3))
user system elapsed
0.68 0.00 0.69 ### myfun took over 100s!!!
To test equality, we have to sort each vector of indices. We also can't use identical for comparison as myL is initialized as an empty list, thus some of the indices contain NULL values (these correspond to integer(0) in the result from myfun and myDistOne).
testJoeFast <- lapply(testJoeFast, sort)
all(sapply(1:50000, function(x) all(testJoe[[x]]==testJoeFast[[x]])))
[1] TRUE
unlist(testJoe[which(sapply(testJoeFast, is.null))])
integer(0)
Here is an example with 500,000 rows:
set.seed(42)
m4 <- matrix(sample(2000, 1000000, replace = TRUE), ncol = 2)
system.time(myDistOneFast(m4))
user system elapsed
10.84 0.06 10.94
Here is an overview of how the algorithm works:
Calculate rowSums
Order the rowSums (i.e. returns the indices from the original vector of the sorted vector)
Call diff
Mark each non-zero instance
Determine which indices in small range satisfy the OP's request
Use the ordered vector calculated in 2 to determine original index
This is much faster than comparing one rowSum to all of the rowSum every time.
I have the following vector:
x = c(1, 2, 3, 10, 20, 30)
At each index, 3 consecutive elements are summed, resulting in the following vector:
c(6, 15, 33, 60)
Thus, first element is 1 + 2 + 3 = 6, the second element is 2 + 3 + 10 = 15, et.c
What you have is a vector, not an array. You can use rollapply function from zoo package to get what you need.
> x <- c(1, 2, 3, 10, 20, 30)
> #library(zoo)
> rollapply(x, 3, sum)
[1] 6 15 33 60
Take a look at ?rollapply for further details on what rollapply does and how to use it.
I put together a package for handling these kinds of 'roll'ing functions that offers functionality similar to zoo's rollapply, but with Rcpp on the backend. Check out RcppRoll on CRAN.
library(microbenchmark)
library(zoo)
library(RcppRoll)
x <- rnorm(1E5)
all.equal( m1 <- rollapply(x, 3, sum), m2 <- roll_sum(x, 3) )
## from flodel
rsum.cumsum <- function(x, n = 3L) {
tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
}
microbenchmark(
unit="ms",
times=10,
rollapply(x, 3, sum),
roll_sum(x, 3),
rsum.cumsum(x, 3)
)
gives me
Unit: milliseconds
expr min lq median uq max neval
rollapply(x, 3, sum) 1056.646058 1068.867550 1076.550463 1113.71012 1131.230825 10
roll_sum(x, 3) 0.405992 0.442928 0.457642 0.51770 0.574455 10
rsum.cumsum(x, 3) 2.610119 2.821823 6.469593 11.33624 53.798711 10
You might find it useful if speed is a concern.
If speed is a concern, you could use a convolution filter and chop off the ends:
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
Or even faster, write it as the difference between two cumulative sums:
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
Both use base functions only. Some benchmarks:
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){
sum(x[i:(i+n-1)])})
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x)
)
# Unit: microseconds
# expr min lq median uq max neval
# rsum.rollapply(x) 12891.315 13267.103 14635.002 17081.5860 28059.998 100
# rsum.sapply(x) 4287.533 4433.180 4547.126 5148.0205 12967.866 100
# rsum.filter(x) 170.165 208.661 269.648 290.2465 427.250 100
# rsum.cumsum(x) 97.539 130.289 142.889 159.3055 449.237 100
Also I imagine all methods will be faster if x and all applied weights were integers instead of numerics.
Using just the base R you could do:
v <- c(1, 2, 3, 10, 20, 30)
grp <- 3
res <- sapply(1:(length(v)-grp+1),function(x){sum(v[x:(x+grp-1)])})
> res
[1] 6 15 33 60
Another way, faster than sapply (comparable to #flodel's rsum.cumsum), is the following:
res <- rowSums(outer(1:(length(v)-grp+1),1:grp,FUN=function(i,j){v[(j - 1) + i]}))
Here's flodel's benchmark updated:
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){sum(x[i:(i+n-1)])})
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
rsum.outer <- function(x, n = 3L) rowSums(outer(1:(length(x)-n+1),1:n,FUN=function(i,j){x[(j - 1) + i]}))
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x)
)
# Unit: microseconds
# expr min lq median uq max neval
# rsum.rollapply(x) 9464.495 9929.4480 10223.2040 10752.7960 11808.779 100
# rsum.sapply(x) 3013.394 3251.1510 3466.9875 4031.6195 7029.333 100
# rsum.filter(x) 161.278 178.7185 229.7575 242.2375 359.676 100
# rsum.cumsum(x) 65.280 70.0800 88.1600 95.1995 181.758 100
# rsum.outer(x) 66.880 73.7600 82.8795 87.0400 131.519 100
If you need real speed, try
rsum.cumdiff <- function(x, n = 3L) (cs <- cumsum(x))[-(1:(n-1))] - c(0,cs[1:(length(x)-n)])
It's all in base R, and updating flodel's microbenchmark speaks for itself
x <- sample(1:1000)
rsum.rollapply <- function(x, n = 3L) rollapply(x, n, sum)
rsum.sapply <- function(x, n = 3L) sapply(1:(length(x)-n+1),function(i){sum(x[i:(i+n-1)])})
rsum.filter <- function(x, n = 3L) filter(x, rep(1, n))[-c(1, length(x))]
rsum.cumsum <- function(x, n = 3L) tail(cumsum(x) - cumsum(c(rep(0, n), head(x, -n))), -n + 1)
rsum.outer <- function(x, n = 3L) rowSums(outer(1:(length(x)-n+1),1:n,FUN=function(i,j){x[(j - 1) + i]}))
rsum.cumdiff <- function(x, n = 3L) (cs <- cumsum(x))[-(1:(n-1))] - c(0, cs[1:(length(x)-n)])
all.equal(rsum.rollapply(x), rsum.sapply(x))
# [1] TRUE
all.equal(rsum.sapply(x), rsum.filter(x))
# [1] TRUE
all.equal(rsum.filter(x), rsum.outer(x))
# [1] TRUE
all.equal(rsum.outer(x), rsum.cumsum(x))
# [1] TRUE
all.equal(rsum.cumsum(x), rsum.cumdiff(x))
# [1] TRUE
library(microbenchmark)
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x),
rsum.cumdiff(x)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# rsum.rollapply(x) 3369.211 4104.2415 4630.89799 4391.7560 4767.2710 12002.904 100
# rsum.sapply(x) 850.425 999.2730 1355.56383 1086.0610 1246.5450 6915.877 100
# rsum.filter(x) 48.970 67.1525 97.28568 96.2430 113.6975 248.728 100
# rsum.cumsum(x) 47.515 62.7885 89.12085 82.1825 106.6675 230.303 100
# rsum.outer(x) 69.819 85.3340 160.30133 92.6070 109.0920 5740.119 100
# rsum.cumdiff(x) 9.698 12.6070 70.01785 14.3040 17.4555 5346.423 100
## R version 3.5.1 "Feather Spray"
## zoo and microbenchmark compiled under R 3.5.3
Oddly enough, everything is faster the second time through microbenchmark:
microbenchmark(
rsum.rollapply(x),
rsum.sapply(x),
rsum.filter(x),
rsum.cumsum(x),
rsum.outer(x),
rsum.cumdiff(x)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# rsum.rollapply(x) 3127.272 3477.5750 3869.38566 3593.4540 3858.9080 7836.603 100
# rsum.sapply(x) 844.122 914.4245 1059.89841 965.3335 1032.2425 5184.968 100
# rsum.filter(x) 47.031 60.8490 80.53420 74.1830 90.9100 260.365 100
# rsum.cumsum(x) 45.092 55.2740 69.90630 64.4855 81.4555 122.668 100
# rsum.outer(x) 68.850 76.6070 88.49533 82.1825 91.8800 166.304 100
# rsum.cumdiff(x) 9.213 11.1520 13.18387 12.1225 13.5770 49.456 100
library runner may also be used
x <- c(1, 2, 3, 10, 20, 30)
runner::sum_run(x, k=3, na_pad = T)
#> [1] NA NA 6 15 33 60
or slider is also useful
x <- c(1, 2, 3, 10, 20, 30)
slider::slide_sum(x, before = 2, complete = T)
#> [1] NA NA 6 15 33 60
Created on 2021-06-14 by the reprex package (v2.0.0)
I am looking for an efficient solution for the following problem:
b <- matrix(c(0,0,0,1,1,0), nrow = 2, byrow = T)
weight <- c(1,1)
times <- 5
abc <- do.call(rbind, replicate(times, b, simplify=FALSE))
weight <- rep.int(weight,times)
sum1 <- as.numeric(rep.int(NA,nrow(abc)))
##Rprof()
for(j in 1:nrow(abc)){
a <- abc[j,]
sum1[j] <- sum(weight[rowSums(t(a == t(abc)) + 0) == ncol(abc)])
}
##Rprof(NULL)
##summaryRprof()
Is there a faster way to do this? Rprof shows that rowSums(), t(), == and + are quite slow. If nrows is 20,000 it takes like 21 seconds.
Thanks for helping!
Edit: I have a matrix abc and a vector weight with length equal to nrow(abc). The first value of weight corresponds to the first row of matrix abc and so on... Now, I would like to determine which rows of matrix abc are equal. Then, I want to remember the position of those rows in order to sum up the corresponding weights which have the same position. The appropriate sum I wanna store for each row.
Here is a way that looks valid and fast:
ff <- function(mat, weights)
{
rs <- apply(mat, 1, paste, collapse = ";")
unlist(lapply(unique(rs),
function(x)
sum(weights[match(rs, x, 0) > 0])))[match(rs, unique(rs))]
}
ff(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
And comparing with your function:
ffOP <- function(mat, weights)
{
sum1 <- as.numeric(rep.int(NA,nrow(mat)))
for(j in 1:nrow(mat)) {
a <- mat[j,]
sum1[j] <- sum(weights[rowSums(t(a == t(mat)) + 0) == ncol(mat)])
}
sum1
}
ffOP(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
library(microbenchmark)
m = do.call(rbind, replicate(1e3, matrix(0:11, 3, 4), simplify = F))
set.seed(101); w = runif(1e3*3)
all.equal(ffOP(m, w), ff(m, w))
#[1] TRUE
microbenchmark(ffOP(m, w), ff(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 969.83968 986.47941 996.68563 1015.53552 1051.23847 10
# ff(m, w) 20.42426 20.64002 21.36508 21.97182 22.59127 10
For the record, I, also, implemented your approach in C and here are the benchmarkings:
#> microbenchmark(ffOP(m, w), ff(m, w), ffC(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 957.66691 967.09429 991.35232 1000.53070 1016.74100 10
# ff(m, w) 20.60243 20.85578 21.70578 22.13434 23.04924 10
# ffC(m, w) 36.24618 36.40940 37.18927 37.39877 38.83358 10