Select matrix rows that are permutations of a given vector - r

I have a matrix X:
one two three four
[1,] 1 3 2 4
[2,] 2 0 1 5
[3,] 3 2 1 4
[4,] 4 9 11 19
[5,] 4 3 2 1
I want to get a new matrix Y which only contains rows that are permutations of "1", "2", "3", "4". That is:
one two three four
[1,] 1 3 2 4
[3,] 3 2 1 4
[5,] 4 3 2 1
What function or command should I use?

mat <- rbind(
c(1, 3, 2, 4),
c(2, 0, 1, 5),
c(3, 2, 1, 4)
)
ok <- apply(mat, 1L, function(x) setequal(x, c(1, 2, 3, 4)))
mat[ok, ]

Your example matrix and target vector:
X <- structure(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4, 5, 4, 19, 1),
dim = 5:4)
v <- 1:4
But let's construct a more challenging one (thanks to user harre):
X <- rbind(X, 1, c(1, 2, 1, 2))
A fully vectorized approach (using package matrixStats)
rk <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
ct <- matrixStats::rowTabulates(rk, values = 1:length(v))
zo <- matrixStats::rowCounts(ct, value = 0L)
## all rows that are permutations of 'v'
X[zo == 0L, ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
## remove rows that are permutations of 'v'
X[zo > 0L, ]
Another fully vectorized method (base R)
This is a mathematical solution. For a nonlinear and asymmetric weight function w(x), the following weighted sum:
1 x w(1) + 2 x w(2) + 3 x w(3) + 4 x w(4)
is a unique score or identifier and is invariant to permutations. So for example, the following gives the same value:
2 x w(2) + 1 x w(1) + 3 x w(3) + 4 x w(4)
But anything else will give different values, like:
1 x w(1) + 3 x w(1) + 3 x w(3) + 4 x w(4)
0 x w(0) + 3 x w(1) + 0 x w(0) + 4 x w(4)
Here is an implementation using cosine weights. It works even if X and v are floating point numbers or characters.
## method = "tab" for tabulation method
## method = "cos" for cosine weights method
FindPerm <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
X[FindPerm(X, v, "cos"), ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
Benchmark
Performance depends on the number of values in v. The tabulation method will slow down as v becomes long.
## a benchmark function, relying on package "microbenchmark"
## nr: number of matrix rows
## nc: number of elements in 'v'
bm <- function (nr, nc) {
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
check = "identical")
}
bm(2e+4, 4)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 4.302674 4.324236 4.536260 4.336955 4.359814 7.039699
# cos 4.846893 4.872361 5.163209 4.882942 4.901288 7.837580
bm(2e+4, 20)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 30.63438 30.70217 32.73508 30.77588 33.08046 135.64322
# cos 21.16669 21.26161 22.28298 21.37563 23.60574 26.31775

Update since there's so much interest in this question, here's a method using indexing to give a speed boost on Zheyuan Li's excellent generalization of my original answer.
The idea is to index on a length(v)-dimensional array for small v, or to index on v*sin(w*v) using the results of match instead of calculating X*sin(W*X) when v is large:
library(RcppAlgos)
# simplified version of Zheyuan Li's function
f1 <- function(X, v) {
n <- length(v)
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
w <- pi/(n + 1)
rowSums(Xi*sin(Xi*w)) == sum(vi*sin(vi*w))
}
f2 <- function(X, v) {
n <- length(v)
if (n < 6) {
# index an n-dimensional array
m <- array(FALSE, rep(n + 1L, n))
m[permuteGeneral(n)] <- TRUE
X[] <- match(X, v, nomatch = length(v) + 1L)
m[X]
} else {
nn <- 1:n
u <- c(nn*sin(pi*nn/(n + 1L)), 0)
X[] <- u[match(X, v, nomatch = n + 1L)]
rowSums(X) == sum(u)
}
}
set.seed(123)
# using Zheyuan Li's test dataset
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 344.4 367.25 438.932 374.05 386.75 5960.6 100
#> f2 81.9 85.00 163.332 88.90 98.50 6924.4 100
# Zheyuan Li's larger test dataset
set.seed(123)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 1569.2 1575.45 1653.510 1601.30 1683.6 3972.6 100
#> f2 355.2 359.90 431.705 366.85 408.6 2253.8 100
Original answer edited to use X + exp(1/X) (see comments).
This should work with positive integers:
Y <- X[rowSums(X + exp(1/X)) == sum(1:4 + exp(1/(1:4))),]
Benchmarking against the apply solution:
f1 <- function(x) x[apply(x, 1L, function(x) setequal(x, 1:4)),]
f2 <- function(x) x[rowSums(x + exp(1/x)) == sum(1:4 + exp(1/(1:4))),]
X <- matrix(sample(10, 4e5, TRUE), 1e5)
microbenchmark::microbenchmark(f1 = f1(X),
f2 = f2(X),
times = 10,
check = "equal")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 448.2680 450.8778 468.55179 461.62620 472.0022 542.0455 10
#> f2 28.5362 28.6889 31.50941 29.44845 30.2693 50.4402 10

This question is HOT. I am learning so I take this as a good opportunity to learn. It is really difficult for me to come up with new solutions, but I found two things missing here:
there is no serious validation of these answers;
there is no benchmark for all of them.
I would like to convert each answer to a function that returns a TRUE/FALSE vector for flagging rows. I also want this function to work with any matrix and any vector.
Stéphane Laurent's answer, Zheyuan Li's answer and ThomasIsCoding's answer need minimal adaption.
Mohamed Desouky's answer is also easy to adapt, by taking out the function applied in Filter() and apply() it over matrix rows.
jblood94's answer is challenging. It was commented that for other matrices and vectors, conversion is needed using match. I don't know what is the appropriate way, but I saw match in Zheyuan Li's answer, so I borrowed that part.
TarJae's answer is awful (sorry; don't take this as an insult). None of them seems to work. I don't see any comparison between matrix rows and vectors in the base R solution. For other tidyverse codes, I don't know what df_matrix is. I have requested TarJae to please revise the answer.
harre's answer uses tidyverse and dose not return TRUE/FALSE. So I have to exclude it from the benchmark (sorry).
Here are the functions for the benchmark.
S.Laurent <- function (X, v) apply(X, 1L, function(x) setequal(x, v))
Z.Li <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
Thomas <- function (X, v) colSums(mapply(`%in%`, list(v), asplit(X, 1))) == ncol(X)
M.Desouky <- function (X, v) apply(X, 1, function (x) all((x %in% v) & length(unique(x)) == length(v)))
jblood94 <- function (X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:length(v)
rowSums(Xi + exp(1/Xi)) == sum(vi + exp(1/vi))
}
For benchmark, I followed the setup in Zheyuan Li's answer.
library(matrixStats)
library(microbenchmark); library(ggplot2)
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm1
autoplot(bm1)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm2
autoplot(bm2)
I don't know how time is transformed for plotting, but clearly, they are not on the usual scale. Those to the left are far faster than it is suggested from the plot!
Conclusion: Zheyuan Li's "cos" method is the winner.

We can try this
> mat[colSums(mapply(`%in%`, list(1:4), asplit(mat, 1))) == ncol(mat), ]
[,1] [,2] [,3] [,4]
[1,] 1 3 2 4
[2,] 3 2 1 4
[3,] 4 3 2 1

Another option is using Filter function
t(Filter(\(x) all((x %in% 1:4) & length(unique(x)) == 4) ,
data.frame(t(X))))
Just for fun
Who can give me 4 distinct digits whose sum equal 6 ?
there is just {0,1,2,3}
then we can use the module of 4 using %%
X[apply(X , 1 , \(x) sum(unique(x %% 4)) == 6 & length(unique(x)) == 4) , ]
OR
with Using pure for loop
ans <- data.frame(matrix(NA , ncol = ncol(X)))
r <- 1
for(i in 1:nrow(X)){
if(all((X[i,] %in% 1:4) & length(unique(X[i,])) == 4)){
ans[r,] <- X[i,]
r <- r + 1
}
}
ans <- as.matrix(ans)
output
X1 X2 X3 X4
1 1 3 2 4
2 3 2 1 4
3 4 3 2 1
data
X <- matrix(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4,
5, 4, 19, 1) , ncol = 4)

For the fun of tidyverse-solutions, even if I think we'd rather work on the matrices directly. However, we could use rowwise() and c_across():
With set-operations (inspired by #Stéphane Laurent):
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(setequal(c_across(), c(1, 2, 3, 4))) |>
ungroup() |>
as.matrix()
Or without set-operations:
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(1 %in% c_across(everything()) &
2 %in% c_across(everything()) &
3 %in% c_across(everything()) &
4 %in% c_across(everything())
) |>
ungroup() |>
as.matrix()
Or inspired by #Mohamed Desouky:
mat %>%
as_tibble() |>
rowwise() |>
filter(all(c_across() %in% 1:4) & n_distinct(c_across()) == 4) |>
ungroup() |>
as.matrix()
And so on..

The algorithm library in C++ offers a function called std::is_permutation that does just the trick.
The workhorse function below uses Rcpp and is fairly straightforward.
#include <Rcpp.h>
// [[Rcpp::export]]
SEXP perm_idx_cpp(Rcpp::IntegerMatrix mat, const std::vector<int> &v) {
const int nRows = mat.nrow();
const int nCols = mat.ncol();
std::vector<int> test(nCols);
Rcpp::LogicalVector res(nRows);
for (int i = 0; i < nRows; ++i) {
for (int j = 0; j < nCols; ++j) {
test[j] = mat(i, j);
}
res[i] = std::is_permutation(
test.begin(), test.end(), v.begin()
);
}
return res;
}
And calling it in R we have (N.B. We use match to get integer indices as in #Zheyuan Li's answer, which is absolutely brilliant btw!):
get_perm_idx <- function(X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), ncol = ncol(X))
perm_idx_cpp(Xi, seq_along(v))
}
It is very efficient as well. Here is a simple benchmark:
nr <- 2e4
nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
"is_perm_cpp" = get_perm_idx(X, v),
check = "identical")
Unit: milliseconds
expr min lq mean median uq max neval
tab 33.641345 36.479660 39.00994 37.402306 39.560015 54.88057 100
cos 9.496309 12.887493 15.30122 13.306302 14.053643 132.24079 100
is_perm_cpp 3.232093 4.819553 6.08687 4.993367 5.248818 19.56919 100
You could probably squeeze out some extra efficiency, but it is tough to beat the simplicity here.

Related

For a dataset of 0's and 1's, set all but the first 1 in each row to 0's

I have a data.frame of 1,480 rows and 1,400 columns like:
1 2 3 4 5 6 ..... 1399 1400
1 0 0 0 1 0 0 ..... 1 0 #first occurrence would be at 4
2 0 0 0 0 0 1 ..... 0 1
3 1 0 0 1 0 0 ..... 0 0
## and etc
Each row contains a series of 0's and 1's - predominantly 0's. For each row, I want to find at which column the first 1 shows up and set the remaining values to 0's.
My current implementation can efficiently find the occurrence of the first 1, but I've only figured out how to zero out the remaining values iteratively by row. In repeated simulations, this iterative process is taking too long.
Here is the current implementation:
N <- length(df[which(df$arm == 0), "pt_id"]) # of patients
M <- max_days
#
# df is like the data frame shown above
#
df[which(df$arm == 0), 5:length(colnames(df))] <- unlist(lapply(matrix(data = rep(pbo_hr, M*N), nrow=N, ncol = M), rbinom, n=1, size = 1))
event_day_post_rand <- apply(df[,5:length(colnames(df))], MARGIN = 1, FUN = function(x) which (x>0)[1])
df <- add_column(df, "event_day_post_rand" = event_day_post_rand, .after = "arm_id")
##
## From here trial days start on column 6 for df
##
#zero out events that occurred after the first event, since each patient can only have 1 max event which will be taken as the earliest event
for (pt_id in df[which(!is.na(df$event_day_post_rand)),"pt_id"]){
event_idx = df[which(df$pt_id == pt_id), "event_day_post_rand"]
df[which(df$pt_id == pt_id), as.character(5+event_idx+1):"1400"] <- 0
}
We can do
mat <- as.matrix(df) ## data frame to matrix
j <- max.col(mat, ties.method = "first")
mat[] <- 0
mat[cbind(1:nrow(mat), j)] <- 1
df <- data.frame(mat) ## matrix to data frame
I also suggest just using a matrix to store these values. In addition, the result will be a sparse matrix. So I recommend
library(Matrix)
sparseMatrix(i = 1:nrow(mat), j = j, x = rep(1, length(j)))
We can get a little more performance by setting the 1 elements to 0 whose rows are duplicates.
Since the OP is open to starting with a matrix rather than a data.frame, I'll do the same.
# dummy data
m <- matrix(sample(0:1, 1480L*1400L, TRUE, c(0.9, 0.1)), 1480L, 1400L)
# proposed solution
f1 <- function(m) {
ones <- which(m == 1L)
m[ones[duplicated((ones - 1L) %% nrow(m), nmax = nrow(m))]] <- 0L
m
}
# Zheyuan Li's solution
f2 <- function(m) {
j <- max.col(m, ties.method = "first")
m[] <- 0L
m[cbind(1:nrow(m), j)] <- 1L
m
}
microbenchmark::microbenchmark(f1 = f1(m),
f2 = f2(m),
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 9.1457 11.45020 12.04258 11.9011 12.3529 37.6716 100
#> f2 12.8424 14.92955 17.31811 15.3251 16.0550 43.6314 100
Zheyuan Li's suggestion to go with a sparse matrix is a good idea.
# convert to a memory-efficient nsparseMatrix
library(Matrix)
m1 <- as(Matrix(f1(m), dimnames = list(NULL, NULL), sparse = TRUE), "nsparseMatrix")
object.size(m)
#> 8288216 bytes
object.size(m1)
#> 12864 bytes
# proposed function to go directly to a sparse matrix
f3 <- function(m) {
n <- nrow(m)
ones <- which(m == 1L) - 1L
i <- ones %% n
idx <- which(!duplicated(i, nmax = n))
sparseMatrix(i[idx], ones[idx] %/% n, dims = dim(m), index1 = FALSE, repr = "C")
}
# going directly to a sparse matrix using Zheyuan Li's solution
f4 <- function(m) {
sparseMatrix(1:nrow(m), max.col(m, ties.method = "first"), dims = dim(m), repr = "C")
}
identical(m1, f3(m))
#> [1] TRUE
identical(m1, f4(m))
#> [1] TRUE
microbenchmark::microbenchmark(f1 = f1(m),
f3 = f3(m),
f4 = f4(m))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 9.1719 9.30715 11.12569 9.52300 11.92740 83.8518 100
#> f3 7.4330 7.59875 12.62412 7.69610 11.08815 84.8291 100
#> f4 8.9607 9.31115 14.01477 9.49415 11.44825 87.1577 100

Checking whether a number of vectors exist in a matrix, need speed

I have two large numerical matrices and need to check which rows in one of them exist in the other one (exist as in being equal). This is my code:
myMatrix1 <- rbind(c(1,2,3),c(4,5,6),c(7,8,9))
myMatrix2 <- rbind(c(10,11,12),c(4,5,6),c(13,14,15))
logicalMatrix <- apply(myMatrix1,1,checkForEquality)
result <- apply(logicalMatrix,1,any)
checkForEquality <- function(x){
apply(myMatrix2, 1, innerFcn, oneRow = x)
}
innerFcn <- function(x, oneRow){
isTRUE(all.equal(x, oneRow))
}
result is
[1] FALSE TRUE FALSE
With two 2067*198 matrices this takes 350 seconds on my machine. With CPU-parallelization I guess I could bring it down to around 15 seconds. Unfortunately anything above 1 second is unacceptable. I need some direction. The matrices contain only 0's, 1's and 2's, if that matters.
You can use euclidean distance, which can be expressed as vectorized/linear algebra operations:
dist_xy <- outer(rowSums(x^2), rowSums(y^2), '+') - tcrossprod(x, 2 * y))
Benchmark:
nr = 1e5
nc = 200
x = t(sample(0:2, size = nc, replace = TRUE))
y = matrix(sample(0L:2L, size = nc * nr, replace = TRUE), nrow = nr)
all.equal(apply(y, 1, function(z) identical(z, x)),
drop(outer(rowSums(x^2), rowSums(y^2), '+') == tcrossprod(x, 2 * y)))
# [1] TRUE
microbenchmark::microbenchmark(
A = apply(y, 1, function(z) identical(z, x)),
B = apply(y, 1, function(z) all(z == x)),
C = drop(outer(rowSums(x^2), rowSums(y^2), '+') == tcrossprod(x, 2 * y)),
times = 3
)
Unit: milliseconds
expr min lq mean median uq max neval
A 543.1362 559.9647 585.3627 576.7931 606.4760 636.1589 3
B 609.7400 636.5922 667.3405 663.4445 696.1408 728.8370 3
C 368.2808 416.4194 441.9118 464.5580 478.7273 492.8965 3
This should benefits even more if you have larger matrices.
Calculate the distance matrix:
X <- rbind(c(1,2,3),c(4,5,6),c(7,8,9))
Y <- rbind(c(10,11,12),c(4,5,6),c(13,14,15))
library(pracma)
ind <- which(distmat(X, Y) == 0L, arr.ind = TRUE)
# row col
#[1,] 2 2
X[ind[, 1],]
#[1] 4 5 6
Y[ind[, 2],]
#[1] 4 5 6
If you need to consider floating point accuracy, use this:
ind <- which(abs(distmat(X, Y)) < tol, arr.ind = TRUE)
You can get a roughly 10x speedup by switching away from all.equal. all(x == y) and pracma::distmat, and identical(x, y) are all about 10x faster, with identical(x, y) as the fastest (note: this is for comparisons of length 200, as in your data. For longer vectors, all(x == y) may be faster!).
# sample data
## demo on single row vs matrix comparison
nr = 1e5
nc = 200
x = sample(0:2, size = nc, replace = TRUE)
y = matrix(sample(0L:2L, size = nc * nr, replace = TRUE), nrow = nr)
library(pracma)
microbenchmark::microbenchmark(
apply(y, 1, function(z) identical(z, x)),
apply(y, 1, function(z) all(z == x)),
apply(y, 1, function(z) all.equal(z, x)),
which(distmat(x, y) == 0L, arr.ind = TRUE),
times = 2
)
# Unit: milliseconds
# expr min lq mean median uq max
# apply(y, 1, function(z) identical(z, x)) 619.1912 619.1912 644.3034 644.3034 669.4156 669.4156
# apply(y, 1, function(z) all(z == x)) 759.4982 759.4982 789.4765 789.4765 819.4548 819.4548
# apply(y, 1, function(z) all.equal(z, x)) 7618.6853 7618.6853 7657.7665 7657.7665 7696.8477 7696.8477
# which(distmat(x, y) == 0L, arr.ind = TRUE) 824.8337 824.8337 899.2349 899.2349 973.6360 973.6360
You could obviously go faster with Rcpp, but even doing row-wise comparisons in a for loop will be pretty quick in R (thanks to JIT compilation).
The next spot I'd try to optimize would be early stopping - if you find match, you don't need to check any more comparisons with that row, so you can go on to the next row. You can't do this well with apply, but with a for loop you have greater control.
Subtract b from a and then find take the rowSum of the absolute values in case the positive and negative differences cancel out. The zero sum rows will be identical:
diff <- a - b
idrows <- rowSum(abs(diff))
Just try:
asplit(m2,1) %in% asplit(m1,1)
An example with some medium matrices:
dims<-c(1920,198)
set.seed(4)
m1<-matrix(sample(0:2,prod(dims),TRUE),ncol=dims[2])
m2<-matrix(sample(0:2,prod(dims),TRUE),ncol=dims[2])
#simulate some rows in m2 which are equal to some rows in m1
samind<-sample(nrow(m1),50)
samind2<-sample(nrow(m2),50)
m2[samind2,]<-m1[samind,]
system.time(res<-asplit(m2,1) %in% asplit(m1,1))
# user system elapsed
# 0.407 0.000 0.406
#Check whether the result is correct
identical(which(res),sort(samind2))
#[1] TRUE

Rolling min without replacement

Given a vector (vec) and a window size of 5,
winsz <- 5L
vec <- c(9, 3, 10, 5, 6, 2, 4, 8, 7, 1)
Is there a faster way to calculate rolling minimum without replacement?
Rolling min w/o replacement: Using vec as an example and window size of 5.
In the first pass, the min is extracted from first 5 elements (9, 3, 10, 5, 6). Min is 3 in the first pass.
In the 2nd pass, the min is extracted from 4 elements left over from the first pass (9, 10, 5, 6) and the new windowed element (2). Min is 2 in the second pass.
In the 3rd pass, the min is extracted from elements left over from the previous pass (9, 10, 5, 6) and the new windowed element (4). Min is 4 in the third pass. So on and so forth.
The result from the example is
[1] 3 2 4 5 6 1 7 8 9 10
Please kindly exclude Rcpp implementation for now.
Current implementation and timing code:
#rolling min without replacement
set.seed(0L)
N <- 10e5
winsz <- 5L
vec <- sample(N)
mtd0 <- compiler::cmpfun(function(x) {
subx <- x[seq_len(winsz)]
n <- 1L
drawn <- rep(NA_integer_, length(x))
while (n <= length(x)-winsz) {
idx <- which.min(subx)
drawn[n] <- subx[idx]
subx[idx] <- x[n+winsz]
n <- n + 1
}
drawn[tail(seq_along(drawn), winsz)] <- sort(subx)
drawn
})
library(microbenchmark)
microbenchmark(mtd0(vec), times=3L)
Around 8s for a window size of 5 and vector of length 1e6.
Not sure how this is going to clock in but here is another option
f <- function(x, window = 5) {
ret <- numeric(length = length(x))
i <- 1L
while (length(x) > 0) {
idx.min <- which.min(x[1:window])
ret[i] <- x[idx.min]
x <- x[-idx.min]
i <- i + 1
}
return(ret)
}
f(vec)
# [1] 3 2 4 5 6 1 7 8 9 10
Or
f2 <- function(x, window = 5) {
ret <- numeric(length = length(x))
i <- 1L
while (i <= length(x)) {
idx.min <- which.min(x[1:(window + i - 1)])
ret[i] <- x[idx.min]
x[idx.min] <- NA
i <- i + 1
}
return(ret)
}
On a side note...
Kudos for the numeric(length = length(x)) part goes to #RonakShah; it's interesting that numeric(length = length(x)) is much faster than rep(0, length(x)) (which is what I wrote originally;-)
res <- microbenchmark(
rep = rep(0, 10^6),
numeric = numeric(length = 10^6)
)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# rep 1392.582 2549.219 3682.897 2694.137 3098.073 14726.81 100 a
# numeric 424.257 1592.110 2902.232 1727.431 2174.159 11747.87 100 a
Timings so far:
#rolling min without replacement
set.seed(0L)
N <- 10e4
winsz <- 5L
vec <- sample(N)
f <- compiler::cmpfun(function(x, window = 5) {
ret <- numeric(length = length(x))
i <- 1L
while (length(x) > 0) {
idx.min <- which.min(x[1:window])
ret[i] <- x[idx.min]
x <- x[-idx.min]
i <- i + 1
}
return(ret)
})
mtd0 <- compiler::cmpfun(function(x) {
subx <- x[seq_len(winsz)]
n <- 1L
drawn <- rep(NA_integer_, length(x))
while (n <= length(x)-winsz) {
idx <- which.min(subx)
drawn[n] <- subx[idx]
subx[idx] <- x[n+winsz]
n <- n + 1
}
drawn[tail(seq_along(drawn), winsz)] <- sort(subx)
drawn
})
mtd1 <- compiler::cmpfun(function(x) {
res <- Reduce(function(ans, s) {
v <- ans$students
idx <- which.min(v)
list(students=c(v[-idx], s), drawn=v[idx])
},
x=as.list(x[seq_along(x)[-seq_len(winsz)]]),
init=list(students=x[seq_len(winsz)], drawn=NULL),
accumulate=TRUE)
c(unlist(lapply(res, `[[`, "drawn")), sort(res[[length(res)]]$students))
})
#all.equal(f(vec), mtd0(vec))
# [1] TRUE
#all.equal(mtd0(vec), mtd1(vec))
# [1] TRUE
library(microbenchmark)
microbenchmark(f(vec), mtd0(vec), mtd1(vec), times=3L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval cld
f(vec) 16234.97047 16272.00705 16457.05138 16309.04363 16568.092 16827.1400 3 b
mtd0(vec) 75.18676 83.34443 96.03222 91.50209 106.455 121.4078 3 a
mtd1(vec) 301.56747 342.36437 427.33052 383.16127 490.212 597.2628 3 a

Vectorise R code to randomly select 2 columns from each row

Does anyone have suggestions of how I could vectorise this code or otherwise speed it up? I'm creating a matrix, potentially very large. In each row, I want to select 2 columns at random, and flip them from 0 to 1.
I cannot select the same row and column number, i.e. the diagonal of the matrix will be zero hence the (1:N)[-j] in sample(). Because this changes with each row, I can't see a way to do this by using vectorisation, but parallelisation could be an option here?
I use library(Matrix) for sparse matrix functionality.
library(Matrix)
N <- 100
m <- Matrix(0, nrow = N, ncol = N)
for(j in 1:N) {
cols <- sample((1:N)[-j], 2) #Choose 2 columns not equal to the
m[j, cols] <- 1
}
Any ideas?
library(Matrix)
N <- 7
desired_output <- Matrix(0, nrow = N, ncol = N)
set.seed(1)
for(j in 1:N) {
cols <- sample((1:N)[-j], 2) #Choose 2 columns not equal to the
desired_output[j, cols] <- 1
}
# 7 x 7 sparse Matrix of class "dgCMatrix"
#
# [1,] . . 1 . . . 1
# [2,] . . . . 1 1 .
# [3,] . 1 . . . 1 .
# [4,] . . . . 1 . 1
# [5,] 1 . . 1 . . .
# [6,] 1 1 . . . . .
# [7,] . 1 . . 1 . .
res <- Matrix(0, nrow = N, ncol = N)
set.seed(1)
ind <- cbind(rep(1:N, each = 2), c(sapply(1:N, function(j) sample((1:N)[-j], 2))))
res[ind] <- 1
all.equal(res, desired_output)
# [1] TRUE
Quick bench:
microbenchmark::microbenchmark(
OP = {
desired_output <- Matrix(0, nrow = N, ncol = N)
set.seed(1)
for(j in 1:N) {
cols <- sample((1:N)[-j], 2) #Choose 2 columns not equal to the
desired_output[j, cols] <- 1
}
},
Aurele = {
res <- Matrix(0, nrow = N, ncol = N)
set.seed(1)
ind <- cbind(rep(1:N, each = 2), c(sapply(1:N, function(j) sample((1:N)[-j], 2))))
res[ind] <- 1
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# OP 10.240969 10.509384 11.065336 10.804949 11.044846 14.903377 100 b
# Aurele 1.185001 1.258037 1.392021 1.363503 1.434818 4.553614 100 a
EDIT: I've edited my answer to make it simpler and to include the way R and RcppArmadillo make the sampling. And now it seems to be linear with N (as I thought it would be).
There are two "problems" in your code:
sample((1:N)[-j], 2) is uncessary allocating memory, making your solution quadratic with N. The solution would be to use rejection sampling as N is large (so rejection will not occur often).
you replace indices that are not "contiguous" in the matrix.
It is true that because you have sample with no replacement, it is not straightforward to make a vectorized solution for your problem. But again, it would be possible by using rejection sampling. Here I prefer an Rcpp solution:
Rcpp:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerMatrix sample2(int N) {
IntegerMatrix res(2 * N, 2);
int j, ind1, ind2;
for (j = 0; j < N; j++) {
res(2 * j, 0) = res(2 * j + 1, 0) = j + 1;
// sample first one
do {
ind1 = N * unif_rand();
} while (ind1 == j);
res(2 * j, 1) = ind1 + 1;
// sample second one
do {
ind2 = N * unif_rand();
} while (ind2 == j || ind2 == ind1);
res(2 * j + 1, 1) = ind2 + 1;
}
return res;
}
R:
# table(replicate(1e5, sample2(5))) # Verify that the sampling is OK
library(Matrix)
N <- 1000
m <- Matrix(0, nrow = N, ncol = N)
m[sample2(N)] <- 1
Benchmark:
microbenchmark::microbenchmark(
OP = {
desired_output <- Matrix(0, nrow = N, ncol = N)
for(j in 1:N) {
cols <- sample((1:N)[-j], 2) #Choose 2 columns not equal to the
desired_output[j, cols] <- 1
}
},
Aurele = {
res <- Matrix(0, nrow = N, ncol = N)
ind <- cbind(rep(1:N, each = 2), c(sapply(1:N, function(j) sample((1:N)[-j], 2))))
res[ind] <- 1
},
privefl = {
m <- Matrix(0, nrow = N, ncol = N)
m[sample2(N)] <- 1
},
times = 20
)
Results with N = 1000:
Unit: milliseconds
expr min lq mean median uq max neval
OP 599.996226 605.868229 618.479868 615.653853 625.908794 679.292360 20
Aurele 12.315795 12.633971 14.183891 13.148149 15.118948 19.649716 20
privefl 1.401824 1.493371 1.649015 1.622826 1.704273 2.520239 20
Results with N = 10,000:
Unit: milliseconds
expr min lq mean median uq max neval
Aurele 812.018743 845.434915 903.387191 863.851661 967.08294 1084.738882 20
privefl 3.657525 4.083799 4.409226 4.239576 4.49501 6.413498 20

Compare each row with other rows of matrix

I am looking for an efficient solution for the following problem:
b <- matrix(c(0,0,0,1,1,0), nrow = 2, byrow = T)
weight <- c(1,1)
times <- 5
abc <- do.call(rbind, replicate(times, b, simplify=FALSE))
weight <- rep.int(weight,times)
sum1 <- as.numeric(rep.int(NA,nrow(abc)))
##Rprof()
for(j in 1:nrow(abc)){
a <- abc[j,]
sum1[j] <- sum(weight[rowSums(t(a == t(abc)) + 0) == ncol(abc)])
}
##Rprof(NULL)
##summaryRprof()
Is there a faster way to do this? Rprof shows that rowSums(), t(), == and + are quite slow. If nrows is 20,000 it takes like 21 seconds.
Thanks for helping!
Edit: I have a matrix abc and a vector weight with length equal to nrow(abc). The first value of weight corresponds to the first row of matrix abc and so on... Now, I would like to determine which rows of matrix abc are equal. Then, I want to remember the position of those rows in order to sum up the corresponding weights which have the same position. The appropriate sum I wanna store for each row.
Here is a way that looks valid and fast:
ff <- function(mat, weights)
{
rs <- apply(mat, 1, paste, collapse = ";")
unlist(lapply(unique(rs),
function(x)
sum(weights[match(rs, x, 0) > 0])))[match(rs, unique(rs))]
}
ff(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
And comparing with your function:
ffOP <- function(mat, weights)
{
sum1 <- as.numeric(rep.int(NA,nrow(mat)))
for(j in 1:nrow(mat)) {
a <- mat[j,]
sum1[j] <- sum(weights[rowSums(t(a == t(mat)) + 0) == ncol(mat)])
}
sum1
}
ffOP(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
library(microbenchmark)
m = do.call(rbind, replicate(1e3, matrix(0:11, 3, 4), simplify = F))
set.seed(101); w = runif(1e3*3)
all.equal(ffOP(m, w), ff(m, w))
#[1] TRUE
microbenchmark(ffOP(m, w), ff(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 969.83968 986.47941 996.68563 1015.53552 1051.23847 10
# ff(m, w) 20.42426 20.64002 21.36508 21.97182 22.59127 10
For the record, I, also, implemented your approach in C and here are the benchmarkings:
#> microbenchmark(ffOP(m, w), ff(m, w), ffC(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 957.66691 967.09429 991.35232 1000.53070 1016.74100 10
# ff(m, w) 20.60243 20.85578 21.70578 22.13434 23.04924 10
# ffC(m, w) 36.24618 36.40940 37.18927 37.39877 38.83358 10

Resources