Related
I have a matrix X:
one two three four
[1,] 1 3 2 4
[2,] 2 0 1 5
[3,] 3 2 1 4
[4,] 4 9 11 19
[5,] 4 3 2 1
I want to get a new matrix Y which only contains rows that are permutations of "1", "2", "3", "4". That is:
one two three four
[1,] 1 3 2 4
[3,] 3 2 1 4
[5,] 4 3 2 1
What function or command should I use?
mat <- rbind(
c(1, 3, 2, 4),
c(2, 0, 1, 5),
c(3, 2, 1, 4)
)
ok <- apply(mat, 1L, function(x) setequal(x, c(1, 2, 3, 4)))
mat[ok, ]
Your example matrix and target vector:
X <- structure(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4, 5, 4, 19, 1),
dim = 5:4)
v <- 1:4
But let's construct a more challenging one (thanks to user harre):
X <- rbind(X, 1, c(1, 2, 1, 2))
A fully vectorized approach (using package matrixStats)
rk <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
ct <- matrixStats::rowTabulates(rk, values = 1:length(v))
zo <- matrixStats::rowCounts(ct, value = 0L)
## all rows that are permutations of 'v'
X[zo == 0L, ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
## remove rows that are permutations of 'v'
X[zo > 0L, ]
Another fully vectorized method (base R)
This is a mathematical solution. For a nonlinear and asymmetric weight function w(x), the following weighted sum:
1 x w(1) + 2 x w(2) + 3 x w(3) + 4 x w(4)
is a unique score or identifier and is invariant to permutations. So for example, the following gives the same value:
2 x w(2) + 1 x w(1) + 3 x w(3) + 4 x w(4)
But anything else will give different values, like:
1 x w(1) + 3 x w(1) + 3 x w(3) + 4 x w(4)
0 x w(0) + 3 x w(1) + 0 x w(0) + 4 x w(4)
Here is an implementation using cosine weights. It works even if X and v are floating point numbers or characters.
## method = "tab" for tabulation method
## method = "cos" for cosine weights method
FindPerm <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
X[FindPerm(X, v, "cos"), ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
Benchmark
Performance depends on the number of values in v. The tabulation method will slow down as v becomes long.
## a benchmark function, relying on package "microbenchmark"
## nr: number of matrix rows
## nc: number of elements in 'v'
bm <- function (nr, nc) {
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
check = "identical")
}
bm(2e+4, 4)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 4.302674 4.324236 4.536260 4.336955 4.359814 7.039699
# cos 4.846893 4.872361 5.163209 4.882942 4.901288 7.837580
bm(2e+4, 20)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 30.63438 30.70217 32.73508 30.77588 33.08046 135.64322
# cos 21.16669 21.26161 22.28298 21.37563 23.60574 26.31775
Update since there's so much interest in this question, here's a method using indexing to give a speed boost on Zheyuan Li's excellent generalization of my original answer.
The idea is to index on a length(v)-dimensional array for small v, or to index on v*sin(w*v) using the results of match instead of calculating X*sin(W*X) when v is large:
library(RcppAlgos)
# simplified version of Zheyuan Li's function
f1 <- function(X, v) {
n <- length(v)
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
w <- pi/(n + 1)
rowSums(Xi*sin(Xi*w)) == sum(vi*sin(vi*w))
}
f2 <- function(X, v) {
n <- length(v)
if (n < 6) {
# index an n-dimensional array
m <- array(FALSE, rep(n + 1L, n))
m[permuteGeneral(n)] <- TRUE
X[] <- match(X, v, nomatch = length(v) + 1L)
m[X]
} else {
nn <- 1:n
u <- c(nn*sin(pi*nn/(n + 1L)), 0)
X[] <- u[match(X, v, nomatch = n + 1L)]
rowSums(X) == sum(u)
}
}
set.seed(123)
# using Zheyuan Li's test dataset
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 344.4 367.25 438.932 374.05 386.75 5960.6 100
#> f2 81.9 85.00 163.332 88.90 98.50 6924.4 100
# Zheyuan Li's larger test dataset
set.seed(123)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 1569.2 1575.45 1653.510 1601.30 1683.6 3972.6 100
#> f2 355.2 359.90 431.705 366.85 408.6 2253.8 100
Original answer edited to use X + exp(1/X) (see comments).
This should work with positive integers:
Y <- X[rowSums(X + exp(1/X)) == sum(1:4 + exp(1/(1:4))),]
Benchmarking against the apply solution:
f1 <- function(x) x[apply(x, 1L, function(x) setequal(x, 1:4)),]
f2 <- function(x) x[rowSums(x + exp(1/x)) == sum(1:4 + exp(1/(1:4))),]
X <- matrix(sample(10, 4e5, TRUE), 1e5)
microbenchmark::microbenchmark(f1 = f1(X),
f2 = f2(X),
times = 10,
check = "equal")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 448.2680 450.8778 468.55179 461.62620 472.0022 542.0455 10
#> f2 28.5362 28.6889 31.50941 29.44845 30.2693 50.4402 10
This question is HOT. I am learning so I take this as a good opportunity to learn. It is really difficult for me to come up with new solutions, but I found two things missing here:
there is no serious validation of these answers;
there is no benchmark for all of them.
I would like to convert each answer to a function that returns a TRUE/FALSE vector for flagging rows. I also want this function to work with any matrix and any vector.
Stéphane Laurent's answer, Zheyuan Li's answer and ThomasIsCoding's answer need minimal adaption.
Mohamed Desouky's answer is also easy to adapt, by taking out the function applied in Filter() and apply() it over matrix rows.
jblood94's answer is challenging. It was commented that for other matrices and vectors, conversion is needed using match. I don't know what is the appropriate way, but I saw match in Zheyuan Li's answer, so I borrowed that part.
TarJae's answer is awful (sorry; don't take this as an insult). None of them seems to work. I don't see any comparison between matrix rows and vectors in the base R solution. For other tidyverse codes, I don't know what df_matrix is. I have requested TarJae to please revise the answer.
harre's answer uses tidyverse and dose not return TRUE/FALSE. So I have to exclude it from the benchmark (sorry).
Here are the functions for the benchmark.
S.Laurent <- function (X, v) apply(X, 1L, function(x) setequal(x, v))
Z.Li <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
Thomas <- function (X, v) colSums(mapply(`%in%`, list(v), asplit(X, 1))) == ncol(X)
M.Desouky <- function (X, v) apply(X, 1, function (x) all((x %in% v) & length(unique(x)) == length(v)))
jblood94 <- function (X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:length(v)
rowSums(Xi + exp(1/Xi)) == sum(vi + exp(1/vi))
}
For benchmark, I followed the setup in Zheyuan Li's answer.
library(matrixStats)
library(microbenchmark); library(ggplot2)
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm1
autoplot(bm1)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm2
autoplot(bm2)
I don't know how time is transformed for plotting, but clearly, they are not on the usual scale. Those to the left are far faster than it is suggested from the plot!
Conclusion: Zheyuan Li's "cos" method is the winner.
We can try this
> mat[colSums(mapply(`%in%`, list(1:4), asplit(mat, 1))) == ncol(mat), ]
[,1] [,2] [,3] [,4]
[1,] 1 3 2 4
[2,] 3 2 1 4
[3,] 4 3 2 1
Another option is using Filter function
t(Filter(\(x) all((x %in% 1:4) & length(unique(x)) == 4) ,
data.frame(t(X))))
Just for fun
Who can give me 4 distinct digits whose sum equal 6 ?
there is just {0,1,2,3}
then we can use the module of 4 using %%
X[apply(X , 1 , \(x) sum(unique(x %% 4)) == 6 & length(unique(x)) == 4) , ]
OR
with Using pure for loop
ans <- data.frame(matrix(NA , ncol = ncol(X)))
r <- 1
for(i in 1:nrow(X)){
if(all((X[i,] %in% 1:4) & length(unique(X[i,])) == 4)){
ans[r,] <- X[i,]
r <- r + 1
}
}
ans <- as.matrix(ans)
output
X1 X2 X3 X4
1 1 3 2 4
2 3 2 1 4
3 4 3 2 1
data
X <- matrix(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4,
5, 4, 19, 1) , ncol = 4)
For the fun of tidyverse-solutions, even if I think we'd rather work on the matrices directly. However, we could use rowwise() and c_across():
With set-operations (inspired by #Stéphane Laurent):
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(setequal(c_across(), c(1, 2, 3, 4))) |>
ungroup() |>
as.matrix()
Or without set-operations:
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(1 %in% c_across(everything()) &
2 %in% c_across(everything()) &
3 %in% c_across(everything()) &
4 %in% c_across(everything())
) |>
ungroup() |>
as.matrix()
Or inspired by #Mohamed Desouky:
mat %>%
as_tibble() |>
rowwise() |>
filter(all(c_across() %in% 1:4) & n_distinct(c_across()) == 4) |>
ungroup() |>
as.matrix()
And so on..
The algorithm library in C++ offers a function called std::is_permutation that does just the trick.
The workhorse function below uses Rcpp and is fairly straightforward.
#include <Rcpp.h>
// [[Rcpp::export]]
SEXP perm_idx_cpp(Rcpp::IntegerMatrix mat, const std::vector<int> &v) {
const int nRows = mat.nrow();
const int nCols = mat.ncol();
std::vector<int> test(nCols);
Rcpp::LogicalVector res(nRows);
for (int i = 0; i < nRows; ++i) {
for (int j = 0; j < nCols; ++j) {
test[j] = mat(i, j);
}
res[i] = std::is_permutation(
test.begin(), test.end(), v.begin()
);
}
return res;
}
And calling it in R we have (N.B. We use match to get integer indices as in #Zheyuan Li's answer, which is absolutely brilliant btw!):
get_perm_idx <- function(X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), ncol = ncol(X))
perm_idx_cpp(Xi, seq_along(v))
}
It is very efficient as well. Here is a simple benchmark:
nr <- 2e4
nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
"is_perm_cpp" = get_perm_idx(X, v),
check = "identical")
Unit: milliseconds
expr min lq mean median uq max neval
tab 33.641345 36.479660 39.00994 37.402306 39.560015 54.88057 100
cos 9.496309 12.887493 15.30122 13.306302 14.053643 132.24079 100
is_perm_cpp 3.232093 4.819553 6.08687 4.993367 5.248818 19.56919 100
You could probably squeeze out some extra efficiency, but it is tough to beat the simplicity here.
I am trying to create a function which returns the min & max value of a vector.
Currently I have created 2 seperate functions but I need the one to return similar output like so.
min max
-2.078793 2.041260
Vector
vec <- rnorm(20)
Functions
minmax <- function(x) {
my_min = Inf
for (i in seq_along(x)) {
if (x[i] < my_min) my_min = x[i]
}
return(min = my_min)
}
minmax <- function(x) {
my_max = 0
for (i in seq_along(x)) {
if (x[i] > my_max) my_max = x[i]
}
return(max = my_max)
}
Try this function
minmax <- function(x) {
my_min = Inf
my_max = - Inf
for (i in seq_along(x)) {
if (x[i] < my_min) my_min = x[i]
if (x[i] > my_max) my_max = x[i]
}
cat("min , max :" , my_min , " , " , my_max)
invisible(c(min = my_min , max = my_max))
}
Using first element as starting value.
f <- function(x) {
r <- x[c(1L, 1L)]
for (i in 2:length(x)) {
if (x[i] < r[1L]) r[1L] <- x[i]
if (x[i] > r[2L]) r[2L] <- x[i]
}
r
}
However, such loops are slow in R, but we could implement it using Rcpp,
rcppfun <- "
Rcpp::NumericVector myrange(Rcpp::NumericVector x) {
std::vector<double> r(2);
r[0] = x[0];
r[1] = x[0];
for (int i = 1; i < x.size(); ++i) {
if (x[i] < r[0]) {
r[0] = x[i];
}
if (x[i] > r[1]) {
r[1] = x[i];
}
}
return Rcpp::wrap(r);
}
"
library(Rcpp)
f_rcpp <- cppFunction(rcppfun)
set.seed(42)
x <- rnorm(1e7)
stopifnot(all.equal(range(x), f(x)) & all.equal(range(x), f_rcpp(x)))
f(x)
# [1] -5.522383 5.537123
f_rcpp(x)
# [1] -5.522383 5.537123
which appears to be much faster than range(). The reason for this is that base:::range.default concatenates min(x) and min(x), i.e. essentially two for loops are used whereas f_rcpp uses only one. Notice, that f_rcpp also works with matrices f_rcpp(mat), and with data frames, f_rcpp(as.matrix(df)) works.
microbenchmark::microbenchmark(
f(x), f_rcpp(x), range(x), minmax(x), times=3L
)
Unit: milliseconds
expr min lq mean median uq max neval cld
f(x) 1478.53334 1478.54111 1488.13588 1478.54889 1492.93715 1507.32542 3 b
f_rcpp(x) 53.66378 53.77902 54.28918 53.89426 54.60187 55.30949 3 a
range(x) 97.38360 107.07452 113.62282 116.76545 121.74244 126.71942 3 a
minmax(x) 1443.86547 1444.31277 1484.25910 1444.76007 1504.45592 1564.15176 3 b
Consider head() or tail() after sorting:
minmax <- function(x) {
sorted_vec <- sort(x)
c(min=head(sorted_vec, 1), max=tail(sorted_vec, 1))
}
Alternatively, by indexing after sorting:
minmax <- function(x) {
sorted_vec <- sort(x)
c(min=sorted_vec[1], max=sorted_vec[length(x)])
}
I have a random vector vec and want make a new vector L without using a loop. New element of L depends on old elements of L and vec.
set.seed(0)
vec <- rnorm(20,0)
i = 2;
N <- length(vec) -1
L <- numeric(N-1)
constant <- 0.6
while (i < N){
L[i] = vec[i + 1] - vec[i] - constant * L[i - 1]
i <- i + 1
}
L
# [1] 0.0000000 1.6560326 -1.0509895 -0.2271942 -1.8182750 1.7023480 -0.3875622 0.5214906 2.0975262 -2.8995756 0.1771427
# [12] -0.4549334 1.1311555 -0.6884468 0.3007724 0.4832709 -1.4341071 2.1880687
You want
L[1] = 0
L[i] = -constant * L[i - 1] + (vec[i + 1] - vec[i]), i = 2, 3, ...,
Let dv <- diff(vec), the 2nd line becomes
L[i] = -constant * L[i - 1] + dv[i], i = 2, 3, ...
an AR1 process with lag-1 auto-correlation -constant and innovation dv[-1]. AR1 process can be efficiently generated by filter with "recursive" method.
dv <- diff(vec)
L <- c(0, filter(dv[-1], -constant, "recursive"))
# [1] 0.0000000 1.6560326 -1.0509895 -0.2271942 -1.8182750 1.7023480
# [7] -0.3875622 0.5214906 2.0975262 -2.8995756 0.1771427 -0.4549334
#[13] 1.1311555 -0.6884468 0.3007724 0.4832709 -1.4341071 2.1880687
#[19] -2.9860629
I guess you mean while (i <= N) in your question. If you do want i < N, then you have to get rid of the last element above. Which can be done by
dv <- diff(vec)
L <- c(0, filter(dv[2:(length(dv) - 1)], -constant, "recursive"))
hours later...
I was brought to attention by Rui Barradas's benchmark. For short vec, any method is fast enough. For long vec, filter is definitely faster, but practically suffers from coercion as filter expects and returns a "ts" (time series) object. It is better to call its workhorse C routine straightaway:
AR1_FILTER <- function (x, filter, full = TRUE) {
n <- length(x)
AR1 <- .Call(stats:::C_rfilter, as.double(x), as.double(filter), double(n + 1L))
if (!full) AR1 <- AR1[-1L]
AR1
}
dv <- diff(vec)
L <- AR1_FILTER(dv[-1], -constant)
#L <- AR1_FILTER(dv[2:(length(dv) - 1)], -constant)
I am not interested in comparing AR1_FILTER with R-level loop. I will just compare it with filter.
library(microbenchmark)
v <- runif(100000)
microbenchmark("R" = c(0, filter(v, -0.6, "recursive")),
"C" = AR1_FILTER(v, -0.6))
Unit: milliseconds
expr min lq mean median uq max neval
R 6.803945 7.987209 11.08361 8.074241 9.131967 54.672610 100
C 2.586143 2.606998 2.76218 2.644068 2.660831 3.845041 100
When you have to compute values based on previous values the general purpose answer is no, there is no way around a loop.
In your case I would use a for loop. It's simpler.
M <- numeric(N - 1)
for(i in seq_len(N)[-N])
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
identical(L, M)
#[1] TRUE
Note the use of seq_len, not 2:(N - 1).
Edit.
I have timed the solutions by myself and by user 李哲源. The results are clearly favorable to my solution.
f1 <- function(vec, constant = 0.6){
N <- length(vec) - 1
M <- numeric(N - 1)
for(i in seq_len(N)[-c(1, N)]){
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
}
M
}
f2 <- function(vec, constant = 0.6){
dv <- diff(vec)
c(0, c(stats::filter(dv[2:(length(dv) - 1)], -constant, "recursive")) )
}
L1 <- f1(vec)
L2 <- f2(vec)
identical(L, L1)
identical(L, L2)
microbenchmark::microbenchmark(
loop = f1(vec),
filter = f2(vec)
)
On my PC the ratio of the medians gives my code 11 times faster.
I was thinking about using Rcpp for this, but one of the answer mentioned rfilter built internally in R, so I had a check:
/* recursive filtering */
SEXP rfilter(SEXP x, SEXP filter, SEXP out)
{
if (TYPEOF(x) != REALSXP || TYPEOF(filter) != REALSXP
|| TYPEOF(out) != REALSXP) error("invalid input");
R_xlen_t nx = XLENGTH(x), nf = XLENGTH(filter);
double sum, tmp, *r = REAL(out), *rx = REAL(x), *rf = REAL(filter);
for(R_xlen_t i = 0; i < nx; i++) {
sum = rx[i];
for (R_xlen_t j = 0; j < nf; j++) {
tmp = r[nf + i - j - 1];
if(my_isok(tmp)) sum += tmp * rf[j];
else { r[nf + i] = NA_REAL; goto bad3; }
}
r[nf + i] = sum;
bad3:
continue;
}
return out;
}
This function is already pretty look and I don't think I could write an Rcpp one to beat it with great improvement. I did a benchmark with this rfilter and the f1 function in the accepted answer:
f1 <- function(vec, constant = 0.6){
N <- length(vec) - 1
M <- numeric(N - 1)
for(i in seq_len(N)[-c(1, N)]){
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
}
M
}
AR1_FILTER <- function (x, filter, full = TRUE) {
n <- length(x)
AR1 <- .Call(stats:::C_rfilter, as.double(x), as.double(filter), double(n + 1L))
if (!full) AR1 <- AR1[-1L]
AR1
}
f2 <- function (vec, constant) {
dv <- diff(vec)
AR1_FILTER(dv[2:(length(dv) - 1)], -constant)
}
library(microbenchmark)
Bench <- function (n) {
vec <- runif(n)
microbenchmark("R" = f1(vec, 0.6), "C" = f2(vec, 0.6))
}
For short vectors with length 100, I got
Bench(100)
Unit: microseconds
expr min lq mean median uq max neval
R 68.098 69.8585 79.05593 72.456 74.6210 244.148 100
C 66.423 68.5925 73.18702 69.793 71.1745 150.029 100
For large vectors with length 10000, I got
Bench(10000)
Unit: microseconds
expr min lq mean median uq max neval
R 6168.742 6699.9170 6870.277 6786.0415 6997.992 8921.279 100
C 876.934 904.6175 1192.000 931.9345 1034.273 2962.006 100
Yeah, there is no way that R is going to beat a compiled language.
library(dplyr)
L2 <- c(0,lead(vec) - vec - constant * lag(L))
L2 <- L2[!is.na(L2)]
L2
[1] 0.00000000 1.09605531 -0.62765133 1.81529867 -2.10535596 3.10864280 -4.36975556 1.41375965
[9] -1.08809820 2.16767510 -1.82140234 1.14748512 -0.89245650 0.03962074 -0.10930073 1.48162072
[17] -1.63074832 2.21593009
all.equal(L,L2)
[1] TRUE
Let's suppose we have the following vector:
v <- c(2,2,3,5,8,0,32,1,3,12,5,2,3,5,8,33,1)
Given a sequence of numbers, for instance c(2,3,5,8), I am trying to find what the position of this sequence of numbers is in the vector v. The result I expect is something like:
FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
I am trying to use which(v == c(2,3,5,8)), but it doesn't give me what I am looking for.
Using base R you could do the following:
v <- c(2,2,3,5,8,0,32,1,3,12,5,2,3,5,8,33,1)
x <- c(2,3,5,8)
idx <- which(v == x[1])
idx[sapply(idx, function(i) all(v[i:(i+(length(x)-1))] == x))]
# [1] 2 12
This tells you that the exact sequence appears twice, starting at positions 2 and 12 of your vector v.
It first checks the possible starting positions, i.e. where v equals the first value of x and then loops through these positions to check if the values after these positions also equal the other values of x.
Two other approaches using the shift-function trom data.table:
library(data.table)
# option 1
which(rowSums(mapply('==',
shift(v, type = 'lead', n = 0:(length(x) - 1)),
x)
) == length(x))
# option 2
which(Reduce("+", Map('==',
shift(v, type = 'lead', n = 0:(length(x) - 1)),
x)
) == length(x))
both give:
[1] 2 12
To get a full vector of the matching positions:
l <- length(x)
w <- which(Reduce("+", Map('==',
shift(v, type = 'lead', n = 0:(l - 1)),
x)
) == l)
rep(w, each = l) + 0:(l-1)
which gives:
[1] 2 3 4 5 12 13 14 15
The benchmark which was included earlier in this answer has been moved to a separate community wiki answer.
Used data:
v <- c(2,2,3,5,8,0,32,1,3,12,5,2,3,5,8,33,1)
x <- c(2,3,5,8)
You can use rollapply() from zoo
v <- c(2,2,3,5,8,0,32,1,3,12,5,2,3,5,8,33,1)
x <- c(2,3,5,8)
library("zoo")
searchX <- function(x, X) all(x==X)
rollapply(v, FUN=searchX, X=x, width=length(x))
The result TRUEshows you the beginning of the sequence.
The code could be simplified to rollapply(v, length(x), identical, x) (thanks to G. Grothendieck):
set.seed(2)
vl <- as.numeric(sample(1:10, 1e6, TRUE))
# vm <- vl[1:1e5]
# vs <- vl[1:1e4]
x <- c(2,3,5)
library("zoo")
searchX <- function(x, X) all(x==X)
i1 <- rollapply(vl, FUN=searchX, X=x, width=length(x))
i2 <- rollapply(vl, width=length(x), identical, y=x)
identical(i1, i2)
For using identical() both arguments must be of the same type (num and int are not the same).
If needed == coerces int to num; identical() does not any coercion.
I feel like looping should be efficient:
w = seq_along(v)
for (i in seq_along(x)) w = w[v[w+i-1L] == x[i]]
w
# [1] 2 12
This should be writable in C++ following #SymbolixAU approach for extra speed.
A basic comparison:
# create functions for selected approaches
redjaap <- function(v,x)
which(Reduce("+", Map('==', shift(v, type = 'lead', n = 0:(length(x) - 1)), x)) == length(x))
loop <- function(v,x){
w = seq_along(v)
for (i in seq_along(x)) w = w[v[w+i-1L] == x[i]]
w
}
# check consistency
identical(redjaap(v,x), loop(v,x))
# [1] TRUE
# check speed
library(microbenchmark)
vv <- rep(v, 1e4)
microbenchmark(redjaap(vv,x), loop(vv,x), times = 100)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# redjaap(vv, x) 5.883809 8.058230 17.225899 9.080246 9.907514 96.35226 100 b
# loop(vv, x) 3.629213 5.080816 9.475016 5.578508 6.495105 112.61242 100 a
# check consistency again
identical(redjaap(vv,x), loop(vv,x))
# [1] TRUE
Here are two Rcpp solutions. The first one returns the location of v that is the starting position of the sequence.
library(Rcpp)
v <- c(2,2,3,5,8,0,32,1,3,12,5,2,3,5,8,33,1)
x <- c(2,3,5,8)
cppFunction('NumericVector SeqInVec(NumericVector myVector, NumericVector mySequence) {
int vecSize = myVector.size();
int seqSize = mySequence.size();
NumericVector comparison(seqSize);
NumericVector res(vecSize);
for (int i = 0; i < vecSize; i++ ) {
for (int j = 0; j < seqSize; j++ ) {
comparison[j] = mySequence[j] == myVector[i + j];
}
if (sum(comparison) == seqSize) {
res[i] = 1;
}else{
res[i] = 0;
}
}
return res;
}')
SeqInVec(v, x)
#[1] 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
This second one returns the index values (as per the other answers) of every matched entry in the sequence.
cppFunction('NumericVector SeqInVec(NumericVector myVector, NumericVector mySequence) {
int vecSize = myVector.size();
int seqSize = mySequence.size();
NumericVector comparison(seqSize);
NumericVector res(vecSize);
int foundCounter = 0;
for (int i = 0; i < vecSize; i++ ) {
for (int j = 0; j < seqSize; j++ ) {
comparison[j] = mySequence[j] == myVector[i + j];
}
if (sum(comparison) == seqSize) {
for (int j = 0; j < seqSize; j++ ) {
res[foundCounter] = i + j + 1;
foundCounter++;
}
}
}
IntegerVector idx = seq(0, (foundCounter-1));
return res[idx];
}')
SeqInVec(v, x)
# [1] 2 3 4 5 12 13 14 15
Optimising
As #MichaelChirico points out in their comment, further optimisations can be made. For example, if we know the first entry in the sequence doesn't match a value in the vector, we don't need to do the rest of the comparison
cppFunction('NumericVector SeqInVecOpt(NumericVector myVector, NumericVector mySequence) {
int vecSize = myVector.size();
int seqSize = mySequence.size();
NumericVector comparison(seqSize);
NumericVector res(vecSize);
int foundCounter = 0;
for (int i = 0; i < vecSize; i++ ) {
if (myVector[i] == mySequence[0]) {
for (int j = 0; j < seqSize; j++ ) {
comparison[j] = mySequence[j] == myVector[i + j];
}
if (sum(comparison) == seqSize) {
for (int j = 0; j < seqSize; j++ ) {
res[foundCounter] = i + j + 1;
foundCounter++;
}
}
}
}
IntegerVector idx = seq(0, (foundCounter-1));
return res[idx];
}')
The answer with benchmarks shows the performance of these approaches
A benchmark on the posted answers:
Load the needed packages:
library(data.table)
library(microbenchmark)
library(Rcpp)
library(zoo)
Creating vector with which the benchmarks will be run:
set.seed(2)
vl <- sample(1:10, 1e6, TRUE)
vm <- vl[1:1e5]
vs <- vl[1:1e4]
x <- c(2,3,5)
Testing whether all solution give the same outcome on the small vector vs:
> all.equal(jaap1(vs,x), jaap2(vs,x))
[1] TRUE
> all.equal(jaap1(vs,x), docendo(vs,x))
[1] TRUE
> all.equal(jaap1(vs,x), a5c1(vs,x))
[1] TRUE
> all.equal(jaap1(vs,x), jogo1(vs,x))
[1] TRUE
> all.equal(jaap1(vs,x), moody(vs,x))
[1] "Numeric: lengths (24, 873) differ"
> all.equal(jaap1(vs,x), cata1(vs,x))
[1] "Numeric: lengths (24, 0) differ"
> all.equal(jaap1(vs,x), u989(vs,x))
[1] TRUE
> all.equal(jaap1(vs,x), frank(vs,x))
[1] TRUE
> all.equal(jaap1(vs,x), symb(vs,x))
[1] TRUE
> all.equal(jaap1(vs, x), symbOpt(vs, x))
[1] TRUE
Further inspection of the cata1 and moody solutions learns that they don't give the desired output. They are therefore not included in the benchmarks.
The benchmark for the smallest vector vs:
mbs <- microbenchmark(jaap1(vs,x), jaap2(vs,x), docendo(vs,x), a5c1(vs,x),
jogo1(vs,x), u989(vs,x), frank(vs,x), symb(vs,x), symbOpt(vs, x),
times = 100)
gives:
print(mbs, order = "median")
Unit: microseconds
expr min lq mean median uq max neval
symbOpt(vs, x) 40.658 47.0565 78.47119 51.5220 56.2765 2170.708 100
symb(vs, x) 106.208 112.7885 151.76398 117.0655 123.7450 1976.360 100
frank(vs, x) 121.303 129.0515 203.13616 132.1115 137.9370 6193.837 100
jaap2(vs, x) 187.973 218.7805 322.98300 235.0535 255.2275 6287.548 100
jaap1(vs, x) 306.944 341.4055 452.32426 358.2600 387.7105 6376.805 100
a5c1(vs, x) 463.721 500.9465 628.13475 516.2845 553.2765 6179.304 100
docendo(vs, x) 1139.689 1244.0555 1399.88150 1313.6295 1363.3480 9516.529 100
u989(vs, x) 8048.969 8244.9570 8735.97523 8627.8335 8858.7075 18732.750 100
jogo1(vs, x) 40022.406 42208.4870 44927.58872 43733.8935 45008.0360 124496.190 100
The benchmark for the medium vector vm:
mbm <- microbenchmark(jaap1(vm,x), jaap2(vm,x), docendo(vm,x), a5c1(vm,x),
jogo1(vm,x), u989(vm,x), frank(vm,x), symb(vm,x), symbOpt(vm, x),
times = 100)
gives:
print(mbm, order = "median")
Unit: microseconds
expr min lq mean median uq max neval
symbOpt(vm, x) 357.452 405.0415 974.9058 763.0205 1067.803 7444.126 100
symb(vm, x) 1032.915 1117.7585 1923.4040 1422.1930 1753.044 17498.132 100
frank(vm, x) 1158.744 1470.8170 1829.8024 1826.1330 1935.641 6423.966 100
jaap2(vm, x) 1622.183 2872.7725 3798.6536 3147.7895 3680.954 14886.765 100
jaap1(vm, x) 3053.024 4729.6115 7325.3753 5607.8395 6682.814 87151.774 100
a5c1(vm, x) 5487.547 7458.2025 9612.5545 8137.1255 9420.684 88798.914 100
docendo(vm, x) 10780.920 11357.7440 13313.6269 12029.1720 13411.026 21984.294 100
u989(vm, x) 83518.898 84999.6890 88537.9931 87675.3260 90636.674 105681.313 100
jogo1(vm, x) 471753.735 512979.3840 537232.7003 534780.8050 556866.124 646810.092 100
The benchmark for the largest vector vl:
mbl <- microbenchmark(jaap1(vl,x), jaap2(vl,x), docendo(vl,x), a5c1(vl,x),
jogo1(vl,x), u989(vl,x), frank(vl,x), symb(vl,x), symbOpt(vl, x),
times = 100)
gives:
print(mbl, order = "median")
Unit: milliseconds
expr min lq mean median uq max neval
symbOpt(vl, x) 4.679646 5.768531 12.30079 6.67608 11.67082 118.3467 100
symb(vl, x) 11.356392 12.656124 21.27423 13.74856 18.66955 149.9840 100
frank(vl, x) 13.523963 14.929656 22.70959 17.53589 22.04182 132.6248 100
jaap2(vl, x) 18.754847 24.968511 37.89915 29.78309 36.47700 145.3471 100
jaap1(vl, x) 37.047549 52.500684 95.28392 72.89496 138.55008 234.8694 100
a5c1(vl, x) 54.563389 76.704769 116.89269 89.53974 167.19679 248.9265 100
docendo(vl, x) 109.824281 124.631557 156.60513 129.64958 145.47547 296.0214 100
u989(vl, x) 1380.886338 1413.878029 1454.50502 1436.18430 1479.18934 1632.3281 100
jogo1(vl, x) 4067.106897 4339.005951 4472.46318 4454.89297 4563.08310 5114.4626 100
The used functions of each solution:
jaap1 <- function(v,x) {
l <- length(x);
w <- which(rowSums(mapply('==', shift(v, type = 'lead', n = 0:(length(x) - 1)), x) ) == length(x));
rep(w, each = l) + 0:(l-1)
}
jaap2 <- function(v,x) {
l <- length(x);
w <- which(Reduce("+", Map('==', shift(v, type = 'lead', n = 0:(length(x) - 1)), x)) == length(x));
rep(w, each = l) + 0:(l-1)
}
docendo <- function(v,x) {
l <- length(x);
idx <- which(v == x[1]);
w <- idx[sapply(idx, function(i) all(v[i:(i+(length(x)-1))] == x))];
rep(w, each = l) + 0:(l-1)
}
a5c1 <- function(v,x) {
l <- length(x);
w <- which(colSums(t(embed(v, l)[, l:1]) == x) == l);
rep(w, each = l) + 0:(l-1)
}
jogo1 <- function(v,x) {
l <- length(x);
searchX <- function(x, X) all(x==X);
w <- which(rollapply(v, FUN=searchX, X=x, width=l));
rep(w, each = l) + 0:(l-1)
}
moody <- function(v,x) {
l <- length(x);
v2 <- as.numeric(factor(c(v,NA),levels = x));
v2[is.na(v2)] <- l+1;
which(diff(v2) == 1)
}
cata1 <- function(v,x) {
l <- length(x);
w <- which(sapply(lapply(seq(length(v)-l)-1, function(i) v[seq(x)+i]), identical, x));
rep(w, each = l) + 0:(l-1)
}
u989 <- function(v,x) {
l <- length(x);
s <- paste(v, collapse = '-');
p <- paste0('\\b', paste(x, collapse = '-'), '\\b');
i <- c(1, unlist(gregexpr(p, s)));
m <- substring(s, head(i,-1), tail(i,-1));
ln <- lengths(strsplit(m, '-'));
w <- cumsum(c(ln[1], ln[-1]-1));
rep(w, each = l) + 0:(l-1)
}
frank <- function(v,x) {
l <- length(x);
w = seq_along(v);
for (i in seq_along(x)) w = w[v[w+i-1L] == x[i]];
rep(w, each = l) + 0:(l-1)
}
cppFunction('NumericVector SeqInVec(NumericVector myVector, NumericVector mySequence) {
int vecSize = myVector.size();
int seqSize = mySequence.size();
NumericVector comparison(seqSize);
NumericVector res(vecSize);
int foundCounter = 0;
for (int i = 0; i < vecSize; i++ ) {
for (int j = 0; j < seqSize; j++ ) {
comparison[j] = mySequence[j] == myVector[i + j];
}
if (sum(comparison) == seqSize) {
for (int j = 0; j < seqSize; j++ ) {
res[foundCounter] = i + j + 1;
foundCounter++;
}
}
}
IntegerVector idx = seq(0, (foundCounter-1));
return res[idx];
}')
symb <- function(v,x) {SeqInVec(v, x)}
cppFunction('NumericVector SeqInVecOpt(NumericVector myVector, NumericVector mySequence) {
int vecSize = myVector.size();
int seqSize = mySequence.size();
NumericVector comparison(seqSize);
NumericVector res(vecSize);
int foundCounter = 0;
for (int i = 0; i < vecSize; i++ ) {
if (myVector[i] == mySequence[0]) {
for (int j = 0; j < seqSize; j++ ) {
comparison[j] = mySequence[j] == myVector[i + j];
}
if (sum(comparison) == seqSize) {
for (int j = 0; j < seqSize; j++ ) {
res[foundCounter] = i + j + 1;
foundCounter++;
}
}
}
}
IntegerVector idx = seq(0, (foundCounter-1));
return res[idx];
}')
symbOpt <- function(v,x) {SeqInVecOpt(v,x)}
Since this is a cw-answer I'll add my own benchmark of some of the answers.
library(data.table)
library(microbenchmark)
set.seed(2); v <- sample(1:100, 5e7, TRUE); x <- c(2,3,5)
jaap1 <- function(v, x) {
which(rowSums(mapply('==',shift(v, type = 'lead', n = 0:(length(x) - 1)),
x)) == length(x))
}
jaap2 <- function(v, x) {
which(Reduce("+", Map('==',shift(v, type = 'lead', n = 0:(length(x) - 1)),
x)) == length(x))
}
dd1 <- function(v, x) {
idx <- which(v == x[1])
idx[sapply(idx, function(i) all(v[i:(i+(length(x)-1))] == x))]
}
dd2 <- function(v, x) {
idx <- which(v == x[1L])
xl <- length(x) - 1L
idx[sapply(idx, function(i) all(v[i:(i+xl)] == x))]
}
frank <- function(v, x) {
w = seq_along(v)
for (i in seq_along(x)) w = w[v[w+i-1L] == x[i]]
w
}
all.equal(jaap1(v, x), dd1(v, x))
all.equal(jaap2(v, x), dd1(v, x))
all.equal(dd2(v, x), dd1(v, x))
all.equal(frank(v, x), dd1(v, x))
bm <- microbenchmark(jaap1(v, x), jaap2(v, x), dd1(v, x), dd2(v, x), frank(v, x),
unit = "relative", times = 25)
plot(bm)
bm
Unit: relative
expr min lq mean median uq max neval
jaap1(v, x) 4.487360 4.591961 4.724153 4.870226 4.660023 3.9361093 25
jaap2(v, x) 2.026052 2.159902 2.116204 2.282644 2.138106 2.1133068 25
dd1(v, x) 1.078059 1.151530 1.119067 1.257337 1.201762 0.8646835 25
dd2(v, x) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 25
frank(v, x) 1.400735 1.376405 1.442887 1.427433 1.611672 1.3440097 25
Bottom line: without knowing the real data, all these benchmarks don't tell the whole story.
Here's a solution that leverages binary search on secondary indices in data.table. (Great vignette here)
This method has quite a bit of overhead so it's not particularly competitive on the 1e4 length vector in the benchmark, but it hangs near the top of the pack as the size increases.
matt <- function(v,x){
l <- length(x);
SL <- seq_len(l-1);
DT <- data.table(Seq_0 = v);
for (i in SL) set(DT, j = eval(paste0("Seq_",i)), value = shift(DT[["Seq_0"]],n = i, type = "lead"));
w <- DT[as.list(x),on = paste0("Seq_",c(0L,SL)), which = TRUE];
rep(w, each = l) + 0:(l-1)
}
Benchmarking
library(data.table)
library(microbenchmark)
library(Rcpp)
library(zoo)
set.seed(2)
vl <- sample(1:10, 1e6, TRUE)
vm <- vl[1:1e5]
vs <- vl[1:1e4]
x <- c(2,3,5)
Vector Length 1e4
Unit: microseconds
expr min lq mean median uq max neval
symb(vs, x) 138.342 143.048 161.6681 153.1545 159.269 259.999 10
frank(vs, x) 176.634 184.129 198.8060 193.2850 200.701 257.050 10
jaap2(vs, x) 282.231 299.025 342.5323 316.5185 337.760 524.212 10
jaap1(vs, x) 490.013 528.123 568.6168 538.7595 547.268 731.340 10
a5c1(vs, x) 706.450 742.270 751.3092 756.2075 758.859 793.446 10
dd2(vs, x) 1319.098 1348.082 2061.5579 1363.2265 1497.960 7913.383 10
docendo(vs, x) 1427.768 1459.484 1536.6439 1546.2135 1595.858 1696.070 10
dd1(vs, x) 1377.502 1406.272 2217.2382 1552.5030 1706.131 8084.474 10
matt(vs, x) 1928.418 2041.597 2390.6227 2087.6335 2430.470 4762.909 10
u989(vs, x) 8720.330 8821.987 8935.7188 8882.0190 9106.705 9163.967 10
jogo1(vs, x) 47123.615 47536.700 49158.2600 48449.2390 50957.035 52496.981 10
Vector Length 1e5
Unit: milliseconds
expr min lq mean median uq max neval
symb(vm, x) 1.319921 1.378801 1.464972 1.423782 1.577006 1.682156 10
frank(vm, x) 1.671155 1.739507 1.806548 1.760738 1.844893 2.097404 10
jaap2(vm, x) 2.298449 2.380281 2.683813 2.432373 2.566581 4.310258 10
matt(vm, x) 3.195048 3.495247 3.577080 3.607060 3.687222 3.844508 10
jaap1(vm, x) 4.079117 4.179975 4.776989 4.496603 5.206452 6.295954 10
a5c1(vm, x) 6.488621 6.617709 7.366226 6.720107 6.877529 12.500510 10
dd2(vm, x) 12.595699 12.812876 14.990739 14.058098 16.758380 20.743506 10
docendo(vm, x) 13.635357 13.999721 15.296075 14.729947 16.151790 18.541582 10
dd1(vm, x) 13.474589 14.177410 15.676348 15.446635 17.150199 19.085379 10
u989(vm, x) 94.844298 95.026733 96.309658 95.134400 97.460869 100.536654 10
jogo1(vm, x) 575.230741 581.654544 621.824297 616.474265 628.267155 723.010738 10
Vector Length 1e6
Unit: milliseconds
expr min lq mean median uq max neval
symb(vl, x) 13.34294 13.55564 14.01556 13.61847 14.78210 15.26076 10
frank(vl, x) 17.35628 17.45602 18.62781 17.56914 17.88896 25.38812 10
matt(vl, x) 20.79867 21.07157 22.41467 21.23878 22.56063 27.12909 10
jaap2(vl, x) 22.81464 22.92414 22.96956 22.99085 23.02558 23.10124 10
jaap1(vl, x) 40.00971 40.46594 43.01407 41.03370 42.81724 55.90530 10
a5c1(vl, x) 65.39460 65.97406 69.27288 66.28000 66.72847 83.77490 10
dd2(vl, x) 127.47617 132.99154 161.85129 134.63168 157.40028 342.37526 10
dd1(vl, x) 140.06140 145.45085 154.88780 154.23280 161.90710 171.60294 10
docendo(vl, x) 147.07644 151.58861 162.20522 162.49216 165.49513 183.64135 10
u989(vl, x) 2022.64476 2041.55442 2055.86929 2054.92627 2066.26187 2088.71411 10
jogo1(vl, x) 5563.31171 5632.17506 5863.56265 5872.61793 6016.62838 6244.63205 10
Here is a string-based approach in base R:
str <- paste(v, collapse = '-')
# "2-2-3-5-8-0-32-1-3-12-5-2-3-5-8-33-1"
pattern <- paste0('\\b', paste(x, collapse = '-'), '\\b')
# "\\b2-3-5-8\\b"
inds <- unlist(gregexpr(pattern, str)) # (1)
# 3 25
sapply(inds, function(i) lengths(strsplit(substr(str, 1, i),'-'))) # (2)
# [1] 2 12
\\b is used for exact matching.
(1) Finds the positions at which pattern is seen within str.
(2) Getting back the respective indices within the original vector v.
As for running-time efficiency, here is a much faster solution than my first solution:
str <- paste(v, collapse = '-')
pattern <- paste0('\\b', paste(x, collapse = '-'), '\\b')
inds <- c(1, unlist(gregexpr(pattern, str)))
m <- substring(str, head(inds,-1), tail(inds,-1))
ln <- lengths(strsplit(m, '-'))
cumsum(c(ln[1], ln[-1]-1))
Note: This doesn't always give the desired output
We can convert v to factors and keep only consecutive values in our transformed vector:
v2 <- as.numeric(factor(c(v,NA),levels = x)) # [1] 1 1 2 3 4 NA NA NA ...
v2[is.na(v2)] <- length(x)+1 # [1] 1 1 2 3 4 5 5 5 ...
output <- diff(v2) ==1
# [1] FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
Data
v <- c(2,2,3,5,8,0,32,1,3,12,5,2,3,5,8,33,1)
x <- c(2,3,5,8)
I have modified talat's solution as I found this didn't work in all scenarios.
Firstly, if this step idx[sapply(idx, function(i) all(v[i:(i+(length(x)-1))] == x))] will produce NAs if v[i:(i+(length(x)-1))] == x)) contains NAs and no FALSE. Secondly, in order to match the desired outcome I have used the indices to create the final logical vector as desired.
seq_detect <- function(v, x) {
#If the integer is not detected then return early a vector of all falses
if(!any(v == x[1])){
return(vector(length = length(v)))
}
#Create an index of v where the first value in x appears
idx <- which(v == x[1])
#See if each of those indices do indeed match the whole pattern
index_seq_start_raw <- idx[sapply(idx, function(i) all(v[i:(i+(length(x)-1))] == x))]
#These may return NAs if above index outside range of 1:length(v)
if(all(is.na(index_seq_start_raw))){
return(vector(length = length(v)))
}
#If some NAs then remove these
(index_seq_start <- index_seq_start_raw[!is.na(index_seq_start_raw)])
#Create template of FALSES for output
output <- vector(length = length(v))
#Loop over index_seq_start and replace any matches with TRUEs
for(i in seq_along(1:length(index_seq_start))){
output[(index_seq_start[i]):(index_seq_start[i]+3)] <- TRUE
}
output
}
#This works on both the following pairs of vectors, where as due to indexing
#issues #talat's solution causes an error with v1 and x1.
v <- c(2, 2, 3, 5, 8, 0, 32, 1, 3, 12, 5, 2, 3, 5, 8, 33, 1)
x <- c(2, 3, 5, 8)
[1] FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
v1 <- c(1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1)
x1 <- c(1, 2, 2, 1)
[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
Here is a MATLAB code for performing Gram Schmidt in page 1
http://web.mit.edu/18.06/www/Essays/gramschmidtmat.pdf
I am trying for hours and hours to perform this with R since I don't have MATLAB
Here is my R
f=function(x){
m=nrow(x);
n=ncol(x);
Q=matrix(0,m,n);
R=matrix(0,n,n);
for(j in 1:n){
v=x[,j,drop=FALSE];
for(i in 1:j-1){
R[i,j]=t(Q[,i,drop=FALSE])%*%x[,j,drop=FALSE];
v=v-R[i,j]%*%Q[,i,drop=FALSE]
}
R[j,j]=max(svd(v)$d);
Q[,j,,drop=FALSE]=v/R[j,j]}
return(list(Q,R))
}
}
It keeps on saying there is errors in either:
v=v-R[i,j]%*%Q[,i,drop=FALSE]
or
R[j,j]=max(svd(v)$d);
What is it that I am doing wrong translating MATLAB code to R???
Just for fun I added an Armadillo version of this code and benchmark it
Armadillo code :
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
using namespace Rcpp;
//[[Rcpp::export]]
List grahm_schimdtCpp(arma::mat A) {
int n = A.n_cols;
int m = A.n_rows;
arma::mat Q(m, n);
Q.fill(0);
arma::mat R(n, n);
R.fill(0);
for (int j = 0; j < n; j++) {
arma::vec v = A.col(j);
if (j > 0) {
for(int i = 0; i < j; i++) {
R(i, j) = arma::as_scalar(Q.col(i).t() * A.col(j));
v = v - R(i, j) * Q.col(i);
}
}
R(j, j) = arma::norm(v, 2);
Q.col(j) = v / R(j, j);
}
return List::create(_["Q"] = Q,
_["R"] = R
);
}
R code not optimized (directly based on algorithm)
grahm_schimdtR <- function(A) {
m <- nrow(A)
n <- ncol(A)
Q <- matrix(0, nrow = m, ncol = n)
R <- matrix(0, nrow = n, ncol = n)
for (j in 1:n) {
v <- A[ , j, drop = FALSE]
if (j > 1) {
for(i in 1:(j-1)) {
R[i, j] <- t(Q[,i,drop = FALSE]) %*% A[ , j, drop = FALSE]
v <- v - R[i, j] * Q[ ,i]
}
}
R[j, j] = norm(v, type = "2")
Q[ ,j] = v / R[j, j]
}
list("Q" = Q, "R" = R)
}
Native QR decomposition in R
qrNative <- function(A) {
qrdec <- qr(A)
list(Q = qr.R(qrdec), R = qr.Q(qrdec))
}
We will test it with the same matrix as in original document (link in the post above)
A <- matrix(c(4, 3, -2, 1), ncol = 2)
all.equal(grahm_schimdtR(A)$Q %*% grahm_schimdtR(A)$R, A)
## [1] TRUE
all.equal(grahm_schimdtCpp(A)$Q %*% grahm_schimdtCpp(A)$R, A)
## [1] TRUE
all.equal(qrNative(A)$Q %*% qrNative(A)$R, A)
## [1] TRUE
Now let's benchmark it
require(rbenchmark)
set.seed(123)
A <- matrix(rnorm(10000), 100, 100)
benchmark(qrNative(A),
grahm_schimdtR(A),
grahm_schimdtCpp(A),
order = "elapsed")
## test replications elapsed relative user.self
## 3 grahm_schimdtCpp(A) 100 0.272 1.000 0.272
## 1 qrNative(A) 100 1.013 3.724 1.144
## 2 grahm_schimdtR(A) 100 84.279 309.849 95.042
## sys.self user.child sys.child
## 3 0.000 0 0
## 1 0.872 0 0
## 2 72.577 0 0
I really love how easy to port code into Rcpp....
If you are translating code in Matlab into R, then code semantics (code logic) should remain same. For example, in your code, you are transposing Q in t(Q[,i,drop=FALSE]) as per the given Matlab code. But Q[,i,drop=FALSE] does not return the column in column vector. So, we can make it a column vector by using the statement:
matrix(Q[,i],n,1); # n is the number of rows.
There is no error in R[j,j]=max(svd(v)$d) if v is a vector (row or column).
Yes, there is an error in
v=v-R[i,j]%*%Q[,i,drop=FALSE]
because you are using a matrix multiplication. Instead you should use a normal multiplication:
v=v-R[i,j] * Q[,i,drop=FALSE]
Here R[i,j] is a number, whereas Q[,i,drop=FALSE] is a vector. So, dimension mismatch arises here.
One more thing, if j is 3 , then 1:j-1 returns [0,1,2]. So, it should be changed to 1:(j-1), which returns [1,2] for the same value for j. But there is a catch. If j is 2, then 1:(j-1) returns [1,0]. So, 0th index is undefined for a vector or a matrix. So, we can bypass 0 value by putting a conditional expression.
Here is a working code for Gram Schmidt algorithm:
A = matrix(c(4,3,-2,1),2,2)
m = nrow(A)
n = ncol(A)
Q = matrix(0,m,n)
R = matrix(0,n,n)
for(j in 1:n)
{
v = matrix(A[,j],n,1)
for(i in 1:(j-1))
{
if(i!=0)
{
R[i,j] = t(matrix(Q[,i],n,1))%*%matrix(A[,j],n,1)
v = v - (R[i,j] * matrix(Q[,i],n,1))
}
}
R[j,j] = svd(v)$d
Q[,j] = v/R[j,j]
}
If you need to wrap the code into a function, you can do so as per your convenience.
You could simply use Hans W. Borchers' pracma package, which provides many Octave/Matlab functions translated in R.
> library(pracma)
> gramSchmidt
function (A, tol = .Machine$double.eps^0.5)
{
stopifnot(is.numeric(A), is.matrix(A))
m <- nrow(A)
n <- ncol(A)
if (m < n)
stop("No. of rows of 'A' must be greater or equal no. of colums.")
Q <- matrix(0, m, n)
R <- matrix(0, n, n)
for (k in 1:n) {
Q[, k] <- A[, k]
if (k > 1) {
for (i in 1:(k - 1)) {
R[i, k] <- t(Q[, i]) %*% Q[, k]
Q[, k] <- Q[, k] - R[i, k] * Q[, i]
}
}
R[k, k] <- Norm(Q[, k])
if (abs(R[k, k]) <= tol)
stop("Matrix 'A' does not have full rank.")
Q[, k] <- Q[, k]/R[k, k]
}
return(list(Q = Q, R = R))
}
<environment: namespace:pracma>
Here a version very similar to yours but without the use of the extra variabale v. I use directly the Q matrix. So no need to use drop. Of course since you have j-1 in the index you need to add the condition j>1.
f=function(x){
m <- nrow(x)
n <- ncol(x)
Q <- matrix(0, m, n)
R <- matrix(0, n, n)
for (j in 1:n) {
Q[, j] <- x[, j]
if (j > 1) {
for (i in 1:(j - 1)) {
R[i, j] <- t(Q[, i]) %*% Q[, j]
Q[, j] <- Q[, j] - R[i, j] * Q[, i]
}
}
R[j, j] <- max(svd(Q[, j])$d)
Q[, j] <- Q[, j]/R[j, j]
}
return(list(Q = Q, R = R))
}
EDIT add some benchmarking:
To get some real case I use the Hilbert matrix from the Matrix package.
library(microbenchmark)
library(Matrix)
A <- as.matrix(Hilbert(100))
microbenchmark(grahm_schimdtR(A),
grahm_schimdtCpp(A),times = 100L)
Unit: milliseconds
expr min lq median uq max neval
grahm_schimdtR(A) 330.77424 335.648063 337.443273 343.72888 601.793201 100
grahm_schimdtCpp(A) 1.45445 1.510768 1.615255 1.66816 2.062018 100
As expected CPP solution is really fster.
A verbatim implementation of the following matlab code (shown in the next figure) in base R to obtain orthonormal basis vectors with Gram-Schmidt algorithm is shown below:
Gram_Schmidt <- function(A) {
n <- ncol(A)
Q <- 0*A
R <- matrix(rep(0, n*n), nrow=n)
for (j in 1:n) {
v <- A[,j]
if (j > 1) # the first basis vector to be included in Q anyway (after normalization)
for (i in 1:(j-1)) {
R[i, j] <- t(Q[,i]) %*% A[,j]
v <- v - R[i,j] * Q[,i] # subtract the projections on other orthonormal basis vectors constructed so far
}
R[j,j] <- sqrt(v %*% v)
Q[,j] <- v / R[j,j]
}
return(list(Q=Q, R=R))
}
Given the matrix A, we obtain the following results as expected:
A <- matrix(c(4,3,-2,1), nrow=2)
Gram_Schmidt(A)
#$Q
# [,1] [,2]
# [1,] 0.8 -0.6
# [2,] 0.6 0.8
#$R
# [,1] [,2]
#[1,] 5 -1
#[2,] 0 2
Using QR decomposition with base R again,
Gram_Schmidt_QR <- function(A) {
res <- qr(A)
return(list(Q=qr.Q(res), R=qr.R(res)))
}
Gram_Schmidt_QR(A)
#$Q
# [,1] [,2]
# [1,] 0.8 -0.6
# [2,] 0.6 0.8
#$R
# [,1] [,2]
#[1,] 5 -1
#[2,] 0 2
Also, we could use R library matlib's implementation, it only outputs the orthonormal Q matrix though and not the upper triangular matrix R:
library(matlib)
GramSchmidt(A)
# [,1] [,2]
#[1,] 0.8 -0.6
#[2,] 0.6 0.8
Finally, some performance benchmarking gives the following result:
library(ggplot2)
library(microbenchmark)
autoplot(microbenchmark(Gram_Schmidt(A),
Gram_Schmidt_QR(A),
GramSchmidt(A), times=1000L))