Efficiently compute a sum based on sequences in R

Efficiently compute a sum based on sequences in R - r

I'm trying to compute a specific sum in R as quickly as possible. The object of interest is
and the relevant input objects are two L times K matrices x (contains only positive integers) and alpha (contains only positive real values). A is equivalent to rowSums(alpha) and N is equivalent to rowSums(x). Subscripts l and k denote a row / a column of alpha or x, respectively.
At first I thought it's going to be easy to come up with something that's super-quick, but I couldn't find an elegant solution. I think a matrix-valued version of seq() would be very helpful here. Does anyone have a creative solution to implement this efficiently?
Here's an easy-to-read, but obviously inefficient, loop-based version for reference:
# parameters
L = 20
K = 5
# x ... L x K matrix of integers
x = matrix(1 : (L * K), L, K)
# alpha ... L x K matrix of positive real numbers
alpha = matrix(1 : (L * K) / 100, L, K)
# N ... sum over rows of x
N = rowSums(x)
# A ... sum over rows of alpha
A = rowSums(alpha)
# implementation
stacksum = function(x, alpha, N, A){
# parameters
K = ncol(x)
L = nrow(x)
result = 0
for(ll in 1:L){
# first part of sum
first.sum = 0
for(kk in 1:K){
# create sequence
sequence.k = seq(alpha[ll, kk], (alpha[ll, kk] + x[ll, kk] - 1), 1)
# take logs and sum
first.sum = first.sum + sum(log(sequence.k))
}
# second part of sum
second.sum = sum(log(seq(A[ll], (A[ll] + N[ll] - 1), 1)))
# add to result
result = result + first.sum - second.sum
}
return(result)
}
# test
stacksum(x, alpha, N, A)

Update with a lgamma solution based on #RobertDodier comments.
Using sequence and rep.int.
# parameters
L <- 20
K <- 5
# x ... L x K matrix of integers
x <- matrix(1 : (L * K), L, K)
# alpha ... L x K matrix of positive real numbers
alpha <- matrix(1 : (L * K) / 100, L, K)
# N ... sum over rows of x
N <- rowSums(x)
# A ... sum over rows of alpha
A <- rowSums(alpha)
# proposed solution
stacksum2 <- function(x, alpha, N, A) {
sum(log(sequence(x, alpha) + rep.int(alpha %% 1, x))) - sum(log(sequence(N, A) + rep.int(A %% 1, N)))
}
# solution from Robert Dodier's comments
stacksum3 <- function(x, alpha, N, A) {
sum(lgamma(alpha + x) - lgamma(alpha)) - sum(lgamma(A + N) - lgamma(A))
}
# OP solution
stacksum1 = function(x, alpha, N, A){
# parameters
K = ncol(x)
L = nrow(x)
result = 0
for(ll in 1:L){
# first part of sum
first.sum = 0
for(kk in 1:K){
# create sequence
sequence.k = seq(alpha[ll, kk], (alpha[ll, kk] + x[ll, kk] - 1), 1)
# take logs and sum
first.sum = first.sum + sum(log(sequence.k))
}
# second part of sum
second.sum = sum(log(seq(A[ll], (A[ll] + N[ll] - 1), 1)))
# add to result
result = result + first.sum - second.sum
}
result
}
res <- list(
stacksum1(x, alpha, N, A),
stacksum2(x, alpha, N, A),
stacksum3(x, alpha, N, A)
)
all.equal(res[1:2], res[-1])
#> [1] TRUE
microbenchmark::microbenchmark(stacksum1 = stacksum1(x, alpha, N, A),
stacksum2 = stacksum2(x, alpha, N, A),
stacksum3 = stacksum3(x, alpha, N, A),
check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> stacksum1 1654.2 1704.60 1899.384 1740.80 1964.75 4234.4 100
#> stacksum2 238.2 246.45 258.284 252.35 268.40 319.4 100
#> stacksum3 18.5 19.05 20.981 20.55 21.70 36.4 100

Related

Select matrix rows that are permutations of a given vector

I have a matrix X:
one two three four
[1,] 1 3 2 4
[2,] 2 0 1 5
[3,] 3 2 1 4
[4,] 4 9 11 19
[5,] 4 3 2 1
I want to get a new matrix Y which only contains rows that are permutations of "1", "2", "3", "4". That is:
one two three four
[1,] 1 3 2 4
[3,] 3 2 1 4
[5,] 4 3 2 1
What function or command should I use?

mat <- rbind(
c(1, 3, 2, 4),
c(2, 0, 1, 5),
c(3, 2, 1, 4)
)
ok <- apply(mat, 1L, function(x) setequal(x, c(1, 2, 3, 4)))
mat[ok, ]

Your example matrix and target vector:
X <- structure(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4, 5, 4, 19, 1),
dim = 5:4)
v <- 1:4
But let's construct a more challenging one (thanks to user harre):
X <- rbind(X, 1, c(1, 2, 1, 2))
A fully vectorized approach (using package matrixStats)
rk <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
ct <- matrixStats::rowTabulates(rk, values = 1:length(v))
zo <- matrixStats::rowCounts(ct, value = 0L)
## all rows that are permutations of 'v'
X[zo == 0L, ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
## remove rows that are permutations of 'v'
X[zo > 0L, ]
Another fully vectorized method (base R)
This is a mathematical solution. For a nonlinear and asymmetric weight function w(x), the following weighted sum:
1 x w(1) + 2 x w(2) + 3 x w(3) + 4 x w(4)
is a unique score or identifier and is invariant to permutations. So for example, the following gives the same value:
2 x w(2) + 1 x w(1) + 3 x w(3) + 4 x w(4)
But anything else will give different values, like:
1 x w(1) + 3 x w(1) + 3 x w(3) + 4 x w(4)
0 x w(0) + 3 x w(1) + 0 x w(0) + 4 x w(4)
Here is an implementation using cosine weights. It works even if X and v are floating point numbers or characters.
## method = "tab" for tabulation method
## method = "cos" for cosine weights method
FindPerm <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
X[FindPerm(X, v, "cos"), ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
Benchmark
Performance depends on the number of values in v. The tabulation method will slow down as v becomes long.
## a benchmark function, relying on package "microbenchmark"
## nr: number of matrix rows
## nc: number of elements in 'v'
bm <- function (nr, nc) {
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
check = "identical")
}
bm(2e+4, 4)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 4.302674 4.324236 4.536260 4.336955 4.359814 7.039699
# cos 4.846893 4.872361 5.163209 4.882942 4.901288 7.837580
bm(2e+4, 20)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 30.63438 30.70217 32.73508 30.77588 33.08046 135.64322
# cos 21.16669 21.26161 22.28298 21.37563 23.60574 26.31775

Update since there's so much interest in this question, here's a method using indexing to give a speed boost on Zheyuan Li's excellent generalization of my original answer.
The idea is to index on a length(v)-dimensional array for small v, or to index on v*sin(w*v) using the results of match instead of calculating X*sin(W*X) when v is large:
library(RcppAlgos)
# simplified version of Zheyuan Li's function
f1 <- function(X, v) {
n <- length(v)
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
w <- pi/(n + 1)
rowSums(Xi*sin(Xi*w)) == sum(vi*sin(vi*w))
}
f2 <- function(X, v) {
n <- length(v)
if (n < 6) {
# index an n-dimensional array
m <- array(FALSE, rep(n + 1L, n))
m[permuteGeneral(n)] <- TRUE
X[] <- match(X, v, nomatch = length(v) + 1L)
m[X]
} else {
nn <- 1:n
u <- c(nn*sin(pi*nn/(n + 1L)), 0)
X[] <- u[match(X, v, nomatch = n + 1L)]
rowSums(X) == sum(u)
}
}
set.seed(123)
# using Zheyuan Li's test dataset
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 344.4 367.25 438.932 374.05 386.75 5960.6 100
#> f2 81.9 85.00 163.332 88.90 98.50 6924.4 100
# Zheyuan Li's larger test dataset
set.seed(123)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 1569.2 1575.45 1653.510 1601.30 1683.6 3972.6 100
#> f2 355.2 359.90 431.705 366.85 408.6 2253.8 100
Original answer edited to use X + exp(1/X) (see comments).
This should work with positive integers:
Y <- X[rowSums(X + exp(1/X)) == sum(1:4 + exp(1/(1:4))),]
Benchmarking against the apply solution:
f1 <- function(x) x[apply(x, 1L, function(x) setequal(x, 1:4)),]
f2 <- function(x) x[rowSums(x + exp(1/x)) == sum(1:4 + exp(1/(1:4))),]
X <- matrix(sample(10, 4e5, TRUE), 1e5)
microbenchmark::microbenchmark(f1 = f1(X),
f2 = f2(X),
times = 10,
check = "equal")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 448.2680 450.8778 468.55179 461.62620 472.0022 542.0455 10
#> f2 28.5362 28.6889 31.50941 29.44845 30.2693 50.4402 10

This question is HOT. I am learning so I take this as a good opportunity to learn. It is really difficult for me to come up with new solutions, but I found two things missing here:
there is no serious validation of these answers;
there is no benchmark for all of them.
I would like to convert each answer to a function that returns a TRUE/FALSE vector for flagging rows. I also want this function to work with any matrix and any vector.
Stéphane Laurent's answer, Zheyuan Li's answer and ThomasIsCoding's answer need minimal adaption.
Mohamed Desouky's answer is also easy to adapt, by taking out the function applied in Filter() and apply() it over matrix rows.
jblood94's answer is challenging. It was commented that for other matrices and vectors, conversion is needed using match. I don't know what is the appropriate way, but I saw match in Zheyuan Li's answer, so I borrowed that part.
TarJae's answer is awful (sorry; don't take this as an insult). None of them seems to work. I don't see any comparison between matrix rows and vectors in the base R solution. For other tidyverse codes, I don't know what df_matrix is. I have requested TarJae to please revise the answer.
harre's answer uses tidyverse and dose not return TRUE/FALSE. So I have to exclude it from the benchmark (sorry).
Here are the functions for the benchmark.
S.Laurent <- function (X, v) apply(X, 1L, function(x) setequal(x, v))
Z.Li <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
Thomas <- function (X, v) colSums(mapply(`%in%`, list(v), asplit(X, 1))) == ncol(X)
M.Desouky <- function (X, v) apply(X, 1, function (x) all((x %in% v) & length(unique(x)) == length(v)))
jblood94 <- function (X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:length(v)
rowSums(Xi + exp(1/Xi)) == sum(vi + exp(1/vi))
}
For benchmark, I followed the setup in Zheyuan Li's answer.
library(matrixStats)
library(microbenchmark); library(ggplot2)
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm1
autoplot(bm1)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm2
autoplot(bm2)
I don't know how time is transformed for plotting, but clearly, they are not on the usual scale. Those to the left are far faster than it is suggested from the plot!
Conclusion: Zheyuan Li's "cos" method is the winner.

We can try this
> mat[colSums(mapply(`%in%`, list(1:4), asplit(mat, 1))) == ncol(mat), ]
[,1] [,2] [,3] [,4]
[1,] 1 3 2 4
[2,] 3 2 1 4
[3,] 4 3 2 1

Another option is using Filter function
t(Filter(\(x) all((x %in% 1:4) & length(unique(x)) == 4) ,
data.frame(t(X))))
Just for fun
Who can give me 4 distinct digits whose sum equal 6 ?
there is just {0,1,2,3}
then we can use the module of 4 using %%
X[apply(X , 1 , \(x) sum(unique(x %% 4)) == 6 & length(unique(x)) == 4) , ]
OR
with Using pure for loop
ans <- data.frame(matrix(NA , ncol = ncol(X)))
r <- 1
for(i in 1:nrow(X)){
if(all((X[i,] %in% 1:4) & length(unique(X[i,])) == 4)){
ans[r,] <- X[i,]
r <- r + 1
}
}
ans <- as.matrix(ans)
output
X1 X2 X3 X4
1 1 3 2 4
2 3 2 1 4
3 4 3 2 1
data
X <- matrix(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4,
5, 4, 19, 1) , ncol = 4)

For the fun of tidyverse-solutions, even if I think we'd rather work on the matrices directly. However, we could use rowwise() and c_across():
With set-operations (inspired by #Stéphane Laurent):
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(setequal(c_across(), c(1, 2, 3, 4))) |>
ungroup() |>
as.matrix()
Or without set-operations:
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(1 %in% c_across(everything()) &
2 %in% c_across(everything()) &
3 %in% c_across(everything()) &
4 %in% c_across(everything())
) |>
ungroup() |>
as.matrix()
Or inspired by #Mohamed Desouky:
mat %>%
as_tibble() |>
rowwise() |>
filter(all(c_across() %in% 1:4) & n_distinct(c_across()) == 4) |>
ungroup() |>
as.matrix()
And so on..

The algorithm library in C++ offers a function called std::is_permutation that does just the trick.
The workhorse function below uses Rcpp and is fairly straightforward.
#include <Rcpp.h>
// [[Rcpp::export]]
SEXP perm_idx_cpp(Rcpp::IntegerMatrix mat, const std::vector<int> &v) {
const int nRows = mat.nrow();
const int nCols = mat.ncol();
std::vector<int> test(nCols);
Rcpp::LogicalVector res(nRows);
for (int i = 0; i < nRows; ++i) {
for (int j = 0; j < nCols; ++j) {
test[j] = mat(i, j);
}
res[i] = std::is_permutation(
test.begin(), test.end(), v.begin()
);
}
return res;
}
And calling it in R we have (N.B. We use match to get integer indices as in #Zheyuan Li's answer, which is absolutely brilliant btw!):
get_perm_idx <- function(X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), ncol = ncol(X))
perm_idx_cpp(Xi, seq_along(v))
}
It is very efficient as well. Here is a simple benchmark:
nr <- 2e4
nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
"is_perm_cpp" = get_perm_idx(X, v),
check = "identical")
Unit: milliseconds
expr min lq mean median uq max neval
tab 33.641345 36.479660 39.00994 37.402306 39.560015 54.88057 100
cos 9.496309 12.887493 15.30122 13.306302 14.053643 132.24079 100
is_perm_cpp 3.232093 4.819553 6.08687 4.993367 5.248818 19.56919 100
You could probably squeeze out some extra efficiency, but it is tough to beat the simplicity here.

More efficient function and for loop

I am trying to do a more efficient for loop. I know the existence of sapply, laaply, etc. but I don't know how to implement it in my code.
I have my function which I don't know if it is very efficient. I think I should improve this but I don't know how.
myfun <- function(a, b, c) {
sum <- 0
iter <- 0
while (sum < c) {
nr <- runif(1, a, b)
sum <- sum + nr
iter <- iter + 1
}
return(iter)
}
And here is the part which I would like to use an sapply or something similar.
a <- 0
b <- 1
c <- 2
x <- 0
for (i in 1:10^9) {
x <- x + myfun(a, b, c)
}
Also, I need to make a sapply similar to this
sapply(1:10^9, functie(a ,b ,c))
But the sapply uses 1:10^9 as parameters, instead of a, b, c.

I think replicate() is what you may be looking for (I changed your n to something smaller).
set.seed(1234)
n <- 10^2
y <- replicate(n, myfun(a,b,c))
sum(y)
# [1] 462
This matches your prior result.
set.seed(1234)
a <-0
b <-1
c <-2
x <-0
for (i in 1:n){
x <- x + myfun(a,b,c)
}
x
# [1] 462

Recursion
Here is a recursive function f() that does the same job as myfun().
f <- function(s=0) {
if (s[length(s)] >= 2) {
return(length(s) - 1L)
} else {
f(c(s, s[length(s)] + runif(1, 0L, 1L)))
}
}
set.seed(42)
f()
# [1] 3
replicate(8, f())
# [1] 4 5 4 4 3 5 3 5
stopifnot(all.equal({set.seed(42);f()}, {set.seed(42);myfun(0, 1, 2)}))
However (and most likely for that reason), it's just cooler, not faster.
Rcpp
Learning from that, we may define the while loop in Rcpp.
library(Rcpp)
cppFunction('
double myfun_cpp() {
double s = 0;
int i = 0;
while (s < 2) {
s = s + R::runif(0, 1);
i++;
}
return i;
}
')
set.seed(42)
myfun_cpp()
# [1] 3
replicate(8, myfun_cpp())
# [1] 4 5 4 4 3 5 3 5
stopifnot(all.equal({set.seed(42);myfun_cpp()}, {set.seed(42);myfun(0, 1, 2)}))
Now it's lightning fast:
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# f 22.244076 22.639439 25.345203 22.927777 24.089196 35.82683 100 c
# myfun_cpp 3.204448 3.260542 3.843632 3.294618 3.347971 13.71213 100 a
# myfun 16.823981 17.125346 20.605663 17.516248 27.385791 28.63267 100 b
set.seed(42); R <- 1e3
microbenchmark::microbenchmark(
f=replicate(R, f()),
myfun_cpp=replicate(R, myfun_cpp()),
myfun=replicate(R, myfun(0, 1, 2)), times=1e2L,
control=list(warmup=1e1L))

Here are some options
A base R recursion method
f_TIC <- function(x, y, z) ifelse(z <= 0, 0, f_TIC(x, y, z - runif(1, x, y)) + 1)
Rcpp implementation of f_TIC
library(Rcpp)
cppFunction("
int f_TIC_cpp(double x, double y, double z) {
if (z <= 0) {
return 0;
} else {
return f_TIC_cpp(x, y, z- R::runif(0,1))+1;
}
}
")
Benchmarking
library(Rcpp)
f <- function(s = 0) {
if (s[length(s)] >= 2) {
return(length(s) - 1L)
} else {
f(c(s, s[length(s)] + runif(1, 0L, 1L)))
}
}
f_TIC <- function(x, y, z) ifelse(z <= 0, 0, f_TIC(x, y, z - runif(1, x, y)) + 1)
cppFunction("
double myfun_cpp() {
double s = 0;
int i = 0;
while (s < 2) {
s = s + R::runif(0, 1);
i++;
}
return i;
}
")
cppFunction("
int f_TIC_cpp(double x, double y, double z) {
if (z <= 0) {
return 0;
} else {
return f_TIC_cpp(x, y, z- R::runif(0,1))+1;
}
}
")
myfun <- function(a, b, c) {
sum <- 0
iter <- 0
while (sum < c) {
nr <- runif(1, a, b)
sum <- sum + nr
iter <- iter + 1
}
return(iter)
}
set.seed(42)
R <- 1e3
microbenchmark::microbenchmark(
f = replicate(R, f()),
f_TIC = replicate(R, f_TIC(0, 1, 2)),
f_TIC_cpp = replicate(R, f_TIC_cpp(0,1,2)),
myfun_cpp = replicate(R, myfun_cpp()),
myfun = replicate(R, myfun(0, 1, 2)),
times = 1e2L,
control = list(warmup = 1e1L)
)
and we will see
Unit: milliseconds
expr min lq mean median uq max neval
f 11.9342 12.50330 14.161982 13.02100 14.96575 22.7116 100
f_TIC 20.1925 21.69420 23.678240 22.28255 24.86350 34.1577 100
f_TIC_cpp 2.0293 2.10080 2.639625 2.17505 2.36190 7.9715 100
myfun_cpp 1.7351 1.79415 2.094577 1.83810 2.00495 6.7481 100
myfun 9.1408 9.45240 11.783504 10.32355 14.68815 19.5400 100

I would probably solve this using purrr::map(). E.g. like this:
c(1:1e9) %>%
purrr::map_dbl(
~ myfun(a, b, c)
) %>%
sum()
This first calls myfun() the same number of times as the length of c(1:1e9), and stores the results in a numeric vector, then it uses sum() to add the results together.
My tests shows it's a bit faster than using replicate().

You're doing it right, in my honest opinion.
Since you don't need to return a vectorized or multi-dimensional result but instead update an existing object at each iteration, the for loop you're suggesting is more than adequate.
If you want to take a look at some great discussion about this topic I suggest you to look at this link: https://r4ds.had.co.nz/iteration.html
Edit: just to address the speed argument
start <- Sys.time()
purrr::map_dbl(1:1000, function(x) y + myfun(a, b, c)) %>% sum
end <- Sys.time()
end - start
# Time difference of 0.02593184 secs
start <- Sys.time()
y <- replicate(1000, myfun(a,b,c))
cumsum(y)[1000]
end <- Sys.time()
end - start
# Time difference of 0.01755929 secs
y <- 0
start <- Sys.time()
for(i in 1:1000){
y<- y + myfun(a,b,c)
}
end <- Sys.time()
end - start
# Time difference of 0.01459098 secs

How to deal with very big numbers producing NaN

I want to define very simple function following:
where:
My work so far
prob <- function(x, n) {
quan <- qgamma(0.95, n, 1)
temp <- quan / (x)^2
first_term <- exp(-temp)
second_term <- temp^(0:(n - 1)) / factorial(0:(n - 1))
second_term <- sum(second_term)
first_term * second_term
}
The problem here is that in the sum (second term) for big n we are dealing with very big numbers, so R treats those as infinity.
So for example:
prob(0.5, n = 1000)
[1] NaN
Because quantile for n = 1000 equals to 1052.577, in nominator we have to calculate 1052.577^999 and in denominator factorial of 999. R understands those two numbers as infinity:
> factorial(999)
[1] Inf
> 1052.577^999
[1] Inf
So when it tries to divide them NaN is produced. However the output of this function is always in interval (0, 1), since its a probability. Is there any possibility to calculate value of this function in this point?

Your prob function is just the cumulative Poisson with lambda = temp and k = n - 1. Use ppois:
prob <- function(x, n) {
return(ppois(n - 1, qgamma(0.95, n, 1)/x^2))
}
prob(0.5, n = 1000)
# [1] 0
prob(0.5, n = 1000) = 0 because n - 1 = 999 is so far from the mean (lambda = qgamma(0.95, 1000, 1)/0.5^2 = 4210.308).

vectorizing an R-loop with backward dependency

I have a random vector vec and want make a new vector L without using a loop. New element of L depends on old elements of L and vec.
set.seed(0)
vec <- rnorm(20,0)
i = 2;
N <- length(vec) -1
L <- numeric(N-1)
constant <- 0.6
while (i < N){
L[i] = vec[i + 1] - vec[i] - constant * L[i - 1]
i <- i + 1
}
L
# [1] 0.0000000 1.6560326 -1.0509895 -0.2271942 -1.8182750 1.7023480 -0.3875622 0.5214906 2.0975262 -2.8995756 0.1771427
# [12] -0.4549334 1.1311555 -0.6884468 0.3007724 0.4832709 -1.4341071 2.1880687

You want
L[1] = 0
L[i] = -constant * L[i - 1] + (vec[i + 1] - vec[i]), i = 2, 3, ...,
Let dv <- diff(vec), the 2nd line becomes
L[i] = -constant * L[i - 1] + dv[i], i = 2, 3, ...
an AR1 process with lag-1 auto-correlation -constant and innovation dv[-1]. AR1 process can be efficiently generated by filter with "recursive" method.
dv <- diff(vec)
L <- c(0, filter(dv[-1], -constant, "recursive"))
# [1] 0.0000000 1.6560326 -1.0509895 -0.2271942 -1.8182750 1.7023480
# [7] -0.3875622 0.5214906 2.0975262 -2.8995756 0.1771427 -0.4549334
#[13] 1.1311555 -0.6884468 0.3007724 0.4832709 -1.4341071 2.1880687
#[19] -2.9860629
I guess you mean while (i <= N) in your question. If you do want i < N, then you have to get rid of the last element above. Which can be done by
dv <- diff(vec)
L <- c(0, filter(dv[2:(length(dv) - 1)], -constant, "recursive"))
hours later...
I was brought to attention by Rui Barradas's benchmark. For short vec, any method is fast enough. For long vec, filter is definitely faster, but practically suffers from coercion as filter expects and returns a "ts" (time series) object. It is better to call its workhorse C routine straightaway:
AR1_FILTER <- function (x, filter, full = TRUE) {
n <- length(x)
AR1 <- .Call(stats:::C_rfilter, as.double(x), as.double(filter), double(n + 1L))
if (!full) AR1 <- AR1[-1L]
AR1
}
dv <- diff(vec)
L <- AR1_FILTER(dv[-1], -constant)
#L <- AR1_FILTER(dv[2:(length(dv) - 1)], -constant)
I am not interested in comparing AR1_FILTER with R-level loop. I will just compare it with filter.
library(microbenchmark)
v <- runif(100000)
microbenchmark("R" = c(0, filter(v, -0.6, "recursive")),
"C" = AR1_FILTER(v, -0.6))
Unit: milliseconds
expr min lq mean median uq max neval
R 6.803945 7.987209 11.08361 8.074241 9.131967 54.672610 100
C 2.586143 2.606998 2.76218 2.644068 2.660831 3.845041 100

When you have to compute values based on previous values the general purpose answer is no, there is no way around a loop.
In your case I would use a for loop. It's simpler.
M <- numeric(N - 1)
for(i in seq_len(N)[-N])
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
identical(L, M)
#[1] TRUE
Note the use of seq_len, not 2:(N - 1).
Edit.
I have timed the solutions by myself and by user 李哲源. The results are clearly favorable to my solution.
f1 <- function(vec, constant = 0.6){
N <- length(vec) - 1
M <- numeric(N - 1)
for(i in seq_len(N)[-c(1, N)]){
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
}
M
}
f2 <- function(vec, constant = 0.6){
dv <- diff(vec)
c(0, c(stats::filter(dv[2:(length(dv) - 1)], -constant, "recursive")) )
}
L1 <- f1(vec)
L2 <- f2(vec)
identical(L, L1)
identical(L, L2)
microbenchmark::microbenchmark(
loop = f1(vec),
filter = f2(vec)
)
On my PC the ratio of the medians gives my code 11 times faster.

I was thinking about using Rcpp for this, but one of the answer mentioned rfilter built internally in R, so I had a check:
/* recursive filtering */
SEXP rfilter(SEXP x, SEXP filter, SEXP out)
{
if (TYPEOF(x) != REALSXP || TYPEOF(filter) != REALSXP
|| TYPEOF(out) != REALSXP) error("invalid input");
R_xlen_t nx = XLENGTH(x), nf = XLENGTH(filter);
double sum, tmp, *r = REAL(out), *rx = REAL(x), *rf = REAL(filter);
for(R_xlen_t i = 0; i < nx; i++) {
sum = rx[i];
for (R_xlen_t j = 0; j < nf; j++) {
tmp = r[nf + i - j - 1];
if(my_isok(tmp)) sum += tmp * rf[j];
else { r[nf + i] = NA_REAL; goto bad3; }
}
r[nf + i] = sum;
bad3:
continue;
}
return out;
}
This function is already pretty look and I don't think I could write an Rcpp one to beat it with great improvement. I did a benchmark with this rfilter and the f1 function in the accepted answer:
f1 <- function(vec, constant = 0.6){
N <- length(vec) - 1
M <- numeric(N - 1)
for(i in seq_len(N)[-c(1, N)]){
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
}
M
}
AR1_FILTER <- function (x, filter, full = TRUE) {
n <- length(x)
AR1 <- .Call(stats:::C_rfilter, as.double(x), as.double(filter), double(n + 1L))
if (!full) AR1 <- AR1[-1L]
AR1
}
f2 <- function (vec, constant) {
dv <- diff(vec)
AR1_FILTER(dv[2:(length(dv) - 1)], -constant)
}
library(microbenchmark)
Bench <- function (n) {
vec <- runif(n)
microbenchmark("R" = f1(vec, 0.6), "C" = f2(vec, 0.6))
}
For short vectors with length 100, I got
Bench(100)
Unit: microseconds
expr min lq mean median uq max neval
R 68.098 69.8585 79.05593 72.456 74.6210 244.148 100
C 66.423 68.5925 73.18702 69.793 71.1745 150.029 100
For large vectors with length 10000, I got
Bench(10000)
Unit: microseconds
expr min lq mean median uq max neval
R 6168.742 6699.9170 6870.277 6786.0415 6997.992 8921.279 100
C 876.934 904.6175 1192.000 931.9345 1034.273 2962.006 100
Yeah, there is no way that R is going to beat a compiled language.

library(dplyr)
L2 <- c(0,lead(vec) - vec - constant * lag(L))
L2 <- L2[!is.na(L2)]
L2
[1] 0.00000000 1.09605531 -0.62765133 1.81529867 -2.10535596 3.10864280 -4.36975556 1.41375965
[9] -1.08809820 2.16767510 -1.82140234 1.14748512 -0.89245650 0.03962074 -0.10930073 1.48162072
[17] -1.63074832 2.21593009
all.equal(L,L2)
[1] TRUE

Select a subset of combinations

Suppose that I have a 20 X 5 matrix, I would like to select subsets of the matrix and do some computation with them. Further suppose that each sub-matrix is 7 X 5. I could of course do
ncomb <- combn(20, 7)
which gives me all possible combinations of 7 row indices, and I can use these to obtain sub-matrices. But with a small, 20 X 5 matrix, there are already 77520 possible combination. So I would like to instead randomly sample some of the combinations, e.g., 5000 of them.
One possibility is the following:
ncomb <- combn(20, 7)
ncombsub <- ncomb[, sample(77520, 5000)]
In other words, I obtain all possible combinations, and then randomly select only 5000 of the combinations. But I imagine it would be problematic to compute all possible combinations if I had a larger matrix - say, 100 X 7.
So I wonder if there is a way to get subsets of combinations without first obtaining all possible combinations.

Your approach:
op <- function(){
ncomb <- combn(20, 7)
ncombsub <- ncomb[, sample(choose(20,7), 5000)]
return(ncombsub)
}
A different strategy that simply samples seven rows from the original matrix 5000 times (replacing any duplicate samples with a new sample until 5000 unique row combinations are found):
me <- function(){
rowsample <- replicate(5000,sort(sample(1:20,7,FALSE)),simplify=FALSE)
while(length(unique(rowsample))<5000){
rowsample <- unique(rowsample)
rowsample <- c(rowsample,
replicate(5000-length(rowsample),
sort(sample(1:20,7,FALSE)),simplify=FALSE))
}
return(do.call(cbind,rowsample))
}
This should be more efficient because it prevents you from having to calculate all of the combinations first, which will get costly as the matrix gets larger.
And yet, some benchmarking reveals that is not the case. At least on this matrix:
library(microbenchmark)
microbenchmark(op(),me())
Unit: milliseconds
expr min lq median uq max neval
op() 184.5998 201.9861 206.3408 241.430 299.9245 100
me() 411.7213 422.9740 429.4767 474.047 490.3177 100

I ended up doing what #Roland suggested, by modifying combn(), and byte-compiling the code:
combn_sub <- function (x, m, nset = 5000, seed=123, simplify = TRUE, ...) {
stopifnot(length(m) == 1L)
if (m < 0)
stop("m < 0", domain = NA)
if (is.numeric(x) && length(x) == 1L && x > 0 && trunc(x) ==
x)
x <- seq_len(x)
n <- length(x)
if (n < m)
stop("n < m", domain = NA)
m <- as.integer(m)
e <- 0
h <- m
a <- seq_len(m)
len.r <- length(r <- x[a] )
count <- as.integer(round(choose(n, m)))
if( count < nset ) nset <- count
dim.use <- c(m, nset)
##-----MOD 1: Change the output matrix size--------------
out <- matrix(r, nrow = len.r, ncol = nset)
if (m > 0) {
i <- 2L
nmmp1 <- n - m + 1L
##----MOD 2: Select a subset of indices
set.seed(seed)
samp <- sort(c(1, sample( 2:count, nset - 1 )))
##----MOD 3: Start a counter.
counter <- 2L
while (a[1L] != nmmp1 ) {
if (e < n - h) {
h <- 1L
e <- a[m]
j <- 1L
}
else {
e <- a[m - h]
h <- h + 1L
j <- 1L:h
}
a[m - h + j] <- e + j
#-----MOD 4: Whenever the counter matches an index in samp,
#a combination of row indices is produced and stored in the matrix `out`
if(samp[i] == counter){
out[, i] <- x[a]
if( i == nset ) break
i <- i + 1L
}
#-----Increase the counter by 1 for each iteration of the while-loop
counter <- counter + 1L
}
}
array(out, dim.use)
}
library("compiler")
comb_sub <- cmpfun(comb_sub)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Efficiently compute a sum based on sequences in R - r

Related

Select matrix rows that are permutations of a given vector

More efficient function and for loop

How to deal with very big numbers producing NaN

vectorizing an R-loop with backward dependency

Select a subset of combinations

Categories

Resources