Vectorizing nested ifelse - r

I'm trying to fasten my function in R. It contains of three ifelse statements where one of it is nested. For the single one I conducted vectorization which reduced my computation time. Unfortunately I don't see how I can vectorize the nested one. Every way I apply it returns an error. Furthemore if there is any another quirk I can use to speed it up?
cont.run <- function(reps=10000, n=10000, d=0.005, l=10 ,s=0.1) {
r <- rep(0, reps)
theta <- rep(0, n)
for (t in 1:reps) {
epsilon <- rnorm(1, 0, d)
Zt = sum(ifelse(epsilon > theta, 1,
ifelse(epsilon < -theta, -1, 0)))
r[t] <- Zt / (l * n)
theta <- ifelse(runif(n) < s, abs(r[t]), theta)
}
return(mean(r))
}
system.time(cont.run())
I got:
cont.run <- function(reps=10000, n=10000, d=0.005, l=10 ,s=0.1) {
r <- rep(0, reps)
theta <- rep(0, n)
for (t in 1:reps) {
epsilon <- rnorm(1, 0, d)
Zt = rep(NA, length(theta))
Zt = sum(Zt[epsilon > theta, 1])
Zt = sum(Zt[epsilon < -theta, -1])
r[t] <- Zt / (l * n)
theta = rep(theta, length(s))
theta[runif(n) < s] = abs(r[t])
}
return(mean(r))
}
system.time(cont.run())

Here's a little bit improved code.
Main change is that we don't use double ifelse, but instead perform two sums on TRUE vectors (sum(epsilon > theta) - sum(epsilon < -theta)) (we don't care about zeroes here). I added a couple of other improvements (eg., replaced rep with numeric, moved some operations outside the for loop).
contRun <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
# Replace rep with numeric
r <- numeric(reps)
theta <- numeric(n)
# Define before loop
ln <- l * n
# Don't use t as it's a function in base R
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
# Sum two TRUE vectors
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
# Define before ifelse
absr <- abs(r[i])
theta <- ifelse(runif(n) < s, absr, theta)
}
return(mean(r))
}
library(microbenchmark)
microbenchmark(cont.run(), contRun())
Unit: seconds
expr min lq mean median uq max neval
cont.run() 13.652324 13.749841 13.769848 13.766342 13.791573 13.853786 100
contRun() 6.533654 6.559969 6.581068 6.577265 6.596459 6.770318 100
PS. For this kind of computing you might one to set seed (set.seed() before the for loop) to make sure that you can reproduce your results.

Furthemore if there is any another quirk I can use to speed it up?
In addition to PoGibas answer, you can avoid calling ifelse and get a faster function as follows
contRun <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
# Replace rep with numeric
r <- numeric(reps)
theta <- numeric(n)
# Define before loop
ln <- l * n
# Don't use t as it's a function in base R
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
# Sum two TRUE vectors
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
# Define before ifelse
absr <- abs(r[i])
theta <- ifelse(runif(n) < s, absr, theta)
}
mean(r)
}
contRun2 <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
r <- numeric(reps)
theta <- numeric(n)
ln <- l * n
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
absr <- abs(r[i])
# avoid ifelse
theta[runif(n) < s] <- absr
}
mean(r)
}
contRun3 <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
r <- numeric(reps)
theta <- numeric(n)
ln <- l * n
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
absr <- abs(r[i])
# replace runif
theta[sample(c(T, F), prob = c(s, 1 - s), size = n, replace = TRUE)] <- absr
}
mean(r)
}
# gives the same
set.seed(1)
o1 <- contRun()
set.seed(1)
o2 <- contRun2()
set.seed(1)
o3 <- contRun3()
all.equal(o1, o2)
#R [1] TRUE
all.equal(o1, o3) # likely will not match
#R [1] [1] "Mean relative difference: 0.1508537"
# but distribution is the same
set.seed(1)
c1 <- replicate(10000, contRun2(reps = 100, n = 100))
c2 <- replicate(10000, contRun3(reps = 100, n = 100))
par(mfcol = c(1, 2), mar = c(5, 4, 2, .5))
hist(c1, breaks = seq(-.015, .015, length.out = 26))
hist(c2, breaks = seq(-.015, .015, length.out = 26))
# the latter is faster
microbenchmark::microbenchmark(
contRun = {set.seed(1); contRun ()},
contRun2 = {set.seed(1); contRun2()},
contRun3 = {set.seed(1); contRun3()},
times = 5)
#R Unit: seconds
#R expr min lq mean median uq max neval
#R contRun 7.121264 7.371242 7.388159 7.384997 7.443940 7.619352 5
#R contRun2 3.811267 3.887971 3.892523 3.892158 3.921148 3.950070 5
#R contRun3 1.920594 1.920754 1.998829 1.999755 2.009035 2.144005 5
The only bottleneck now is runif in contRun2. Replacing it with sample yields quite an improvement.

Related

How can I fix the while loop problem in R?

I wrote the code as like below, and sometime it gets proper value but sometime it could not give me the value for a long time.
I guess it looks like it has infinite problem with while function but I couldn't get it how to fix it.
I've already tried to search about the while loop but I guess I wrote proeprly but I couldn't get it why it sometime run properly and sometime run not.
Could you please give me advice or the proper modification?
Thank you.
rm(list=ls())
library(readxl)
library(dplyr)
library(ggplot2)
library(MASS)
# Mean Vector, Covariance Matrix Construction
mu <- c(0,0,0)
mu <- t(mu)
mu <- t(mu)
mu
# Construct 40 random variables for Phase II
mu2 <- c(1, 2, 1)
mu2 <- t(mu2)
mu2 <- t(mu2)
mu2
Sigma <- matrix(c(1, 0.9, 0.9, 0.9, 1, 0.9, 0.9, 0.9, 1), 3)
Sigma
getResult <- function(Result) {
# Construct 50 Random Variables for Phase I
Obs <- mvrnorm(50, mu = mu, Sigma = Sigma)
VecT2 <- apply(Obs, 2, mean)
VecT2 <- round(VecT2, 3)
ST2 <- cov(Obs)
ST2 <- round(ST2, 3)
Obs <- as.matrix(Obs)
T2All <- rep(0, nrow(Obs))
for(i in 1:nrow(Obs)) {
T2All[i] = t(Obs[i, ] - VecT2) %*% solve(ST2) %*% (Obs[i, ] - VecT2)
}
# Construct Control Limit
Alpha <- 0.005
M <- nrow(Obs)
M
p <- ncol(Obs)
p
UCL <- ((p * (M-1) * (M + 1))) / ((M - p) * M) * qf((1-Alpha), p, (M-p))
UCL <- round(UCL, 3)
Compare <- which(T2All > UCL)
# Repeat when is there are Out of Control in Phase I with eliminating it
while(isTRUE(Compare > UCL)) {
Obs <- Obs[-Compare,]
Alpha <- 0.005
M <- nrow(Obs)
p <- ncol(Obs)
UCL <- ((p * (M-1) * (M + 1))) / ((M - p) * M) * qf((1-Alpha), p, (M-p))
Compare <- which(T2All > UCL)
}
UCL <- round(UCL, 3)
# Prepare Observations two types of cases with Variable 20_1, Variable 20_2
Obs20_1 <- mvrnorm(20, mu = mu, Sigma = Sigma)
Obs20_2 <- mvrnorm(20, mu = mu2, Sigma = Sigma)
Obs40 <- rbind(Obs20_1, Obs20_2)
Obs40 <- as.matrix(Obs40)
T2 <- rep(0, nrow(Obs40))
for(i in 1:nrow(Obs40)) {
T2[i] = t(Obs40[i, ] - mu) %*% solve(Sigma) %*% (Obs40[i, ] - mu)
}
Result <- which(T2 > UCL)[1]
# Repeat when Out of Control occur in ARL0 section
while(isTRUE(Result < 20)) {
Obs20_1 <- mvrnorm(20, mu = mu, Sigma = Sigma)
Obs40 <- rbind(Obs20_1, Obs20_2)
Obs40 <- as.matrix(Obs40)
T2 <- rep(0, nrow(Obs40))
for(i in 1:nrow(Obs40)) {
T2[i] = t(Obs40[i, ] - mu) %*% solve(Sigma) %*% (Obs40[i, ] - mu)
}
Result <- which(T2 > UCL)[1]
}
Result
}
# Result
Final <- replicate(n = 200, expr = getResult(Result))
Final <- Final - 20
Final
mean(Final)
You could try using a for loop instead of a while loop.

Convert for loops into foreach loops

I want to make the code below more efficient by using the foreach package. I tried it for a very long time but I don't manage to get the same result as when using the for-loops. I would like to use a nested foreach-loop including parallelization... And as output I would like to have two matrices with dim [R,b1] I would be very grateful for some suggestions!!
n <- c(100, 300, 500)
R <- 100
b0 <- 110
b1 <- seq(0.01, 0.1, length.out = 100)
## all combinations of n and b1
grid <- expand.grid(n, b1)
names(grid) <- c("n", "b1")
calcPower <- function( R, b0, grid) {
cl <- makeCluster(3)
registerDoParallel(cl)
## n and b1 coefficients
n <- grid$n
b1 <- grid$b1
## ensures reproducibility
set.seed(2020)
x <- runif(n, 18, 80)
x.dich <- factor( ifelse( x < median( x), 0, 1))
## enables to store two outputs
solution <- list()
## .options.RNG ensures reproducibility
res <- foreach(i = 1:R, .combine = rbind, .inorder = TRUE, .options.RNG = 666) %dorng% {
p.val <- list()
p.val.d <- list()
for( j in seq_along(b1)) {
y <- b0 + b1[j] * x + rnorm(n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c( p.val, ifelse( summary(mod.lm)$coef[2,4] <= 0.05, 1, 0))
p.val.d <- c( p.val.d, ifelse( summary(mod.lm.d)$coef[2,4] <= 0.05, 1, 0))
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
return(solution)
}
dp.val <- matrix( unlist(res[,1], use.names = FALSE), R, length(b1), byrow = TRUE)
dp.val.d <- matrix( unlist(res[,2], use.names = FALSE), R, length(b1), byrow = TRUE)
stopCluster(cl)
df <- data.frame(
effectS = b1,
power = apply( dp.val, 2, function(x){ mean(x) * 100}),
power.d = apply( dp.val.d, 2, function(x){ mean(x) * 100}),
n = factor(n))
return(df)
}
## simulation for different n
tmp <- with(grid,
by( grid, n,
calcPower, R = R, b0 = b0))
## combines the 3 results
df.power <- rbind(tmp[[1]], tmp[[2]], tmp[[3]])
I created a foreach loop in following code. There had to be some changes made. It is a lot easier to return a list then a matrix in foreach, since it's combined with rbind. Especially when you want to return multiple ones. My solution here is to save everything in a list and afterwards transform it into a matrix of length 100.
Note: there is one mistake in your code. summary( mod.lm.d)$coef[2,4] does not exist. I changed it to [2]. Adjust to your needing
solution <- list()
df2<-foreach(i = 1:R, .combine = rbind, .inorder=TRUE) %dopar%{
set.seed(i)
p.val <- list()
p.val.d <- list()
counter <- list()
for( j in seq_along(b1)){
x <- sort( runif(n, 18, 80))
x.dich <- factor( ifelse( x < median(x), 0, 1))
y <- b0 + b1[j] * x + rnorm( n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c(p.val, ifelse( summary( mod.lm)$coef[2] <= 0.05, 1, 0))
p.val.d <- c(p.val.d, ifelse( summary( mod.lm.d)$coef[2] <= 0.05, 1, 0))
counter <- c(counter, j)
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
solution[[3]] <- counter
return(solution)
}
dp.val <- unlist(df2[,1], use.names = FALSE)
dp.val.d <- unlist(df2[,2], use.names = FALSE)
dp.val.matr <- matrix(dp.val, R, length(b1))
dp.val.d.matr <- matrix(dp.val.d, R, length(b1))
stopCluster(cl)
for your comment:
A foreach does work with a normal for loop. Minimal reproducible example:
df<-foreach(i = 1:R, .combine = cbind, .inorder=TRUE) %dopar%{
x <- list()
for(j in 1:3){
x <- c(x,j)
}
return(x)
}

How can I find minimum value for this inequality?

n <- 100
r <- rnorm(n, 0.5, 0.01)
xbar <- mean(r)
mu <- seq(from = 0, to = 1, by=0.0001)
F.mu <- function(xbar, mu) {
hstar <- xbar * log(xbar/mu) + (1-xbar) * log((1-xbar)/(1-mu))
ifelse(xbar >= mu, 1, exp(-n*hstar))
}
Then I want to find the minimum value which satisfies F.mu > 0.05.
Something like this maybe:
x <- F.mu(xbar, mu)
mu[which(x == min(x[x > 0.05]))]
# 0.6211
You could run your function with former specified values, then take the minimum of all > 0.05.
set.seed(927254) # to make results comparable
n <- 100
r <- rnorm(n, 0.5, 0.01)
xbar <- mean(r)
mu <- seq(from=0.0, to=1, by=0.0001)
F.mu <- function(xbar.=xbar, mu.=mu){
hstar <-xbar*log(xbar/mu)+(1-xbar)*log((1-xbar)/(1-mu))
ifelse(xbar >= mu,1, exp(-n*hstar)) }
min(F.mu()[F.mu()>0.05])
# [1] 0.05018463

Split a vector into chunks such that sum of each chunk is approximately constant

I have a large data frame with more than 100 000 records where the values are sorted
For example, consider the following dummy data set
df <- data.frame(values = c(1,1,2,2,3,4,5,6,6,7))
I want to create 3 groups of above values (in sequence only) such that the sum of each group is more or less the same
So for the above group, if I decide to divide the sorted df in 3 groups as follows, their sums will be
1. 1 + 1 + 2 +2 + 3 + 4 = 13
2. 5 + 6 = 11
3. 6 + 7 = 13
How can create this optimization in R? any logic?
So, let's use pruning. I think other solutions are giving a good solution, but not the best one.
First, we want to minimize
where S_n is the cumulative sum of the first n elements.
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
I think the other solutions optimize over p and q independently, which won't give a global minima (expected for some particular cases).
optiCut <- function(v) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
This is as fast as the other solutions because it prunes a lot the iterations based on the condition S3*S3 < min. But, it gives the optimal solution, see optiCut(c(1, 2, 3, 3, 5, 10)).
For the solution with K >= 3, I basically reimplemented trees with nested tibbles, that was fun!
optiCut_K <- function(v, K) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / K
# good starting values
p_vec_first <- sapply(seq_len(K - 1), function(i) which.min((S - i*S_star)^2))
min_first <- sum((diff(c(0, S[c(p_vec_first, n)])) - S_star)^2)
compute_children <- function(level, ind, val) {
# leaf
if (level == 1) {
val <- val + (S[ind] - S_star)^2
if (val > min_first) {
return(NULL)
} else {
return(val)
}
}
P_all <- val + (S[ind] - S[seq_len(ind - 1)] - S_star)^2
inds <- which(P_all < min_first)
if (length(inds) == 0) return(NULL)
node <- tibble::tibble(
level = level - 1,
ind = inds,
val = P_all[inds]
)
node$children <- purrr::pmap(node, compute_children)
node <- dplyr::filter(node, !purrr::map_lgl(children, is.null))
`if`(nrow(node) == 0, NULL, node)
}
compute_children(K, n, 0)
}
This gives you all the solution that are least better than the greedy one:
v <- sort(sample(1:1000, 1e5, replace = TRUE))
test <- optiCut_K(v, 9)
You need to unnest this:
full_unnest <- function(tbl) {
tmp <- try(tidyr::unnest(tbl), silent = TRUE)
`if`(identical(class(tmp), "try-error"), tbl, full_unnest(tmp))
}
print(test <- full_unnest(test))
And finally, to get the best solution:
test[which.min(test$children), ]
Here is one approach:
splitter <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
how good is it:
# I calculate the mean and sd of the maximal difference of the sums in the
#splits of 100 runs:
#split on 15 parts
set.seed(5)
z1 = as.data.frame(matrix(1:15, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 15)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
[1] 1004.158
sd(apply(z1, 1, function(x) max(x) - min(x)))
[1] 210.6653
#with less splits (4)
set.seed(5)
z1 = as.data.frame(matrix(1:4, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 4)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
#632.7723
sd(apply(z1, 1, function(x) max(x) - min(x)))
#260.9864
library(microbenchmark)
1M:
values = sort(sample(1:1000, 1000000, replace = T))
microbenchmark(
sp_27 = splitter(values, 27),
sp_3 = splitter(values, 3),
)
Unit: milliseconds
expr min lq mean median uq max neval cld
sp_27 897.7346 934.2360 1052.0972 1078.6713 1118.6203 1329.3044 100 b
sp_3 108.3283 116.2223 209.4777 173.0522 291.8669 409.7050 100 a
btw F. Privé is correct this function does not give the globally optimal split. It is greedy which is not a good characteristic for such a problem. It will give splits with sums closer to global sum / n in the initial part of the vector but behaving as so will compromise the splits in the later part of the vector.
Here is a test comparison of the three functions posted so far:
db = function(values, N){
temp = floor(sum(values)/N)
inds = c(0, which(c(0, diff(cumsum(values) %% temp)) < 0)[1:(N-1)], length(values))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
} #had to change it a bit since the posted one would not work - the core
#which calculates the splitting positions is the same
missuse <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
prive = function(v, N){ #added dummy N argument because of the tester function
dummy = N
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
optiCut <- function(v, N) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
z3 = optiCut(v)
inds = c(0, z3[1:2], length(v))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(v, re))
} #added output to be more in line with the other two
Function for testing:
tester = function(split, seed){
set.seed(seed)
z1 = as.data.frame(matrix(1:3, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = split(values, 3)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
m = mean(apply(z1, 1, function(x) max(x) - min(x)))
s = sd(apply(z1, 1, function(x) max(x) - min(x)))
return(c("mean" = m, "sd" = s))
} #tests 100 random 1M length vectors with elements drawn from 1:1000
tester(db, 5)
#mean sd
#779.5686 349.5717
tester(missuse, 5)
#mean sd
#481.4804 216.9158
tester(prive, 5)
#mean sd
#451.6765 174.6303
prive is the clear winner - however it takes quite a bit longer than the other 2. and can handle splitting on 3 elements only.
microbenchmark(
missuse(values, 3),
prive(values, 3),
db(values, 3)
)
Unit: milliseconds
expr min lq mean median uq max neval cld
missuse(values, 3) 100.85978 111.1552 185.8199 120.1707 304.0303 393.4031 100 a
prive(values, 3) 1932.58682 1980.0515 2096.7516 2043.7133 2211.6294 2671.9357 100 b
db(values, 3) 96.86879 104.5141 194.0085 117.6270 306.7143 500.6455 100 a
N = 3
temp = floor(sum(df$values)/N)
inds = c(0, which(c(0, diff(cumsum(df$values) %% temp)) < 0)[1:(N-1)], NROW(df))
split(df$values, rep(1:N, ifelse(N == 1, NROW(df), diff(inds))))
#$`1`
#[1] 1 1 2 2 3 4
#$`2`
#[1] 5 6
#$`3`
#[1] 6 7

R - Vectorized implementation of ternary function

I have three vectors X, Y and Z of equal length n. I need to create an n x n x n array of a function f(X[i],Y[j],Z[k]). The straightforward way to do this is to sequentially loop through each element of each of the 3 vectors. However, the time required to compute the array grows exponentially with n. Is there a way to implement this using vectorized operations?
EDIT: As mentioned in the comments, I have added a simple example of what's needed.
set.seed(1)
X = rnorm(10)
Y = seq(11,20)
Z = seq(21,30)
F = array(0, dim=c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
Thanks.
You can use nested outer :
set.seed(1)
X = rnorm(10)
Y = seq(11,20)
Z = seq(21,30)
F = array(0, dim = c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
F2 <- outer(X, outer(Y, Z, "+"), "*")
> identical(F, F2)
[1] TRUE
A microbenchmark including the expand.grid solution proposed by Nick K :
X = rnorm(100)
Y = seq(1:100)
Z = seq(101:200)
forLoop <- function(X, Y, Z) {
F = array(0, dim = c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
return(F)
}
nestedOuter <- function(X, Y, Z) {
outer(X, outer(Y, Z, "+"), "*")
}
expandGrid <- function(X, Y, Z) {
df <- expand.grid(X = X, Y = Y, Z = Z)
G <- df$X * (df$Y + df$Z)
dim(G) <- c(length(X), length(Y), length(Z))
return(G)
}
library(microbenchmark)
mbm <- microbenchmark(
forLoop = F1 <- forLoop(X, Y, Z),
nestedOuter = F2 <- nestedOuter(X, Y, Z),
expandGrid = F3 <- expandGrid(X, Y, Z),
times = 50L)
> mbm
Unit: milliseconds
expr min lq mean median uq max neval
forLoop 3261.872552 3339.37383 3458.812265 3388.721159 3524.651971 4074.40422 50
nestedOuter 3.293461 3.36810 9.874336 3.541637 5.126789 54.24087 50
expandGrid 53.907789 57.15647 85.612048 88.286431 103.516819 235.45443 50
Here's as an additional option, a possible Rcpp implementation (in case you like your loops). I wasn't able to outperform #Juliens solution though (maybe someone can), but they are more or less have the same timing
library(Rcpp)
cppFunction('NumericVector RCPP(NumericVector X, NumericVector Y, NumericVector Z){
int nrow = X.size(), ncol = 3, indx = 0;
double temp(1) ;
NumericVector out(pow(nrow, ncol)) ;
IntegerVector dim(ncol) ;
for (int l = 0; l < ncol; l++){
dim[l] = nrow;
}
for (int j = 0; j < nrow; j++) {
for (int k = 0; k < nrow; k++) {
temp = Y[j] + Z[k] ;
for (int i = 0; i < nrow; i++) {
out[indx] = X[i] * temp ;
indx += 1 ;
}
}
}
out.attr("dim") = dim;
return out;
}')
Validating
identical(RCPP(X, Y, Z), F)
## [1] TRUE
A quick benchmark
set.seed(123)
X = rnorm(100)
Y = 1:100
Z = 101:200
nestedOuter <- function(X, Y, Z) outer(X, outer(Y, Z, "+"), "*")
library(microbenchmark)
microbenchmark(
nestedOuter = nestedOuter(X, Y, Z),
RCPP = RCPP(X, Y, Z),
unit = "relative",
times = 1e4)
# Unit: relative
# expr min lq mean median uq max neval
# nestedOuter 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 10000
# RCPP 1.164254 1.141713 1.081235 1.100596 1.080133 0.7092394 10000
You could use expand.grid as follows:
df <- expand.grid(X = X, Y = Y, Z = Z)
G <- df$X * (df$Y + df$Z)
dim(G) <- c(length(X), length(Y), length(Z))
all.equal(F, G)
If you had a vectorised function, this would work just as well. If not, you could use plyr::daply.

Resources