Pre-allocation and optimization loop - r

My R script have the form:
for (j in 1:N) {
#construct the DF2 data frame
#operations on the DF2 data frame
}
Where N can be large (like a 1 mln). The columns of DF2 are defined
one after the other with the formula:
DF2$column_i <- function(x,f..) #or constant or ....
DF$column_i can are a constant, a function or a loop "while". I tried to pre allocate defining DF2 before with:
DF2 <- data.frame(matrix(nrow=..,ncol=..))
and computing after the columns DF2$column_i, but I have not had any benefits.
Does anyone have any ideas?

My code is of the type:
par <- data.frame(CA=runif(n = 50, min = 70000, max = 100000),
D=round(runif(n = 50, min = 70, max = 90),0),
P=runif(n = 50, min = 900, max = 20000),
A=round(runif(n = 50, min = 50, max = 70),0))
parpa <- data.frame(matrix(nrow = nrow(par), ncol = 3*V))
comp <- function(CA, D, P, A){
vect <- rep('numeric', 3*V)
b <- 1
k <- 1
while (((b+1) <= (D+1))&(k < V)) {
a <- b+1
b <- min((a+8-1), (D+1))
vect[c(1+4*k, 2+4*k, 3+4*k, 4+4*k)] <- c(mean(DF2$Z[a:b]), sum(DF2$X[a:b]),
mean(DF2$Q[a:b]), sum(DF2$AE[a:b]))
k <- k+1
}
return(vect)
}
#loop
for (j in 1:nrow(par)) {
CA <- par$CA[j]
D <- par$D[j]
R <- 0.01*D
P <- par$P[j]
A <- par$A[j]
COST <- 500
V <- 5
#DF2
DF2 <- data.frame(M=0:D)
OB <- function(x) {
c <- COST*D*DF2$M/R
return(c)
}
DF2$O <- O(D)
DF2$E <- (D*DF2$M+2)/D*(D+4)
DF2$Q <- (CA-DF2$M)*D
DF2$X <- (CA-DF2$O)*(DF2$E+P)
Func <- function(x) {return(round(x/30, 2))}
DF2$Z[(A+2):(D+1)] <- sapply(DF2$E[(A+2):(D+1)], Func)
parpa[j,] <- comp(CA, D, P, A)
}

Related

Using app function from {terra} package on raster stacks? (in parallel)

I have four high resolution rasters for a country. I have split each raster into tiles and done some other processing to them. I now want to apply a function to each cell, of each 'stack' of the raster tiles, to produce one set of output tiles. The function is a little complex. I have tried to synthesise some data below to reproduce my current approach. It works (ish) but I'm convinced that there's a better way to do this. To use parallel processing on my unix box, I simply swap mapply for mcmapply, but I haven't done that in the example below as I presume many will be working on Windows machines. I'd welcome ideas on my approach and particularly optimisation.
library("terra")
library("glue")
## Make some toy data
dir.create("temp_folder")
dir.create("result_folder")
x <- rast(ncols = 10, nrows = 10)
a <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(a) <- some_values
a_tiles <- makeTiles(a, x, glue("temp_folder/tile_a_{1:100}.tif"), overwrite = TRUE)
b <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(b) <- some_values
b_tiles <- makeTiles(b, x, glue("temp_folder/tile_b_{1:100}.tif"), overwrite = TRUE)
c <-rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(c) <- some_values
c_tiles <- makeTiles(c, x, glue("temp_folder/tile_c_{1:100}.tif"), overwrite = TRUE)
d <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(d) <- some_values
d_tiles <- makeTiles(d, x, glue("temp_folder/tile_d_{1:100}.tif"), overwrite = TRUE)
## Outer function so that this can be used in parallel ? But maybe this is a silly way to do it?
outer_function <- function(a_tiles, b_tiles, c_tiles, d_tiles, output_files) {
one_a_tile <- rast(unlist(a_tiles))
one_b_tile <- rast(unlist(b_tiles))
one_c_tile <- rast(unlist(c_tiles))
one_d_tile <- rast(unlist(d_tiles))
output_file <- output_files
# I replace any NAs with 0 as an NA will break my 'if' statement of the inner_function.
# I get Error in if (z["a"] <= z["b"]) { : missing value where TRUE/FALSE needed
one_a_tile[is.na(one_a_tile)] <- 0
one_b_tile[is.na(one_b_tile)] <- 0
one_c_tile[is.na(one_c_tile)] <- 0
one_d_tile[is.na(one_d_tile)] <- 0
z <- sds(one_a_tile, one_b_tile, one_c_tile, one_d_tile)
## Inner function that actually does the work I want doing
inner_function <- function(z) {
names(z) <- c('a', 'b', 'c', 'd')
if (z['a'] <= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 10),
(z['c'] + z['a'] * 20)))
}
if (z['a'] >= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 40),
(z['c'] + z['a'] * 10)))
}
if (z['a'] == z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 60),
(z['c'] + z['a'] * 10)))
}
y <- ifelse(y == 0, NA, y)
return(y)
}
app(z,
inner_function,
filename = output_file,
overwrite = TRUE,
wopt = list(datatype = "INT4U"))
return(output_file)
}
results <- mapply(outer_function,
a_tiles = a_tiles,
b_tiles = b_tiles,
c_tiles = c_tiles,
d_tiles = d_tiles,
output_files = output_files <- glue("result_folder/result_tile_{1:length(d_tiles)}.tif"))
names(results) <- NULL
unlink("temp_folder", recursive = TRUE)
unlink("result_folder", recursive = TRUE)

Why does function return NULL?

A beginner in R over here, so apologies for the basic question.
Why does ATE return a null vector instead of saving the values of the difference of the means?
fun.cluster <- function(M, N){
set.seed(02139)
J <- 1:M # vector J_i
df <- as.data.frame(matrix(data=1:N, nrow = N, ncol = 1)) #data frame of all original values
df$cluster <- cut(df$V1, M, labels = 1:M) #breaking the dataframe into clusters
df$cluster <- as.numeric(df$cluster)
Y1 <- as.vector(sample(J, 5)) # assigning treatment
df$treatment <- ifelse(df$cluster %in% Y1, df$treatment <- 1, df$treatment <- 0)
#Inducing intracluster correlation:
mu_0j <- runif(n = 50, min = -1, max = 1)
df$V1[df$treatment==0] <- mu_0j
mu_1j <- runif(n=50, min = -0.5, max = 1.5)
df$V1[df$treatment==0] <- mu_1j
# drawing values
y_0i <- rnorm(n = 50, mean = mu_0j, sd = 1)
y_1i <- rnorm(n = 50, mean = mu_1j, sd = 1)
D_i <- as.vector(c(y_0i, y_1i))
# calculating ATE:
ATE[i] <- mean(y_1i - y_0i)
}
ATE <- c()
for(i in 1:10){
fun.cluster(M = 10, N = 100)
}

multiple data frames with similar names

I needed to generate array or many data frames from other data frames which only varied in names. This required me to do a lot of copy-paste works. Is it possible that I can make it cleaner but not keep copying and pasting? Follows are two examples from many similar cases of the analysis I am doing now (I will provide codes for reproduction at the end of the question), which I think may be able to make them cleaner with the same approach.
case 1, create an array with data from per_d1,per_d1,per_d3,per_d4,per_d5
perd <- array(dim=c(7,15,5))
perd [,,1] <- as.matrix(per_d$per_d1)
perd [,,2] <- as.matrix(per_d$per_d2)
perd [,,3] <- as.matrix(per_d$per_d3)
perd [,,4] <- as.matrix(per_d$per_d4)
perd [,,5] <- as.matrix(per_d$per_d5)
case 2, create multiple data frames from data with similar names.
dataplot <- dfmak (per_d$per_d1,ge$per_d1$g1,ge$per_d1$g2,ge$per_d1$g3,ge$per_d1$g4,ge$per_d1$g5)
dataplot2 <- dfmak (per_d$per_d2,ge$per_d2$g1,ge$per_d2$g2,ge$per_d2$g3,ge$per_d2$g4,ge$per_d2$g5)
dataplot3 <- dfmak (per_d$per_d3,ge$per_d3$g1,ge$per_d3$g2,ge$per_d3$g3,ge$per_d3$g4,ge$per_d3$g5)
dataplot4 <- dfmak (per_d$per_d4,ge$per_d4$g1,ge$per_d4$g2,ge$per_d4$g3,ge$per_d4$g4,ge$per_d4$g5)
dataplot5 <- dfmak (per_d$per_d5,ge$per_d5$g1,ge$per_d5$g2,ge$per_d5$g3,ge$per_d5$g4,ge$per_d5$g5)
codes for reproduction
N <- 1
CS <- 10.141
S <- seq (7.72,13,0.807)
t <- 15
l <- length (S)
m0 <- 100
exps <- c(0.2, 0.5, 0.9, 1.5, 2)
sd <- c(0.2, 0.5, 0.8, 1.3, 1.8)
names(sd) <- paste("per", seq_along(sd), sep = "")
per <- lapply(sd, function(x){
per <- matrix(nrow = length(S)*N, ncol = t+1)
for (i in 1:dim(per)[1]) {
for (j in 1:t+1){
per [,1] <- replicate (n = N, S)
per [i,j] <- round (abs (rnorm (1, mean = per[i,1], sd =x)),digits=3)
colnames(per) <- c('physical',paste('t', 1:15, sep = ""))
per <- as.data.frame (per)
}
}
per <- per [,-1]
return(per)
}
)
per_d <- lapply(per, function(x){
per_d <- abs (x - 10.141)
}
)
names(per_d) <- paste("per_d", seq_along(sd), sep = "")
gefun <- function (i){
res <- lapply(exps, function(x){
g <- as.matrix (m0 * exp (-x * i))
for (i in 1:l) {
for (j in 1:t){
g [i,j] <- abs((round (rnorm(1,mean = g[i,j],sd=3), digits = 3)))
colnames(g) <- paste('t', 1:ncol(g), sep = "")
g <- as.data.frame(g)
}}
return(g)
}
)
}
ge <- lapply(per_d, gefun)
for (i in 1:length(ge)){
names(ge[[i]]) <- paste("g", seq_along(ge), sep = "")
}
dfmak <- function(df1,df2,df3,df4,df5,df6){
data.frame(stimulus = c (paste0('S',1:3),'CS+',paste0('S',5:7)),
phy_dis = S,
per_dis = c(df1$t1,df1$t2,df1$t3,df1$t4,df1$t5,df1$t6,df1$t7,df1$t8,df1$t9,df1$t10,df1$t11,df1$t12,df1$t13,df1$t14,df1$t15),
trials = rep(1:15, each = 7),
response_0.2 = c (df2$t1,df2$t2,df2$t3,df2$t4,df2$t5,df2$t6,df2$t7,df2$t8,df2$t9,df2$t10,df2$t11,df2$t12,df2$t13,df2$t14,df2$t15),
response_0.5 = c (df3$t1,df3$t2,df3$t3,df3$t4,df3$t5,df3$t6,df3$t7,df3$t8,df3$t9,df3$t10,df3$t11,df3$t12,df3$t13,df3$t14,df3$t15),
response_0.9 = c (df4$t1,df4$t2,df4$t3,df4$t4,df4$t5,df4$t6,df4$t7,df4$t8,df4$t9,df4$t10,df4$t11,df4$t12,df4$t13,df4$t14,df4$t15),
response_1.5 = c (df5$t1,df5$t2,df5$t3,df5$t4,df5$t5,df5$t6,df5$t7,df5$t8,df5$t9,df5$t10,df5$t11,df5$t12,df5$t13,df5$t14,df5$t15),
response_2 = c (df6$t1,df6$t2,df6$t3,df6$t4,df6$t5,df6$t6,df6$t7,df6$t8,df6$t9,df6$t10,df6$t11,df6$t12,df6$t13,df6$t14,df6$t15)
)
}
You can try the followings. But the codes, unfortunately, are not short.
Case 1
a <- lapply(per_d, as.matrix)
b <- c(a, recursive = TRUE)
pred <- array(b, dim = c(7,15,5))
Case 2
The data frames will be stored in a list. You still have to extract them using $ or [[]].
# create empty lists to store the outputs
out <- list()
name <- list()
for(i in 1:5) {
a <- per_d[[i]]
b <- ge[[i]][[1]]
c <- ge[[i]][[2]]
d <- ge[[i]][[3]]
e <- ge[[i]][[4]]
f <- ge[[i]][[5]]
arg <- list(a, b, c, d, e, f)
name[[i]] <- paste0("df_", i)
out[[i]] <- do.call(dfmak, arg)
}
out <- setNames(out, name)

Convert for loops into foreach loops

I want to make the code below more efficient by using the foreach package. I tried it for a very long time but I don't manage to get the same result as when using the for-loops. I would like to use a nested foreach-loop including parallelization... And as output I would like to have two matrices with dim [R,b1] I would be very grateful for some suggestions!!
n <- c(100, 300, 500)
R <- 100
b0 <- 110
b1 <- seq(0.01, 0.1, length.out = 100)
## all combinations of n and b1
grid <- expand.grid(n, b1)
names(grid) <- c("n", "b1")
calcPower <- function( R, b0, grid) {
cl <- makeCluster(3)
registerDoParallel(cl)
## n and b1 coefficients
n <- grid$n
b1 <- grid$b1
## ensures reproducibility
set.seed(2020)
x <- runif(n, 18, 80)
x.dich <- factor( ifelse( x < median( x), 0, 1))
## enables to store two outputs
solution <- list()
## .options.RNG ensures reproducibility
res <- foreach(i = 1:R, .combine = rbind, .inorder = TRUE, .options.RNG = 666) %dorng% {
p.val <- list()
p.val.d <- list()
for( j in seq_along(b1)) {
y <- b0 + b1[j] * x + rnorm(n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c( p.val, ifelse( summary(mod.lm)$coef[2,4] <= 0.05, 1, 0))
p.val.d <- c( p.val.d, ifelse( summary(mod.lm.d)$coef[2,4] <= 0.05, 1, 0))
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
return(solution)
}
dp.val <- matrix( unlist(res[,1], use.names = FALSE), R, length(b1), byrow = TRUE)
dp.val.d <- matrix( unlist(res[,2], use.names = FALSE), R, length(b1), byrow = TRUE)
stopCluster(cl)
df <- data.frame(
effectS = b1,
power = apply( dp.val, 2, function(x){ mean(x) * 100}),
power.d = apply( dp.val.d, 2, function(x){ mean(x) * 100}),
n = factor(n))
return(df)
}
## simulation for different n
tmp <- with(grid,
by( grid, n,
calcPower, R = R, b0 = b0))
## combines the 3 results
df.power <- rbind(tmp[[1]], tmp[[2]], tmp[[3]])
I created a foreach loop in following code. There had to be some changes made. It is a lot easier to return a list then a matrix in foreach, since it's combined with rbind. Especially when you want to return multiple ones. My solution here is to save everything in a list and afterwards transform it into a matrix of length 100.
Note: there is one mistake in your code. summary( mod.lm.d)$coef[2,4] does not exist. I changed it to [2]. Adjust to your needing
solution <- list()
df2<-foreach(i = 1:R, .combine = rbind, .inorder=TRUE) %dopar%{
set.seed(i)
p.val <- list()
p.val.d <- list()
counter <- list()
for( j in seq_along(b1)){
x <- sort( runif(n, 18, 80))
x.dich <- factor( ifelse( x < median(x), 0, 1))
y <- b0 + b1[j] * x + rnorm( n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c(p.val, ifelse( summary( mod.lm)$coef[2] <= 0.05, 1, 0))
p.val.d <- c(p.val.d, ifelse( summary( mod.lm.d)$coef[2] <= 0.05, 1, 0))
counter <- c(counter, j)
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
solution[[3]] <- counter
return(solution)
}
dp.val <- unlist(df2[,1], use.names = FALSE)
dp.val.d <- unlist(df2[,2], use.names = FALSE)
dp.val.matr <- matrix(dp.val, R, length(b1))
dp.val.d.matr <- matrix(dp.val.d, R, length(b1))
stopCluster(cl)
for your comment:
A foreach does work with a normal for loop. Minimal reproducible example:
df<-foreach(i = 1:R, .combine = cbind, .inorder=TRUE) %dopar%{
x <- list()
for(j in 1:3){
x <- c(x,j)
}
return(x)
}

Avoiding Loops to Generate a Complex Dataframe with Nested Lists

Here is a kind of DF, I have to generate to store simulations data).
nbSimul <- 100
nbSampleSizes <- 4
nbCensoredRates <- 4
sampleSize <- c(100, 50, 30, 10)
censoredRate <- c(0.1, 0.3, 0.5, 0.8)
df.sampled <- data.frame(cas = numeric() ,
distribution = character(),
simul = numeric() ,
sampleSize = numeric() ,
censoredRate = numeric() ,
dta = I(list()) ,
quantileLD = I(list()) ,
stringsAsFactors = FALSE)
v <- 0 # Scenario indicator
for(k in 1:nbCensoredRates){
for(j in 1:nbSampleSizes){
for(i in 1:nbSimul){
# Scenario Id + Other info
v <- v + 1
df.sampled[v,"cas"] <- v
df.sampled[v,"distribution"] <- "logNormal"
df.sampled[v,"simul"] <- i
df.sampled[v,"sampleSize"] <- sampleSize[j]
df.sampled[v,"censoredRate"] <- censoredRate[k]
X <- rlnorm(sampleSize[j], meanlog = 0, sdlog = 1)
estimatedLD <- array(9)
for(w in 1:9){
estimatedLD[w] <- quantile(X, probs=censoredRate[k], type=w)[[1]]
}
df.sampled$dta[v] <- list(X)
df.sampled$quantileLD[v] <- list(estimatedLD[1:9])
}
}
}
Which is quite difficult to read.
I would like to find a way to avoid loops, and to reference easily scenarios (v) and attached variables.
Any idea?

Resources