I need to repeat a process 1000 times and save the results as I go along, but I’m not sure how to do it.
Here’s what I have:
x1 = runif(4000, min = 0, max = 1)
x2 = runif(4000, min = 0, max = 1)
y <- 1*x1 - 2*x2 + rnorm(4000)
df <- data.frame(y, x1, x2)
# part that’s needs to be replicated 1k times:
set.seed(2)
df2 <- df[sample(1 : nrow(df), 4000, replace = T ), ]
x= as.matrix(df2[,-1])
y= as.matrix(df2[,1])
OLS <- solve(t(x)%*%x)%*%t(x)%*%y
# What I think might work
set.seed(2)
n = 1000
out <- replicate(n, {df2 <- df[sample(1 : nrow(df), 4000, replace = T ), ]
})
The problem is that I cannot figure out how to code the x and y matrix, and the OLS estimate inside the replicate() function. Perhaps a loop would be better?
What is the most efficient way to do this?
Add a simplify=F to replicate, which will save the data frames in a list. You can add multiple commands into one call.
df2=replicate(n,{
df2 <- df[sample(1 : nrow(df), 4000, replace = T ), ]
x= as.matrix(df2[,-1])
y= as.matrix(df2[,1])
OLS <- solve(t(x)%*%x)%*%t(x)%*%y
df[sample(1 : nrow(df), 4000, replace = T ), ]
},simplify=F)
I needed to generate array or many data frames from other data frames which only varied in names. This required me to do a lot of copy-paste works. Is it possible that I can make it cleaner but not keep copying and pasting? Follows are two examples from many similar cases of the analysis I am doing now (I will provide codes for reproduction at the end of the question), which I think may be able to make them cleaner with the same approach.
case 1, create an array with data from per_d1,per_d1,per_d3,per_d4,per_d5
perd <- array(dim=c(7,15,5))
perd [,,1] <- as.matrix(per_d$per_d1)
perd [,,2] <- as.matrix(per_d$per_d2)
perd [,,3] <- as.matrix(per_d$per_d3)
perd [,,4] <- as.matrix(per_d$per_d4)
perd [,,5] <- as.matrix(per_d$per_d5)
case 2, create multiple data frames from data with similar names.
dataplot <- dfmak (per_d$per_d1,ge$per_d1$g1,ge$per_d1$g2,ge$per_d1$g3,ge$per_d1$g4,ge$per_d1$g5)
dataplot2 <- dfmak (per_d$per_d2,ge$per_d2$g1,ge$per_d2$g2,ge$per_d2$g3,ge$per_d2$g4,ge$per_d2$g5)
dataplot3 <- dfmak (per_d$per_d3,ge$per_d3$g1,ge$per_d3$g2,ge$per_d3$g3,ge$per_d3$g4,ge$per_d3$g5)
dataplot4 <- dfmak (per_d$per_d4,ge$per_d4$g1,ge$per_d4$g2,ge$per_d4$g3,ge$per_d4$g4,ge$per_d4$g5)
dataplot5 <- dfmak (per_d$per_d5,ge$per_d5$g1,ge$per_d5$g2,ge$per_d5$g3,ge$per_d5$g4,ge$per_d5$g5)
codes for reproduction
N <- 1
CS <- 10.141
S <- seq (7.72,13,0.807)
t <- 15
l <- length (S)
m0 <- 100
exps <- c(0.2, 0.5, 0.9, 1.5, 2)
sd <- c(0.2, 0.5, 0.8, 1.3, 1.8)
names(sd) <- paste("per", seq_along(sd), sep = "")
per <- lapply(sd, function(x){
per <- matrix(nrow = length(S)*N, ncol = t+1)
for (i in 1:dim(per)[1]) {
for (j in 1:t+1){
per [,1] <- replicate (n = N, S)
per [i,j] <- round (abs (rnorm (1, mean = per[i,1], sd =x)),digits=3)
colnames(per) <- c('physical',paste('t', 1:15, sep = ""))
per <- as.data.frame (per)
}
}
per <- per [,-1]
return(per)
}
)
per_d <- lapply(per, function(x){
per_d <- abs (x - 10.141)
}
)
names(per_d) <- paste("per_d", seq_along(sd), sep = "")
gefun <- function (i){
res <- lapply(exps, function(x){
g <- as.matrix (m0 * exp (-x * i))
for (i in 1:l) {
for (j in 1:t){
g [i,j] <- abs((round (rnorm(1,mean = g[i,j],sd=3), digits = 3)))
colnames(g) <- paste('t', 1:ncol(g), sep = "")
g <- as.data.frame(g)
}}
return(g)
}
)
}
ge <- lapply(per_d, gefun)
for (i in 1:length(ge)){
names(ge[[i]]) <- paste("g", seq_along(ge), sep = "")
}
dfmak <- function(df1,df2,df3,df4,df5,df6){
data.frame(stimulus = c (paste0('S',1:3),'CS+',paste0('S',5:7)),
phy_dis = S,
per_dis = c(df1$t1,df1$t2,df1$t3,df1$t4,df1$t5,df1$t6,df1$t7,df1$t8,df1$t9,df1$t10,df1$t11,df1$t12,df1$t13,df1$t14,df1$t15),
trials = rep(1:15, each = 7),
response_0.2 = c (df2$t1,df2$t2,df2$t3,df2$t4,df2$t5,df2$t6,df2$t7,df2$t8,df2$t9,df2$t10,df2$t11,df2$t12,df2$t13,df2$t14,df2$t15),
response_0.5 = c (df3$t1,df3$t2,df3$t3,df3$t4,df3$t5,df3$t6,df3$t7,df3$t8,df3$t9,df3$t10,df3$t11,df3$t12,df3$t13,df3$t14,df3$t15),
response_0.9 = c (df4$t1,df4$t2,df4$t3,df4$t4,df4$t5,df4$t6,df4$t7,df4$t8,df4$t9,df4$t10,df4$t11,df4$t12,df4$t13,df4$t14,df4$t15),
response_1.5 = c (df5$t1,df5$t2,df5$t3,df5$t4,df5$t5,df5$t6,df5$t7,df5$t8,df5$t9,df5$t10,df5$t11,df5$t12,df5$t13,df5$t14,df5$t15),
response_2 = c (df6$t1,df6$t2,df6$t3,df6$t4,df6$t5,df6$t6,df6$t7,df6$t8,df6$t9,df6$t10,df6$t11,df6$t12,df6$t13,df6$t14,df6$t15)
)
}
You can try the followings. But the codes, unfortunately, are not short.
Case 1
a <- lapply(per_d, as.matrix)
b <- c(a, recursive = TRUE)
pred <- array(b, dim = c(7,15,5))
Case 2
The data frames will be stored in a list. You still have to extract them using $ or [[]].
# create empty lists to store the outputs
out <- list()
name <- list()
for(i in 1:5) {
a <- per_d[[i]]
b <- ge[[i]][[1]]
c <- ge[[i]][[2]]
d <- ge[[i]][[3]]
e <- ge[[i]][[4]]
f <- ge[[i]][[5]]
arg <- list(a, b, c, d, e, f)
name[[i]] <- paste0("df_", i)
out[[i]] <- do.call(dfmak, arg)
}
out <- setNames(out, name)
My R script have the form:
for (j in 1:N) {
#construct the DF2 data frame
#operations on the DF2 data frame
}
Where N can be large (like a 1 mln). The columns of DF2 are defined
one after the other with the formula:
DF2$column_i <- function(x,f..) #or constant or ....
DF$column_i can are a constant, a function or a loop "while". I tried to pre allocate defining DF2 before with:
DF2 <- data.frame(matrix(nrow=..,ncol=..))
and computing after the columns DF2$column_i, but I have not had any benefits.
Does anyone have any ideas?
My code is of the type:
par <- data.frame(CA=runif(n = 50, min = 70000, max = 100000),
D=round(runif(n = 50, min = 70, max = 90),0),
P=runif(n = 50, min = 900, max = 20000),
A=round(runif(n = 50, min = 50, max = 70),0))
parpa <- data.frame(matrix(nrow = nrow(par), ncol = 3*V))
comp <- function(CA, D, P, A){
vect <- rep('numeric', 3*V)
b <- 1
k <- 1
while (((b+1) <= (D+1))&(k < V)) {
a <- b+1
b <- min((a+8-1), (D+1))
vect[c(1+4*k, 2+4*k, 3+4*k, 4+4*k)] <- c(mean(DF2$Z[a:b]), sum(DF2$X[a:b]),
mean(DF2$Q[a:b]), sum(DF2$AE[a:b]))
k <- k+1
}
return(vect)
}
#loop
for (j in 1:nrow(par)) {
CA <- par$CA[j]
D <- par$D[j]
R <- 0.01*D
P <- par$P[j]
A <- par$A[j]
COST <- 500
V <- 5
#DF2
DF2 <- data.frame(M=0:D)
OB <- function(x) {
c <- COST*D*DF2$M/R
return(c)
}
DF2$O <- O(D)
DF2$E <- (D*DF2$M+2)/D*(D+4)
DF2$Q <- (CA-DF2$M)*D
DF2$X <- (CA-DF2$O)*(DF2$E+P)
Func <- function(x) {return(round(x/30, 2))}
DF2$Z[(A+2):(D+1)] <- sapply(DF2$E[(A+2):(D+1)], Func)
parpa[j,] <- comp(CA, D, P, A)
}
Lets start with some generated data which are pretty realistic:
tmp <- data.table(
label = sprintf( "X%03d", 1:500),
start = sample( 50:950, 500, replace=TRUE ),
length = round( 20 * rf( rep(1, 500), 5, 5 ), 0 )
)
DT <- tmp[ , list( t = seq( start, length.out=length ) ), by = label ]
DT[ , I := sample(1:100, 1) * dbeta( seq(from=0,to=1, length.out=length(t)), sample(3:6,1), sample(5:10,1) ), by = label ]
DT <- DT[ I > 1E-2 ]
DT represents time series data for (in this case) 500 labels:
library(ggplot2)
ggplot( DT[ t %between% c(100,200) ], aes( x = t, y = I, group = label ) ) +
geom_line()
I want to correlate the data by all label pairs, given that they have a sufficient overlap. This is my approach:
# feel free to use just a subset here
labs <- DT[ , unique( label ) ][1:50]
# is needed for fast intersecting
setkey( DT, t )
# just needed for tracking progress
count <- 0
progress <- round(seq( from = 1, to = length(labs) * (length(labs) -1) / 2, length.out=100 ),0)
corrs <-
combn( labs, m=2, simplify=TRUE, minOverlap = 5, FUN = function( x, minOverlap ) {
# progress
count <<- count + 1
if( count %in% progress ){
cat( round( 100*count/max(progress),0 ), ".." )
}
# check overlap and correlate
a <- DT[label == x[1]]
b <- DT[label == x[2]]
iscectT <- intersect( a[ , t], b[ , t] )
n <- length(iscectT)
if( n >= minOverlap ){
R <- cor( a[J(iscectT)][, I], b[J(iscectT)][, I] )
return( c( x[1], x[2], n, min(iscectT), max(iscectT), R) )
}
else{
# only needed because of simplify = TRUE
return( rep(NA, 6) )
}
})
This works pretty fine, but is much slower than expected. In the particular case this would take up to 10 minutes on my machine.
Any help on improving the performance of this approach is highly appreciated. Questions which came to my mind:
Do I have to expect any side effects concerning on DTif I would deploy one of R's parallelization mechanisms, e.g. foreach? Is there a parallelization interface for data.table as there is for example for plyr?
Is there a way of using combn with simplify = FALSE without having horrible runtimes the longer the process goes. I assume that a lot of list copying takes place because increasing list capacities.
Is there anything I can do on the algorithmic side to make this faster?
As Roland suggested in his comment, using combn just to calculate the combinations of labels and then perform directly joins on the data.table, is magnitudes faster:
corrs <- as.data.frame(do.call( rbind, combn(labs, m=2, simplify = FALSE) ), stringsAsFactors=FALSE)
names(corrs) <- c("a", "b")
setDT(corrs)
setkey(DT, label)
setkey( corrs, a )
corrs <- corrs[ DT, nomatch = 0, allow.cartesian = TRUE]
setkey(corrs, b, t)
setkey(DT, label, t)
corrs <- corrs[ DT, nomatch = 0 ]
corrs[ , overlap := .N >= minOverlap , by = list(a,b) ]
corrs <- corrs[ (overlap) ]
corrs <- corrs[ ,list( start = min(t), end = max(t), R = cor(I,I.1) ), by = list(a,b) ]