Avoiding Loops to Generate a Complex Dataframe with Nested Lists - r

Here is a kind of DF, I have to generate to store simulations data).
nbSimul <- 100
nbSampleSizes <- 4
nbCensoredRates <- 4
sampleSize <- c(100, 50, 30, 10)
censoredRate <- c(0.1, 0.3, 0.5, 0.8)
df.sampled <- data.frame(cas = numeric() ,
distribution = character(),
simul = numeric() ,
sampleSize = numeric() ,
censoredRate = numeric() ,
dta = I(list()) ,
quantileLD = I(list()) ,
stringsAsFactors = FALSE)
v <- 0 # Scenario indicator
for(k in 1:nbCensoredRates){
for(j in 1:nbSampleSizes){
for(i in 1:nbSimul){
# Scenario Id + Other info
v <- v + 1
df.sampled[v,"cas"] <- v
df.sampled[v,"distribution"] <- "logNormal"
df.sampled[v,"simul"] <- i
df.sampled[v,"sampleSize"] <- sampleSize[j]
df.sampled[v,"censoredRate"] <- censoredRate[k]
X <- rlnorm(sampleSize[j], meanlog = 0, sdlog = 1)
estimatedLD <- array(9)
for(w in 1:9){
estimatedLD[w] <- quantile(X, probs=censoredRate[k], type=w)[[1]]
}
df.sampled$dta[v] <- list(X)
df.sampled$quantileLD[v] <- list(estimatedLD[1:9])
}
}
}
Which is quite difficult to read.
I would like to find a way to avoid loops, and to reference easily scenarios (v) and attached variables.
Any idea?

Related

How to see the distribution of variables after clustering with gower distance in R. How can i see the distribution of variables accross the clusters?

I have calculated dendrograms of my dataset with the divisive and agglomerative method
library(cluster)
library(fpc)
gower.dist <- daisy(data.cluster, metric=c("gower"))
divisive.clust <- diana(as.matrix(gower.dist),
diss = TRUE, keep.diss = TRUE)
plot(divisive.clust, main = "Divisive")
aggl.clust.c <- hclust(gower.dist, method = "complete")
plot(aggl.clust.c,
main = "Agglomerative, complete linkages")
I also have the results in a table with the amounts of cases in the clusters, etc.
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between",
"wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for(i in c(1:k)){
row.clust[i] <- paste("Cluster-", i, " size")
}
for(i in c(2:k)){
stats.names[i] <- paste("Test", i-1)
for(j in seq_along(clust.assess)){
output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
}
for(d in 1:k) {
cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
cluster.sizes[d, i]
}
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
stats.df.divisive <- cstats.table(gower.dist, divisive.clust, 7)
stats.df.divisive
stats.df.aggl <-cstats.table(gower.dist, aggl.clust.c, 7)
#complete linkages looks like the most balanced approach
stats.df.aggl

how to organize the output of MLE using R

I wrote down this function for MLE estimation and then I apply it for different settings of parameters.
Finally, I bind all results for an output.
But is not working i have problem with the output and also I need to organize the output like the attached image using R program.
enter image description here
could some one help me please?
What should I fix and how can I print the results like the picture attached.
thank you in advance
rbssn<- function(n,alpha,beta)
{
if(!is.numeric(n)||!is.numeric(alpha)||!is.numeric(beta))
{stop("non-numeric argument to mathematical function")}
if(alpha<=0){ stop("alpha must be positive")}
if(beta<=0) { stop("beta must be positive") }
z <- rnorm(n,0,1)
r <- beta*((alpha*z*0.5)+sqrt((alpha*z*0.5)^2+1))^2
return(r)
}
#Function
mymle <- function(n,alpha,beta,rep)
{
theta=c(alpha,beta) # store starting values
#Tables
LHE=array(0, c(2,rep));
rownames(LHE)= c("MLE_alpha", "MLE_beta")
#Bias
bias= array(0, c(2,rep));
rownames(bias)= c("bias_alpha", "bias_beta")
#Simulation
set.seed(1)
#Loop
for(i in 1:rep){
myx <- exp(-rbssn(n, alpha, beta))
Score <- function(x) {
y <- numeric(2)
y[1] <- (-n/x[1])*(1+2/(x[1]^2)) - (1/(x[2]*x[1]^3))*sum(log(myx)) - (x[2]/(x[1]^3))*sum(1/log(myx))
y[2] <- -(n/(2*x[2])) + sum((1/(x[2]-log(myx)))) - (1/(2*(x[1]^2)*(x[2]^2)))*sum(log(myx)) + (1/(2*x[1]^2))*sum(1/(log(myx)))
y
}
Sin <- c(alpha,beta)
mle<- nleqslv(Sin, Score, control=list(btol=.01))[1]
LHE[i,]= mle
bias[i,]= c(mle[1]-theta[1], mle[2]-theta[2])
}
# end for i
#Format results
L <-round(apply(LHE, 1, mean), 3) # MLE of all the applied iterations
bs <-round(apply(bias,1, mean),3) # bias of all the applied iterations
row<- c(L, bs)
#Format a label
lab <- paste0('n= ',n,';',' alpha= ',alpha,';',' beta= ',beta)
row2 <- c(lab,row)
row2 <- as.data.frame(t(row2))
return(row2)
}
#Bind all
#Example 1
ex1 <- mymle(n = 20,alpha = 1,beta = 0.5,rep = 100)
ex2 <- mymle(n = 50,alpha = 2,beta = 0.5,rep = 100)
ex3 <- mymle(n = 100,alpha = 3,beta = 0.5,rep = 100)
#Example 2
ex4 <- mymle(n = 20,alpha = 0.5,beta = 0.5,rep = 100)
ex5 <- mymle(n = 50,alpha = 0.5,beta = 1,rep = 100)
ex6 <- mymle(n = 100,alpha = 0.5,beta = 1,rep = 100)
df <- rbind(ex1,ex2,ex3,ex4,ex5,ex6)
Any help will be appreciated.

for loop for every matrices in a list

I am trying to apply the following for-loop to every matrices in the list per_d and create a new list called per_hole. I am not sure how to do this, should I use lapply?
Thank you very much in advance for your helps!
per_hole <- per_d
for (i in 1:S) {
for (j in 1:t){
if (per_hole [i,j] > CS) {
per_hole [i,j] <- per_hole [i,j] - rnorm (1, mean = 1, sd = 0.5)
} else {
per_hole [i,j] <- per_hole [i,j] + rnorm (1, mean = 1, sd = 0.5)
}}}
codes for reproduction
N <- 1
CS <- 10.141
S <- seq (7.72,13,0.807)
t <- 15
l <- length (S)
m0 <- 100
exps <- c(0.2, 0.5, 0.9, 1.5, 2)
sd_per <- c(0.2, 0.5, 0.8, 1.3, 1.8)
sd_noise <- 3
per <- lapply(sd_per, function(x){
per <- matrix(nrow = length(S)*N, ncol = t+1)
for (i in 1:dim(per)[1]) {
for (j in 1:t+1){
per [,1] <- replicate (n = N, S)
per [i,j] <- round (abs (rnorm (1, mean = per[i,1], sd =x)),digits=3)
colnames(per) <- c('physical',paste('t', 1:15, sep = ""))
per <- as.data.frame (per)
}
}
per <- per [,-1]
return(per)
}
)
names(per) <- paste("per", seq_along(sd_per), sep = "")
per_d <- lapply(per, function(x){
per_d <- abs (x - 10.141)
}
)
names(per_d) <- paste("per_d", seq_along(sd_per), sep = "")
You can try
per_hole <- lapply(per_d,function(x) x + ifelse(x>CS,-1,1)*rnorm(prod(dim(x)),1,0.5))
or
per_hole <- lapply(per_d, function(x) x + rnorm(prod(dim(x)), 1-2*(x > CS), 0.5))

multiple data frames with similar names

I needed to generate array or many data frames from other data frames which only varied in names. This required me to do a lot of copy-paste works. Is it possible that I can make it cleaner but not keep copying and pasting? Follows are two examples from many similar cases of the analysis I am doing now (I will provide codes for reproduction at the end of the question), which I think may be able to make them cleaner with the same approach.
case 1, create an array with data from per_d1,per_d1,per_d3,per_d4,per_d5
perd <- array(dim=c(7,15,5))
perd [,,1] <- as.matrix(per_d$per_d1)
perd [,,2] <- as.matrix(per_d$per_d2)
perd [,,3] <- as.matrix(per_d$per_d3)
perd [,,4] <- as.matrix(per_d$per_d4)
perd [,,5] <- as.matrix(per_d$per_d5)
case 2, create multiple data frames from data with similar names.
dataplot <- dfmak (per_d$per_d1,ge$per_d1$g1,ge$per_d1$g2,ge$per_d1$g3,ge$per_d1$g4,ge$per_d1$g5)
dataplot2 <- dfmak (per_d$per_d2,ge$per_d2$g1,ge$per_d2$g2,ge$per_d2$g3,ge$per_d2$g4,ge$per_d2$g5)
dataplot3 <- dfmak (per_d$per_d3,ge$per_d3$g1,ge$per_d3$g2,ge$per_d3$g3,ge$per_d3$g4,ge$per_d3$g5)
dataplot4 <- dfmak (per_d$per_d4,ge$per_d4$g1,ge$per_d4$g2,ge$per_d4$g3,ge$per_d4$g4,ge$per_d4$g5)
dataplot5 <- dfmak (per_d$per_d5,ge$per_d5$g1,ge$per_d5$g2,ge$per_d5$g3,ge$per_d5$g4,ge$per_d5$g5)
codes for reproduction
N <- 1
CS <- 10.141
S <- seq (7.72,13,0.807)
t <- 15
l <- length (S)
m0 <- 100
exps <- c(0.2, 0.5, 0.9, 1.5, 2)
sd <- c(0.2, 0.5, 0.8, 1.3, 1.8)
names(sd) <- paste("per", seq_along(sd), sep = "")
per <- lapply(sd, function(x){
per <- matrix(nrow = length(S)*N, ncol = t+1)
for (i in 1:dim(per)[1]) {
for (j in 1:t+1){
per [,1] <- replicate (n = N, S)
per [i,j] <- round (abs (rnorm (1, mean = per[i,1], sd =x)),digits=3)
colnames(per) <- c('physical',paste('t', 1:15, sep = ""))
per <- as.data.frame (per)
}
}
per <- per [,-1]
return(per)
}
)
per_d <- lapply(per, function(x){
per_d <- abs (x - 10.141)
}
)
names(per_d) <- paste("per_d", seq_along(sd), sep = "")
gefun <- function (i){
res <- lapply(exps, function(x){
g <- as.matrix (m0 * exp (-x * i))
for (i in 1:l) {
for (j in 1:t){
g [i,j] <- abs((round (rnorm(1,mean = g[i,j],sd=3), digits = 3)))
colnames(g) <- paste('t', 1:ncol(g), sep = "")
g <- as.data.frame(g)
}}
return(g)
}
)
}
ge <- lapply(per_d, gefun)
for (i in 1:length(ge)){
names(ge[[i]]) <- paste("g", seq_along(ge), sep = "")
}
dfmak <- function(df1,df2,df3,df4,df5,df6){
data.frame(stimulus = c (paste0('S',1:3),'CS+',paste0('S',5:7)),
phy_dis = S,
per_dis = c(df1$t1,df1$t2,df1$t3,df1$t4,df1$t5,df1$t6,df1$t7,df1$t8,df1$t9,df1$t10,df1$t11,df1$t12,df1$t13,df1$t14,df1$t15),
trials = rep(1:15, each = 7),
response_0.2 = c (df2$t1,df2$t2,df2$t3,df2$t4,df2$t5,df2$t6,df2$t7,df2$t8,df2$t9,df2$t10,df2$t11,df2$t12,df2$t13,df2$t14,df2$t15),
response_0.5 = c (df3$t1,df3$t2,df3$t3,df3$t4,df3$t5,df3$t6,df3$t7,df3$t8,df3$t9,df3$t10,df3$t11,df3$t12,df3$t13,df3$t14,df3$t15),
response_0.9 = c (df4$t1,df4$t2,df4$t3,df4$t4,df4$t5,df4$t6,df4$t7,df4$t8,df4$t9,df4$t10,df4$t11,df4$t12,df4$t13,df4$t14,df4$t15),
response_1.5 = c (df5$t1,df5$t2,df5$t3,df5$t4,df5$t5,df5$t6,df5$t7,df5$t8,df5$t9,df5$t10,df5$t11,df5$t12,df5$t13,df5$t14,df5$t15),
response_2 = c (df6$t1,df6$t2,df6$t3,df6$t4,df6$t5,df6$t6,df6$t7,df6$t8,df6$t9,df6$t10,df6$t11,df6$t12,df6$t13,df6$t14,df6$t15)
)
}
You can try the followings. But the codes, unfortunately, are not short.
Case 1
a <- lapply(per_d, as.matrix)
b <- c(a, recursive = TRUE)
pred <- array(b, dim = c(7,15,5))
Case 2
The data frames will be stored in a list. You still have to extract them using $ or [[]].
# create empty lists to store the outputs
out <- list()
name <- list()
for(i in 1:5) {
a <- per_d[[i]]
b <- ge[[i]][[1]]
c <- ge[[i]][[2]]
d <- ge[[i]][[3]]
e <- ge[[i]][[4]]
f <- ge[[i]][[5]]
arg <- list(a, b, c, d, e, f)
name[[i]] <- paste0("df_", i)
out[[i]] <- do.call(dfmak, arg)
}
out <- setNames(out, name)

Convert for loops into foreach loops

I want to make the code below more efficient by using the foreach package. I tried it for a very long time but I don't manage to get the same result as when using the for-loops. I would like to use a nested foreach-loop including parallelization... And as output I would like to have two matrices with dim [R,b1] I would be very grateful for some suggestions!!
n <- c(100, 300, 500)
R <- 100
b0 <- 110
b1 <- seq(0.01, 0.1, length.out = 100)
## all combinations of n and b1
grid <- expand.grid(n, b1)
names(grid) <- c("n", "b1")
calcPower <- function( R, b0, grid) {
cl <- makeCluster(3)
registerDoParallel(cl)
## n and b1 coefficients
n <- grid$n
b1 <- grid$b1
## ensures reproducibility
set.seed(2020)
x <- runif(n, 18, 80)
x.dich <- factor( ifelse( x < median( x), 0, 1))
## enables to store two outputs
solution <- list()
## .options.RNG ensures reproducibility
res <- foreach(i = 1:R, .combine = rbind, .inorder = TRUE, .options.RNG = 666) %dorng% {
p.val <- list()
p.val.d <- list()
for( j in seq_along(b1)) {
y <- b0 + b1[j] * x + rnorm(n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c( p.val, ifelse( summary(mod.lm)$coef[2,4] <= 0.05, 1, 0))
p.val.d <- c( p.val.d, ifelse( summary(mod.lm.d)$coef[2,4] <= 0.05, 1, 0))
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
return(solution)
}
dp.val <- matrix( unlist(res[,1], use.names = FALSE), R, length(b1), byrow = TRUE)
dp.val.d <- matrix( unlist(res[,2], use.names = FALSE), R, length(b1), byrow = TRUE)
stopCluster(cl)
df <- data.frame(
effectS = b1,
power = apply( dp.val, 2, function(x){ mean(x) * 100}),
power.d = apply( dp.val.d, 2, function(x){ mean(x) * 100}),
n = factor(n))
return(df)
}
## simulation for different n
tmp <- with(grid,
by( grid, n,
calcPower, R = R, b0 = b0))
## combines the 3 results
df.power <- rbind(tmp[[1]], tmp[[2]], tmp[[3]])
I created a foreach loop in following code. There had to be some changes made. It is a lot easier to return a list then a matrix in foreach, since it's combined with rbind. Especially when you want to return multiple ones. My solution here is to save everything in a list and afterwards transform it into a matrix of length 100.
Note: there is one mistake in your code. summary( mod.lm.d)$coef[2,4] does not exist. I changed it to [2]. Adjust to your needing
solution <- list()
df2<-foreach(i = 1:R, .combine = rbind, .inorder=TRUE) %dopar%{
set.seed(i)
p.val <- list()
p.val.d <- list()
counter <- list()
for( j in seq_along(b1)){
x <- sort( runif(n, 18, 80))
x.dich <- factor( ifelse( x < median(x), 0, 1))
y <- b0 + b1[j] * x + rnorm( n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c(p.val, ifelse( summary( mod.lm)$coef[2] <= 0.05, 1, 0))
p.val.d <- c(p.val.d, ifelse( summary( mod.lm.d)$coef[2] <= 0.05, 1, 0))
counter <- c(counter, j)
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
solution[[3]] <- counter
return(solution)
}
dp.val <- unlist(df2[,1], use.names = FALSE)
dp.val.d <- unlist(df2[,2], use.names = FALSE)
dp.val.matr <- matrix(dp.val, R, length(b1))
dp.val.d.matr <- matrix(dp.val.d, R, length(b1))
stopCluster(cl)
for your comment:
A foreach does work with a normal for loop. Minimal reproducible example:
df<-foreach(i = 1:R, .combine = cbind, .inorder=TRUE) %dopar%{
x <- list()
for(j in 1:3){
x <- c(x,j)
}
return(x)
}

Resources