multiple data frames with similar names - r
I needed to generate array or many data frames from other data frames which only varied in names. This required me to do a lot of copy-paste works. Is it possible that I can make it cleaner but not keep copying and pasting? Follows are two examples from many similar cases of the analysis I am doing now (I will provide codes for reproduction at the end of the question), which I think may be able to make them cleaner with the same approach.
case 1, create an array with data from per_d1,per_d1,per_d3,per_d4,per_d5
perd <- array(dim=c(7,15,5))
perd [,,1] <- as.matrix(per_d$per_d1)
perd [,,2] <- as.matrix(per_d$per_d2)
perd [,,3] <- as.matrix(per_d$per_d3)
perd [,,4] <- as.matrix(per_d$per_d4)
perd [,,5] <- as.matrix(per_d$per_d5)
case 2, create multiple data frames from data with similar names.
dataplot <- dfmak (per_d$per_d1,ge$per_d1$g1,ge$per_d1$g2,ge$per_d1$g3,ge$per_d1$g4,ge$per_d1$g5)
dataplot2 <- dfmak (per_d$per_d2,ge$per_d2$g1,ge$per_d2$g2,ge$per_d2$g3,ge$per_d2$g4,ge$per_d2$g5)
dataplot3 <- dfmak (per_d$per_d3,ge$per_d3$g1,ge$per_d3$g2,ge$per_d3$g3,ge$per_d3$g4,ge$per_d3$g5)
dataplot4 <- dfmak (per_d$per_d4,ge$per_d4$g1,ge$per_d4$g2,ge$per_d4$g3,ge$per_d4$g4,ge$per_d4$g5)
dataplot5 <- dfmak (per_d$per_d5,ge$per_d5$g1,ge$per_d5$g2,ge$per_d5$g3,ge$per_d5$g4,ge$per_d5$g5)
codes for reproduction
N <- 1
CS <- 10.141
S <- seq (7.72,13,0.807)
t <- 15
l <- length (S)
m0 <- 100
exps <- c(0.2, 0.5, 0.9, 1.5, 2)
sd <- c(0.2, 0.5, 0.8, 1.3, 1.8)
names(sd) <- paste("per", seq_along(sd), sep = "")
per <- lapply(sd, function(x){
per <- matrix(nrow = length(S)*N, ncol = t+1)
for (i in 1:dim(per)[1]) {
for (j in 1:t+1){
per [,1] <- replicate (n = N, S)
per [i,j] <- round (abs (rnorm (1, mean = per[i,1], sd =x)),digits=3)
colnames(per) <- c('physical',paste('t', 1:15, sep = ""))
per <- as.data.frame (per)
}
}
per <- per [,-1]
return(per)
}
)
per_d <- lapply(per, function(x){
per_d <- abs (x - 10.141)
}
)
names(per_d) <- paste("per_d", seq_along(sd), sep = "")
gefun <- function (i){
res <- lapply(exps, function(x){
g <- as.matrix (m0 * exp (-x * i))
for (i in 1:l) {
for (j in 1:t){
g [i,j] <- abs((round (rnorm(1,mean = g[i,j],sd=3), digits = 3)))
colnames(g) <- paste('t', 1:ncol(g), sep = "")
g <- as.data.frame(g)
}}
return(g)
}
)
}
ge <- lapply(per_d, gefun)
for (i in 1:length(ge)){
names(ge[[i]]) <- paste("g", seq_along(ge), sep = "")
}
dfmak <- function(df1,df2,df3,df4,df5,df6){
data.frame(stimulus = c (paste0('S',1:3),'CS+',paste0('S',5:7)),
phy_dis = S,
per_dis = c(df1$t1,df1$t2,df1$t3,df1$t4,df1$t5,df1$t6,df1$t7,df1$t8,df1$t9,df1$t10,df1$t11,df1$t12,df1$t13,df1$t14,df1$t15),
trials = rep(1:15, each = 7),
response_0.2 = c (df2$t1,df2$t2,df2$t3,df2$t4,df2$t5,df2$t6,df2$t7,df2$t8,df2$t9,df2$t10,df2$t11,df2$t12,df2$t13,df2$t14,df2$t15),
response_0.5 = c (df3$t1,df3$t2,df3$t3,df3$t4,df3$t5,df3$t6,df3$t7,df3$t8,df3$t9,df3$t10,df3$t11,df3$t12,df3$t13,df3$t14,df3$t15),
response_0.9 = c (df4$t1,df4$t2,df4$t3,df4$t4,df4$t5,df4$t6,df4$t7,df4$t8,df4$t9,df4$t10,df4$t11,df4$t12,df4$t13,df4$t14,df4$t15),
response_1.5 = c (df5$t1,df5$t2,df5$t3,df5$t4,df5$t5,df5$t6,df5$t7,df5$t8,df5$t9,df5$t10,df5$t11,df5$t12,df5$t13,df5$t14,df5$t15),
response_2 = c (df6$t1,df6$t2,df6$t3,df6$t4,df6$t5,df6$t6,df6$t7,df6$t8,df6$t9,df6$t10,df6$t11,df6$t12,df6$t13,df6$t14,df6$t15)
)
}
You can try the followings. But the codes, unfortunately, are not short.
Case 1
a <- lapply(per_d, as.matrix)
b <- c(a, recursive = TRUE)
pred <- array(b, dim = c(7,15,5))
Case 2
The data frames will be stored in a list. You still have to extract them using $ or [[]].
# create empty lists to store the outputs
out <- list()
name <- list()
for(i in 1:5) {
a <- per_d[[i]]
b <- ge[[i]][[1]]
c <- ge[[i]][[2]]
d <- ge[[i]][[3]]
e <- ge[[i]][[4]]
f <- ge[[i]][[5]]
arg <- list(a, b, c, d, e, f)
name[[i]] <- paste0("df_", i)
out[[i]] <- do.call(dfmak, arg)
}
out <- setNames(out, name)
Related
for loop for every matrices in a list
I am trying to apply the following for-loop to every matrices in the list per_d and create a new list called per_hole. I am not sure how to do this, should I use lapply? Thank you very much in advance for your helps! per_hole <- per_d for (i in 1:S) { for (j in 1:t){ if (per_hole [i,j] > CS) { per_hole [i,j] <- per_hole [i,j] - rnorm (1, mean = 1, sd = 0.5) } else { per_hole [i,j] <- per_hole [i,j] + rnorm (1, mean = 1, sd = 0.5) }}} codes for reproduction N <- 1 CS <- 10.141 S <- seq (7.72,13,0.807) t <- 15 l <- length (S) m0 <- 100 exps <- c(0.2, 0.5, 0.9, 1.5, 2) sd_per <- c(0.2, 0.5, 0.8, 1.3, 1.8) sd_noise <- 3 per <- lapply(sd_per, function(x){ per <- matrix(nrow = length(S)*N, ncol = t+1) for (i in 1:dim(per)[1]) { for (j in 1:t+1){ per [,1] <- replicate (n = N, S) per [i,j] <- round (abs (rnorm (1, mean = per[i,1], sd =x)),digits=3) colnames(per) <- c('physical',paste('t', 1:15, sep = "")) per <- as.data.frame (per) } } per <- per [,-1] return(per) } ) names(per) <- paste("per", seq_along(sd_per), sep = "") per_d <- lapply(per, function(x){ per_d <- abs (x - 10.141) } ) names(per_d) <- paste("per_d", seq_along(sd_per), sep = "")
You can try per_hole <- lapply(per_d,function(x) x + ifelse(x>CS,-1,1)*rnorm(prod(dim(x)),1,0.5)) or per_hole <- lapply(per_d, function(x) x + rnorm(prod(dim(x)), 1-2*(x > CS), 0.5))
Pre-allocation and optimization loop
My R script have the form: for (j in 1:N) { #construct the DF2 data frame #operations on the DF2 data frame } Where N can be large (like a 1 mln). The columns of DF2 are defined one after the other with the formula: DF2$column_i <- function(x,f..) #or constant or .... DF$column_i can are a constant, a function or a loop "while". I tried to pre allocate defining DF2 before with: DF2 <- data.frame(matrix(nrow=..,ncol=..)) and computing after the columns DF2$column_i, but I have not had any benefits. Does anyone have any ideas?
My code is of the type: par <- data.frame(CA=runif(n = 50, min = 70000, max = 100000), D=round(runif(n = 50, min = 70, max = 90),0), P=runif(n = 50, min = 900, max = 20000), A=round(runif(n = 50, min = 50, max = 70),0)) parpa <- data.frame(matrix(nrow = nrow(par), ncol = 3*V)) comp <- function(CA, D, P, A){ vect <- rep('numeric', 3*V) b <- 1 k <- 1 while (((b+1) <= (D+1))&(k < V)) { a <- b+1 b <- min((a+8-1), (D+1)) vect[c(1+4*k, 2+4*k, 3+4*k, 4+4*k)] <- c(mean(DF2$Z[a:b]), sum(DF2$X[a:b]), mean(DF2$Q[a:b]), sum(DF2$AE[a:b])) k <- k+1 } return(vect) } #loop for (j in 1:nrow(par)) { CA <- par$CA[j] D <- par$D[j] R <- 0.01*D P <- par$P[j] A <- par$A[j] COST <- 500 V <- 5 #DF2 DF2 <- data.frame(M=0:D) OB <- function(x) { c <- COST*D*DF2$M/R return(c) } DF2$O <- O(D) DF2$E <- (D*DF2$M+2)/D*(D+4) DF2$Q <- (CA-DF2$M)*D DF2$X <- (CA-DF2$O)*(DF2$E+P) Func <- function(x) {return(round(x/30, 2))} DF2$Z[(A+2):(D+1)] <- sapply(DF2$E[(A+2):(D+1)], Func) parpa[j,] <- comp(CA, D, P, A) }
Sum value by Combine all variable using R
Can somebody help me with data manipulation using R? i have data (data.train) like this datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1) datex <- as.Date(datex, "%d/%m/%y") Ax <- rep("A1",18) Bx <- rep(c(rep("B1",3),rep("B2",3)),3) Cx <- rep(c("C1","C2","C3"),6) valx <- 100 for(i in 1:17){valx[i+1] <- valx[i]+1} data.train <- data.frame(datex, Ax, Bx, Cx, valx) i need all combination from variable and the final form is like this I have tried this code: ### Library library(dplyr) ## datex datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1) datex <- as.Date(datex, "%d/%m/%y") Ax <- rep("A1",18) Bx <- rep(c(rep("B1",3),rep("B2",3)),3) Cx <- rep(c("C1","C2","C3"),6) valx <- 100 for(i in 1:17){valx[i+1] <- valx[i]+1} data.train <- data.frame(datex, Ax, Bx, Cx, valx) names.group <- names(data.train)[1:length(data.train)-1] data.group <- Map(combn, list(names.group), seq_along(names.group), simplify = F) %>% unlist(recursive = F) find.index <- sapply(data.group, function(x, find.y){ any(find.y %in% x) }, find.y = c("datex")) index.group <- NULL for(i in 2:length(find.index)){ if(find.index[i] == "TRUE"){ index.group[i] <- i } } index.group[is.na(index.group)] <- 0 for(i in 1:length(data.group)){ if(index.group[i] == 0){ data.group[[i]] <- 0 } else { data.group[[i]] <- data.group[[i]] } } data.group2 <- data.group[sapply(data.group, function(x) any(x != 0))] combination.result <- lapply(data.group2, FUN = function(x) { do.call(what = group_by_, args = c(list(data.train), x)) %>% summarise(sumVar = sum(valx)) }) combination.result but i don't produce what i want. Thanks
You can generate for combinations of length 1 then for combinations of length 2. Use paste to create your Variable column. Then rbindlist all your results to get the final output. library(data.table) setDT(data.train) sumCombi <- function(x, mySep="_") { data.train[ , sum(Val), by=c("Date", x)][, list(Date, Variable=do.call(paste, c(.SD[,x,with=FALSE], list(sep=mySep))), SumVal=V1)] } rbindlist(c( #combinations with 1 element in each combi lapply(c("A", "B", "C"), sumCombi) , #combinations with 2 elements in each combi lapply(combn(c("A","B","C"), 2, simplify=FALSE), sumCombi) ), use.names=FALSE) or more generically/programmatically: #assuming that your columns are in the middle of the columns while excl. first and last columns myCols <- names(data.train)[-c(1, ncol(data.train))] rbindlist(unlist( lapply(seq_along(myCols), function(n) combn(myCols, n, sumCombi, simplify=FALSE) ), recursive=FALSE), use.names=FALSE)
Avoiding Loops to Generate a Complex Dataframe with Nested Lists
Here is a kind of DF, I have to generate to store simulations data). nbSimul <- 100 nbSampleSizes <- 4 nbCensoredRates <- 4 sampleSize <- c(100, 50, 30, 10) censoredRate <- c(0.1, 0.3, 0.5, 0.8) df.sampled <- data.frame(cas = numeric() , distribution = character(), simul = numeric() , sampleSize = numeric() , censoredRate = numeric() , dta = I(list()) , quantileLD = I(list()) , stringsAsFactors = FALSE) v <- 0 # Scenario indicator for(k in 1:nbCensoredRates){ for(j in 1:nbSampleSizes){ for(i in 1:nbSimul){ # Scenario Id + Other info v <- v + 1 df.sampled[v,"cas"] <- v df.sampled[v,"distribution"] <- "logNormal" df.sampled[v,"simul"] <- i df.sampled[v,"sampleSize"] <- sampleSize[j] df.sampled[v,"censoredRate"] <- censoredRate[k] X <- rlnorm(sampleSize[j], meanlog = 0, sdlog = 1) estimatedLD <- array(9) for(w in 1:9){ estimatedLD[w] <- quantile(X, probs=censoredRate[k], type=w)[[1]] } df.sampled$dta[v] <- list(X) df.sampled$quantileLD[v] <- list(estimatedLD[1:9]) } } } Which is quite difficult to read. I would like to find a way to avoid loops, and to reference easily scenarios (v) and attached variables. Any idea?
How to do calculations on elements from a sublist in R
my code is as follows: x <- data.frame(matrix(rnorm(20), nrow=10)) colnames(x) <- c("z", "m") n_boot<-4 bs <- list() for (i in 1:n_boot) { bs[[i]] <- x[sample(nrow(x), 10, replace = TRUE), ] } bt<-matrix(unlist(bs), ncol = 2*n_boot, byrow = FALSE) colnames(bt) <- rep(c("z","m"),times=n_boot) M_to_boot <- bt[,seq(2,8,by=2)] funct<-function(M_boot_max) { od<-(1/((10*((10^((16-M_boot_max-25)/5))^3)/3)*((max(M_boot_max)-min(M_boot_max))/50))) } V_boot<-apply(M_to_boot,2,funct) rows.combined <- nrow(M_to_boot) cols.combined <- ncol(M_to_boot) + ncol(V_boot) matrix.combined <- matrix(NA, nrow=rows.combined, ncol=cols.combined) matrix.combined[, seq(1, cols.combined, 2)] <- M_to_boot matrix.combined[, seq(2, cols.combined, 2)] <- V_boot colnames(matrix.combined) <- rep(c("M_boot","V_boot"),times=n_boot) df<-as.data.frame(matrix.combined) start0 <- seq(1, by = 2, length = ncol(df) / 2) start <- lapply(start0, function(i, df) df[i:(i+1)], df = df) tests<-lapply(start, function(xy) split(xy, cut(xy$M_boot,breaks=5))) Now I want to prepare some calculations on values V_boot from a sublists. To be specific I want to for each subsample calculate the sum of V_boot. So, for example I want for a bin M_boot "[[4]]$(0.811,1.25]" to have a value of sum(V_boot) for that bin. But I cannot figure out how to get to that each V_boot values. Please help me.