library('dplyr')
num_ens <- 10
I have a piece of code that takes num_ens files of data from a directory, reads them in, takes the average across them, and saves them as 1 object
A_tree <- lapply(1:num_ens, function(i) {
# importing data on each index i
r <- read.csv(
paste0("/Users/sethparker/vox_LA_max/top_down_individ_sd1/ens_",i,"_0_tree_from_data.txt"),
header = FALSE
)
# creating add columns
colnames(r) <- c("GPP","NPP","LA")
r$month <- seq.int(nrow(r))
r$run <- i
return(r)
})
A_tree <- bind_rows(A_tree)
A_tree <- A_tree %>% group_by(month) %>% summarize(across(c(GPP,NPP,LA), mean))
I would like to auto automate this same process to go across 7 directories:
/top_down_individ_sd1/ through /top_down_individ_sd7/
and to generate a series of objects:
A_tree through G_tree
I have unsuccessfully tried to achieve this with a few variations of the following for loop, which have yielded errors
letters <- LETTERS[seq(from = 1, to = 7)]
sd <- c("sd1","sd2","sd3","sd4","sd5","sd6","sd7")
for (j in 1:7) {
paste0(letters[j],"_tree") <- lapply(1:num_ens, function(i) {
# importing data on each index i
r <- read.csv(paste0(paste0("/Users/sethparker/vox_LA_max/top_down_individ_",sd[j]),"/ens_",i,"_0_tree_from_data.txt"),
header = FALSE)
# creating add columns
colnames(r) <- c("GPP","NPP","LA")
r$month <- seq.int(nrow(r))
r$run <- i
return(r)
})
paste0(letters[j],"_tree") <- bind_rows(paste0(letters[j],"_tree"))
paste0(letters[j],"_tree") <- paste0(letters[j],"_tree") %>% group_by(month) %>% summarize(across(c(GPP,NPP,LA), mean))
}
How can I achieve this goal without errors
tree <- list()
for (j in 1:7) {
tree[[j]] <- lapply(1:num_ens, function(i) {
# importing data on each index i
r <- read.csv(paste0(paste0("/Users/sethparker/vox_LA_max/top_down_individ_",sd[j]),"/ens_",i,"_0_tree_from_data.txt"),
header = FALSE)
# creating add columns
colnames(r) <- c("GPP","NPP","LA")
r$month <- seq.int(nrow(r))
r$run <- i
return(r)
})
tree[[j]] <- bind_rows(tree[[j]])
tree[[j]] <- tree[[j]] %>% group_by(month) %>% summarize(across(c(GPP,NPP,LA), mean))
}
Related
I have a set of lists which I would like to convert into the nested list of a certain structure. My initial data look like list_1_1 ... list_2_2. I would like them to be like final_desired_output.
I can do this step by step by extracting desired variable and appending to the output list one by one. However, this dummy example contains only 2 data subsets (first_lists and list second_lists), while the real life data are far >1 GB. Thus, I would like to do it with a function, which I unfortunatly do not know how to do, as nested lists are not well covered in tutorials. Any assistance?
# some dummy data
one_1 <- c(1:10)
one_2 <- c(2:15)
one_3 <- c(3:20)
starting_one_1 <- 1
starting_one_2 <- 2
starting_one_3 <- 3
ending_one_1 <- c(11)
ending_one_2 <- c(16)
ending_one_3 <- c(21)
two_1 <- c(1:100)
two_2 <- c(1:15)
starting_two_1 <- 5
starting_two_2 <- 10
ending_two_1 <- c(101)
ending_two_2 <- c(16)
# lists mimicking output I currently have
list_1_1 <- list(one_1, one_2, one_3)
list_1_2 <- list(starting_one_1, starting_one_2, starting_one_3)
list_1_3 <- list(ending_one_1, ending_one_2, ending_one_3)
list_2_1 <- list(two_1, two_2)
list_2_2 <- list(starting_two_1, starting_two_2)
list_2_3 <- list(ending_two_1, ending_two_2)
# producing desired otput
list_1_1_desired <- list()
list_1_1_desired[["sequence"]] <- one_1
list_1_1_desired[["starting"]] <- starting_one_1
list_1_1_desired[["ending"]] <- ending_one_1
list_1_2_desired <- list()
list_1_2_desired[["sequence"]] <- one_2
list_1_2_desired[["starting"]] <- starting_one_2
list_1_2_desired[["ending"]] <- ending_one_2
list_1_3_desired <- list()
list_1_3_desired[["sequence"]] <- one_3
list_1_3_desired[["starting"]] <- starting_one_3
list_1_3_desired[["ending"]] <- ending_one_3
list_2_1_desired <- list()
list_2_1_desired[["sequence"]] <- two_1
list_2_1_desired[["starting"]] <- starting_two_1
list_2_1_desired[["ending"]] <- ending_two_1
list_2_2_desired <- list()
list_2_2_desired[["sequence"]] <- two_2
list_2_2_desired[["starting"]] <- starting_two_2
list_2_2_desired[["ending"]] <- ending_two_2
first_lists <- list(list_1_1_desired, list_1_2_desired, list_1_3_desired)
names(first_lists) <- c("one_1", "one_2", "one_3")
second_lists <- list(list_2_1_desired, list_2_2_desired)
names(second_lists) <- c("two_1", "two_2")
# this is what I would like to obtain
final_desired_output <- list()
final_desired_output[["one"]] <- first_lists
final_desired_output[["two"]] <- second_lists
You could use purrr::transpose:
out <- mget(ls(pattern = '^list.*\\d$')) %>%
split(sub("_\\d+$", '', names(.))) %>%
map(~transpose(set_names(.,c('sequence', 'starting', 'ending'))))
all.equal(out, final_desired_output, check.attributes = FALSE)
[1] TRUE
I am looking to save the values from "roc_full_resolution" into a vector. Any ideas?
for(i in 1:10) repeat {
CarefulR<- merge(data,reliable,by.x ="Response_ID" )
CarefullRespondents <- CarefulR %>% select(Response_ID,Category,Q17_1,Q17_2,Q17_3,Q17_4,Q17_5,Q17_6,Q17_7,Q17_8,Q17_9,Q17_10,Q17_11,Q17_12,Q17_13,Q17_14,Q17_15,Q17_16,Q17_17,Q17_18,Q17_19,Q17_20,Q17_21,Q17_22,Q17_23,Q17_24)
CarelessRespondents<- unreliable %>% select(Response_ID,Category,Q17_1,Q17_2,Q17_3,Q17_4,Q17_5,Q17_6,Q17_7,Q17_8,Q17_9,Q17_10,Q17_11,Q17_12,Q17_13,Q17_14,Q17_15,Q17_16,Q17_17,Q17_18,Q17_19,Q17_20,Q17_21,Q17_22,Q17_23,Q17_24)
CR2<- sample_n(CarelessRespondents, 146,replace = TRUE)
df <- rbind(CarefullRespondents,CR2)[-1]
simulation1_mahad <- mahad_raw <- mahad(df )
rounded_scores <- round(simulation1_mahad, digits=1)
roc_rounded <- roc(df$Category, rounded_scores)
roc_full_resolution <- roc(df$Category,rounded_scores)
print(roc_full_resolution)
break}
I would create the roc_full_resolution variable before the for loop, then append the results on each iteration. I would also move the setup outside the loop, and I don't think the repeat and break are needed in this situation.
CarefulR <- merge(data, reliable, by.x = "Response_ID")
CarefullRespondents <- CarefulR %>% select(Response_ID,Category,Q17_1,Q17_2,Q17_3,Q17_4,Q17_5,Q17_6,Q17_7,Q17_8,Q17_9,Q17_10,Q17_11,Q17_12,Q17_13,Q17_14,Q17_15,Q17_16,Q17_17,Q17_18,Q17_19,Q17_20,Q17_21,Q17_22,Q17_23,Q17_24)
CarelessRespondents <- unreliable %>% select(Response_ID,Category,Q17_1,Q17_2,Q17_3,Q17_4,Q17_5,Q17_6,Q17_7,Q17_8,Q17_9,Q17_10,Q17_11,Q17_12,Q17_13,Q17_14,Q17_15,Q17_16,Q17_17,Q17_18,Q17_19,Q17_20,Q17_21,Q17_22,Q17_23,Q17_24)
roc_full_resolution<-NULL
for(i in 1:10) {
CR2 <- sample_n(CarelessRespondents, 146, replace = TRUE)
df <- rbind(CarefullRespondents, CR2)[-1]
simulation1_mahad <- mahad_raw <- mahad(df)
rounded_scores <- round(simulation1_mahad, digits = 1)
roc_rounded <- roc(df$Category, rounded_scores)
roc_full_resolution <- append(roc_full_resolution, roc_rounded)
}
print(roc_full_resolution)
I have some code which uses a loop to calculate a water balance for catchments (watersheds) for 8 catchments. I would like the loop to write it's output to a dataframe in R but the only way I can work out how to do this is to write it to csv (inside the loop), then outside of the loop read each of the csv files separately. I feel as though there could be a better way to do this - any ideas?
This is my code: (note it is part of a shiny app, hence my desire to avoid reading and writing csv files)
WB_catchments <- function (){
for (i in 1:8){
file_name <- gsub(" ", "", paste("outputs\\", Lake_name[i], "_catchment_water_balance.csv"))
p <- pts()[[i]]
Rain_in_WB <- RAIN() %>% filter(Grid_id %in% p)
Rain_in_WB$Grid_id <- NULL #remove Grid_id column
Rain_in_WB <- colSums(Rain_in_WB, na.rm = TRUE) # sum over catchment
AET_out_WB <- AET() %>% filter(Grid_id %in% p)
AET_out_WB$Grid_id <- NULL #remove Grid_id column
AET_out_WB <- -1*colSums(AET_out_WB, na.rm = TRUE) # sum over catchment and multiply by -1 as is an output
Evap_WB <- -1*EVAP_lakes[i,]
SW_in_WB <- SW_in_C[i,]
GW_in_WB <- GW_in_C[i,]
SW_out_WB <- -1*SW_out_C[i,]
GW_out_WB <- -1*GW_out_C[i,]
stor_WB <- STOR[i,]
out_catchment <- -1*outside[i,]
bal <- as.data.frame(cbind(WY, Rain_in_WB, SW_in_WB, GW_in_WB, AET_out_WB, Evap_WB, SW_out_WB, GW_out_WB, stor_WB, out_catchment))
bal <- mutate(bal, "res" = rowSums(bal[,2:10], na.rm = TRUE))
colnames(bal) <- c("WaterYear", "Rain", "SW_in", "GW_in", "AET", "Evap", "SW_out", "GW_out", "Storage", "Water_out_of_Greater_Tarawera_Catchments", "Residual")
write.csv(bal, file_name)
}
}
WB_catchments()
Okareka_WB_C <- read.csv("outputs\\Okareka_catchment_water_balance.csv")
Okaro_WB_C <- read.csv("outputs\\Okaro_catchment_water_balance.csv")
Okataina_WB_C <- read.csv("outputs\\Okataina_catchment_water_balance.csv")
Rerewhakaaitu_WB_C <- read.csv("outputs\\Rerewhakaaitu_catchment_water_balance.csv")
Rotokakahi_WB_C <- read.csv("outputs\\Rotokakahi_catchment_water_balance.csv")
Rotomahana_WB_C <- read.csv("outputs\\Rotomahana_catchment_water_balance.csv")
Tarawera_WB_C <- read.csv("outputs\\Tarawera_catchment_water_balance.csv")
Tikitapu_WB_C <- read.csv("outputs\\Tikitapu_catchment_water_balance.csv")
Instead of posting some very special code snippets, it is in most cases to post a toy example. Here an artificial example how to fill a data frame in a loop. As R is a vectorized language, it is often to avoid a loop at all. Compare the two cases below:
## number of cases
N <- 10
### looped version =====
df <- data.frame(
rain=rep(0, N),
evap=rep(0, N)
)
for (i in 1:N) {
# instead of runif, do your calculations
# ...
rain <- runif(1, min=0, max=10)
evap <- runif(1, min=1, max=5)
df[i, ] <- c(rain, evap)
}
df
### vectorized version =====
rain <- runif(N, min=0, max=10)
evap <- runif(N, min=1, max=5)
df2 <- data.frame(
rain=rain,
evap=evap
)
df2
If your calculations return more than one row in each iteration and you don't know beforehand how many, grow the data frame like this:
## empty data frame
df3 <- data.frame(
rain=NULL,
evap=NULL
)
for (i in 1:N) {
# instead of runif, do your calculations
# ...
rain <- runif(7, min=0, max=10)
evap <- runif(7, min=1, max=5)
df3 <- rbind(df3, cbind(rain, evap))
}
df3
Edit: Create several data frames (as elements of a list)
If separate data frames are needed, it is a good idea to put them together in a list. INstead of a loop,l we can use lapply:
create_df <- function(i) {
# optionally: do something with i, e.g. select file name
rain <- runif(7, min=0, max=10)
evap <- runif(7, min=1, max=5)
df <- data.frame(
rain=rain,
evap=evap
)
}
## lapply does the "loop" and returns a list of data frames
df_list <- lapply(1:8, create_df)
df_list[[7]] # returns 7th data frame
Another way I got this working was by using assign(file_name, bal, envir = .GlobalEnv) instead of write.csv(bal, file_name) in the last line of my function
I have a list of data frames (generated by the permutation order of an initial dataframe) to which I would like to apply complicated calculus using group_by_at() and mutate(). It works well with a single data frame but fail using a for loop since mutate requires the name of the dataframe and some of my calculus as well. So I thought, well, let's create a list of different dataframes all having the same name and loop over the initial sequence of names. Unfortunately the trick does not work and I get the following message:
Error: object of type 'closure' is not subsettable.
Here is the self contained example showing all my steps. I think the problem comes from mutate. So, how could I force the use of for loop with mutate?
data <- read.table(text = 'obs gender ageclass weight year subdata income
1 F 1 10 yearA sub1 1000
2 M 2 25 yearA sub1 1200
3 M 2 5 yearB sub2 1400
4 M 1 11 yearB sub1 1350',
header = TRUE)
library(dplyr)
library(GiniWegNeg)
dataA <- select(data, gender, ageclass)
dataB <- select(data, -gender, -ageclass)
rm(data)
# Generate permutation of indexes based on the number of column in dataA
library(combinat)
index <- permn(ncol(dataA))
# Attach dataA to the previous list of index
res <- lapply(index, function(x) dataA[x])
# name my list keeping track of permutation order in dataframe name
names(res) <- unlist(lapply(res,function(x) sprintf('data%s',paste0(toupper(substr(colnames(x),1,1)),collapse = ''))))
# Create a list containing the name of each data.frame name
NameList <- unlist(lapply(res,function(x) sprintf('data%s',paste0(toupper(substr(colnames(x),1,1)),collapse = ''))))
# Define as N the number of columns/permutation/dataframes
N <- length(res)
# Merge res and dataB for all permutation of dataframes
res <- lapply(res,function(x) cbind(x,dataB))
# Change the name of res so that all data frames are named data
names(res) <- rep("data", N)
# APPLY FOR LOOP TO ALL DATAFRAMES
for (j in NameList){
runCalc <- function(data, y){
data <- data %>%
group_by_at(1) %>%
mutate(Income_1 = weighted.mean(income, weight))
data <- data %>%
group_by_at(2) %>%
mutate(Income_2 = weighted.mean(income, weight))
gini <- c(Gini_RSV(data$Income_1, data$weight), Gini_RSV(data$Income_2,data$weight))
Gini <- data.frame(gini)
colnames(Gini) <- c("Income_1","Income_2")
rownames(Gini) <- c(paste0("Gini_", y))
return(Gini)
}
runOtherCalc <- function(df, y){
Contrib <- (1/5) * df$Income_1 + df$Income_2
Contrib <- data.frame(Contrib)
colnames(Contrib) <- c("myresult")
rownames(Contrib) <- c(paste0("Contrib_", y)
return(Contrib)
}
# Run runCalc over dataframe data by year
df1_List <- lapply(unique(data$year), function(i) {
byperiod <- subset(data, year == i)
runCalc(byperiod, i)
})
# runCalc returns df which then passes to runOtherCalc, again by year
df1_OtherList <- lapply(unique(data$year), function(i)
byperiod <- subset(data, year == i)
df <- runCalc(byperiod, i)
runOtherCalc(df, i)
})
# Run runCalc over dataframe data by subdata
df2_List <- lapply(unique(data$subdata), function(i) {
byperiod <- subset(data, subdata == i)
runCalc(bysubdata, i)
})
# runCalc returns df which then passes to runOtherCalc, again by subdata
df2_OtherList <- lapply(unique(data$subdata), function(i)
bysubdata <- subset(data, subdata == i)
df <- runCalc(bysubdata, i)
runOtherCalc(df, i)
})
# Return all results in separate frames, then append by row in 2 frames
Gini_df1 <- do.call(rbind, df1_List)
Contrib_df1 <- do.call(rbind,df1_OtherList)
Gini_df2 <- do.call(rbind, df1_List)
Contrib_df2 <- do.call(rbind,df1_OtherList)
Gini <- rbind(Gini_df1, Gini_df2)
Contrib <- rbind(Contrib_df1, Contrib_df2)
}
Admittedly, the R error you receive below is a bit cryptic but usually it means you are running an operation on an object that does not exist.
Error: object of type 'closure' is not subsettable.
Specifically, it comes with your lapply call as data is not defined anywhere globally (only within the runCalc method) and as above you remove it with rm(data).
dfList <- lapply(unique(data$year), function(i) {
byperiod <- subset(data, year == i)
runCalc(byperiod, i)
})
By, the way the use of lapply...unique...subset can be replaced with the underused grouping base R function, by().
Gathering from your text and code, I believe you intend to run a year grouping on each dataframe of your list, res. Then consider two by calls, wrapped in a larger function that receives as a parameter a dataframe, df. Then run lapply across all items of list to return a new list of nested dataframe pairs.
# SECONDARY FUNCTIONS
runCalc <- function(data) {
data <- data %>%
group_by_at(1) %>%
mutate(Income_1 = weighted.mean(income, weight))
data <- data %>%
group_by_at(2) %>%
mutate(Income_2 = weighted.mean(income, weight))
Gini <- data.frame(
year = data$year[[1]],
Income_1 = unname(Gini_RSV(data$Income_1, data$weight)),
Income_2 = unname(Gini_RSV(data$Income_2, data$weight)),
row.names = paste0("Gini_", data$year[[1]])
)
return(Gini)
}
runOtherCalc <- function(df){
Contrib <- data.frame(
myresult = (1/5) * df$Income_1 + df$Income_2,
row.names = paste0("Contrib_", df$year[[1]])
)
return(Contrib)
}
# PRIMARY FUNCTION
runDfOperations <- function(df) {
gList <- by(df, df$year, runCalc)
gTmp <- do.call(rbind, gList)
cList <- by(gTmp, gTmp$year, runOtherCalc)
cTmp <- do.call(rbind, cList)
gtmp$year <- NULL
return(list(gTmp, cTmp))
}
# RETURNS NESTED LIST OF TWO DFs FOR EACH ORIGINAL DF
new_res <- lapply(res, runDfOperations)
# SEPARATE LISTS IF NEEDED (EQUAL LENGTH)
Gini <- lapply(new_res, "[[", 1)
Contrib <- lapply(new_res, "[[", 2)
I want to visualize a mosaic plot in form of a tree. For example
mosaicplot(~ Sex + Age + Survived, data = Titanic, color = TRUE)
Now what I want is to represent this in a tree form where the first node
for example be sex the second node be age and at the terminal node be number of people survived. May be it should something like http://addictedtor.free.fr/graphiques/RGraphGallery.php?graph=84 where instead of p giving the number of counts.
Is there an function in R to do this or should I write it on my own by taking at a look
at the party:::plot.BinaryTree function
Here is how I managed to get what I wanted with the lovely igraph package. The code is an ugly hack. It will be great to have you suggestions
library(igraph)
rm(list=ls())
req.data <- as.data.frame(Titanic)
lookup <- c("M","F","C","A","N","Y")
names(lookup) <- c("Male","Female","Child","Adult","Yes","No")
req.data$board <- "board"
req.data$Class.m <- paste(req.data$board,req.data$Class,sep="_")
req.data$Sex.m <- paste(req.data$board,req.data$Class,req.data$Sex,
sep="_")
req.data$Age.m <- paste(req.data$board,req.data$Class,req.data$Sex,
req.data$Age,sep="_")
req.data$Survived.m <- paste(req.data$board,req.data$Class,req.data$Sex,
req.data$Age,req.data$Survived,sep="_")
tmp <- data.frame(from=
do.call("c",lapply(req.data[,c("board",
"Class.m",
"Sex.m",
"Age.m")],as.character)),
to=do.call("c",lapply(req.data[,c("Class.m",
"Sex.m",
"Age.m",
"Survived.m")],as.character)),
stringsAsFactors=FALSE)
tmp <- tmp [!duplicated(tmp ),];rownames(tmp) <- NULL
tmp$num <- unlist(lapply(strsplit(tmp$to,"_"),
FUN=function(x){
check1 <- req.data$Class==x[2]
check2 <- req.data$Sex == x[3]
check3 <- req.data$Age == x[4]
check4 <- req.data$Survived == x[5]
sum(req.data$Freq[ifelse(is.na(check1),TRUE,check1) &
ifelse(is.na(check2),TRUE,check2) &
ifelse(is.na(check3),TRUE,check3) &
ifelse(is.na(check4),TRUE,check4)])}))
g <- graph.data.frame(tmp, directed=TRUE)
V(g)$label <- unlist(lapply(strsplit(V(g)$name,"_"),
FUN=function(y){ifelse(y[length(y)] %in% names(lookup),
lookup[y[length(y)]],y[length(y)])}))
E(g)$label <- tmp$num
plot(g,layout=layout.reingold.tilford,ylim=c(1,-1),edge.arrow.size=0.5,vertex.size=7)
legend("topleft", paste(lookup ,names(lookup),sep=" : "),ncol=2,bty="n",cex=0.7)
### To find the case for crew members
tmp1 <- tmp [grepl("Crew",tmp$from),];rownames(tmp1) <- NULL
g <- graph.data.frame(tmp1, directed=TRUE)
V(g)$label <- unlist(lapply(strsplit(V(g)$name,"_"),
FUN=function(y){ifelse(y[length(y)] %in% names(lookup),
lookup[y[length(y)]],y[length(y)])}))
E(g)$label <- tmp1$num
plot(g,layout=layout.reingold.tilford,ylim=c(1,-1),edge.arrow.size=0.5)
legend("topleft", paste(lookup ,names(lookup),sep=" : "),ncol=2,bty="n",cex=0.7)
Here is the plot I generate. You can modify the vertex/edge colors/size as you want
This is pretty close and looks a lot easier to me.. I post it here in case it may be of use. First I convert the ftable to a more traditional long data frame using expand.dft https://stat.ethz.ch/pipermail/r-help/2009-January/185561.html Then I just use the plot.dendrite function from the plotrix package.
expand.dft <- function(x, var.names = NULL, freq = "Freq", ...)
{
# allow: a table object, or a data frame in frequency form
if(inherits(x, "table"))
x <- as.data.frame.table(x, responseName = freq)
freq.col <- which(colnames(x) == freq)
if (length(freq.col) == 0)
stop(paste(sQuote("freq"), "not found in column names"))
DF <- sapply(1:nrow(x),
function(i) x[rep(i, each = x[i, freq.col]), ],
simplify = FALSE)
DF <- do.call("rbind", DF)[, -freq.col]
for (i in 1:ncol(DF))
{
DF[[i]] <- type.convert(as.character(DF[[i]]), ...)
}
rownames(DF) <- NULL
if (!is.null(var.names))
{
if (length(var.names) < dim(DF)[2])
{
stop(paste("Too few", sQuote("var.names"), "given."))
} else if (length(var.names) > dim(DF)[2]) {
stop(paste("Too many", sQuote("var.names"), "given."))
} else {
names(DF) <- var.names
}
}
DF
}
library(plotrix)
r = ftable(Titanic)
plot.dendrite(makeDendrite(expand.dft(data.frame(r))))