Adding vector next to data.frame under new column name - r

I am experimenting with R and would like to implement a loop which runs 1000000 times and creates a vector of length 10 and adds each vector to a data frame under the name cycle and the number it has iterated.
This is my current code:
loser <- 100
winner <- 500
percentageWinner <- 70
runns <- 1000000
numbs <- 10
for(i in runns ) {
randNumb <- runif(numbs, min=0, max=100)
outcome <- ifelse(randNumb < percentageWinner, winner, loser) # true are winners and false are losers
df <- data.frame(outcome)
colnames(df)[which(names(df) == "outcome")] <- paste("cycle",i)
}
df
I am struggeling to add the vector next to the other data.frame column.
Any suggestions, how to do that?
I appreciate your replies!

In your code, at each iteration of your for loop, you overwrite i by 1 (i <- 1). And if you remove it, it will be always equal to runns, i.e only 1 loop.
You need to change your code for something like:
loser <- 100
winner <- 500
percentageWinner <- 70
runns <- 1000000
numbs <- 10
outcome <- matrix(NA, numbs, runns)
for(i in seq_len(runns)) {
randNumb <- runif(numbs, min=0, max=100)
outcome[,i] <- ifelse(randNumb < percentageWinner, winner, loser)
}
df <- data.frame(outcome)
colnames(df) <- paste0("cycle",seq_len(runns))
Or you can avoid the loop:
randNumb <- runif(numbs*runns, min=0, max=100)
outcome <- ifelse(randNumb < percentageWinner, winner, loser)
outcome <- matrix(outcome, numbs, runns)
df <- data.frame(outcome)
colnames(df) <- paste0("cycle",seq_len(runns))

Related

undefined columns selected and cannot xtfrm data frame error

I am trying to write a code that checks for outliers based on IQR and change those respective values to "NA". So I wrote this:
dt <- rnorm(200)
dg <- rnorm(200)
dh <- rnorm(200)
l <- c(1,3) #List of relevant columns
df <- data.frame(dt,dg,dh)
To check if the column contains any outliers and change their value to NA:
vector.is.empty <- function(x) return(length(x) ==0)
#Checks for empty values in vector and returns booleans.
for (i in 1:length(l)){
IDX <- l[i]
BP <- boxplot.stats(df[IDX])
OutIDX <- which(df[IDX] %in% BP$out)
if (vector.is.empty(OutIDX)==FALSE){
for (u in 1:length(OutIDX)){
IDX2 <- OutIDX[u]
df[IDX2,IDX] <- NA
}
}
}
So, when I run this code, I get these error messages:
I've tried to search online for any good answers. but I'm not sure why they claim that the column is unspecified. Any clues here?
I would do something like that in order to replace the outliers:
# Set a seed (to make the example reproducible)
set.seed(31415)
# Generate the data.frame
df <- data.frame(dt = rnorm(100), dg = rnorm(100), dh = rnorm(100))
# A list to save the result of boxplot.stats()
l <- list()
for (i in 1:ncol(df)){
l[[i]] <- boxplot.stats(df[,i])
df[which(df[,i]==l[[i]]$out),i] <- NA
}
# Which values have been replaced?
lapply(l, function(x) x$out)

Creating R data frames within a loop

I have some code which uses a loop to calculate a water balance for catchments (watersheds) for 8 catchments. I would like the loop to write it's output to a dataframe in R but the only way I can work out how to do this is to write it to csv (inside the loop), then outside of the loop read each of the csv files separately. I feel as though there could be a better way to do this - any ideas?
This is my code: (note it is part of a shiny app, hence my desire to avoid reading and writing csv files)
WB_catchments <- function (){
for (i in 1:8){
file_name <- gsub(" ", "", paste("outputs\\", Lake_name[i], "_catchment_water_balance.csv"))
p <- pts()[[i]]
Rain_in_WB <- RAIN() %>% filter(Grid_id %in% p)
Rain_in_WB$Grid_id <- NULL #remove Grid_id column
Rain_in_WB <- colSums(Rain_in_WB, na.rm = TRUE) # sum over catchment
AET_out_WB <- AET() %>% filter(Grid_id %in% p)
AET_out_WB$Grid_id <- NULL #remove Grid_id column
AET_out_WB <- -1*colSums(AET_out_WB, na.rm = TRUE) # sum over catchment and multiply by -1 as is an output
Evap_WB <- -1*EVAP_lakes[i,]
SW_in_WB <- SW_in_C[i,]
GW_in_WB <- GW_in_C[i,]
SW_out_WB <- -1*SW_out_C[i,]
GW_out_WB <- -1*GW_out_C[i,]
stor_WB <- STOR[i,]
out_catchment <- -1*outside[i,]
bal <- as.data.frame(cbind(WY, Rain_in_WB, SW_in_WB, GW_in_WB, AET_out_WB, Evap_WB, SW_out_WB, GW_out_WB, stor_WB, out_catchment))
bal <- mutate(bal, "res" = rowSums(bal[,2:10], na.rm = TRUE))
colnames(bal) <- c("WaterYear", "Rain", "SW_in", "GW_in", "AET", "Evap", "SW_out", "GW_out", "Storage", "Water_out_of_Greater_Tarawera_Catchments", "Residual")
write.csv(bal, file_name)
}
}
WB_catchments()
Okareka_WB_C <- read.csv("outputs\\Okareka_catchment_water_balance.csv")
Okaro_WB_C <- read.csv("outputs\\Okaro_catchment_water_balance.csv")
Okataina_WB_C <- read.csv("outputs\\Okataina_catchment_water_balance.csv")
Rerewhakaaitu_WB_C <- read.csv("outputs\\Rerewhakaaitu_catchment_water_balance.csv")
Rotokakahi_WB_C <- read.csv("outputs\\Rotokakahi_catchment_water_balance.csv")
Rotomahana_WB_C <- read.csv("outputs\\Rotomahana_catchment_water_balance.csv")
Tarawera_WB_C <- read.csv("outputs\\Tarawera_catchment_water_balance.csv")
Tikitapu_WB_C <- read.csv("outputs\\Tikitapu_catchment_water_balance.csv")
Instead of posting some very special code snippets, it is in most cases to post a toy example. Here an artificial example how to fill a data frame in a loop. As R is a vectorized language, it is often to avoid a loop at all. Compare the two cases below:
## number of cases
N <- 10
### looped version =====
df <- data.frame(
rain=rep(0, N),
evap=rep(0, N)
)
for (i in 1:N) {
# instead of runif, do your calculations
# ...
rain <- runif(1, min=0, max=10)
evap <- runif(1, min=1, max=5)
df[i, ] <- c(rain, evap)
}
df
### vectorized version =====
rain <- runif(N, min=0, max=10)
evap <- runif(N, min=1, max=5)
df2 <- data.frame(
rain=rain,
evap=evap
)
df2
If your calculations return more than one row in each iteration and you don't know beforehand how many, grow the data frame like this:
## empty data frame
df3 <- data.frame(
rain=NULL,
evap=NULL
)
for (i in 1:N) {
# instead of runif, do your calculations
# ...
rain <- runif(7, min=0, max=10)
evap <- runif(7, min=1, max=5)
df3 <- rbind(df3, cbind(rain, evap))
}
df3
Edit: Create several data frames (as elements of a list)
If separate data frames are needed, it is a good idea to put them together in a list. INstead of a loop,l we can use lapply:
create_df <- function(i) {
# optionally: do something with i, e.g. select file name
rain <- runif(7, min=0, max=10)
evap <- runif(7, min=1, max=5)
df <- data.frame(
rain=rain,
evap=evap
)
}
## lapply does the "loop" and returns a list of data frames
df_list <- lapply(1:8, create_df)
df_list[[7]] # returns 7th data frame
Another way I got this working was by using assign(file_name, bal, envir = .GlobalEnv) instead of write.csv(bal, file_name) in the last line of my function

range problems needed to solve

I want to combine range, mean and sd using cbind (each has 10 numbers), and I used range function to calculate range for each variable in my dataset. However, the range is atomic, and the output is like this:
my output
This is my data
BAA BAF Data Science
1 1 1
0 0 0
1 0 0
In my output, R has separated the range and produced 20 numbers. Line 1 and Line 2 should be the range for my first variable. Does anyone know how to solve this? This is my code below that produced this output:
range_com <- c()
mean_com <- c()
sd_com <- c()
dyad.realized_subset <- subset(dyad.realized, select=c(BAA,BAF,`Data
Science`,`Life Science`,Engineer,`Previous Raised`,`Max Raise`,Age,
Patent,`Committed Amount ($K)`))
range1 <- range(dyad.realized_subset$BAA,na.rm=T)
range2 <- range(dyad.realized_subset$BAF,na.rm=T)
range3 <- range(dyad.realized_subset$`Data Science`,na.rm=T)
range4 <- range(dyad.realized_subset$`Life Science`,na.rm=T)
range5 <- range(dyad.realized_subset$Engineer,na.rm=T)
range6 <- range(dyad.realized_subset$`Previous Raised`,na.rm=T)
range7 <- range(dyad.realized_subset$`Max Raise`,na.rm=T)
range8 <- range(dyad.realized_subset$Age,na.rm=T)
range9 <- range(dyad.realized_subset$Patent,na.rm=T)
range10 <- range(dyad.realized_subset$`Committed Amount ($K)`,na.rm=T)
range_com <-c(range1,range2,range3,range4,range5,range6,range7,range8,range9,range10)
for(i in seq(dyad.realized_subset)){
mean_com[i] <- mean(dyad.realized_subset[[i]], na.rm=T)
sd_com[i] <- sd(dyad.realized_subset[[i]],na.rm=T) }
# Bind and output table ####
desc_dyads <- cbind(range_dyads,mean_dyads,sd_dyads)
You can try using sapply to calculate mean, sd and range for each column in dyad.realized_subset together.
t(sapply(dyad.realized_subset, function(x) {
setNames(c(mean(x), sd(x), toString(range(x))), c('mean', 'sd', 'range'))
})) -> desc_dyads
desc_dyads

For loop function is looping too many times

I am calculating a community weighted mean of functional trait values (studying forestry). I have to multiply the relative abundances of each species (tree) by the trait values. I have 2dataframes, 1 with the relative abundances of each species within each site and one with the average trait values for each species. I made a loop to automize the calculation, but the endresults return the multiplication 13 times instead of 1 time (I have 13plots, so maybe it has something to do with this) I'm already busy with this script for several days since i'm new to R, but i have to do this for my masterthesis. I think I reached my limit of logical thinking today and can't find my error :) can someone help me please? I'll paste the script below:
load data, apply some column names, fill NAs with 0
library(data.table)
traits <- read.csv("Trait value.csv", sep = ";")
plots_Maiz <- read.csv("CWM Maiz plot.csv", sep = ";")
plots_Maiz[is.na(plots_Maiz)] <- 0
colnames(plots_Maiz) <- c("site", "species","y0","y1", "y2", "y3", "y4", "y5")
traits[,1:17][is.na(traits[,1:17])] <- 0
#function for finding the corresponding species for a plot in the traitlist
traitsf <- function(df, traitlist){
plottraits <- subset(traitlist, species %in% df[,2])
return(plottraits)
}
traitcalc <- function(traits, plots_Maiz){
multlist <- list()
blist <- list()
vmult <- vector()
tickcount <- 0
plotsplit <- split.data.frame(plots_Maiz, plots_Maiz$site)
testlist <- lapply(plotsplit, traitsf, traitlist = traits)
for (q in 1:length(plotsplit)){
df1 <- testlist[[q]]
df2 <- plotsplit[[q]]
plot <- as.character(plotsplit[[q]][1,1])
for (i in 1:nrow(df1)){
v <- as.numeric(as.vector(t(df1[i,2:ncol(df1)])))
species <- as.character(df1[i,1])
for (j in 1:(ncol(df2)-2)){
tickcount <- tickcount + 1
vmult <-as.vector(v * (as.numeric(as.vector(df2[i,j+2]))))
vmult <- as.list(c(vmult, j-1, species, plot))
multlist[[tickcount]] <- vmult
}
}
b <- do.call(rbind, multlist)
b <- data.table::rbindlist(multlist)
blist[[q]] <- b
}
return(blist)
}
endresults <- traitcalc(traits,plots_Maiz)
endresultsdf2<- do.call("rbind", endresults)

creating a function for processing my dataframe calculations

I am doing systematic calculations for my created dataframe. I have the code for the calculations but I would like to:
1) Wite it as a function and calling it for the dataframe I created.
2) reset the calculations for next ID in the dataframe.
I would appreciate your help and advice on this.
The dataframe is created in R using the following code:
#Create a dataframe
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
The other thing is that the previous code assumes the subject ID=1 for all time points. If subject ID=2 when the WT (weight) changes to 120. How can I reset the calculations and make it automated for all subject IDs in the dataframe? In this case the original dataframe would be like this:
#code:
rm(list=ls(all=TRUE))
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
df$ID[(df$WT>=120)==T] <- 2
df$TIME[df$ID==2] <- c(seq(0,20,1))
Thank you in advance!
In general, when doing calculations on different subject's data, I like to split the dataframe by ID, pass the vector of individual subject data into a for loop, do all the calculations, build a vector containing all the newly calculated data and then collapse the resultant and return the dataframe with all the numbers you want. This allows for a lot of control over what you do for each subject
subjects = split(df, df$ID)
forResults = vector("list", length=length(subjects))
# initialize these constants
C <- 2
V <- 10
k <- C/V
myFunc = function(data, resultsArray){
for(k in seq_along(subjects)){
df = subjects[[k]]
df$A1 = 100 # I assume this should be 100 for t=0 for each subject?
# you could vectorize this nested for loop..
for(i in 2:nrow(df)) {
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
# you can add all sorts of other calculations you want to do on each subject's data
# when you're done doing calculations, put the resultant into
# the resultsArray and we'll rebuild the dataframe with all the new variables
resultsArray[[k]] = df
# if you're not using RStudio, then you want to use dev.new() to instantiate a new plot canvas
# dev.new() # dont need this if you're using RStudio (which doesnt allow multiple plots open)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
# collapse the results vector into a dataframe
resultsDF = do.call(rbind, resultsArray)
return(resultsDF)
}
results = myFunc(subjects, forResults)
Do you want this:
ddf <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
myfn = function(df){
dosetimes <- c(0,6,12,18)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
myfn(ddf)
For multiple calls:
for(i in 1:N) {
myfn(ddf[ddf$ID==i,])
readline(prompt="Press <Enter> to continue...")
}

Resources