creating a function for processing my dataframe calculations - r

I am doing systematic calculations for my created dataframe. I have the code for the calculations but I would like to:
1) Wite it as a function and calling it for the dataframe I created.
2) reset the calculations for next ID in the dataframe.
I would appreciate your help and advice on this.
The dataframe is created in R using the following code:
#Create a dataframe
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
The other thing is that the previous code assumes the subject ID=1 for all time points. If subject ID=2 when the WT (weight) changes to 120. How can I reset the calculations and make it automated for all subject IDs in the dataframe? In this case the original dataframe would be like this:
#code:
rm(list=ls(all=TRUE))
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
df$ID[(df$WT>=120)==T] <- 2
df$TIME[df$ID==2] <- c(seq(0,20,1))
Thank you in advance!

In general, when doing calculations on different subject's data, I like to split the dataframe by ID, pass the vector of individual subject data into a for loop, do all the calculations, build a vector containing all the newly calculated data and then collapse the resultant and return the dataframe with all the numbers you want. This allows for a lot of control over what you do for each subject
subjects = split(df, df$ID)
forResults = vector("list", length=length(subjects))
# initialize these constants
C <- 2
V <- 10
k <- C/V
myFunc = function(data, resultsArray){
for(k in seq_along(subjects)){
df = subjects[[k]]
df$A1 = 100 # I assume this should be 100 for t=0 for each subject?
# you could vectorize this nested for loop..
for(i in 2:nrow(df)) {
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
# you can add all sorts of other calculations you want to do on each subject's data
# when you're done doing calculations, put the resultant into
# the resultsArray and we'll rebuild the dataframe with all the new variables
resultsArray[[k]] = df
# if you're not using RStudio, then you want to use dev.new() to instantiate a new plot canvas
# dev.new() # dont need this if you're using RStudio (which doesnt allow multiple plots open)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
# collapse the results vector into a dataframe
resultsDF = do.call(rbind, resultsArray)
return(resultsDF)
}
results = myFunc(subjects, forResults)

Do you want this:
ddf <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
myfn = function(df){
dosetimes <- c(0,6,12,18)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
myfn(ddf)
For multiple calls:
for(i in 1:N) {
myfn(ddf[ddf$ID==i,])
readline(prompt="Press <Enter> to continue...")
}

Related

undefined columns selected and cannot xtfrm data frame error

I am trying to write a code that checks for outliers based on IQR and change those respective values to "NA". So I wrote this:
dt <- rnorm(200)
dg <- rnorm(200)
dh <- rnorm(200)
l <- c(1,3) #List of relevant columns
df <- data.frame(dt,dg,dh)
To check if the column contains any outliers and change their value to NA:
vector.is.empty <- function(x) return(length(x) ==0)
#Checks for empty values in vector and returns booleans.
for (i in 1:length(l)){
IDX <- l[i]
BP <- boxplot.stats(df[IDX])
OutIDX <- which(df[IDX] %in% BP$out)
if (vector.is.empty(OutIDX)==FALSE){
for (u in 1:length(OutIDX)){
IDX2 <- OutIDX[u]
df[IDX2,IDX] <- NA
}
}
}
So, when I run this code, I get these error messages:
I've tried to search online for any good answers. but I'm not sure why they claim that the column is unspecified. Any clues here?
I would do something like that in order to replace the outliers:
# Set a seed (to make the example reproducible)
set.seed(31415)
# Generate the data.frame
df <- data.frame(dt = rnorm(100), dg = rnorm(100), dh = rnorm(100))
# A list to save the result of boxplot.stats()
l <- list()
for (i in 1:ncol(df)){
l[[i]] <- boxplot.stats(df[,i])
df[which(df[,i]==l[[i]]$out),i] <- NA
}
# Which values have been replaced?
lapply(l, function(x) x$out)

Creating R data frames within a loop

I have some code which uses a loop to calculate a water balance for catchments (watersheds) for 8 catchments. I would like the loop to write it's output to a dataframe in R but the only way I can work out how to do this is to write it to csv (inside the loop), then outside of the loop read each of the csv files separately. I feel as though there could be a better way to do this - any ideas?
This is my code: (note it is part of a shiny app, hence my desire to avoid reading and writing csv files)
WB_catchments <- function (){
for (i in 1:8){
file_name <- gsub(" ", "", paste("outputs\\", Lake_name[i], "_catchment_water_balance.csv"))
p <- pts()[[i]]
Rain_in_WB <- RAIN() %>% filter(Grid_id %in% p)
Rain_in_WB$Grid_id <- NULL #remove Grid_id column
Rain_in_WB <- colSums(Rain_in_WB, na.rm = TRUE) # sum over catchment
AET_out_WB <- AET() %>% filter(Grid_id %in% p)
AET_out_WB$Grid_id <- NULL #remove Grid_id column
AET_out_WB <- -1*colSums(AET_out_WB, na.rm = TRUE) # sum over catchment and multiply by -1 as is an output
Evap_WB <- -1*EVAP_lakes[i,]
SW_in_WB <- SW_in_C[i,]
GW_in_WB <- GW_in_C[i,]
SW_out_WB <- -1*SW_out_C[i,]
GW_out_WB <- -1*GW_out_C[i,]
stor_WB <- STOR[i,]
out_catchment <- -1*outside[i,]
bal <- as.data.frame(cbind(WY, Rain_in_WB, SW_in_WB, GW_in_WB, AET_out_WB, Evap_WB, SW_out_WB, GW_out_WB, stor_WB, out_catchment))
bal <- mutate(bal, "res" = rowSums(bal[,2:10], na.rm = TRUE))
colnames(bal) <- c("WaterYear", "Rain", "SW_in", "GW_in", "AET", "Evap", "SW_out", "GW_out", "Storage", "Water_out_of_Greater_Tarawera_Catchments", "Residual")
write.csv(bal, file_name)
}
}
WB_catchments()
Okareka_WB_C <- read.csv("outputs\\Okareka_catchment_water_balance.csv")
Okaro_WB_C <- read.csv("outputs\\Okaro_catchment_water_balance.csv")
Okataina_WB_C <- read.csv("outputs\\Okataina_catchment_water_balance.csv")
Rerewhakaaitu_WB_C <- read.csv("outputs\\Rerewhakaaitu_catchment_water_balance.csv")
Rotokakahi_WB_C <- read.csv("outputs\\Rotokakahi_catchment_water_balance.csv")
Rotomahana_WB_C <- read.csv("outputs\\Rotomahana_catchment_water_balance.csv")
Tarawera_WB_C <- read.csv("outputs\\Tarawera_catchment_water_balance.csv")
Tikitapu_WB_C <- read.csv("outputs\\Tikitapu_catchment_water_balance.csv")
Instead of posting some very special code snippets, it is in most cases to post a toy example. Here an artificial example how to fill a data frame in a loop. As R is a vectorized language, it is often to avoid a loop at all. Compare the two cases below:
## number of cases
N <- 10
### looped version =====
df <- data.frame(
rain=rep(0, N),
evap=rep(0, N)
)
for (i in 1:N) {
# instead of runif, do your calculations
# ...
rain <- runif(1, min=0, max=10)
evap <- runif(1, min=1, max=5)
df[i, ] <- c(rain, evap)
}
df
### vectorized version =====
rain <- runif(N, min=0, max=10)
evap <- runif(N, min=1, max=5)
df2 <- data.frame(
rain=rain,
evap=evap
)
df2
If your calculations return more than one row in each iteration and you don't know beforehand how many, grow the data frame like this:
## empty data frame
df3 <- data.frame(
rain=NULL,
evap=NULL
)
for (i in 1:N) {
# instead of runif, do your calculations
# ...
rain <- runif(7, min=0, max=10)
evap <- runif(7, min=1, max=5)
df3 <- rbind(df3, cbind(rain, evap))
}
df3
Edit: Create several data frames (as elements of a list)
If separate data frames are needed, it is a good idea to put them together in a list. INstead of a loop,l we can use lapply:
create_df <- function(i) {
# optionally: do something with i, e.g. select file name
rain <- runif(7, min=0, max=10)
evap <- runif(7, min=1, max=5)
df <- data.frame(
rain=rain,
evap=evap
)
}
## lapply does the "loop" and returns a list of data frames
df_list <- lapply(1:8, create_df)
df_list[[7]] # returns 7th data frame
Another way I got this working was by using assign(file_name, bal, envir = .GlobalEnv) instead of write.csv(bal, file_name) in the last line of my function

How to calculate correlation against 2 data frames and output by condition using R?

I am going to calculate correlation between 2 gene expression data frames, which are protein and RNA data frames. Data shows here.https://drive.google.com/file/d/1S5bYm8baqLf40KqmEbOO59YrDLp4sWnL/view
https://drive.google.com/file/d/1wmZF4v8Ehq2giKldFWHv6UUBzu8Ncsy2/view
Rownames are gene, and colnames are samples. let's say, gene1 represents gene in 1st df, gene2 represents gene in 2nd df.
The output I want is a table, only contain the correlation meet the condition, not all correlation value. It looks like this:enter image description here
Here is what and how I do:
A. I need to calculate correlation of gene1 and gene2.
B. And I want to filter that |correlation(gene1, gene1)|<0.3 & |correlation(gene1, gene2)|>0.6, which means correlation of same gene in 2 df smaller than 0.3, and correlation of different gene in 2 df higher than 0.6
C. Return a table with columns are 'gene, gene, correlation', the output I want is a table like this:enter image description here
This is my code, which cannot make what I need, and there are more than 10,000 rows and 77 cols of each data frames, the job was been killed in GPU, which run more than 2 hours, please make code easy and use less memory as less as possible.
func.cor3 <- function(x,y){#func.cor
na1 <- which(is.na(x)==TRUE)
na2 <- which(is.na(y)==TRUE)
nas <- union(na1,na2)
if(length(nas)!=0){
x <- x[-nas]
y <- y[-nas]
}
nn <- cor(x,y)
nn <- round(nn,5)
nn <- format(nn, nsmall = 5)
if(nn>0.6 & nn< -0.6){
w <-as.character(str_c(nn,"_r1p2>0.6"))
return(w)
}
else if(nn<0.3 & nn> -0.3){
w <-as.character(str_c(nn,"_r1p1<0.3"))
return(w)
}else{
}
P2 <- P[,-1]
rownames(P2) <- P[,1]
nn <- setdiff(colnames(P2),colnames(R2))
ns <- vector();n=1
for (i in 1:length(nn)) {
nn.i <- nn[i]
w.i <- which(colnames(P2)==nn.i)
ns[n] <- w.i
n=n+1
}
P2 <- P2[,-ns]
nk <- colnames(P2)
ns <- order(nk)
P2 <- P2[,ns]
P2_1000 <- P2[1:1000,1:77]# try first 1000 rows of data
R <- read.csv(file = "RNA_Breast_2.csv",header = TRUE)
R2 <- R[,-1]
rownames(R2) <- R[,1]
nk <- colnames(R2)
ns <- order(nk)
R2 <- R2[,ns]
R2_1000 <- R2[1:1000,]
D_M <- matrix(rep(NA,3*nrow(R2_1000)*nrow(P2_1000)),ncol =3 )
colnames(D_M) <- c("gene1_RNA","gene_Protein","correlation")
n=1
for (i in 1:nrow(R2_1000)) {
for (j in 1:nrow(P2_1000)) {
D_M[n,1] <- rownames(R2_1000)[i]
D_M[n,2] <- rownames(P2_1000)[j]
D_M[n,3] <- func.cor3(as.numeric(R2_1000[i,]),as.numeric(P2_1000[j,]))
n=n+1
}
}
D_M

For loop function is looping too many times

I am calculating a community weighted mean of functional trait values (studying forestry). I have to multiply the relative abundances of each species (tree) by the trait values. I have 2dataframes, 1 with the relative abundances of each species within each site and one with the average trait values for each species. I made a loop to automize the calculation, but the endresults return the multiplication 13 times instead of 1 time (I have 13plots, so maybe it has something to do with this) I'm already busy with this script for several days since i'm new to R, but i have to do this for my masterthesis. I think I reached my limit of logical thinking today and can't find my error :) can someone help me please? I'll paste the script below:
load data, apply some column names, fill NAs with 0
library(data.table)
traits <- read.csv("Trait value.csv", sep = ";")
plots_Maiz <- read.csv("CWM Maiz plot.csv", sep = ";")
plots_Maiz[is.na(plots_Maiz)] <- 0
colnames(plots_Maiz) <- c("site", "species","y0","y1", "y2", "y3", "y4", "y5")
traits[,1:17][is.na(traits[,1:17])] <- 0
#function for finding the corresponding species for a plot in the traitlist
traitsf <- function(df, traitlist){
plottraits <- subset(traitlist, species %in% df[,2])
return(plottraits)
}
traitcalc <- function(traits, plots_Maiz){
multlist <- list()
blist <- list()
vmult <- vector()
tickcount <- 0
plotsplit <- split.data.frame(plots_Maiz, plots_Maiz$site)
testlist <- lapply(plotsplit, traitsf, traitlist = traits)
for (q in 1:length(plotsplit)){
df1 <- testlist[[q]]
df2 <- plotsplit[[q]]
plot <- as.character(plotsplit[[q]][1,1])
for (i in 1:nrow(df1)){
v <- as.numeric(as.vector(t(df1[i,2:ncol(df1)])))
species <- as.character(df1[i,1])
for (j in 1:(ncol(df2)-2)){
tickcount <- tickcount + 1
vmult <-as.vector(v * (as.numeric(as.vector(df2[i,j+2]))))
vmult <- as.list(c(vmult, j-1, species, plot))
multlist[[tickcount]] <- vmult
}
}
b <- do.call(rbind, multlist)
b <- data.table::rbindlist(multlist)
blist[[q]] <- b
}
return(blist)
}
endresults <- traitcalc(traits,plots_Maiz)
endresultsdf2<- do.call("rbind", endresults)

Adding vector next to data.frame under new column name

I am experimenting with R and would like to implement a loop which runs 1000000 times and creates a vector of length 10 and adds each vector to a data frame under the name cycle and the number it has iterated.
This is my current code:
loser <- 100
winner <- 500
percentageWinner <- 70
runns <- 1000000
numbs <- 10
for(i in runns ) {
randNumb <- runif(numbs, min=0, max=100)
outcome <- ifelse(randNumb < percentageWinner, winner, loser) # true are winners and false are losers
df <- data.frame(outcome)
colnames(df)[which(names(df) == "outcome")] <- paste("cycle",i)
}
df
I am struggeling to add the vector next to the other data.frame column.
Any suggestions, how to do that?
I appreciate your replies!
In your code, at each iteration of your for loop, you overwrite i by 1 (i <- 1). And if you remove it, it will be always equal to runns, i.e only 1 loop.
You need to change your code for something like:
loser <- 100
winner <- 500
percentageWinner <- 70
runns <- 1000000
numbs <- 10
outcome <- matrix(NA, numbs, runns)
for(i in seq_len(runns)) {
randNumb <- runif(numbs, min=0, max=100)
outcome[,i] <- ifelse(randNumb < percentageWinner, winner, loser)
}
df <- data.frame(outcome)
colnames(df) <- paste0("cycle",seq_len(runns))
Or you can avoid the loop:
randNumb <- runif(numbs*runns, min=0, max=100)
outcome <- ifelse(randNumb < percentageWinner, winner, loser)
outcome <- matrix(outcome, numbs, runns)
df <- data.frame(outcome)
colnames(df) <- paste0("cycle",seq_len(runns))

Resources