Ok so I have a loop that works out the annualized / cumulative return of a stock price series.
I wish to do the same thing over many files. So made a loop to do so.
First some dummy data:
# Create dummy data
# Use lubridate to change timestamp to date format
# Use dplyr to arrange by ascending order
# Use fread from data.table to read .csv to data frame
require(lubridate)
require(data.table)
require(dplyr)
MSFT <- fread("https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=MSFT&outputsize=full&apikey=6RSYX9BPXKZVXUS9&datatype=csv")
MSFT$timestamp <- ymd(MSFT$timestamp)
MSFT <- arrange(MSFT,timestamp)
AAPL <- fread("https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=AAPL&outputsize=full&apikey=6RSYX9BPXKZVXUS9&datatype=csv")
AAPL$timestamp <- ymd(AAPL$timestamp)
AAPL <- arrange(AAPL,timestamp)
NFLX <- fread("https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=NFLX&outputsize=full&apikey=6RSYX9BPXKZVXUS9&datatype=csv")
NFLX$timestamp <- ymd(NFLX$timestamp)
NFLX <- arrange(NFLX,timestamp)
TSLA <- fread("https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=TSLA&outputsize=full&apikey=6RSYX9BPXKZVXUS9&datatype=csv")
TSLA$timestamp <- ymd(TSLA$timestamp)
TSLA <- arrange(TSLA,timestamp)
# Place data frames in a list
df.list <- list(MSFT,AAPL,NFLX,TSLA)
# Specify file names
file.names <- c("MSFT","AAPL","NFLX","TSLA")
Now that prepares the data.
Next I want to calculate the cumulative and annualized returns for each series. I place this in a function then call the function with a loop:
# Create function for performing commands.
genAnnualized = function(x){
next.file <- data.frame(df.list[[1]],stringsAsFactors=FALSE)
next.name <- paste0(file.names[i])
new.df <- data.frame(next.file)
# Calculate annualized return
# Make prices vector
prices <- new.df[, "close", drop = FALSE]
# Denote n the number of time periods:
n <- nrow(prices)
# Calculate close to close returns
# lead in with rep,NA,1 to maintain length of vector comparible to data frame
close_ret <- c(rep(NA, 1),(prices[2:n, 1] - prices[1:(n-1), 1])/prices[1:(n-1), 1])
close_ret[1] <- 0
# Compute continuously returns (log returns)
close_ccret <- log(prices[2:n, 1]) - log(prices[1:(n-1), 1])
# Compute gross returns
close_gret <- 1 + close_ret # use close to close ret
# Compute future values
close_fv <- cumprod(close_gret)
# Obtain first and last values
ret.last <- tail(close_fv, n=1)
ret.first <- head(close_fv, n=1)
cum.ret <- (ret.last-ret.first)/ret.first
# Get First And Last row to calculate time between
ret.first.row <- head(new.df$timestamp, n=1)
ret.last.row <- tail(new.df$timestamp, n=1)
# Time diff
#trading.years.between <- as.numeric(difftime(as.Date(ret.last.row), as.Date(ret.first.row), unit="weeks"))/52.25
# Find time diff
ret.time <- ret.last.row - ret.first.row
ret.trading.years.between <- ret.time/365 #252 trading days or 365
ret.trading.years.between <- as.numeric(ret.trading.years.between, units="days") # Extract numerical value from time difference 'Time difference of 2837.208 days'
# Annualized return
# (1 + % diff of final) / (last balance to the power of 1/time first and last balance) -1
ret.annual.return <- (1+cum.ret) ^ (1/ret.trading.years.between) -1
########## Store annualized and cumulative return in data frame for each iteration #########
# Store file name as a row name :: next.name variable
# Store final annualized return :: cret.annual.return
# Store final cumulative return :: cum.ret
output.df <- cbind(cum.ret,ret.annual.return)
rownames(output.df) <- next.name
##################################################################
# Sanity check, use PerformanceAnalytics for annualized return
# TTR for returns
# Calculate Close-to-Close returns
require(TTR)
require(PerformanceAnalytics)
new.df$clret <- ROC(new.df$close, type = c("discrete"))
new.df$clret[1] <- 0
# Make time series object of returns and date
require(xts)
xts1 = xts(new.df$clret, order.by=as.Date(new.df$timestamp, format="%m/%d/%Y"))
Return.annualized(xts1)
Return.cumulative(xts1, geometric=TRUE)
}
And call the function to loop through each data frame in the data frame list:
for (i in 1:length(df.list)){
tryCatch({
genAnnualized(df.list[[i]])
}, error = function(e) { print(paste("i =", i, "failed:")) })
}
This should make a re producible example.
On each iteration, I wish to store the cumulative and annualized return of each series as with the name of the data set (so its identifiable later).
I am attempting this with the below within my function:
output.df <- cbind(cum.ret,ret.annual.return)
rownames(output.df) <- next.name
I am specifying the names with:
file.names <- c("MSFT","AAPL","NFLX","TSLA")
and then in the function calling it:
next.name <- paste0(file.names[i])
I was hoping to paste the file name so I can tag my final output in the data frame.
I think might need to rep the name twice when naming each row or column name. So that it tags the cumulative return and also the annualized return.
I think have the general idea but have been wrestling with this for a few weeks so looking for some assistance.
Essentially with the output data frame I can then organise into quartiles etc etc for further analysis
My simplest solution was to rbind a dataframe for each file.names evaluation, and name this row the same name of the corresponding file.
I've deleted comments for clarity (and put some of mine).
'genAnnualized' = function(df_list) {
next.file <- data.frame(df_list, stringsAsFactors=FALSE) # Put the parameter of the function here
next.name <- paste0(file.names[i])
new.df <- data.frame(next.file)
prices <- new.df[, "close", drop = FALSE]
n <- nrow(prices)
close_ret <- c(rep(NA, 1),(prices[2:n, 1] - prices[1:(n-1), 1])/prices[1:(n-1), 1])
close_ret[1] <- 0
close_ccret <- log(prices[2:n, 1]) - log(prices[1:(n-1), 1])
close_gret <- 1 + close_ret
close_fv <- cumprod(close_gret)
ret.last <- tail(close_fv, n=1)
ret.first <- head(close_fv, n=1)
cum.ret <- (ret.last-ret.first)/ret.first
ret.first.row <- head(new.df$timestamp, n=1)
ret.last.row <- tail(new.df$timestamp, n=1)
ret.time <- ret.last.row - ret.first.row
ret.trading.years.between <- ret.time/365
ret.trading.years.between <- as.numeric(ret.trading.years.between, units="days")
ret.annual.return <- (1+cum.ret) ^ (1/ret.trading.years.between) -1
output.df <- cbind(cum.ret,ret.annual.return)
rownames(output.df) <- next.name
##################################################################
new.df$clret <- TTR::ROC(new.df$close, type = c("discrete"))
new.df$clret[1] <- 0
xts1 = xts::xts(new.df$clret, order.by=as.Date(new.df$timestamp, format="%m/%d/%Y"))
# Create the output of the function : a named data.frame
out_df <- data.frame("Annualized Return" = PerformanceAnalytics::Return.annualized(xts1),
"Cumulative Return" = PerformanceAnalytics::Return.cumulative(xts1, geometric=TRUE))
return(out_df)
}
# Initialize the output dataframe to which we will rowbind the results
cum_ret <- data.frame()
for (i in 1:length(df.list)){
temp <- genAnnualized(df.list[[i]] )
rownames(temp) <- file.names[i]
cum_ret <- rbind.data.frame(cum_ret, temp)
}
This gives a data frame with number of named rows equal to the number of
files in df.list and 2 columns for the annualized and cumulative returns.
> cum_ret
Annualized.Return Cumulative.Return
MSFT -0.02279597 -0.3361359
AAPL 0.02039616 0.4314812
NFLX 0.17454862 10.8991045
TSLA 0.44666765 13.8233571
Related
So I have sampled a set of lakes at x timepoints throughout the year. I also have deployed loggers etc. in the water and I want to use daily averages from these loggers, at the timepoint of the visit to x days/hours before. Sometimes I also just grab the a sample for the timepoint of the visit.
This is my solution, it works just fine but since I experiment alot with some model assumptions and perform sensitivity analyses it operates unsatisfactory slow.
I seem to have solved most of my R problems with loops and I often encounter more efficient scripts, it would be very interesting to see some more effective alternatives to my code.
Below code just generates some dummy data..
library(dplyr)
library(lubridate)
do.pct.sat <- function(x,y,z){
t <- x
do <- y
p <- z
atm <- (p*100)/101325
do.sat <- atm*exp(-139.34411+157570.1/(t+273.15)-66423080/(t+273.15)^2+12438000000/(t+273.15)^3-862194900000/(t+273.15)^4)
do.pct.sat <- (do/do.sat)*100
return(do.pct.sat)
}#function for calculating the % oxygen saturation
#here's some dummy date resembling real data
date.initial <- as.POSIXct("2022-06-01")#deployment date
date.end <- as.POSIXct("2022-10-01")#date of retrieval
id <- c("a","b","c")#lake id
lake <- list()#make dataset list for each lake
s <- list()#list of dataframes for the samples from the lake logger timelines
#loop below generates dummy data. this is not part of the real script that I want to improve.
for(i in 1:3){
datetime <- seq(from = date.initial,to = date.end,by=10*60)#10 minute intervals from deploy to retrieve
l <- length(datetime)#vector length of datetime
#set dummy data
do <- rnorm(l,mean = 10,sd=3)#o2 conc.
pressure <- rnorm(l,mean = 980,sd=50)#baro pressure
temp <- rnorm(l,mean=15,sd=5)#water temp
k.z <- rnorm(l,mean=0.35,sd=0.1)#gas exchange koeff / mixed layer depth
dosat.pct <- do.pct.sat(temp,do,pressure)#oxygen sat in %
iso <- as.data.frame(cbind(datetime,do,dosat.pct,temp,pressure,k.z))#bind dummy dataframe to resemble real data
iso$datetime <- as.POSIXct(iso$datetime,origin = "1970-01-01")
lake[[i]] <- iso#save the data frame to the lake logger list
samples <- as.POSIXct(sample((date.initial+5*24*60*60):date.end, 7, replace=FALSE),origin = "1970-01-01")#randomize 7 timepoints
s[[i]] <- as.data.frame(samples)#save it in empty data frame
s[[i]]$lake <- id[i]
}
names(lake) <- id
samples <- bind_rows(s)
samples$samples <- round_date(samples$samples,unit="10 minutes")#rounds my random samples to closest 10 minute
Below is the function that I want to effectivize (same library). I think it operates slow because I take one date at a time, before taking the next;
sample.lakes <- function(average=3){
dts <- list()#empty list
for(i in 1:length(lake)){
print(id[i])
data = lake[[i]]
y <- samples[grepl(id[i],samples$lake),]
dates <- y$samples
#empty vectors to fill with values sampled in loop
avg.kz <- vector()
sd.kz <- vector()
do.mgl <- vector()
dosat.pct <- vector()
temp.c <- vector()
for (k in 1:length(dates)){
print(k)
#below I filter the logger data to contain timepoint of sampling minus number of days I want the average from 'averages'.
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
#fill the empty vectors with value I desire, mean and sd k.z and point sample of the other variables.
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
temp.c[k] <- data[grepl(dates[k],data$datetime),]$temp
do.mgl[k] <- data[grepl(dates[k],data$datetime),]$do
dosat.pct[k] <- data[grepl(dates[k],data$datetime),]$dosat.pct
}
sd.kz[is.na(sd.kz)] <- 0
#add them to data frame y
y$dosat.pct <- dosat.pct
y$do.mgl <- do.mgl
y$temp.c <- temp.c
y$avg.kz <- avg.kz
y$sd.kz <- sd.kz
dts[[i]] <- y#add to single-row dataframe
}
iso <- bind_rows(dts)#make a complete dataframe with samples.
return(iso)
}
iso <- sample.lakes(average=4)#do not set average to > 5 in this example script
I would appreciaty any suggestions alot!
My guess is that this part using grepl:
data[grepl(dates[k],data$datetime),]
inside your inner for loop is slow.
Couldn't you instead try just seeing if the datetimes are the same with ==?
In addition, you only need to subset data once.
Try this as an alternative:
for (k in 1:length(dates)){
print(k)
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
sub_data <- data[data$datetime == dates[k], ]
temp.c[k] <- sub_data$temp
do.mgl[k] <- sub_data$do
dosat.pct[k] <- sub_data$dosat.pct
}
I made an empty matrix to populate with time series data and forecasts using
pred <- matrix(rep(NA,80),20,4)
But when I try to populate the matrix with a for loop, I get error message ("Error in pred[i, 2] <- forecast(fit.season, h = 1) : number of items to replace is not a multiple of replacement length")
beer1 <- window(ausbeer, start=1990,end=c(2009,4))
n.end <- 2004.75 # 2004Q4
fit.season <- tslm(beer1 ~ season, data=beer1)
fit.trend <- tslm(beer1 ~ season + trend, data=beer1)
for(i in 1:20){
tmp0 <- 1990
tmp1 <- n.end+(i-1)*.25
tmp <- window(beer1,tmp0,tmp1)
pred[i,1] <- window(beer1,tmp1+.25,tmp1+.25) # actual data
# compute forecasts
pred[i,2] <- forecast(fit.season, h=1)
pred[i,3] <- forecast(fit.trend, h=1)
}
I know that the error message means the columns aren't equal so I checked the matrix and only the first element (row 1, column 1) was populated.
And my window seems okay so I tried with another set of functions in the loop.
for(i in 1:20){
tmp0 <- 1992
tmp1 <- n.end+(i-1)*.25
tmp <- window(beer1,tmp0,tmp1)
pred[i,1] <- window(beer1,tmp1+.25,tmp1+.25) # actual
# compute forecasts
pred[i,2] <- meanf(tmp, h=1)$mean
pred[i,3] <- rwf(tmp, h=1)$mean
pred[i,4] <- snaive(tmp, h=1)$mean
}
And the whole matrix was populated.
What's wrong with the one I initially did?
The forecast function returns an object of class forecast, not a vector. Replace the last two lines in your loop by
pred[i,2] <- forecast(fit.season, h=1)$mean
pred[i,3] <- forecast(fit.trend, h=1)$mean
to extract just the point forecasts
My script and one of the first 3 csv files are can be found in my Github folder
I have split a list of NDVI and climate data into small csv. files with 34 years of data each.
Every 34 years should then be split into two parts depending on a conflict year, saved in the same table and a certain time range. But this part of the code works already.
Now I want to control the second part of the list with the climate data of the first part, by using multiple linear regression, which is also done.
I basically need to make a loop to store all the coefficients from every round of the lm function of one csv. file in a new list.
I know that I can use lapply to loop and get the output as a list. But there are some missing parts to actually loop through the csv. files.
#load libraries
library(ggplot2)
library(readr)
library(tidyr)
library(dplyr)
library(ggpubr)
library(plyr)
library(tidyverse)
library(fs)
file_paths <- fs::dir_ls("E:\\PYTHON_ST\\breakCSV_PYTHON\\AIM_2_regions\\Afghanistan")
file_paths
#create empty list and fill with file paths and loop through them
file_contents <- list()
for (i in seq_along(file_paths)) { #seq_along for vectors (list of file paths is a vector)
file_contents[[i]] <- read_csv(file = file_paths[[i]])
for (i in seq_len(file_contents[[i]])){ # redundant?
# do all the following steps in every file
# Step 1)
# Define years to divide table
#select conflict year in df
ConflictYear = file_contents[[i]][1,9]
ConflictYear
# select Start year of regression in df
SlopeYears = file_contents[[i]][1,7] #to get slope years (e.g.17)
BCStartYear = ConflictYear-SlopeYears #to get start year for regression
BCStartYear
#End year of regression
ACEndYear = ConflictYear+(SlopeYears-1) # -1 because the conflict year is included
ACEndYear
# Step 2
#select needed rows from df
#no headers but row numbers. NDVI.Year = [r1-r34,c2]
NDVI.Year <- file_contents[[i]][1:34,2]
NDVI <- file_contents[[i]][1:34,21]
T.annual.max <- file_contents[[i]][1:34,19]
Prec.annual.max <- file_contents[[i]][1:34,20]
soilM.annual.max <- file_contents[[i]][1:34,18]
#Define BeforeConf and AfterConf depending on Slope Year number and Conflict Years
#Go through NDVI.Year till Conflict.Year (-1 year) since the conflict year is not included in bc
BeforeConf1 <- file_contents[[i]][ which(file_contents[[i]]$NDVI.Year >= BCStartYear & file_contents[[i]]$NDVI.Year < ConflictYear),] #eg. 1982 to 1999
BeforeConf2 <- c(NDVI.Year, NDVI, T.annual.max, Prec.annual.max, soilM.annual.max) #which columns to include
BeforeConf <- BeforeConf1[BeforeConf2] #create table
AfterConf1 <- myFiles[ which(file_contents[[i]]$NDVI.Year >= ConflictYear & file_contents[[i]]$NDVI.Year <= ACEndYear),] #eg. 1999 to 2015
AfterConf2 <- c(NDVI.Year, NDVI, T.annual.max, Prec.annual.max, soilM.annual.max)
AfterConf <- AfterConf1[AfterConf2]
#Step 3)a)
#create empty list, to fill with coefficient results from each model results for each csv file and safe in new list
#Create an empty df for the output coefficients
names <- c("(Intercept)","BeforeConf$T.annual.max","BeforeConf$Prec.annual.max","BeforeConf$soilM.annual.max")
coef_df <- data.frame()
for (k in names) coef_df[[k]] <- as.character()
#Apply Multiple Linear Regression
plyrFunc <- function(x){
model <- lm(NDVI ~ T.annual.max + Prec.annual.max + soilM.annual.max, data = BeforeConf)
return(summary(model)$coefficients[1,1:4])
}
coef_df <- ddply(BeforeConf, .(), x)
coef_DF
}}
Since you have code working for a single CSV, consider separating process and loop. Specifically:
Create a function that receives a single csv path as input parameter and does everything you need for a single file.
get_coeffs <- function(csv_path) {
df <- read.csv(csv_path)
### Step 1
# select conflict year, start year, and end year in df
ConflictYear <- df[1,9]
SlopeYears <- df[1,7] # to get slope years (e.g.17)
BCStartYear <- ConflictYear - SlopeYears # to get start year for regression
ACEndYear <- ConflictYear + (SlopeYears-1) # -1 because the conflict year is included
### Step 2
# select needed rows from df
#no headers but row numbers. NDVI.Year = [r1-r34,c2]
NDVI.Year <- df[1:34, 2]
NDVI <- df[1:34, 21]
T.annual.max <- df[1:34, 19]
Prec.annual.max <- df[1:34, 20]
soilM.annual.max <- df[1:34, 18]
# Define BeforeConf and AfterConf depending on Slope Year number and Conflict Years
# Go through NDVI.Year till Conflict.Year (-1 year) since the conflict year is not included in bc
BeforeConf1 <- df[ which(df$NDVI.Year >= BCStartYear & df$NDVI.Year < ConflictYear),]
BeforeConf2 <- c(NDVI.Year, NDVI, T.annual.max, Prec.annual.max, soilM.annual.max)
BeforeConf <- BeforeConf1[BeforeConf2] #create table
AfterConf1 <- myFiles[ which(df$NDVI.Year >= ConflictYear & df$NDVI.Year <= ACEndYear),]
AfterConf2 <- c(NDVI.Year, NDVI, T.annual.max, Prec.annual.max, soilM.annual.max)
AfterConf <- AfterConf1[AfterConf2]
### Step 3
tryCatch({
# Run model and return coefficients
model <- lm(NDVI ~ T.annual.max + Prec.annual.max + soilM.annual.max, data = BeforeConf)
return(summary(model)$coefficients[1,1:4])
}, error = function(e) {
print(e)
return(rep(NA, 4))
})
}
Loop through csv paths, passing each file into your function, building a list of results which you can handle with lapply for list return or sapply (or vapply that specifies length and type) for simplified return such as vector, matrix/array if applicable.
mypath <- "E:\\PYTHON_ST\\breakCSV_PYTHON\\AIM_2_regions\\Afghanistan"
file_paths <- list.files(pattern=".csv", path=mypath)
# LIST RETURN
result_list <- lapply(file_paths, get_coeffs)
# MATRIX RETURN
results_matrix <- sapply(file_paths, get_coeffs)
results_matrix <- vapply(file_paths, get_coeffs, numeric(4))
My dataset is called data and I have a column called time that contains time in mm:ss format. I also wrote a function functime(var1,var2).
I would like ultimately to use apply or vapply and have var2 set to a constant (lets say var2 = 6) and var1 to be each value of the column data$time.
Something like:
If
data$time <- c("10:10","11:00", "09:30"), when I do vapply(), I would like to get a
data$output <-c(functime(data$time[1],6),functime(data$time[2],6),functime(data$time[3],6))
which in this example is the same as
data$output <- c(functime("10:10",6),functime("11:00",6),functime("09:30",6))
My lame attempt to that is something like:
vapply(data$time,functime,var2 = 6,FUN.VALUE = 1)
The documentation for vapply says it should be :vapply(x,fun,fun.value)
I am confused on how to "say to vapply" that I want to take as its first argument all rows of the data$time column, have a fixed second argument that I will define it as 6.
Ultimately I would like to add my data$output in the original dataset using a mutate.
Edit: (Include lines of data and function)
data$id <- c(9,6,5763,4)
data$time <- c("5:06","5:06","5:11","5:08")
data$city <-c("Kyle","Oklahoma","Monterey","Austin")
The function is:
calctime <- function(racePace, raceDistance){
# racePace is the per unit pace in mm:ss - character
# raceDistance is the total race distance - numeric
# Pace and race distance must use same units (km or mi or whatever)
# Seconds to character time function
CharMinSec <- function(sec){
outMin <- floor(sec/60)
outSec <- ((sec/60)-outMin)*60
if(outSec==0 | round(outSec)<10){
outChar <- paste0(outMin,":0",round(outSec))
} else {
outChar <- paste(outMin,round(outSec),sep=":")
}
outChar
}
paceMinSec <- as.numeric(strsplit(racePace,':')[[1]])
paceSec <- paceMinSec[1]*60+ paceMinSec[2]
raceMin <- floor(paceSec*raceDistance/60)
raceSec <- ((paceSec*raceDistance/60)-raceMin)*60
raceTime <- CharMinSec(raceMin*60+raceSec)
list(Seconds=raceSec)
}
# Example of 4:15/km for a half-marathon
calctime("4:15",21.097494)
calcTime <- function(pace,distance){
return (lubridate::period_to_seconds(lubridate::ms(pace)) * distance)
}
pace <- c("10:10","11:00", "09:30")
vapply(pace,calcTime,6,FUN.VALUE = 1)
## 10:10 11:00 09:30
## 36960 39960 34560
d <- tibble::as_tibble(list(pace = pace))
dplyr::mutate(d, raceSeconds = calcTime(pace,6))
## A tibble: 3 x 2
## pace raceSeconds
## <chr> <dbl>
## 1 10:10 36960
## 2 11:00 39960
## 3 09:30 34560
I had to change 2 things, but your vapply call was right.
In the function, I changed the last line so it returns a value instead of a list with one value
calctime <- function(racePace, raceDistance){
# racePace is the per unit pace in mm:ss - character
# raceDistance is the total race distance - numeric
# Pace and race distance must use same units (km or mi or whatever)
# Seconds to character time function
CharMinSec <- function(sec){
outMin <- floor(sec/60)
outSec <- ((sec/60)-outMin)*60
if(outSec==0 | round(outSec)<10){
outChar <- paste0(outMin,":0",round(outSec))
} else {
outChar <- paste(outMin,round(outSec),sep=":")
}
outChar
}
paceMinSec <- as.numeric(strsplit(racePace,':')[[1]])
paceSec <- paceMinSec[1]*60+ paceMinSec[2]
raceMin <- floor(paceSec*raceDistance/60)
raceSec <- ((paceSec*raceDistance/60)-raceMin)*60
raceTime <- CharMinSec(raceMin*60+raceSec)
raceSec
}
Now that the list returns a value, the vapply() works, but in my case I had to force the time column to be a character
data = data.frame(
id = c(9,6,5763,4),
time = c("5:06","5:06","5:11","5:08"),
city = c("Kyle","Oklahoma","Monterey","Austin")
)
data$time = as.character(data$time)
data$output = vapply(data$time,calctime,raceDistance = 6,FUN.VALUE=1) #works fine
I am doing systematic calculations for my created dataframe. I have the code for the calculations but I would like to:
1) Wite it as a function and calling it for the dataframe I created.
2) reset the calculations for next ID in the dataframe.
I would appreciate your help and advice on this.
The dataframe is created in R using the following code:
#Create a dataframe
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
The other thing is that the previous code assumes the subject ID=1 for all time points. If subject ID=2 when the WT (weight) changes to 120. How can I reset the calculations and make it automated for all subject IDs in the dataframe? In this case the original dataframe would be like this:
#code:
rm(list=ls(all=TRUE))
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
df$ID[(df$WT>=120)==T] <- 2
df$TIME[df$ID==2] <- c(seq(0,20,1))
Thank you in advance!
In general, when doing calculations on different subject's data, I like to split the dataframe by ID, pass the vector of individual subject data into a for loop, do all the calculations, build a vector containing all the newly calculated data and then collapse the resultant and return the dataframe with all the numbers you want. This allows for a lot of control over what you do for each subject
subjects = split(df, df$ID)
forResults = vector("list", length=length(subjects))
# initialize these constants
C <- 2
V <- 10
k <- C/V
myFunc = function(data, resultsArray){
for(k in seq_along(subjects)){
df = subjects[[k]]
df$A1 = 100 # I assume this should be 100 for t=0 for each subject?
# you could vectorize this nested for loop..
for(i in 2:nrow(df)) {
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
# you can add all sorts of other calculations you want to do on each subject's data
# when you're done doing calculations, put the resultant into
# the resultsArray and we'll rebuild the dataframe with all the new variables
resultsArray[[k]] = df
# if you're not using RStudio, then you want to use dev.new() to instantiate a new plot canvas
# dev.new() # dont need this if you're using RStudio (which doesnt allow multiple plots open)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
# collapse the results vector into a dataframe
resultsDF = do.call(rbind, resultsArray)
return(resultsDF)
}
results = myFunc(subjects, forResults)
Do you want this:
ddf <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
myfn = function(df){
dosetimes <- c(0,6,12,18)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
myfn(ddf)
For multiple calls:
for(i in 1:N) {
myfn(ddf[ddf$ID==i,])
readline(prompt="Press <Enter> to continue...")
}