I found some previous questions on this topic especially this R: Grouped rolling window linear regression with rollapply and ddply and R: Rolling / moving avg by group , however, both questions did not provide an exact solution for the problem that I am facing. I am currently trying to estimate CAPM beta over panel data using a linear regression. So I have different funds (in the example below I used 3 fund groups) for which I would like to calculate the betas separately and per row. To put this more abstract: I am trying to do a linear regression with a moving window by group to estimate the coefficient for every row based on the data in the window.
install.packages("zoo","dplyr")
library(zoo);library(dplyr)
# Create dataframe
fund <- as.numeric(c(1,1,1,1,1,1,1,1,3,3,3,3,3,3,2,2,2,2,2,2,2))
return<- as.numeric(c(1:21))
benchmark <- as.numeric(c(1,13,14,20,14,32,4,1,5,7,1,0,7,1,-2,1,6,-7,9,10,9))
riskfree<-as.numeric(c(1,5,1,2,1,6,4,7,5,-5,10,0,3,1,2,1,6,7,8,9,10))
date <- as.Date(c("2010-07-30","2010-08-31","2010-09-30","2010-10-31","2010-11-30","2010-12-31","2011-01-30",
"2011-02-28","2010-07-31","2010-09-30","2010-10-31","2010-11-30","2010-12-31","2011-01-30",
"2010-07-30","2010-08-31","2010-09-30","2010-10-31","2010-11-30","2010-12-31","2011-01-30"))
funddata<-data.frame(date,fund,return,benchmark,riskfree)
# Creating variables of interest
funddata["ret_riskfree"]<-as.numeric(funddata$return-funddata$riskfree)
funddata["benchmark_riskfree"]<-as.numeric(funddata$benchmark-funddata$riskfree)
I want to do a rolling regression over two columns df[6:7] for every group indicated by the column "fund". The calculation should be done separately so the first two rows in the beta column for every fund group will always show "NA". In the end I want to have a full dataframe with all fund groups and all beta values combined.
I managed to come up with a new code that works but is pretty messy and it requires to order the data by fund & date before executing. I would welcome any suggestions on how to make it better.
funddata <- funddata[order(funddata$fund, funddata$date),]
beta_func <- function(x, benchmark_riskfree, ret_riskfree) {
a <- coef(lm(as.formula(paste(ret_riskfree, "~", benchmark_riskfree,-1)),
data = x))
return(a)
}
beta_list<-list()
for (i in c(1:3)){beta_list[[paste(i, sep="_")]]<- (rollapplyr(funddata[(funddata$fund==i),6:7], width = 3,
FUN = function(x) beta_func(as.data.frame(x), "benchmark_riskfree" , "ret_riskfree"),
by.column = FALSE,fill=NA))}
beta_list<-unlist(beta_list, recursive=FALSE)
funddata$beta<-beta_list
As I mentioned in the comment above, this solution might be a bit off since I'm not able to reproduce your desired output 100%. Still, the functionality of what you're trying to accomplish is there. Have a look at it and let me know if this is something you could use or I could develop further.
EDIT: The code below does not reproduce the desired output as specified above, but turned out to be what the OP was looking for after all.
Here goes:
# Datasource
fund <- as.numeric(c(1,1,1,1,1,1,1,1,3,3,3,3,3,3,2,2,2,2,2,2,2))
return<- as.numeric(c(1:21))
benchmark <- as.numeric(c(1,13,14,20,14,32,4,1,5,7,1,0,7,1,-2,1,6,-7,9,10,9))
riskfree<-as.numeric(c(1,5,1,2,1,6,4,7,5,-5,10,0,3,1,2,1,6,7,8,9,10))
date <- as.Date(c("2010-07-30","2010-08-31","2010-09-30","2010-10-31","2010-11-30","2010-12-31","2011-01-30",
"2011-02-28","2010-07-31","2010-09-30","2010-10-31","2010-11-30","2010-12-31","2011-01-30",
"2010-07-30","2010-08-31","2010-09-30","2010-10-31","2010-11-30","2010-12-31","2011-01-30"))
funddata<-data.frame(date,fund,return,benchmark,riskfree)
# Creating variables of interest
funddata["ret_riskfree"]<-as.numeric(funddata$return-funddata$riskfree)
funddata["benchmark_riskfree"]<-as.numeric(funddata$benchmark-funddata$riskfree)
# Target check #################################################################
# Subset last three rows in original dataframe
df_check <- funddata[funddata$fund == 1,]
df_check <- tail(df_check,3)
# Run regression check
mod_check <- lm(df_check$ret_riskfree~df_check$benchmark_riskfree)
coef(mod_check)
# My suggestion ################################################################
# The following function takes three arguments:
# 1. a dataframe, myDf
# 2. a column that you'd like to myDf on
# 3. a window length for a sliding window, myWin
fun_rollreg <- function(myDf, subCol, varY, varX, myWin){
df_main <- myDf
# Make an empty data frame to store results in
df_data <- data.frame()
# Identify unique funds
unFunds <- unique(unlist(df_main[subCol]))
# Loop through your subset
for (fundx in unFunds){
# Subset
df <- df_main
df <- df[df$fund == fundx,]
# Keep a copy of the original until later
df_new <- df
# Specify a container for your beta estimates
betas <- c()
# Specify window length
wlength <- myWin
# Retrieve some data dimensions to loop on
rows = dim(df)[1]
periods <- rows - wlength
# Loop through each subset of the data
# and run regression
for (i in rows:(rows - periods)){
# Split dataframe in subsets
# according to the window length
df1 <- df[(i-(wlength-1)):i,]
# Run regression
beta <- coef(lm(df1[[varY]]~df1[[varX]]))[2]
# Keep regression ressults
betas[[i]] <- beta
}
# Add regression data to dataframe
df_new <- data.frame(df, betas)
# Keep the new dataset for later concatenation
df_data <- rbind(df_data, df_new)
}
return(df_data)
}
# Run the function:
df_roll <- fun_rollreg(myDf = funddata, subCol = 'fund',
varY <- 'ret_riskfree', varX <- 'benchmark_riskfree',
myWin = 3)
# Show the results
print(head(df_roll,8))
For the first 8 rows in the new dataframe (fund = 1), this is the result:
date fund return benchmark riskfree ret_riskfree benchmark_riskfree betas
1 2010-07-30 1 1 1 1 0 0 NA
2 2010-08-31 1 2 13 5 -3 8 NA
3 2010-09-30 1 3 14 1 2 13 0.10465116
4 2010-10-31 1 4 20 2 2 18 0.50000000
5 2010-11-30 1 5 14 1 4 13 -0.20000000
6 2010-12-31 1 6 32 6 0 26 -0.30232558
7 2011-01-30 1 7 4 4 3 0 -0.11538462
8 2011-02-28 1 8 1 7 1 -6 -0.05645161
Related
Problem: I have several (10+) custom functions, each defining a step in the workflow. I want to run a nested function of these steps over a large data frame for n (50+) periods iteratively. My current function achieves the result but it is too slow and not very elegant.
Example Input
id x_1975 z_1975
1 1 1 NA
2 2 2 NA
3 3 3 NA
4 4 4 NA
5 5 5 NA
Step 1:
Compare initial x values (x_1975) against a cutoff=3. If x is greater than 3, then the z value should be "Y".
Step 2:
If z value is "Y", then x value in next year should be x times 2. Otherwise, it should be x times 5. Although the z values can be skipped altogether, I need the categorical column to create summary stats.
Note:
The data set I am working with has 20 variables that need to be calculated based on some similar logics.
Desired Output
id x_1975 z_1975 x_1976 z_1976 x_1977 z_1977 x_1978
1 1 1 <NA> 5 Y 10 Y 20
2 2 2 <NA> 10 Y 20 Y 40
3 3 3 <NA> 15 Y 30 Y 60
4 4 4 Y 8 Y 16 Y 32
5 5 5 Y 10 Y 20 Y 40
6 6 6 Y 12 Y 24 Y 48
What I have tried:
Tried setting the data in long format. But found it complicated to iterate over rows.
Pre-allocated all columns with appropriate class. That reduced run time a little although not enough.
Have been trying to use purrr::compose to nest all the functions. But I am not being able to make it work.
Reproducible Example
library(dplyr)
library(purrr)
# Create Data Frame
n <- 6
dat <- data.frame(id=1:n,
x_1975=seq(1,6,1),
z_1975=NA)
cut_off <- 3
# Functions
# Set a value for "z_" variables in period t by comparing "x_" value in period t against the the cut_off value.
func_1 <- function(dat,yr){
# pre-define variables
z <- paste0("z_",yr)
x <- paste0("x_",yr)
# Caclulate values for "z_" in period t
dat <- dat %>% mutate(!!sym(z):=
case_when(!!sym(x)>cut_off ~ "Y",
TRUE~as.character(NA)
))
}
# Calculate the value for "x_" variables in period t+1 based on "z_" variables in period t.
func_2 <- function(dat,yr){
# pre-define variables
x <- paste0("x_",yr+1)
x_lag <- paste0("x_",yr)
z <- paste0("z_",yr)
# Calculate "x_" value for t+1
dat <- dat %>% mutate(!!sym(x):=case_when(
!!sym(z)=="Y"~!!sym(x_lag)*2,
TRUE~!!sym(x_lag)*5
))
}
# Join function 1 and function 2 together. The joined function needs to iterate over the `dat` from beginning year to ending year
joined_func <- function(dat,beginning,ending){
for (year in seq(beginning,ending,1)){
dat <- func_1(dat,year)
# Output of step 1 is used as input for step 2
dat <- func_2(dat,year)
}
return(dat)
}
# Run the code from 1975 to 2025.The data_output has the desired output, but need to reduce runtime.
data_output <- joined_func(dat,1975,1977)
# Tried to use the compose function from purrr. but getting error.
my_funs <- c(func_1, func_2)
f1 <- invoke(compose, my_funs)
joined_func_2 <- function(dat,beginning,ending){
for (year in seq(beginning,ending,1)){
dat <- f1(dat,year=year)
}
}
data_output_2 <- joined_func_2(dat,1975,1977)
# Error message:
# Error in f1(dat, year = year) : unused argument (year = year).
Questions
a) how do I make purrr::compose work? b) any other way to achieve efficiency?
Would really appreciate if someone could help me on this!
Let's say I have data such as this:
dat <- mtcars %>% mutate(cyl2 = cyl*2,cyl3 = cyl*3)
I want to generate frequency tables such as this:
table(dat$cyl, dat$vs)
table(dat$cyl2, dat$vs)
table(dat$cyl3, dat$vs)
table(dat$cyl, dat$am)
table(dat$cyl2, dat$am)
table(dat$cyl3, dat$am)
Is there a way to automate the generation of frequency tables so that I don't have to run it each time like I have in the example above. First, the actual data I have is a lot bigger and second, the output isn't very easy to digest.
Now, if I wanted only the frequency of each variable, I could do something like so:
mapply(table, dat); mylist
There must be a way to run apply for a two way frequency distribution?? Thank you for your insight.
Here is one way to put all of the tables you want into a list:
vars <- as.matrix(expand.grid(c("cyl", "cyl2", "cyl3"), c("vs", "am")))
tables <- lapply(seq(nrow(vars)), function(x) table(dat[, vars[x, ]]))
lbls <- apply(vars, 1, paste, collapse="_")
names(tables) <- lbls
You can access a table if you know the number (row number in vars) or the combination of factors, e.g.
tables[[3]]
# vs
# cyl3 0 1
# 12 1 10
# 18 3 4
# 24 14 0
tables[["cyl3_vs"]]
# vs
# cyl3 0 1
# 12 1 10
# 18 3 4
# 24 14 0
I have data that looks similar to the following
Site Unknown_Parameter X Y Z Predicted Actual
A 2 3 4 2 5 6
A 2 4 3 2 7 5
B 3 6 8 9 12 9
B 3 4 6 2 10 10
etc...
I am trying to create a function that minimizes the RMSE of each site by determining the optimal value for the unknown parameter. I can do this for a single site at a time using the following pseudocode
fn <- function(unknown_parameter) {
df$Predicted <- calculations with unknown_parameter and X Y Z
RMSE <- sqrt(mean((df$Predicted - df$Actual)^2))
RMSE
}
optimize(fn, c(1,10))
I am able to obtain the optimal value for the unknown parameter as well as the RMSE for a single site, but I would like to scale this to do it for every site since I have 100s. Ideally, I would want my output to look like the following
Site Optimal_Value RMSE
A 1.7 2.45
B 1.2 3.24
C 1.3 9.21
etc...
I have been trying to use the split command, but this transforms my data into a list, and I'm not really sure how to work with it. Any thoughts?
While split produces a list of subsetted dataframes by the input factor's value, consider by that also subsets the dataframe by one or more factor(s) but can also pass the subset into a function. And to bind all dataframes together run a do.call(rbind, ...) on returned list.
# USER-DEFINED METHOD RECEIVING subsetted df AS INPUT AND RETURNING dataframe AS OUTPUT
subset_process <- function(subdf) {
fn <- function(unknown_parameter) {
subdf$Predicted <- calculations with unknown_parameter and X Y Z
RMSE <- sqrt(mean((subdf$Predicted - subdf$Actual)^2))
return(RMSE)
}
opt <- optimize(fn, c(1,10))
tmp <- data.frame(Site = subdf$Site[[1]],
Optimal Value = opt,
RMSE = fn)
return(tmp)
}
# SPLIT + RUN METHOD ON EACH SUBSET
df_list <- by(df, df$Site, FUN=subset_process)
# APPEND ALL DF ELEMENTS INTO MASTER DF
final_df <- do.call(rbind, df_list)
I'm dealing with a dataset where I have students ratings of teachers. Some students rated the same teacher more than once.
What I would like to do with the data is to subset it with the following criteria:
1) Keep any unique student Ids and ratings
2) In cases where students rated a teacher twice keep only 1 rating, but to select which rating to keep randomly.
3) If possible I'd like to be able to run the code in a munging script at the top of every analysis file and ensure that the dataset created is exaclty the same for each analysis (set seed?).
# data
student.id <- c(1,1,2,3,3,4,5,6,7,7,7,8,9)
teacher.id <- c(1,1,1,1,1,2,2,2,2,2,2,2,2)
rating <- c(100,99,89,100,99,87,24,52,100,99,89,79,12)
df <- data.frame(student.id,teacher.id,rating)
Thanks for any guidance for how to move forward.
Assuming that each student.id is only applied to one teacher, you could use the following method.
# get a list containing data.frames for each student
myList <- split(df, df$student.id)
# take a sample of each data.frame if more than one observation or the single observation
# bind the result together into a data.frame
set.seed(1234)
do.call(rbind, lapply(myList, function(x) if(nrow(x) > 1) x[sample(nrow(x), 1), ] else x))
This returns
student.id teacher.id rating
1 1 1 100
2 2 1 89
3 3 1 99
4 4 2 87
5 5 2 24
6 6 2 52
7 7 2 99
8 8 2 79
9 9 2 12
If the same student.id rates multiple teachers, then this method requires the construction of a new variable with the interaction function:
# create new interaction variable
df$stud.teach <- interaction(df$student.id, df$teacher.id)
myList <- split(df, df$stud.teach)
then the remainder of the code is identical to that above.
A potentially faster method is to use the data.table library and rbindlist.
library(data.table)
# convert into a data.table
setDT(df)
myList <- split(df, df$stud.teach)
# put together data.frame with rbindlist
rbindlist(lapply(myList, function(x) if(nrow(x) > 1) x[sample(nrow(x), 1), ] else x))
This can now be done much faster using data.table. Your question is equivalent to sampling rows from within groups, see
Sample random rows within each group in a data.table
I am trying to use a huge dataframe (180000 x 400) to calculate another one that would be much smaller.
I have the following dataframe
df1=data.frame(LOCAT=c(1,2,3,4,5,6),START=c(120,345,765,1045,1347,1879),END=c(150,390,802,1120,1436,1935),CODE1=c(1,1,0,1,0,0),CODE2=c(1,0,0,0,-1,-1))
df1
LOCAT START END CODE1 CODE2
1 1 120 150 1 1
2 2 345 390 1 0
3 3 765 802 0 0
4 4 1045 1120 1 0
5 5 1347 1436 0 -1
6 6 1879 1935 0 -1
This is a sample dataframe. The rows continue until 180000 and the columns are over 400.
What I need to do is create a new dataframe based on each column that tells me the size of each continues "1" or "-1" and returns it with the location, size and value.
Something like this for CODE1:
LOCAT SIZE VALUE
1 1 to 2 270 POS
2 4 to 4 75 POS
And like this for CODE2:
LOCAT SIZE VALUE
1 1 to 1 30 POS
2 5 to 6 588 NEG
Unfortunately I still didn't figure out how to do this. I have been trying several lines of code to develop a function to do this automatically but start to get lost or stuck in loops and it seems that nothing works.
Any help would be appreciated.
Thanks in advance
Below is code that gives you the answer in the exact format that you wanted, except I split your "LOCAT" column into two columns entitled "Starts" and "Stops". This code will work for your entire data frame, no need to replicate it manually for each CODE (CODE1, CODE2, etc).
It assumes that the only non-CODE column have the names "LOCAT" "START" and "END".
# need package "plyr"
library("plyr")
# test2 is the example data frame that you gave in the question
test2 <- data.frame(
"LOCAT"=1:6,
"START"=c(120,345,765, 1045, 1347, 1879),
"END"=c(150,390,803,1120,1436, 1935),
"CODE1"=c(1,1,0,1,0,0),
"CODE2"=c(1,0,0,0,-1,-1)
)
codeNames <- names(test2)[!names(test2)%in%c("LOCAT","START","END")] # the names of columns that correspond to different codes
test3 <- reshape(test2, varying=codeNames, direction="long", v.names="CodeValue", timevar="Code") # reshape so the different codes are variables grouped into the same column
test4 <- test3[,!names(test3)%in%"id"] #remove the "id" column
sss <- function(x){ # sss gives the starting points, stopping points, and sizes (sss) in a data frame
rleX <- rle(x[,"CodeValue"]) # rle() to get the size of consecutive values
stops <- cumsum(rleX$lengths) # cumulative sum to get the end-points for the indices (the second value in your LOCAT column)
starts <- c(1, head(stops,-1)+1) # the starts are the first value in your LOCAT column
ssX0 <- data.frame("Value"=rleX$values, "Starts"=starts, "Stops"=stops) #the starts and stops from X (ss from X)
ssX <- ssX0[ssX0[,"Value"]!=0,] # remove the rows the correspond to CODE_ values that are 0 (not POS or NEG)
# The next 3 lines calculate the equivalent of your SIZE column
sizeX1 <- x[ssX[,"Starts"],"START"]
sizeX2 <- x[ssX[,"Stops"],"END"]
sizeX <- sizeX2 - sizeX1
sssX <- data.frame(ssX, "Size"=sizeX) # Combine the Size to the ssX (start stop of X) data frame
return(sssX) #Added in EDIT
}
answer0 <- ddply(.data=test4, .variables="Code", .fun=sss) # use the function ddply() in the package "plyr" (apply the function to each CODE, why we reshaped)
answer <- answer0 # duplicate the original, new version will be reformatted
answer[,"Value"] <- c("NEG",NA,"POS")[answer0[,"Value"]+2] # reformat slightly so that we have POS/NEG instead of 1/-1
Hopefully this helps, good luck!
Use run-length encoding to determine groups where CODE1 takes the same value.
rle_of_CODE1 <- rle(df1$CODE1)
For convenience, find the points where the value is non-zero, and the lenghts of the corresponding blocks.
CODE1_is_nonzero <- rle_of_CODE1$values != 0
n <- rle_of_CODE1$lengths[CODE1_is_nonzero]
Ignore the parts of df1 where CODE1 is zero.
df1_with_nonzero_CODE1 <- subset(df1, CODE1 != 0)
Define a group based on the contiguous blocks we found with rle.
df1_with_nonzero_CODE1$GROUP <- rep(seq_along(n), times = n)
Use ddply to get summary stats for each group.
summarised_by_CODE1 <- ddply(
df1_with_nonzero_CODE1,
.(GROUP),
summarise,
MinOfLOCAT = min(LOCAT),
MaxOfLOCAT = max(LOCAT),
SIZE = max(END) - min(START)
)
summarised_by_CODE1$VALUE <- ifelse(
rle_of_CODE1$values[CODE1_is_nonzero] == 1,
"POS",
"NEG"
)
summarised_by_CODE1
## GROUP MinOfLOCAT MaxOfLOCAT SIZE VALUE
## 1 1 1 2 270 POS
## 2 3 4 4 75 POS
Now repeat with CODE2.