Regress portfolio excess return - r

My data :
dput(head(mydata))
structure(list(DATE = structure(c(-315619200, -312940800, -310435200,
-307756800, -305164800, -302486400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), RF = c(0.33, 0.29, 0.35, 0.19, 0.27, 0.24), RMRF = c(-6.99,
0.99, -1.46, -1.7, 3.08, 2.09), SMB = c(2.13, 0.71, -0.65, 0.32,
1.42, -0.24), UMD = c(-3.28, 3.59, 1.85, 2.6, 4.77, 1.03), HML = c(2.65,
-2.15, -2.69, -2.22, -3.83, -0.3), JANDUM = c(1, 0, 0, 0, 0,
0), R4 = c(-4.57, 1.5, -2.83, -1.98, 3.54, 2.15)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
So , my data contain:
R4 is the percentage return of a portfolio, RF is the return of a good
without risk (risk free rate), RMRF is the excess return of portfolio
Market Portfolio, SMB, UMD, and HML are 3 factors, and
JANDUM is a dummy variable for January (January Dummy).
The data is on a monthly frequency from 1/1960 to 12/2003 (there are 528 observations totaly).
Thats im trying to build and i am struggling is to Regress portfolio excess return (R4-RF) to
one constant and all other variables (RMRF, SMB, UMD, HML, and
JANDUM).
How can i achieve that ?

Perhaps this will get you started?
mydata$PER <-mydata$R4 - mydata$RF
mydata$JANDUM <- as.factor(mydata$JANDUM)
model <- lm(PER ~ DATE + RMRF + SMB + UMD + HML + JANDUM, data = mydata)
summary(model)

Related

Find the 3 nearest neighbours (dist()?) and calculate mean in new column

This is a sample of the data
structure(list(Season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2018/2019",
"2019/2020"), class = "factor"), Date2 = structure(c(17860, 17888,
17916, 17940, 17945, 17952, 17953, 17954, 17978, 17999, 18005,
18188, 18209, 18223, 18237, 18320, 18322, 18334, 18447, 18476
), class = "Date"), HT.av.points = c(0.57, 1.5, 1.67, 1.8, 1.09,
2.18, 1.42, 1.45, 1.79, 1.35, 1.14, 1.83, 2, 1.17, 1.88, 1.83,
1.33, 0.92, 1.31, 1.06), AT.av.points = c(1.14, 2.33, 0.56, 1.2,
1.09, 1.6, 1.08, 1.9, 1.17, 0.9, 1.38, 0.67, 2.14, 1.33, 0.62,
1.08, 2.17, 1.38, 0.56, 0.94), HT_av.PointsTotal = c(0.86, 1.16,
1.18, 1.23, 0.86, 1.86, 1.2, 1.18, 1.5, 1.1, 1.07, 1.46, 1.6,
1.08, 1.75, 1.4, 1.16, 0.92, 1.03, 0.97), AT_av.PointsTotal = c(2.07,
2.21, 0.76, 1.42, 1.59, 1.5, 1.2, 1.91, 1.65, 1.43, 1.38, 0.54,
1.87, 1.58, 0.8, 1.6, 2.32, 1.42, 1.12, 1.32), DIFF.AV.POINTS.PREDICTION = c(-0.28,
-0.43, 0.51, 0.52, -0.36, 0.56, 0.28, -0.38, -0.2, 0.03, -0.43,
1.24, -0.32, -0.29, 1.44, 0.28, -0.85, -0.38, 1.01, 0.22), Over2.5G = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1)), row.names = c(NA,
-20L), .internal.selfref = <pointer: 0x1ca2448>, class = c("data.table",
"data.frame"), .Names = c("Season", "Date2", "HT.av.points",
"AT.av.points", "HT_av.PointsTotal", "AT_av.PointsTotal", "DIFF.AV.POINTS.PREDICTION",
"Over2.5G"))
What I want to do:
group by Season
After the group by, I want to find the 3 previous rows that are most similar (according to the following columns) to the current row.
(HT.av.points, AT.av.points, HT_av.PointsTotal, AT_av.PointsTotal, DIFF.AV.POINTS.PREDICTION)
I guess the dist() function is a possibility.
Finally I want to create a new column with the mean of the values of the Over2.5G column of those 3 most similar rows.
New column:
First 3 rows(of the Season) NAs.
In fourth row(of the Season) the 3 nearest neighbours (and their Over2.5G values) will always be the first 3 rows.
breaking below code up:
a helper function which returns row indices of nearest neighbours with a ready-made function, e.g. get.knn of package FNN
calling this function for increasingly large slices (from row one to current) of the input data df and storing the result as an extra column
extracting the row indices as integers from the result string to index the desired column of the input data for the aggregation (mean, in your case)
here we go:
## helper function returns row indices of nearest 3 neighbours
## as comma-separated string
find_nearest_predecessors <- function(df, ...){
ifelse(nrow(df) < 4, ## can't calculate n neighbours for <n rows:
paste(1:3, collapse = ','),
## otherwise = if sufficient rows,
## get row indices of 3 nearest neighbours:
get.knn(data = df,
k = 3,
algo = 'CR'
) %>%
.[['nn.index']] %>%
tail(1) %>% paste(collapse = ',')
)
}
## df being your input data:
df %>%
mutate(rownum = row_number()) %>%
rowwise %>%
mutate(nearest_neighbours = find_nearest_predecessors(
df = ## use previous data up to current row:
slice(df, 1:rownum) %>%
## choose features/dimensions of distance:
select(HT.av.points, AT.av.points, HT_av.PointsTotal,
AT_av.PointsTotal, DIFF.AV.POINTS.PREDICTION)
),
## calculate mean of OVER2.5G
mean_Over2.5G = mean(df$Over2.5G[
strsplit(nearest_neighbours,',') %>%
unlist %>% as.integer
], na.rm = TRUE)
)

Create subset of data using conditions from another data frame

I would like to use something like dplyr to create a subset of data from one data frame using conditions from another data frame. So in one data frame I have a set of data with minimum and maximum years and other sea-level data lsp , and in another frame I have a time series of ocean dynamics. For each row in the lsp dataframe, I would like to extract every year between the minimum and maximum ages in the dynamics data frame and create a sub set of data. I think this will require a for loop. Does anyone have any idea if this is possible?
Desired output using row 1 of LSP as an example:
Row 1 LSP (simplified) is:
Age min
Age max
1997
2007
I want to use this information to create a data frame like this from the dynamics file:
Subset
Year
Dynamics
1997
125
1998
109
1999
152
2000
161
2001
106
2002
120
2003
58
2004
68
2005
110
2006
144
2007
100
Many thanks
## LSP data
structure(list(Depth = c(0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5,
8.5, 10.5, 13.5, 14.5, 18.5, 19.5, 27.5, 28.5, 32, 35.5, 40.5,
41.5), RSL = c(0.03, 0.03, 0.01, 0.01, -0.04, -0.01, -0.03, 0,
0.04, 0.03, 0, -0.01, -0.05, -0.07, -0.19, -0.24, -0.31, -0.31,
-0.27, -0.29), RSL_err_1sig = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1), Age_mean = c(2001.754499, 1994.278776, 1987.678949, 1980.805889,
1973.270485, 1965.018421, 1957.442729, 1952.134369, 1949.031929,
1945.148184, 1939.132213, 1936.957531, 1927.311071, 1924.379033,
1897.26123, 1892.977317, 1876.1995, 1858.135589, 1825.967544,
1820.605298), Age.min = c(1996.752238, 1985.111654, 1977.483594,
1968.26211, 1961.886124, 1958.219318, 1947.496532, 1943.084044,
1941.761439, 1935.843414, 1923.952516, 1920.057048, 1906.228232,
1902.242998, 1875.327613, 1869.925103, 1834.992176, 1811.928966,
1784.998245, 1767.524866), Age.max = c(2006.75676, 2003.445898,
1997.874304, 1993.349668, 1984.654846, 1971.817524, 1967.388926,
1961.184694, 1956.302419, 1954.452954, 1954.31191, 1953.858014,
1948.39391, 1946.515068, 1919.194847, 1916.029531, 1917.406824,
1904.342212, 1866.936843, 1873.68573)), class = "data.frame", row.names = c(NA,
-20L))
## Dynamics (only head)
structure(list(Year = 1815:1820, dynamics = c(-76.01893261, -64.50519732,
-66.06270761, -76.22822397, -72.35960029, -77.34157443)), row.names = c(NA,
6L), class = "data.frame")
Here is a base R option with Map and subset -
Map(function(x, y) subset(dynamics, Year >= x & Year <= y),
LSP$Age.min, LSP$Age.max)
The same logic can be implemented using tidyverse functions as well.
library(dplyr)
library(purrr)
map2(LSP$Age.min, LSP$Age.max, ~dynamics %>% filter(Year >= .x & Year <= .y))
As long as your dataset isn't huge, I would take something like the following approach.
Add the (nested) dynamics dataset to each row of your lsp dataset
Unnest the dynamics dataset to get one row per year
Filter out years that aren't relevant
(Optional)
Renest the dynamics columns to you have one row per lsp record with a tibble for all relevant years from the dynamics set.
lsp %>%
add_column(dynamics %>% nest(data = everything())) %>%
unnest(data) %>%
filter(year >= min & year <= max) %>%
nest(filtered = c(year, value))
I guess this does what you want to do. First assign names to your input data, so later you know what my codes mean.
lsp <- structure(list(Depth = c(0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5,
8.5, 10.5, 13.5, 14.5, 18.5, 19.5, 27.5, 28.5, 32, 35.5, 40.5,
41.5), RSL = c(0.03, 0.03, 0.01, 0.01, -0.04, -0.01, -0.03, 0,
0.04, 0.03, 0, -0.01, -0.05, -0.07, -0.19, -0.24, -0.31, -0.31,
-0.27, -0.29), RSL_err_1sig = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1), Age_mean = c(2001.754499, 1994.278776, 1987.678949, 1980.805889,
1973.270485, 1965.018421, 1957.442729, 1952.134369, 1949.031929,
1945.148184, 1939.132213, 1936.957531, 1927.311071, 1924.379033,
1897.26123, 1892.977317, 1876.1995, 1858.135589, 1825.967544,
1820.605298), Age.min = c(1996.752238, 1985.111654, 1977.483594,
1968.26211, 1961.886124, 1958.219318, 1947.496532, 1943.084044,
1941.761439, 1935.843414, 1923.952516, 1920.057048, 1906.228232,
1902.242998, 1875.327613, 1869.925103, 1834.992176, 1811.928966,
1784.998245, 1767.524866), Age.max = c(2006.75676, 2003.445898,
1997.874304, 1993.349668, 1984.654846, 1971.817524, 1967.388926,
1961.184694, 1956.302419, 1954.452954, 1954.31191, 1953.858014,
1948.39391, 1946.515068, 1919.194847, 1916.029531, 1917.406824,
1904.342212, 1866.936843, 1873.68573)), class = "data.frame", row.names = c(NA,
-20L))
dynamics <- structure(list(Year = 1815:1820, dynamics = c(-76.01893261, -64.50519732,
-66.06270761, -76.22822397, -72.35960029, -77.34157443)), row.names = c(NA,
6L), class = "data.frame")
Then the actual codes to get the subset.
# first get info of years from the "lsp" dataset
# following your example in your comments
year_min <- list()
year_max <- list()
all_years <- list()
for(i in 1:nrow(lsp)){
year_min[[i]] <- round(lsp$Age.min[[i]])
year_max[[i]] <- round(lsp$Age.max[[i]])
all_years[[i]] <- c(year_min[[i]]:year_max[[i]])
all_years[[i]] <- as.data.frame(all_years[[i]])
colnames(all_years[[i]]) <- "Year"
}
# now join the info on "Year" from "lsp" data with "dynamics" data to get the subset
library(dplyr)
subset_output <- list()
for (i in 1:length(all_years)){
subset_output[[i]] <- left_join(dynamics,all_years[[i]])
}

Combining pheatmaps in R

I've been working around with no success in solving how 2 or more pheatmaps (heatmaps) can be combined in a final plot.
data1 <- structure(list(DC1 = c(NA, NA, 1.98), DC2 = c(NA, NA, 0.14),
DC3 = c(1.85, 1.51, 0.52), DC4 = c(0.89, 0.7, 1.47), DC5 = c(0,
0.78, 0), DC6 = c(0, 1.3, 0), DC7 = c(0, 1.47, 0), DC8 = c(0,
1.2, 0), DC9 = c(0, 0, 0), DC10 = c(0.51, 1.9, 0)), .Names = c("DC1",
"DC2", "DC3", "DC4", "DC5", "DC6", "DC7", "DC8", "DC9", "DC10"),
enter code here`class = "data.frame", row.names = c("A", "B", "C"))
data 2 <- structure(list(DC1 = c(9.56, 1.87, 2.07, 1.87, 2.07, 1.35), DC2 = c(5.51, 1.13, 1.25, 1.13, 0.99, 0.45), DC3 = c(4.84, 1.17, 0.66, 1.17,
0.34, 0.16), DC4 = c(4.18, 0.59, 0.05, 0.97, 0.43, 0.59), DC5 = c(3.26,
0, 0.14, 0.31, 0.79, 0.63), DC6 = c(3.35, 0, 1.12, 0.05, 1.12,
0), DC7 = c(4.18, 0.63, 1.27, 0.47, 1.27, 0), DC8 = c(4.37, 1.17,
1.3, 1.17, 0, 0), DC9 = c(4.3, 1.13, 0, 1.13, 0, 0), DC10 = c(7.47,
1.88, 0.71, 1.88, 0, 0)), .Names = c("DC1", "DC2", "DC3", "DC4",
"DC5", "DC6", "DC7", "DC8", "DC9", "DC10"), class = "data.frame", row.names = c("TD6 vs SH",
"TD6 vs SAP", "TD6 vs NEA", "SH vs SAP", "SH vs NEA", "SAP vs NEA"
))
I construct very easily a heatmap using pheatmap by using these two codes:
hm_data1 <- pheatmap(as.matrix(data1))
hm_data2 <- pheatmap(as.matrix(data2))
However, in no way I can get both printed in one figure. I would like to see both of them horizontally. However, my real figure will be composed by 16 pheatmaps, so they must be arrange in 4 columns and 4 rows.
I tried with par mfrow with no success.
How can I combine pheatmaps?
I know there are plenty of R packages that can plot heatmaps, but I would like to do it with pheatmap
This will work.
library(gridExtra); library(pheatmap)
m <- matrix(c(1:4), ncol=2)
n <- matrix(c(1,1,1,2), ncol=2)
a <- list(pheatmap(m)[[4]])
a[[2]] <- pheatmap(n)[[4]]
z <- do.call(grid.arrange,a)
plot(z)
Based on one of the comments. If you have many single plots; you can use a loop like this.
mn <- list(m, n)
a <- list()
for(i in 1:length(mn)){
a[i] <- list(pheatmap(mn[[i]])[[4]])
}
z <- do.call(grid.arrange,a)
plot(z)
The point is it to add all the data for your single plots in a list. You can then loop over the list, applying pheatmap.

for loop to find threshold values between different data frames

I have 2 data frame with some matching columns (pollutants).
The first data frame contains the observations while the second one contains different thresholds for some pollutants.
Here a small subset of both data frames:
dput(df1)
structure(list(sample = structure(27:76, .Label = c("A_1", "A_2",
"A_LS", "A_PC", "A_PM", "B_1", "B1_1", "B1_2", "B1-8_PC", "B1-8_PM",
"B1_LS", "B1_PC", "B1_PM", "B_2", "B2_1", "B2_2", "B2-8_PC",
"B2-8_PM", "B2_LS", "B2_PC", "B2_PM", "B_LS", "B_PC", "B_PM",
"C_1", "C_2", "C386", "C387", "C388", "C389", "C390", "C391",
"C392", "C393", "C394", "C395", "C396", "C397", "C398", "C399",
"C400", "C401", "C402", "C403", "C404", "C405", "C406", "C407",
"C408", "C409", "C410", "C411", "C412", "C413", "C414", "C415",
"C416", "C417", "C418", "C419", "C420", "C421", "C422", "C423",
"C424", "C425", "C426", "C427", "C428", "C429", "C430", "C431",
"C432", "C433", "C434", "C435", "C436", "C437", "C438", "C439",
"C440", "C441", "C442", "C443", "C444", "C445", "C446", "C447",
"C448", "C449", "C450", "C451", "C452", "C453", "C454", "C455",
"C456", "C457", "C458", "C459", "C460", "C461", "C462", "C463",
"C464", "C465", "C466", "C467", "C468", "C469", "C470", "C471",
"C472", "C473", "C474", "C475", "C476", "C477", "C478", "C479",
"C480", "C481", "C482", "C483", "C484", "C485", "C486", "C487",
"C488", "C489", "C490", "C491", "C492", "C493", "C494", "C495",
"C496", "C497", "C498", "C499", "C500", "C501", "C502", "C503",
"C504", "C505", "C506", "C507", "C508", "C509", "C510", "C511",
"C512", "C513", "C514", "C515", "C516", "C517", "C518", "C519",
"C520", "C521", "C522", "C523", "C524", "C-8_PC", "C-8_PM", "D_1",
"D_2", "E_1", "E_2", "F_1", "F_2"), class = "factor"), As = c(9,
8.75, 13.5, 7.75, 7.6, 8.33, 8, 8.75, 7.4, 8.25, 8.17, 7.75,
7.6, 7.5, 7.2, 8, 7.83, 7.75, 7, 7.5, 8.17, 8.75, 6.67, 7, 5.83,
6.75, 5.6, 6.4, 6.2, 6.2, 6.2, 6.25, 7, 6, 6, 6.4, 6, 5.8, 5.6,
6, 5.8, 7.25, 8.8, 8.5, 8, 8.25, 8.25, 8.5, 8.25, 8.25), Al = c(30245,
38060, 36280, 24355, 27776, 35190, 38733.8, 36400, 29624, 33699.75,
32163.33, 30645.75, 31373, 26647.5, 19987.6, 32210, 27158, 24220.25,
18598.5, 23081.75, 29393, 26800.5, 22581.67, 29290, 29651.67,
20947.5, 19762.6, 23815, 32784.8, 20696.2, 26880.6, 25087.75,
19497.2, 21794, 32232, 24253.4, 20034, 21270, 22510, 15170.25,
8956.6, 21612.25, 35828, 30006.25, 27128.75, 25835, 31118.75,
35614.5, 37440.25, 33736.75), Hg = c(0.25, 0.35, 0.48, 1.03,
1.12, 0.2, 1.14, 0.4, 2, 0.48, 0.85, 0.18, 0.76, 0.4, 0.48, 0.35,
0.32, 0.33, 0.4, 0.13, 0.15, 0.13, 0.87, 0.12, 0.03, 0.33, 0.2,
0.22, 0.04, 0.16, 0.1, 0.18, 0.11, 0.08, 0.03, 0.06, 0.06, 0.1,
0.03, 0.07, 0.03, 0.1, 0.08, 0.11, 0.1, 0.13, 0.08, 0.12, 0.07,
0.09)), .Names = c("sample", "As", "Al", "Hg"), row.names = c(NA,
50L), class = "data.frame")
and
dput(df2)
structure(list(As = c(25L, 32L), Hg = c(0.4, 0.8), Cr = c(100L,
360L), Element = structure(c(1L, 3L), .Label = c("LCB", "LCB_pelite",
"LCL"), class = "factor")), .Names = c("As", "Hg", "Cr", "Element"
), row.names = c(NA, -2L), class = "data.frame")
Actually the original data frames are bigger, but this subset gives the idea.
What I want now is to put in a 3rd data frames the values of each element of the first df that exceed the threshold values contained in the second df.
Be aware that there are 2 different threshold values (for each element) in df2 and df2 has some element not matched in df1 (for example Cr).
I've tried to write a for loop but I was able to do that just for 1 element at a time:
for (i in df2$As) {
print(length(which(df1$As > i)))
}
I've also tried to use nested for loops but without success..
I'm pretty sure this does not look good, but I think it works. I added some extra lines to match only the elements found in both data frames, which in this case is only 1. It might ned some changes for your full data:
df1.2 <- rbind(df1, df1) #Duplicate the df1 to compare to each threshold value
df1.2 <- df1.2[order(df1.2$sample),] #Order by sample again
cols2 <- na.omit(match(colnames(df1), colnames(df2)))[[1]] #Get the columns of df2 which are in df1
cols1 <- na.omit(match(colnames(df2), colnames(df1)))[[1]] #Get the columns of df1 which are in df2
df2.2 <- df2[rep(1:2, nrow(df1)),cols2] #Replicates df2 the number of times to allow matching the thresholds to each sample, once for each threshold
exceeds <- df1.2[,cols1]>df2.2 #Make the comparions and return a boolean
sum(exceeds) #You will need colSums() for more than one column
With your sample data it's also not clear from the answer which elements ir refers to, but this shouldn't happen if more than one element matches and your result is a matrix.
Maybe there's a more elegant way without replicating the dataframes and having to worry about number of element matches.
df3=data.frame(Pollutant="Z",LCB=0,LCL=0,stringsAsFactors=FALSE)
for (p in names(df1)[-1]) {
if(p %in% names(df2)[1:(length(df2)-1)]) {
df3 = rbind(df3,c(p,sum(df1[p]>df2[[p]][1]),sum(df1[p]>df2[[p]][2])))
}
}
df3=df3[-1,]
df3
Update:
Ah, each new row is rbound as a character vector. To finish up:
str(df3)
df3$LCB=as.numeric(df3$LCB)
df3$LCL=as.numeric(df3$LCL)
str(df3)
How about this?
foo <- function(x, y) {
sapply(x, function(i) sum(y>i))
}
cols = c("As", "Hg")
mapply(foo, df2[cols], df1[cols])
# As Hg
# [1,] 0 10
# [2,] 0 6
Convert this to a data.frame if necessary.

How can I stop the overwriting of lagged columns?

Reference: https://stackoverflow.com/a/28056113/3942806
I am using the code from the link above to create lagged values for two columns.
n<-4
odd2<-setDT(odd)[, paste("OBS_Q", 1:n) := shift(OBS_Q, 1:n)]
odd2<-setDT(odd)[, paste("sac", 1:n) := shift(sac, 1:n)]
This works great! I get 18 columns.
But for convenience, I tried to convert it into a function:
masterlag<-function(df,col,n){
setDT(df)[, paste(col,sep='_',1:n) := shift(df[[col]], 1:n)]
}
odd3<-masterlag(df=odd,col="OBS_Q",n=4)
odd3<-masterlag(df=odd,col="sac",n=4)
But in this case, the newly created columns of the first one ('OBS_Q') are getting replaced when I used the function the second time ('sac'). So, I am only left with 14 columns instead of 18.
Any pointers as to why?
odd<-structure(list(DATE = 19630101:19630104, PRECIP = c(0, 0, 0,0),
OBS_Q = c(1.61, 1.48, 1.4, 1.33), swb = c(1.75, 1.73, 1.7,1.67),
gr4j = c(1.9, 1.77, 1.67, 1.58), isba = c(0.83, 0.83,0.83, 0.83),
noah = c(1.31, 1.19, 1.24, 1.31), sac = c(1.99,1.8, 1.66, 1.57),
swap = c(1.1, 1.05, 1.08, 0.99), vic.mm.day. = c(2.1,1.75, 1.55, 1.43)),
.Names = c("DATE", "PRECIP", "OBS_Q", "swb","gr4j", "isba", "noah", "sac", "swap", "vic.mm.day."),
class = c("data.table","data.frame"), row.names = c(NA, -4L))
The dataframes are self-updating
odd<-masterlag(df=odd,col="OBS_Q",n=4)
odd<-masterlag(df=odd,col="sac",n=4)

Resources