How to extract confidence intervals from multiple regression models? - r

I am extracting the regression results for two different groups as shown in this example below. In the temp data.frame i get the estimate, std.error, statistic and p-value. However, i don't get the confidence intervals. Is there a simple way to extract them as well?
df <- tibble(
a = rnorm(1000),
b = rnorm(1000),
c = rnorm(1000),
d = rnorm(1000),
group = rbinom(n=1000, size=1, prob=0.5)
)
df$group = as.factor(df$group)
temp <- df %>%
group_by(group) %>%
do(model1 = tidy(lm(a ~ b + c + d, data = .))) %>%
gather(model_name, model, -group) %>%
unnest()

You are doing tidy on a lm object. If you check the help page, there is an option to include the confidence interval, conf.int=TRUE:
temp <- df %>%
group_by(group) %>%
do(model1 = tidy(lm(a ~ b + c + d, data = . ), conf.int=TRUE)) %>%
gather(model_name, model, -group) %>%
unnest()
# A tibble: 8 x 9
group model_name term estimate std.error statistic p.value conf.low conf.high
<fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 model1 (Int… 0.0616 0.0423 1.46 0.146 -0.0215 0.145
2 0 model1 b 0.00178 0.0421 0.0424 0.966 -0.0808 0.0844
3 0 model1 c -0.00339 0.0431 -0.0787 0.937 -0.0881 0.0813
4 0 model1 d -0.0537 0.0445 -1.21 0.228 -0.141 0.0337
5 1 model1 (Int… -0.0185 0.0454 -0.408 0.683 -0.108 0.0707
6 1 model1 b 0.00128 0.0435 0.0295 0.976 -0.0842 0.0868
7 1 model1 c -0.0972 0.0430 -2.26 0.0244 -0.182 -0.0126
8 1 model1 d 0.0734 0.0457 1.60 0.109 -0.0165 0.163

If your version of dplyr is higher than 1.0.0, you can use:
df %>%
group_by(group) %>%
summarise(tidy(lm(a ~ b + c + d), conf.int = TRUE), .groups = "drop")
#> # A tibble: 8 x 8
#> group term estimate std.error statistic p.value conf.low conf.high
#> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 (Intercept) 0.0734 0.0468 1.57 0.117 -0.0185 0.165
#> 2 0 b -0.101 0.0461 -2.19 0.0292 -0.191 -0.0102
#> 3 0 c 0.0337 0.0464 0.726 0.468 -0.0575 0.125
#> 4 0 d -0.101 0.0454 -2.23 0.0265 -0.190 -0.0118
#> 5 1 (Intercept) -0.0559 0.0468 -1.20 0.232 -0.148 0.0360
#> 6 1 b -0.0701 0.0474 -1.48 0.140 -0.163 0.0230
#> 7 1 c 0.0319 0.0477 0.668 0.504 -0.0619 0.126
#> 8 1 d -0.0728 0.0466 -1.56 0.119 -0.164 0.0188

Related

edit string text in dataframe variable

I want to tidy up a dataframe and automate the process. Given the following data.frame:
library(survival)
library(rms)
library(broom)
library(tidyverse)
res.cox <- coxph(Surv(time, status) ~ rcs(age, 3) + sex + ph.ecog +
rcs(meal.cal, 4), data = lung)
output <- tidy(res.cox)
output
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 3)age -0.00306 0.0219 -0.140 0.889
# 2 rcs(age, 3)age' 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 rcs(meal.cal, 4)meal.cal -0.000416 0.00104 -0.400 0.689
# 6 rcs(meal.cal, 4)meal.cal' 0.00118 0.00232 0.509 0.611
# 7 rcs(meal.cal, 4)meal.cal'' -0.00659 0.0114 -0.577 0.564
I want to remove the rcs-spline information from term variable and be left with:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age -0.00306 0.0219 -0.140 0.889
# 2 s2 age 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
# 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
# 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
I want the solution to easily work for other cases too so when you increase the number of knots:
res.cox2 <- coxph(Surv(time, status) ~ rcs(age, 4) + rcs(meal.cal, 6) +
sex + ph.ecog, data = lung)
output2 <- tidy(res.cox2)
output2
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 4)age 0.0419 0.0403 1.04 0.298
# 2 rcs(age, 4)age' -0.101 0.0806 -1.26 0.208
# 3 rcs(age, 4)age'' 0.569 0.388 1.47 0.142
# 4 rcs(meal.cal, 6)meal.cal -0.000974 0.00155 -0.631 0.528
# 5 rcs(meal.cal, 6)meal.cal' 0.00751 0.0115 0.655 0.512
# 6 rcs(meal.cal, 6)meal.cal'' -0.0217 0.0358 -0.607 0.544
# 7 rcs(meal.cal, 6)meal.cal''' 0.0614 0.123 0.501 0.616
# 8 rcs(meal.cal, 6)meal.cal'''' -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
you would be left with:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age 0.0419 0.0403 1.04 0.298
# 2 s2 age -0.101 0.0806 -1.26 0.208
# 3 s3 age 0.569 0.388 1.47 0.142
# 4 s1 meal.cal -0.000974 0.00155 -0.631 0.528
# 5 s2 meal.cal 0.00751 0.0115 0.655 0.512
# 6 s3 meal.cal -0.0217 0.0358 -0.607 0.544
# 7 s4 meal.cal 0.0614 0.123 0.501 0.616
# 8 s5 meal.cal -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
etc...
My attempt so far gets me some of the way but I am not sure of the best way to deal with the ', '' (note the first term does not contain a ') etc.:
output %>%
mutate(rcs_indicator = str_detect(term, fixed("rcs(")),
term = str_replace_all(term, "rcs\\(.+?\\)", ""))
# term estimate std.error statistic p.value rcs_indicator
# <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 age -0.00306 0.0219 -0.140 0.889 TRUE
# 2 age' 0.0154 0.0261 0.592 0.554 TRUE
# 3 sex -0.525 0.192 -2.74 0.00620 FALSE
# 4 ph.ecog 0.421 0.131 3.22 0.00128 FALSE
# 5 meal.cal -0.000416 0.00104 -0.400 0.689 TRUE
# 6 meal.cal' 0.00118 0.00232 0.509 0.611 TRUE
# 7 meal.cal'' -0.00659 0.0114 -0.577 0.564 TRUE
It might be useful to just work with the terms I need to change directly:
unique(str_subset(output$term, fixed("rcs(")) %>%
str_replace_all("'", ""))
# [1] "rcs(age, 3)age" "rcs(meal.cal, 4)meal.cal"
I feel there is a way to do this in a simpler way than the steps I am doing.
Any suggestions?
Thanks
This one is clunky but it should work:
library(dplyr)
library(stringr)
output %>%
group_by(group =str_extract(term, 'rcs\\(.')) %>%
mutate(row = row_number()) %>%
mutate(term = str_replace_all(term, 'rcs\\(', paste0("s",row, " "))) %>%
mutate(term = ifelse(str_detect(term, 's\\d'),
str_extract(term, '.\\d\\s.*\\s'), term)) %>%
mutate(term = str_trim(term)) %>%
mutate(term = str_replace_all(term, '\\,', '')) %>%
ungroup() %>%
select(-c(group, row))
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 s1 age -0.00306 0.0219 -0.140 0.889
2 s2 age 0.0154 0.0261 0.592 0.554
3 sex -0.525 0.192 -2.74 0.00620
4 ph.ecog 0.421 0.131 3.22 0.00128
5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
6 s2 meal.cal 0.00118 0.00232 0.509 0.611
7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
This is also less elegant than desired, but should work for multiple knots
output %>%
mutate(is_spline = grepl("^rcs\\(.*?, \\d\\)", term),
n_term = str_count(term, "'") + 1,
pre = ifelse(is_spline, paste0('s', n_term, ' '), ""),
term = paste0(pre, gsub("(^rcs\\(.*?, \\d\\))|(\\'+$)", "", term))) %>%
select(-is_spline, -n_term, -pre)
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 s1 age -0.00306 0.0219 -0.140 0.889
#> 2 s2 age 0.0154 0.0261 0.592 0.554
#> 3 sex -0.525 0.192 -2.74 0.00620
#> 4 ph.ecog 0.421 0.131 3.22 0.00128
#> 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
#> 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
#> 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564

Combine predicted value of probit model into original dataframe

I have the data like this:
df <- tibble::tibble(
id = rep(c(1:50), each = 5),
y = runif(250,min = 0, max = 1),
x1 = rnorm(250, mean = 0, sd=1),
x2 = rnorm(250, mean = 0, sd=1),
x3 = rnorm(250, mean = 0, sd=1),
x4 = rnorm(250, mean = 0, sd=1),
x5 = rnorm(250, mean = 0, sd=1),
) %>%
group_by(id) %>%
mutate(year = rep(c(2001:2005)))
I would like to estimate the probit model for every year to get (1)coefficient estimates,and (2) predicted value of y, and (3) number of observations used to estimate the model:
probit_model <- function(df) {
glm (y ~ x1 + x2 + x3 + x4+ x5,
family = binomial(link = "probit"),
data = df)
}
Do you know how we can get the coefficient estimates, predicted value for every year and then combine them with the original data (that is df) here? I know what we can do with OLS model (by using map function to estimate for multiple models). But I do not know how to do with probit regression.
Thank you so much.
I think you need to do this, I used this post as reference.
library(dplyr)
df <- tibble::tibble(
id = rep(c(1:50), each = 5),
y = runif(250,min = 0, max = 1),
x1 = rnorm(250, mean = 0, sd=1),
x2 = rnorm(250, mean = 0, sd=1),
x3 = rnorm(250, mean = 0, sd=1),
x4 = rnorm(250, mean = 0, sd=1),
x5 = rnorm(250, mean = 0, sd=1),
) %>%
group_by(id) %>%
mutate(year = rep(c(2001:2005)))
fitted_models = df %>% group_by(year) %>% do(model = glm(y ~ x1 + x2 + x3 + x4+ x5,
family = binomial(link = "probit"),
data = .))
#fitted_models$year
#fitted_models$model[1]
fitted_models %>% summarise(broom::tidy(model))
## A tibble: 30 x 5
#term estimate std.error statistic p.value
#<chr> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) -0.160 0.187 -0.856 0.392
#2 x1 0.0860 0.230 0.375 0.708
#3 x2 0.0657 0.187 0.351 0.725
#4 x3 0.0472 0.160 0.296 0.767
#5 x4 0.216 0.191 1.13 0.257
#6 x5 -0.159 0.263 -0.604 0.546
#7 (Intercept) -0.0792 0.182 -0.434 0.664
#8 x1 0.0314 0.170 0.185 0.853
#9 x2 -0.0320 0.194 -0.164 0.869
#10 x3 0.167 0.218 0.763 0.445
## ... with 20 more rows
fitted_models %>% summarise(broom::glance(model))
## A tibble: 5 x 8
#null.deviance df.null logLik AIC BIC deviance df.residual nobs
#<dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
# 1 21.7 49 -32.5 77.0 88.5 19.7 44 50
#2 16.4 49 -33.4 78.8 90.3 15.7 44 50
#3 15.5 49 -34.5 81.1 92.5 15.2 44 50
#4 16.6 49 -32.4 76.7 88.2 15.0 44 50
#5 19.6 49 -33.3 78.6 90.0 19.1 44 50
fitted_models %>% summarise(broom::augment(model, type.predict = "response"))
## A tibble: 250 x 12
#y x1 x2 x3 x4 x5 .fitted .resid .std.resid .hat .sigma .cooksd
#<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.819 0.0246 0.0176 0.280 0.192 0.840 0.407 0.846 0.875 0.0665 0.664 0.00894
#2 0.0418 1.41 0.297 1.15 -1.41 0.347 0.372 -0.792 -0.853 0.137 0.665 0.0144
#3 0.119 -0.265 -0.158 -1.37 -2.48 -0.504 0.237 -0.300 -0.327 0.156 0.676 0.00284
#4 0.0282 -0.836 -0.442 -1.63 0.506 0.910 0.355 -0.808 -0.858 0.114 0.665 0.0112
#5 0.893 -0.481 -0.384 -0.974 0.897 -0.662 0.510 0.819 0.850 0.0703 0.665 0.00792
#6 0.865 0.417 -0.0233 0.841 -0.268 -0.140 0.451 0.865 0.883 0.0395 0.664 0.00494
#7 0.809 1.30 -0.469 1.01 -0.0913 -0.106 0.486 0.669 0.702 0.0921 0.669 0.00778
#8 0.0220 0.119 -0.580 -0.533 -1.09 0.0142 0.326 -0.780 -0.801 0.0522 0.666 0.00406
#9 0.722 0.194 -1.50 -0.395 1.65 -0.868 0.592 0.271 0.297 0.167 0.676 0.00281
#10 0.131 1.24 0.600 1.14 -1.17 0.370 0.392 -0.579 -0.618 0.122 0.671 0.00756
## ... with 240 more rows
A similar answer to #cdcarrion's, from the same post, but using map (a slightly newer approach than do()):
fit the models
library(broom)
models <- (df
%>% group_by(year)
%>% nest()
%>% mutate(model = map(data, glm,
formula = y ~ x1 + x2 + x3 + x4+ x5,
family = binomial(link = "probit")))
)
get coefficients
coefs <- (models
%>% mutate(cc = map(model, tidy))
%>% select(year, cc)
%>% unnest(cols = cc)
)
get predictions
preds <- (models
%>% mutate(aug = map(model, augment, type.predict = "response"))
%>% select(year, aug)
%>% unnest(cols = aug)
%>% select(year:x5, .fitted)
)

Can I overlook a missing variable in a summing part of a function?

This is a shortened version of my real df. I have a function (called: calc) which creates a new variable called 'total', for simplicity this adds up three variables: a, b, c. When I add a dataframe, to that function, that does not feature one variable (say c) so only has a & b, the function falls over. Is there a 'function' / simple way that counts the variables regardless if they are missing?
calc <- function(x) {x %>% mutate(total = a + b + c)}
data.2 has two columns a & b with many rows of values, but when running that in the function it cannot find c so does not calculate.
new.df <- calc(data.2)
Many thanks.
If you want to perform rowwise sum or mean they have na.rm argument which you can use to ignore NA values.
library(dplyr)
calc <- function(x) {x %>% mutate(total = rowSums(select(., a:c), na.rm = TRUE))}
In general case if you are not able to find a function which gives you an out-of-box solution you can replace NA values with 0 maybe and then perform the operation that you want to perform.
calc <- function(x) {
x %>%
mutate(across(a:c, tidyr::replace_na, 0),
total = a + b + c)
}
You can use rowwise() and c_across() with any_of() (or any other tidyselect function) from dplyr (>= 1.0.0).
library(dplyr)
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(any_of(c("a", "b", "c"))))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 -0.884 0.851 -0.0339
#> 2 -1.56 -0.464 -2.02
#> 3 -0.884 0.815 -0.0689
#> 4 -1.46 -0.259 -1.71
#> 5 0.211 -0.528 -0.317
#> 6 1.85 0.190 2.04
#> 7 -1.31 -0.921 -2.23
#> 8 0.450 0.394 0.845
#> 9 -1.14 0.428 -0.714
#> 10 -1.11 0.417 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.0868 0.632 1.81 2.36
#> 2 0.568 -0.523 0.240 0.286
#> 3 -0.0325 0.377 -0.437 -0.0921
#> 4 0.660 0.456 1.28 2.39
#> 5 -0.123 1.75 -1.03 0.599
#> 6 0.641 1.39 0.902 2.93
#> 7 0.266 0.520 0.904 1.69
#> 8 -1.53 0.319 0.439 -0.776
#> 9 0.942 0.468 -1.69 -0.277
#> 10 0.254 -0.600 -0.196 -0.542
If you want to be able to generalize beyond those 3 variables you can use any tidyselect methodology.
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(everything()))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 0.775 1.17 1.95
#> 2 -1.05 1.21 0.155
#> 3 2.07 -0.264 1.81
#> 4 1.11 0.793 1.90
#> 5 -0.700 -0.216 -0.916
#> 6 -1.04 -1.03 -2.07
#> 7 -0.525 1.60 1.07
#> 8 0.354 0.828 1.18
#> 9 0.126 0.110 0.236
#> 10 -0.0954 -0.603 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.616 0.767 0.0462 0.196
#> 2 -0.370 -0.538 -0.186 -1.09
#> 3 0.337 1.11 -0.700 0.751
#> 4 -0.993 -0.531 -0.984 -2.51
#> 5 0.0538 1.50 -0.0808 1.47
#> 6 -0.907 -1.54 -0.734 -3.18
#> 7 -1.65 -0.242 1.43 -0.455
#> 8 -0.166 0.447 -0.281 -0.000524
#> 9 0.0637 -0.0185 0.754 0.800
#> 10 1.81 -1.09 -2.15 -1.42
Created on 2020-09-10 by the reprex package (v0.3.0)

Dynamic portfolio re-balancing if PF weights deviate by more than a threshold

It's not so hard to backtest a portfolio with given weights and a set rebalancing frequency (e.g. daily/weekly...). There are R packages doing this, for example PerformanceAnalytics, or tidyquant's tq_portfolio which uses that function.
I would like to backtest a portfolio that is re-balanced when the weights deviate by a certain threshold given in percentage points.
Say I have two equally-weighted stocks and a threshold of +/-15 percentage points, I would rebalance to the initial weights when one of the weights exceeds 65%.
For example I have 3 stocks with equal weights (we should also be able to set other weights).
library(dplyr)
set.seed(3)
n <- 6
rets <- tibble(period = rep(1:n, 3),
stock = c(rep("A", n), rep("B", n), rep("C", n)),
ret = c(rnorm(n, 0, 0.3), rnorm(n, 0, 0.2), rnorm(n, 0, 0.1)))
target_weights <- tibble(stock = c("A", "B", "C"), target_weight = 1/3)
rets_weights <- rets %>%
left_join(target_weights, by = "stock")
rets_weights
# # A tibble: 18 x 4
# period stock ret target_weight
# <int> <chr> <dbl> <dbl>
# 1 1 A -0.289 0.333
# 2 2 A -0.0878 0.333
# 3 3 A 0.0776 0.333
# 4 4 A -0.346 0.333
# 5 5 A 0.0587 0.333
# 6 6 A 0.00904 0.333
# 7 1 B 0.0171 0.333
# 8 2 B 0.223 0.333
# 9 3 B -0.244 0.333
# 10 4 B 0.253 0.333
# 11 5 B -0.149 0.333
# 12 6 B -0.226 0.333
# 13 1 C -0.0716 0.333
# 14 2 C 0.0253 0.333
# 15 3 C 0.0152 0.333
# 16 4 C -0.0308 0.333
# 17 5 C -0.0953 0.333
# 18 6 C -0.0648 0.333
Here are the actual weights without rebalancing:
rets_weights_actual <- rets_weights %>%
group_by(stock) %>%
mutate(value = cumprod(1+ret)*target_weight[1]) %>%
group_by(period) %>%
mutate(actual_weight = value/sum(value))
rets_weights_actual
# # A tibble: 18 x 6
# # Groups: period [6]
# period stock ret target_weight value actual_weight
# <int> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 A -0.289 0.333 0.237 0.268
# 2 2 A -0.0878 0.333 0.216 0.228
# 3 3 A 0.0776 0.333 0.233 0.268
# 4 4 A -0.346 0.333 0.153 0.178
# 5 5 A 0.0587 0.333 0.162 0.207
# 6 6 A 0.00904 0.333 0.163 0.238
# 7 1 B 0.0171 0.333 0.339 0.383
# 8 2 B 0.223 0.333 0.415 0.437
# 9 3 B -0.244 0.333 0.314 0.361
# 10 4 B 0.253 0.333 0.393 0.458
# 11 5 B -0.149 0.333 0.335 0.430
# 12 6 B -0.226 0.333 0.259 0.377
# 13 1 C -0.0716 0.333 0.309 0.349
# 14 2 C 0.0253 0.333 0.317 0.335
# 15 3 C 0.0152 0.333 0.322 0.371
# 16 4 C -0.0308 0.333 0.312 0.364
# 17 5 C -0.0953 0.333 0.282 0.363
# 18 6 C -0.0648 0.333 0.264 0.385
So I want that if in any period any stock's weight goes over or under the threshold (for example 0.33+/-0.1), the portfolio weights should be set back to the initial weights.
This has to be done dynamically, so we could have a lot of periods and a lot of stocks. Rebalancing could be necessary several times.
What I tried to solve it: I tried to work with lag and set the initial weights when the actual weights exceed the threshold, however I was unable to do so dynamically, as the weights depend on the returns given the rebalanced weights.
The approach to rebalance upon deviation by more than a certain threshold is called percentage-of-portfolio rebalancing.
My solution is to iterate period-by-period and check if the upper or lower threshold was passed. If so we reset to the initial weights.
library(tidyverse)
library(tidyquant)
rets <- FANG %>%
group_by(symbol) %>%
mutate(ret = adjusted/lag(adjusted)-1) %>%
select(symbol, date, ret) %>%
pivot_wider(names_from = "symbol", values_from = ret)
weights <- rep(0.25, 4)
threshold <- 0.05
r_out <- tibble()
i0 <- 1
trade_rebalance <- 1
pf_value <- 1
for (i in 1:nrow(rets)) {
r <- rets[i0:i,]
j <- 0
r_i <- r %>%
mutate_if(is.numeric, replace_na, 0) %>%
mutate_if(is.numeric, list(v = ~ pf_value * weights[j <<- j + 1] * cumprod(1 + .))) %>%
mutate(pf = rowSums(select(., contains("_v")))) %>%
mutate_at(vars(ends_with("_v")), list(w = ~ ./pf))
touch_upper_band <- any(r_i[nrow(r_i),] %>% select(ends_with("_w")) %>% unlist() > weights + threshold)
touch_lower_band <- any(r_i[nrow(r_i),] %>% select(ends_with("_w")) %>% unlist() < weights - threshold)
if (touch_upper_band | touch_lower_band | i == nrow(rets)) {
i0 <- i + 1
r_out <- bind_rows(r_out, r_i %>% mutate(trade_rebalance = trade_rebalance))
pf_value <- r_i[[nrow(r_i), "pf"]]
trade_rebalance <- trade_rebalance + 1
}
}
r_out %>% head()
# # A tibble: 6 x 15
# date FB AMZN NFLX GOOG FB_v AMZN_v NFLX_v GOOG_v pf FB_v_w AMZN_v_w NFLX_v_w GOOG_v_w trade_rebalance
# <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2013-01-02 0 0 0 0 0.25 0.25 0.25 0.25 1 0.25 0.25 0.25 0.25 1
# 2 2013-01-03 -0.00821 0.00455 0.0498 0.000581 0.248 0.251 0.262 0.250 1.01 0.245 0.248 0.259 0.247 1
# 3 2013-01-04 0.0356 0.00259 -0.00632 0.0198 0.257 0.252 0.261 0.255 1.02 0.251 0.246 0.255 0.249 1
# 4 2013-01-07 0.0229 0.0359 0.0335 -0.00436 0.263 0.261 0.270 0.254 1.05 0.251 0.249 0.257 0.243 1
# 5 2013-01-08 -0.0122 -0.00775 -0.0206 -0.00197 0.259 0.259 0.264 0.253 1.04 0.251 0.250 0.255 0.245 1
# 6 2013-01-09 0.0526 -0.000113 -0.0129 0.00657 0.273 0.259 0.261 0.255 1.05 0.261 0.247 0.249 0.244 1
r_out %>% tail()
# # A tibble: 6 x 15
# date FB AMZN NFLX GOOG FB_v AMZN_v NFLX_v GOOG_v pf FB_v_w AMZN_v_w NFLX_v_w GOOG_v_w trade_rebalance
# <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2016-12-22 -0.0138 -0.00553 -0.00727 -0.00415 0.945 1.10 1.32 1.08 4.45 0.213 0.247 0.297 0.243 10
# 2 2016-12-23 -0.00111 -0.00750 0.0000796 -0.00171 0.944 1.09 1.32 1.08 4.43 0.213 0.246 0.298 0.243 10
# 3 2016-12-27 0.00631 0.0142 0.0220 0.00208 0.950 1.11 1.35 1.08 4.49 0.212 0.247 0.301 0.241 10
# 4 2016-12-28 -0.00924 0.000946 -0.0192 -0.00821 1.11 1.12 1.10 1.11 4.45 0.250 0.252 0.247 0.250 11
# 5 2016-12-29 -0.00488 -0.00904 -0.00445 -0.00288 1.11 1.11 1.10 1.11 4.42 0.250 0.252 0.248 0.251 11
# 6 2016-12-30 -0.0112 -0.0200 -0.0122 -0.0140 1.09 1.09 1.08 1.09 4.36 0.251 0.250 0.248 0.251 11
Here we would have rebalanced 11 times.
r_out %>%
mutate(performance = pf-1) %>%
ggplot(aes(x = date, y = performance)) +
geom_line(data = FANG %>%
group_by(symbol) %>%
mutate(performance = adjusted/adjusted[1L]-1),
aes(color = symbol)) +
geom_line(size = 1)
The approach is slow and using a loop is far from elegant. If anyone has a better solution, I would happily upvote and accept.

pmap_ variants operating on data.frames as lists

I have a recollection that purrr::pmap_* can treat a data.frame as a list but the syntax eludes me.
Imagine we wanted to fit a separate lm object for each value of mtcars$vs and mtcars$am
library(tidyverse)
library(broom)
d1 <- mtcars %>%
group_by(
vs, am
) %>%
nest %>%
mutate(
coef = data %>%
map(
~lm(mpg ~ wt, data =.) %>%
tidy
)
)
If I wanted to extract the coefficient estimates as an un-nested data.frame, and append the values of am and vs, I might try
d1[, -3] %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
But this results in an error. More explicitly declaring these variables as separate lists has the desired effect
list(
d1$vs,
d1$am,
d1$coef
) %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
Is there a succinct way for pmap_* to treat a data.frame as a list?
We can use the standard option to extract the components (..1, ..2, etc)
d1[, -3] %>%
pmap_dfr(~ ..3 %>%
mutate(vs = ..1, am = ..2))
# A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
#2 wt -7.91 1.14 -6.93 0.00227 0 1
#3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
#4 wt -7.77 3.36 -2.31 0.0689 1 1
#5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
#6 wt -3.38 2.80 -1.21 0.281 1 0
#7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
#8 wt -2.44 0.842 -2.90 0.0159 0 0
This is because the second list has no names attribute. If you unname d1 it works. The fact that you used the list function in the second example doesn't make a difference (except that it removed the names), because both objects are lists (data frames are lists).
d1[, -3] %>%
unname %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0
You can also name the arguments in your first code block's function to match (or use ..1 etc) for the same result
d1[, -3] %>%
pmap_dfr(
function(vs, am, coef)
coef %>%
mutate(
vs = vs,
am = am
)
)
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0
You could also use wap from the experimental rap package
library(rap)
d1[, -3] %>%
wap( ~ coef %>%
mutate(
vs = vs,
am = am)) %>%
bind_rows
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0

Resources