edit string text in dataframe variable - r

I want to tidy up a dataframe and automate the process. Given the following data.frame:
library(survival)
library(rms)
library(broom)
library(tidyverse)
res.cox <- coxph(Surv(time, status) ~ rcs(age, 3) + sex + ph.ecog +
rcs(meal.cal, 4), data = lung)
output <- tidy(res.cox)
output
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 3)age -0.00306 0.0219 -0.140 0.889
# 2 rcs(age, 3)age' 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 rcs(meal.cal, 4)meal.cal -0.000416 0.00104 -0.400 0.689
# 6 rcs(meal.cal, 4)meal.cal' 0.00118 0.00232 0.509 0.611
# 7 rcs(meal.cal, 4)meal.cal'' -0.00659 0.0114 -0.577 0.564
I want to remove the rcs-spline information from term variable and be left with:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age -0.00306 0.0219 -0.140 0.889
# 2 s2 age 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
# 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
# 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
I want the solution to easily work for other cases too so when you increase the number of knots:
res.cox2 <- coxph(Surv(time, status) ~ rcs(age, 4) + rcs(meal.cal, 6) +
sex + ph.ecog, data = lung)
output2 <- tidy(res.cox2)
output2
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 4)age 0.0419 0.0403 1.04 0.298
# 2 rcs(age, 4)age' -0.101 0.0806 -1.26 0.208
# 3 rcs(age, 4)age'' 0.569 0.388 1.47 0.142
# 4 rcs(meal.cal, 6)meal.cal -0.000974 0.00155 -0.631 0.528
# 5 rcs(meal.cal, 6)meal.cal' 0.00751 0.0115 0.655 0.512
# 6 rcs(meal.cal, 6)meal.cal'' -0.0217 0.0358 -0.607 0.544
# 7 rcs(meal.cal, 6)meal.cal''' 0.0614 0.123 0.501 0.616
# 8 rcs(meal.cal, 6)meal.cal'''' -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
you would be left with:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age 0.0419 0.0403 1.04 0.298
# 2 s2 age -0.101 0.0806 -1.26 0.208
# 3 s3 age 0.569 0.388 1.47 0.142
# 4 s1 meal.cal -0.000974 0.00155 -0.631 0.528
# 5 s2 meal.cal 0.00751 0.0115 0.655 0.512
# 6 s3 meal.cal -0.0217 0.0358 -0.607 0.544
# 7 s4 meal.cal 0.0614 0.123 0.501 0.616
# 8 s5 meal.cal -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
etc...
My attempt so far gets me some of the way but I am not sure of the best way to deal with the ', '' (note the first term does not contain a ') etc.:
output %>%
mutate(rcs_indicator = str_detect(term, fixed("rcs(")),
term = str_replace_all(term, "rcs\\(.+?\\)", ""))
# term estimate std.error statistic p.value rcs_indicator
# <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 age -0.00306 0.0219 -0.140 0.889 TRUE
# 2 age' 0.0154 0.0261 0.592 0.554 TRUE
# 3 sex -0.525 0.192 -2.74 0.00620 FALSE
# 4 ph.ecog 0.421 0.131 3.22 0.00128 FALSE
# 5 meal.cal -0.000416 0.00104 -0.400 0.689 TRUE
# 6 meal.cal' 0.00118 0.00232 0.509 0.611 TRUE
# 7 meal.cal'' -0.00659 0.0114 -0.577 0.564 TRUE
It might be useful to just work with the terms I need to change directly:
unique(str_subset(output$term, fixed("rcs(")) %>%
str_replace_all("'", ""))
# [1] "rcs(age, 3)age" "rcs(meal.cal, 4)meal.cal"
I feel there is a way to do this in a simpler way than the steps I am doing.
Any suggestions?
Thanks

This one is clunky but it should work:
library(dplyr)
library(stringr)
output %>%
group_by(group =str_extract(term, 'rcs\\(.')) %>%
mutate(row = row_number()) %>%
mutate(term = str_replace_all(term, 'rcs\\(', paste0("s",row, " "))) %>%
mutate(term = ifelse(str_detect(term, 's\\d'),
str_extract(term, '.\\d\\s.*\\s'), term)) %>%
mutate(term = str_trim(term)) %>%
mutate(term = str_replace_all(term, '\\,', '')) %>%
ungroup() %>%
select(-c(group, row))
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 s1 age -0.00306 0.0219 -0.140 0.889
2 s2 age 0.0154 0.0261 0.592 0.554
3 sex -0.525 0.192 -2.74 0.00620
4 ph.ecog 0.421 0.131 3.22 0.00128
5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
6 s2 meal.cal 0.00118 0.00232 0.509 0.611
7 s3 meal.cal -0.00659 0.0114 -0.577 0.564

This is also less elegant than desired, but should work for multiple knots
output %>%
mutate(is_spline = grepl("^rcs\\(.*?, \\d\\)", term),
n_term = str_count(term, "'") + 1,
pre = ifelse(is_spline, paste0('s', n_term, ' '), ""),
term = paste0(pre, gsub("(^rcs\\(.*?, \\d\\))|(\\'+$)", "", term))) %>%
select(-is_spline, -n_term, -pre)
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 s1 age -0.00306 0.0219 -0.140 0.889
#> 2 s2 age 0.0154 0.0261 0.592 0.554
#> 3 sex -0.525 0.192 -2.74 0.00620
#> 4 ph.ecog 0.421 0.131 3.22 0.00128
#> 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
#> 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
#> 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564

Related

Purrr's Modify-In Function

I'm trying to use purrr's modify_in to modify elements of a list. An example of the list:
tib_list <- map(1:3, ~ tibble(col_one = runif(5),
col_two = runif(5), col_three = runif(5)))
Let's say I want to change elements 2 and 3 of the list to unselect col_one. I imagined doing this:
modify_in(tib_list, 2:length(tib_list), ~ select(.x, -col_one)
But this yields an error. I then thought of doing something like this, but this ends up duplicating the list
map(1:3, ~ modify_in(tib_list, .x, ~ select(.x, -col_one))
I think you wanted to use modify_at which lets you specify either element names or positions. modify_in allows us to use only one position like purrr::pluck.
library(tidyverse)
tib_list <- map(1:3, ~ tibble(col_one = runif(5), col_two = runif(5), col_three = runif(5)))
modify_at(tib_list, c(2,3), ~ select(.x, -col_one))
#> [[1]]
#> # A tibble: 5 x 3
#> col_one col_two col_three
#> <dbl> <dbl> <dbl>
#> 1 0.190 0.599 0.824
#> 2 0.214 0.172 0.106
#> 3 0.236 0.666 0.584
#> 4 0.373 0.903 0.252
#> 5 0.875 0.196 0.643
#>
#> [[2]]
#> # A tibble: 5 x 2
#> col_two col_three
#> <dbl> <dbl>
#> 1 0.513 0.113
#> 2 0.893 0.377
#> 3 0.275 0.675
#> 4 0.529 0.612
#> 5 0.745 0.405
#>
#> [[3]]
#> # A tibble: 5 x 2
#> col_two col_three
#> <dbl> <dbl>
#> 1 0.470 0.789
#> 2 0.181 0.289
#> 3 0.680 0.213
#> 4 0.772 0.114
#> 5 0.314 0.895
Created on 2021-08-27 by the reprex package (v0.3.0)
We can use modify_in with one position, but supplying a vector such as c(2,3) would mean that we want to access the third element of the second parent element in a nested list. This is why we see the error below.
# works
modify_in(tib_list, 2, ~ select(.x, -col_one))
#> [[1]]
#> # A tibble: 5 x 3
#> col_one col_two col_three
#> <dbl> <dbl> <dbl>
#> 1 0.109 0.697 0.0343
#> 2 0.304 0.645 0.851
#> 3 0.530 0.786 0.600
#> 4 0.708 0.0324 0.605
#> 5 0.898 0.232 0.567
#>
#> [[2]]
#> # A tibble: 5 x 2
#> col_two col_three
#> <dbl> <dbl>
#> 1 0.766 0.157
#> 2 0.0569 0.0422
#> 3 0.943 0.0850
#> 4 0.947 0.0806
#> 5 0.761 0.297
#>
#> [[3]]
#> # A tibble: 5 x 3
#> col_one col_two col_three
#> <dbl> <dbl> <dbl>
#> 1 0.878 0.864 0.540
#> 2 0.168 0.745 0.120
#> 3 0.943 0.338 0.535
#> 4 0.353 0.478 0.204
#> 5 0.267 0.669 0.478
# doesn't work
modify_in(tib_list, c(2,3), ~ select(.x, -col_one))
#> Error in UseMethod("select"): no applicable method for 'select' applied to an object of class "c('double', 'numeric')"
I never used modify_in, but you could use
library(purrr)
library(dplyr)
tib_list %>%
imap(~ if (.y > 1) { select(.x, -col_one) } else { .x })
to get
[[1]]
# A tibble: 5 x 3
col_one col_two col_three
<dbl> <dbl> <dbl>
1 0.710 0.189 0.644
2 0.217 0.946 0.955
3 0.590 0.770 0.0180
4 0.135 0.101 0.888
5 0.640 0.645 0.346
[[2]]
# A tibble: 5 x 2
col_two col_three
<dbl> <dbl>
1 0.267 0.926
2 0.456 0.0902
3 0.659 0.707
4 0.421 0.0451
5 0.801 0.220
[[3]]
# A tibble: 5 x 2
col_two col_three
<dbl> <dbl>
1 0.437 0.649
2 0.256 0.466
3 0.331 0.594
4 0.586 0.558
5 0.625 0.444
We can use modify_if
modify_if(tib_list,.f = ~ .x %>% select(-col_one),
.p = seq_along(tib_list) != 1)
-output
[[1]]
# A tibble: 5 x 3
col_one col_two col_three
<dbl> <dbl> <dbl>
1 0.819 0.666 0.384
2 0.183 0.549 0.0211
3 0.374 0.240 0.252
4 0.359 0.913 0.792
5 0.515 0.402 0.217
[[2]]
# A tibble: 5 x 2
col_two col_three
<dbl> <dbl>
1 0.696 0.0269
2 0.433 0.147
3 0.235 0.743
4 0.589 0.748
5 0.635 0.851
[[3]]
# A tibble: 5 x 2
col_two col_three
<dbl> <dbl>
1 0.707 0.976
2 0.0966 0.130
3 0.574 0.572
4 0.854 0.680
5 0.819 0.582

How to extract confidence intervals from multiple regression models?

I am extracting the regression results for two different groups as shown in this example below. In the temp data.frame i get the estimate, std.error, statistic and p-value. However, i don't get the confidence intervals. Is there a simple way to extract them as well?
df <- tibble(
a = rnorm(1000),
b = rnorm(1000),
c = rnorm(1000),
d = rnorm(1000),
group = rbinom(n=1000, size=1, prob=0.5)
)
df$group = as.factor(df$group)
temp <- df %>%
group_by(group) %>%
do(model1 = tidy(lm(a ~ b + c + d, data = .))) %>%
gather(model_name, model, -group) %>%
unnest()
You are doing tidy on a lm object. If you check the help page, there is an option to include the confidence interval, conf.int=TRUE:
temp <- df %>%
group_by(group) %>%
do(model1 = tidy(lm(a ~ b + c + d, data = . ), conf.int=TRUE)) %>%
gather(model_name, model, -group) %>%
unnest()
# A tibble: 8 x 9
group model_name term estimate std.error statistic p.value conf.low conf.high
<fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 model1 (Int… 0.0616 0.0423 1.46 0.146 -0.0215 0.145
2 0 model1 b 0.00178 0.0421 0.0424 0.966 -0.0808 0.0844
3 0 model1 c -0.00339 0.0431 -0.0787 0.937 -0.0881 0.0813
4 0 model1 d -0.0537 0.0445 -1.21 0.228 -0.141 0.0337
5 1 model1 (Int… -0.0185 0.0454 -0.408 0.683 -0.108 0.0707
6 1 model1 b 0.00128 0.0435 0.0295 0.976 -0.0842 0.0868
7 1 model1 c -0.0972 0.0430 -2.26 0.0244 -0.182 -0.0126
8 1 model1 d 0.0734 0.0457 1.60 0.109 -0.0165 0.163
If your version of dplyr is higher than 1.0.0, you can use:
df %>%
group_by(group) %>%
summarise(tidy(lm(a ~ b + c + d), conf.int = TRUE), .groups = "drop")
#> # A tibble: 8 x 8
#> group term estimate std.error statistic p.value conf.low conf.high
#> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 (Intercept) 0.0734 0.0468 1.57 0.117 -0.0185 0.165
#> 2 0 b -0.101 0.0461 -2.19 0.0292 -0.191 -0.0102
#> 3 0 c 0.0337 0.0464 0.726 0.468 -0.0575 0.125
#> 4 0 d -0.101 0.0454 -2.23 0.0265 -0.190 -0.0118
#> 5 1 (Intercept) -0.0559 0.0468 -1.20 0.232 -0.148 0.0360
#> 6 1 b -0.0701 0.0474 -1.48 0.140 -0.163 0.0230
#> 7 1 c 0.0319 0.0477 0.668 0.504 -0.0619 0.126
#> 8 1 d -0.0728 0.0466 -1.56 0.119 -0.164 0.0188

Dynamic portfolio re-balancing if PF weights deviate by more than a threshold

It's not so hard to backtest a portfolio with given weights and a set rebalancing frequency (e.g. daily/weekly...). There are R packages doing this, for example PerformanceAnalytics, or tidyquant's tq_portfolio which uses that function.
I would like to backtest a portfolio that is re-balanced when the weights deviate by a certain threshold given in percentage points.
Say I have two equally-weighted stocks and a threshold of +/-15 percentage points, I would rebalance to the initial weights when one of the weights exceeds 65%.
For example I have 3 stocks with equal weights (we should also be able to set other weights).
library(dplyr)
set.seed(3)
n <- 6
rets <- tibble(period = rep(1:n, 3),
stock = c(rep("A", n), rep("B", n), rep("C", n)),
ret = c(rnorm(n, 0, 0.3), rnorm(n, 0, 0.2), rnorm(n, 0, 0.1)))
target_weights <- tibble(stock = c("A", "B", "C"), target_weight = 1/3)
rets_weights <- rets %>%
left_join(target_weights, by = "stock")
rets_weights
# # A tibble: 18 x 4
# period stock ret target_weight
# <int> <chr> <dbl> <dbl>
# 1 1 A -0.289 0.333
# 2 2 A -0.0878 0.333
# 3 3 A 0.0776 0.333
# 4 4 A -0.346 0.333
# 5 5 A 0.0587 0.333
# 6 6 A 0.00904 0.333
# 7 1 B 0.0171 0.333
# 8 2 B 0.223 0.333
# 9 3 B -0.244 0.333
# 10 4 B 0.253 0.333
# 11 5 B -0.149 0.333
# 12 6 B -0.226 0.333
# 13 1 C -0.0716 0.333
# 14 2 C 0.0253 0.333
# 15 3 C 0.0152 0.333
# 16 4 C -0.0308 0.333
# 17 5 C -0.0953 0.333
# 18 6 C -0.0648 0.333
Here are the actual weights without rebalancing:
rets_weights_actual <- rets_weights %>%
group_by(stock) %>%
mutate(value = cumprod(1+ret)*target_weight[1]) %>%
group_by(period) %>%
mutate(actual_weight = value/sum(value))
rets_weights_actual
# # A tibble: 18 x 6
# # Groups: period [6]
# period stock ret target_weight value actual_weight
# <int> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 A -0.289 0.333 0.237 0.268
# 2 2 A -0.0878 0.333 0.216 0.228
# 3 3 A 0.0776 0.333 0.233 0.268
# 4 4 A -0.346 0.333 0.153 0.178
# 5 5 A 0.0587 0.333 0.162 0.207
# 6 6 A 0.00904 0.333 0.163 0.238
# 7 1 B 0.0171 0.333 0.339 0.383
# 8 2 B 0.223 0.333 0.415 0.437
# 9 3 B -0.244 0.333 0.314 0.361
# 10 4 B 0.253 0.333 0.393 0.458
# 11 5 B -0.149 0.333 0.335 0.430
# 12 6 B -0.226 0.333 0.259 0.377
# 13 1 C -0.0716 0.333 0.309 0.349
# 14 2 C 0.0253 0.333 0.317 0.335
# 15 3 C 0.0152 0.333 0.322 0.371
# 16 4 C -0.0308 0.333 0.312 0.364
# 17 5 C -0.0953 0.333 0.282 0.363
# 18 6 C -0.0648 0.333 0.264 0.385
So I want that if in any period any stock's weight goes over or under the threshold (for example 0.33+/-0.1), the portfolio weights should be set back to the initial weights.
This has to be done dynamically, so we could have a lot of periods and a lot of stocks. Rebalancing could be necessary several times.
What I tried to solve it: I tried to work with lag and set the initial weights when the actual weights exceed the threshold, however I was unable to do so dynamically, as the weights depend on the returns given the rebalanced weights.
The approach to rebalance upon deviation by more than a certain threshold is called percentage-of-portfolio rebalancing.
My solution is to iterate period-by-period and check if the upper or lower threshold was passed. If so we reset to the initial weights.
library(tidyverse)
library(tidyquant)
rets <- FANG %>%
group_by(symbol) %>%
mutate(ret = adjusted/lag(adjusted)-1) %>%
select(symbol, date, ret) %>%
pivot_wider(names_from = "symbol", values_from = ret)
weights <- rep(0.25, 4)
threshold <- 0.05
r_out <- tibble()
i0 <- 1
trade_rebalance <- 1
pf_value <- 1
for (i in 1:nrow(rets)) {
r <- rets[i0:i,]
j <- 0
r_i <- r %>%
mutate_if(is.numeric, replace_na, 0) %>%
mutate_if(is.numeric, list(v = ~ pf_value * weights[j <<- j + 1] * cumprod(1 + .))) %>%
mutate(pf = rowSums(select(., contains("_v")))) %>%
mutate_at(vars(ends_with("_v")), list(w = ~ ./pf))
touch_upper_band <- any(r_i[nrow(r_i),] %>% select(ends_with("_w")) %>% unlist() > weights + threshold)
touch_lower_band <- any(r_i[nrow(r_i),] %>% select(ends_with("_w")) %>% unlist() < weights - threshold)
if (touch_upper_band | touch_lower_band | i == nrow(rets)) {
i0 <- i + 1
r_out <- bind_rows(r_out, r_i %>% mutate(trade_rebalance = trade_rebalance))
pf_value <- r_i[[nrow(r_i), "pf"]]
trade_rebalance <- trade_rebalance + 1
}
}
r_out %>% head()
# # A tibble: 6 x 15
# date FB AMZN NFLX GOOG FB_v AMZN_v NFLX_v GOOG_v pf FB_v_w AMZN_v_w NFLX_v_w GOOG_v_w trade_rebalance
# <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2013-01-02 0 0 0 0 0.25 0.25 0.25 0.25 1 0.25 0.25 0.25 0.25 1
# 2 2013-01-03 -0.00821 0.00455 0.0498 0.000581 0.248 0.251 0.262 0.250 1.01 0.245 0.248 0.259 0.247 1
# 3 2013-01-04 0.0356 0.00259 -0.00632 0.0198 0.257 0.252 0.261 0.255 1.02 0.251 0.246 0.255 0.249 1
# 4 2013-01-07 0.0229 0.0359 0.0335 -0.00436 0.263 0.261 0.270 0.254 1.05 0.251 0.249 0.257 0.243 1
# 5 2013-01-08 -0.0122 -0.00775 -0.0206 -0.00197 0.259 0.259 0.264 0.253 1.04 0.251 0.250 0.255 0.245 1
# 6 2013-01-09 0.0526 -0.000113 -0.0129 0.00657 0.273 0.259 0.261 0.255 1.05 0.261 0.247 0.249 0.244 1
r_out %>% tail()
# # A tibble: 6 x 15
# date FB AMZN NFLX GOOG FB_v AMZN_v NFLX_v GOOG_v pf FB_v_w AMZN_v_w NFLX_v_w GOOG_v_w trade_rebalance
# <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2016-12-22 -0.0138 -0.00553 -0.00727 -0.00415 0.945 1.10 1.32 1.08 4.45 0.213 0.247 0.297 0.243 10
# 2 2016-12-23 -0.00111 -0.00750 0.0000796 -0.00171 0.944 1.09 1.32 1.08 4.43 0.213 0.246 0.298 0.243 10
# 3 2016-12-27 0.00631 0.0142 0.0220 0.00208 0.950 1.11 1.35 1.08 4.49 0.212 0.247 0.301 0.241 10
# 4 2016-12-28 -0.00924 0.000946 -0.0192 -0.00821 1.11 1.12 1.10 1.11 4.45 0.250 0.252 0.247 0.250 11
# 5 2016-12-29 -0.00488 -0.00904 -0.00445 -0.00288 1.11 1.11 1.10 1.11 4.42 0.250 0.252 0.248 0.251 11
# 6 2016-12-30 -0.0112 -0.0200 -0.0122 -0.0140 1.09 1.09 1.08 1.09 4.36 0.251 0.250 0.248 0.251 11
Here we would have rebalanced 11 times.
r_out %>%
mutate(performance = pf-1) %>%
ggplot(aes(x = date, y = performance)) +
geom_line(data = FANG %>%
group_by(symbol) %>%
mutate(performance = adjusted/adjusted[1L]-1),
aes(color = symbol)) +
geom_line(size = 1)
The approach is slow and using a loop is far from elegant. If anyone has a better solution, I would happily upvote and accept.

Calculate all possible interactions in model_matrix

I'm simulating data with a fluctuating number of variables. As part of the situation, I am needing to calculate a model matrix with all possible combinations. See the following reprex for an example. I am able to get all two-interactions by specifying the formula as ~ .*.. However, this particular dataset has 3 variables (ndim <- 3). I can get all two- and three-way interactions by specifying the formula as ~ .^3. The issue is that there may be 4+ variables that I need to calculate, so I would like to be able to generalize this. I have tried specifying the formula as ~ .^ndim, but this throws an error.
Is there a way define the power in the formula with a variable?
library(tidyverse)
library(mvtnorm)
library(modelr)
ndim <- 3
data <- rmvnorm(100, mean = rep(0, ndim)) %>%
as_tibble(.name_repair = ~ paste0("dim_", seq_len(ndim)))
model_matrix(data, ~ .*.)
#> # A tibble: 100 x 7
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 1 more variable: `dim_2:dim_3` <dbl>
model_matrix(data, ~ .^3)
#> # A tibble: 100 x 8
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 2 more variables: `dim_2:dim_3` <dbl>,
#> # `dim_1:dim_2:dim_3` <dbl>
model_matrix(data, ~.^ndim)
#> Error in terms.formula(object, data = data): invalid power in formula
Created on 2019-02-15 by the reprex package (v0.2.1)
You can use use as.formula with paste in model_matrix:
model_matrix(data, as.formula(paste0("~ .^", ndim)))

Running regressions and extract model estimates to a dataframe in R

I have 3 exposure variables x1-x3, 10 outcome variables y1-y10 and 3 covariates cv1-cv3.
I would like to regress each outcome on each exposure adjusted for all covariates. Then I would like model estimates i.e. beta, SE, p-value placed in a dataframe. Is there a way to automate this in R. Thank you!
The models i want to run look like this:
y1 ~ x1+cv1+cv2+cv3 ... y10 ~ x1+cv1+cv2+cv3
y1 ~ x2+cv1+cv2+cv3 ... y10 ~ x2+cv1+cv2+cv3
y1 ~ x3+cv1+cv2+cv3 ... y10 ~ x3+cv1+cv2+cv3
Without data and a reproducible example, it is hard to help you, but here's an example with simulated data. First, create a fake dataset, called data:
library(tidyverse)
make_df <- function(y_i) {
data_frame(y_var = y_i, y_i = rnorm(100),
x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
cv1 = runif(100), cv2 = runif(100), cv3 = runif(100))
}
ys <- paste0("Y_", sprintf("%02d", 1:10))
ys
#> [1] "Y_01" "Y_02" "Y_03" "Y_04" "Y_05" "Y_06" "Y_07" "Y_08" "Y_09" "Y_10"
data <-
ys %>%
map_dfr(make_df)
data
#> # A tibble: 1,000 x 8
#> y_var y_i x1 x2 x3 cv1 cv2 cv3
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Y_01 0.504 0.892 -0.806 -1.56 0.145 0.436 0.701
#> 2 Y_01 0.967 1.24 -1.19 0.920 0.866 0.00100 0.567
#> 3 Y_01 -0.824 -0.729 -0.0855 -1.06 0.0665 0.780 0.471
#> 4 Y_01 0.294 2.37 -0.514 -0.955 0.397 0.0462 0.209
#> 5 Y_01 -0.893 0.0298 0.0369 0.0787 0.640 0.709 0.0485
#> 6 Y_01 0.670 -0.347 1.56 2.11 0.843 0.542 0.793
#> 7 Y_01 -1.59 1.04 0.228 0.573 0.185 0.151 0.558
#> 8 Y_01 -2.04 0.289 -0.435 -0.113 0.833 0.0898 0.653
#> 9 Y_01 -0.637 0.818 -0.454 0.606 0.294 0.378 0.315
#> 10 Y_01 -1.61 -0.628 -2.75 1.06 0.353 0.0863 0.332
#> # ... with 990 more rows
At this point, you have options. One way is to use the group_by %>% do(tidy(*)) recipe:
data %>%
gather(x_var, x_value, -c(y_var, y_i, cv1:cv3)) %>%
group_by(y_var, x_var) %>%
do(broom::tidy(lm(y_i ~ x_value + cv1 + cv2 + cv3, data = .)))
#> # A tibble: 150 x 7
#> # Groups: y_var, x_var [30]
#> y_var x_var term estimate std.error statistic p.value
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Y_01 x1 (Intercept) -0.111 0.344 -0.324 0.747
#> 2 Y_01 x1 x_value -0.0440 0.111 -0.396 0.693
#> 3 Y_01 x1 cv1 0.286 0.372 0.769 0.444
#> 4 Y_01 x1 cv2 0.0605 0.379 0.160 0.873
#> 5 Y_01 x1 cv3 -0.0690 0.378 -0.182 0.856
#> 6 Y_01 x2 (Intercept) -0.146 0.336 -0.434 0.665
#> 7 Y_01 x2 x_value 0.117 0.105 1.12 0.265
#> 8 Y_01 x2 cv1 0.287 0.362 0.793 0.430
#> 9 Y_01 x2 cv2 0.0564 0.376 0.150 0.881
#> 10 Y_01 x2 cv3 0.0125 0.379 0.0330 0.974
#> # ... with 140 more rows
Another approach is to use a split variable and then a map function from purrr:
data %>%
gather(x_var, x_value, -c(y_var, y_i, cv1:cv3)) %>%
mutate(y_var_x_var = paste0(y_var, x_var)) %>%
split(.$y_var_x_var) %>%
map(~ lm(y_i ~ x_value + cv1 + cv2 + cv3, data = .))
#> $Y_01x1
#>
#> Call:
#> lm(formula = y_i ~ x_value + cv1 + cv2 + cv3, data = .)
#>
#> Coefficients:
#> (Intercept) x_value cv1 cv2 cv3
#> -0.11144 -0.04396 0.28585 0.06051 -0.06896
#>
#>
#> $Y_01x2
#>
#> Call:
#> lm(formula = y_i ~ x_value + cv1 + cv2 + cv3, data = .)
#>
#> Coefficients:
#> (Intercept) x_value cv1 cv2 cv3
#> -0.14562 0.11732 0.28726 0.05642 0.01249
#>
#>
# ...and so on...
#>
#>
#> $Y_10x2
#>
#> Call:
#> lm(formula = y_i ~ x_value + cv1 + cv2 + cv3, data = .)
#>
#> Coefficients:
#> (Intercept) x_value cv1 cv2 cv3
#> -0.45689 -0.02530 0.61375 0.34377 -0.02357
#>
#>
#> $Y_10x3
#>
#> Call:
#> lm(formula = y_i ~ x_value + cv1 + cv2 + cv3, data = .)
#>
#> Coefficients:
#> (Intercept) x_value cv1 cv2 cv3
#> -0.44423 -0.18377 0.64739 0.27688 -0.02013

Resources