split dataframe with recurring columnames - r

I have imported an excel sheet in R which is a compilation of several dataframes with identical columnnames. To illustrate it looks like this:
df <- tibble( empty = c(runif(3), NA, NA, NA, NA),
A = c(runif(3), NA, NA, NA, NA),
B = c(runif(3), NA, NA, NA, NA),
C = c(runif(3), NA, NA, NA, NA),
empty = c(runif(6), NA),
A = c(runif(6), NA),
B = c(runif(6), NA),
C = c(runif(6), NA),
empty = c(runif(5), NA, NA),
A = c(runif(5), NA, NA),
B = c(runif(5), NA, NA),
C = c(runif(5), NA, NA),
.name_repair = "minimal")
How can I transform this dataframe in this result:
> df1
# A tibble: 3 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.200 0.0665 0.723 0.487
2 0.576 0.990 0.969 0.289
3 0.727 0.192 0.780 0.243
> df2
# A tibble: 6 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.556 0.698 0.796 0.357
2 0.308 0.542 0.867 0.103
3 0.643 0.792 0.385 0.882
4 0.675 0.504 0.489 0.0515
5 0.426 0.775 0.410 0.748
6 0.343 0.752 0.185 0.542
> df3
# A tibble: 5 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.229 0.0508 0.0880 0.486
2 0.146 0.295 0.562 0.731
3 0.292 0.804 0.133 0.0480
4 0.0404 0.399 0.366 0.152
5 0.226 0.702 0.476 0.416
The column with name empty has actually no name although I don't know how to assign this in this example.
The reason I ask this question is because I have several other sheets with a different number of similar columns per sheet (D, E etc).
I found a nice post here:
split data frame with recurring column names
although this post looks the same, it is quit different.
Thanks!

This puts the results in a list which should be more convenient than sequentially named data frames.
first_col = "empty"
name_groups = cumsum(names(df) == "empty")
result = split.default(df, name_groups)
# omit rows that have only missing values
result = lapply(result, \(x) x[rowSums(is.na(x)) < ncol(x), ])
result
# $`1`
# # A tibble: 3 × 4
# empty A B C
# <dbl> <dbl> <dbl> <dbl>
# 1 0.590 0.602 0.527 0.900
# 2 0.0450 0.713 0.936 0.911
# 3 0.567 0.781 0.349 0.686
#
# $`2`
# # A tibble: 6 × 4
# empty A B C
# <dbl> <dbl> <dbl> <dbl>
# 1 0.480 0.543 0.744 0.0684
# 2 0.0423 0.799 0.927 0.537
# 3 0.962 0.0745 0.851 0.0639
# 4 0.615 0.546 0.390 0.0985
# 5 0.258 0.857 0.139 0.172
# 6 0.944 0.375 0.356 0.715
#
# $`3`
# # A tibble: 5 × 4
# empty A B C
# <dbl> <dbl> <dbl> <dbl>
# 1 0.790 0.572 0.600 0.701
# 2 0.732 0.610 0.0395 0.283
# 3 0.130 0.168 0.120 0.0682
# 4 0.112 0.682 0.586 0.640
# 5 0.211 0.267 0.0189 0.606
If you really want df1, df2, ... in your global environment, add these lines:
names(result) = paste0("df", names(result))
list2env(result, envir = .GlobalEnv)

When the number of repetition is constant (here 4) then we could do something likes this:
base R:
df1 <- df[,1:4]
df2 <- df[,5:8]
df3 <- df[,9:12]
> df1
# A tibble: 7 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.120 0.448 0.0453 0.315
2 0.337 0.296 0.757 0.448
3 0.533 0.574 0.681 0.324
4 NA NA NA NA
5 NA NA NA NA
6 NA NA NA NA
7 NA NA NA NA
> df2
# A tibble: 7 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.420 0.306 0.472 0.107
2 0.639 0.666 0.349 0.768
3 0.469 0.311 0.100 0.744
4 0.00122 0.586 0.437 0.796
5 0.122 0.00989 0.289 0.408
6 0.570 0.253 0.877 0.197
7 NA NA NA NA
> df3
# A tibble: 7 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.812 0.0464 0.473 0.638
2 0.340 0.482 0.269 0.164
3 0.0323 0.952 0.842 0.282
4 0.511 0.263 0.934 0.183
5 0.0711 0.483 0.763 0.639
6 NA NA NA NA
7 NA NA NA NA

df1 <- df[,1:4][1:3,]
df2 <- df[,5:8][1:6,]
df3 <- df[,9:12][1:5,]

Another possible solution, based on tidyverse:
library(tidyverse)
stack(df) %>%
filter(!is.na(values)) %>%
group_by(aux = cumsum(ind == "empty" & lag(ind, default = "") != "empty")) %>%
group_split() %>%
map(~ pivot_wider(.x %>% select(-aux), names_from = "ind",
values_from = "values", values_fn = list) %>% unnest(everything()))
#> [[1]]
#> # A tibble: 3 × 4
#> empty A B C
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.865 0.0634 0.127 0.136
#> 2 0.343 0.431 0.943 0.985
#> 3 0.482 0.635 0.150 0.263
#>
#> [[2]]
#> # A tibble: 6 × 4
#> empty A B C
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.0656 0.514 0.834 0.662
#> 2 0.977 0.657 0.878 0.427
#> 3 0.670 0.641 0.910 0.175
#> 4 0.402 0.0494 0.433 0.0241
#> 5 0.211 0.388 0.971 0.273
#> 6 0.681 0.355 0.749 0.0536
#>
#> [[3]]
#> # A tibble: 5 × 4
#> empty A B C
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.440 0.856 0.00734 0.0474
#> 2 0.0347 0.328 0.471 0.845
#> 3 0.106 0.393 0.303 0.811
#> 4 0.385 0.184 0.540 0.180
#> 5 0.564 0.579 0.414 0.0110

Related

Using dplyr to calculate geomean in a row wise fashion

I'd like to calculate the geomean using each row from three columns. I found solutions to calculate it from the values in one column (example), but not from a row.
Here's a simplified example:
data <- structure(list(fs_id = structure(1:8, levels = c("CON1", "NC",
"water", "SCR1", "FAN1_1", "CON2", "SCR2", "FAN1_2"), class = "factor"),
twodct_ATP5B = c(1.06960527260684, 0.00241424406360917, NA,
0.953100847649869, 0.404512354245938, 0.934924336678708,
1.32283164360403, 0.194667767059346), twodct_EIF4A2 = c(1.07741209897215,
NA, NA, 1.01873805854745, 0.467988708062081, 0.928149963188649,
1.31762036152893, 0.33377442013251), twodct_GAPDH = c(1.04388739915294,
0.000156497290441042, NA, 0.972431569982792, 0.547030142788418,
0.957957726869246, 0.942311505534324, 0.337842927620691)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
The table looks like this:
> data
# A tibble: 8 × 4
fs_id twodct_ATP5B twodct_EIF4A2 twodct_GAPDH
<fct> <dbl> <dbl> <dbl>
1 CON1 1.07 1.08 1.04
2 NC 0.00241 NA 0.000156
3 water NA NA NA
4 SCR1 0.953 1.02 0.972
5 FAN1_1 0.405 0.468 0.547
6 CON2 0.935 0.928 0.958
7 SCR2 1.32 1.32 0.942
8 FAN1_2 0.195 0.334 0.338
I want to get the row wise geomean of columns twodct_ATP5B, twodct_EIF4A2 and twodct_GAPDH.
I've had a crack like this, but doesn't seem to work:
data %>%
rowwise() %>%
dplyr::mutate(geomean = exp(mean(log(select(., c("twodct_ATP5B", "twodct_EIF4A2", "twodct_GAPDH")))))) %>%
ungroup()
This is a good time to use c_across within the rowwise:
data %>%
rowwise() %>%
dplyr::mutate(geomean = exp(mean(log(c_across(c(twodct_ATP5B, twodct_EIF4A2, twodct_GAPDH)))))) %>%
ungroup()
# # A tibble: 8 × 5
# fs_id twodct_ATP5B twodct_EIF4A2 twodct_GAPDH geomean
# <fct> <dbl> <dbl> <dbl> <dbl>
# 1 CON1 1.07 1.08 1.04 1.06
# 2 NC 0.00241 NA 0.000156 NA
# 3 water NA NA NA NA
# 4 SCR1 0.953 1.02 0.972 0.981
# 5 FAN1_1 0.405 0.468 0.547 0.470
# 6 CON2 0.935 0.928 0.958 0.940
# 7 SCR2 1.32 1.32 0.942 1.18
# 8 FAN1_2 0.195 0.334 0.338 0.280
You can vectorize the computation by rowMeans(). It's optional to set na.rm = TRUE in rowMeans() to omit missing values.
data %>%
mutate(geomean = exp(rowMeans(log(pick(twodct_ATP5B, twodct_EIF4A2, twodct_GAPDH)))))
# # A tibble: 8 × 5
# fs_id twodct_ATP5B twodct_EIF4A2 twodct_GAPDH geomean
# <fct> <dbl> <dbl> <dbl> <dbl>
# 1 CON1 1.07 1.08 1.04 1.06
# 2 NC 0.00241 NA 0.000156 NA
# 3 water NA NA NA NA
# 4 SCR1 0.953 1.02 0.972 0.981
# 5 FAN1_1 0.405 0.468 0.547 0.470
# 6 CON2 0.935 0.928 0.958 0.940
# 7 SCR2 1.32 1.32 0.942 1.18
# 8 FAN1_2 0.195 0.334 0.338 0.280
Note: pick() is a new function since dplyr v1.1.0. If you have not updated, replace it with across or select. pick(a, b, c) is equivalent to
across(c(a, b, c)) (c() is necessary)
select(., a, b, c) (The dot is necessary)

Purrr's Modify-In Function

I'm trying to use purrr's modify_in to modify elements of a list. An example of the list:
tib_list <- map(1:3, ~ tibble(col_one = runif(5),
col_two = runif(5), col_three = runif(5)))
Let's say I want to change elements 2 and 3 of the list to unselect col_one. I imagined doing this:
modify_in(tib_list, 2:length(tib_list), ~ select(.x, -col_one)
But this yields an error. I then thought of doing something like this, but this ends up duplicating the list
map(1:3, ~ modify_in(tib_list, .x, ~ select(.x, -col_one))
I think you wanted to use modify_at which lets you specify either element names or positions. modify_in allows us to use only one position like purrr::pluck.
library(tidyverse)
tib_list <- map(1:3, ~ tibble(col_one = runif(5), col_two = runif(5), col_three = runif(5)))
modify_at(tib_list, c(2,3), ~ select(.x, -col_one))
#> [[1]]
#> # A tibble: 5 x 3
#> col_one col_two col_three
#> <dbl> <dbl> <dbl>
#> 1 0.190 0.599 0.824
#> 2 0.214 0.172 0.106
#> 3 0.236 0.666 0.584
#> 4 0.373 0.903 0.252
#> 5 0.875 0.196 0.643
#>
#> [[2]]
#> # A tibble: 5 x 2
#> col_two col_three
#> <dbl> <dbl>
#> 1 0.513 0.113
#> 2 0.893 0.377
#> 3 0.275 0.675
#> 4 0.529 0.612
#> 5 0.745 0.405
#>
#> [[3]]
#> # A tibble: 5 x 2
#> col_two col_three
#> <dbl> <dbl>
#> 1 0.470 0.789
#> 2 0.181 0.289
#> 3 0.680 0.213
#> 4 0.772 0.114
#> 5 0.314 0.895
Created on 2021-08-27 by the reprex package (v0.3.0)
We can use modify_in with one position, but supplying a vector such as c(2,3) would mean that we want to access the third element of the second parent element in a nested list. This is why we see the error below.
# works
modify_in(tib_list, 2, ~ select(.x, -col_one))
#> [[1]]
#> # A tibble: 5 x 3
#> col_one col_two col_three
#> <dbl> <dbl> <dbl>
#> 1 0.109 0.697 0.0343
#> 2 0.304 0.645 0.851
#> 3 0.530 0.786 0.600
#> 4 0.708 0.0324 0.605
#> 5 0.898 0.232 0.567
#>
#> [[2]]
#> # A tibble: 5 x 2
#> col_two col_three
#> <dbl> <dbl>
#> 1 0.766 0.157
#> 2 0.0569 0.0422
#> 3 0.943 0.0850
#> 4 0.947 0.0806
#> 5 0.761 0.297
#>
#> [[3]]
#> # A tibble: 5 x 3
#> col_one col_two col_three
#> <dbl> <dbl> <dbl>
#> 1 0.878 0.864 0.540
#> 2 0.168 0.745 0.120
#> 3 0.943 0.338 0.535
#> 4 0.353 0.478 0.204
#> 5 0.267 0.669 0.478
# doesn't work
modify_in(tib_list, c(2,3), ~ select(.x, -col_one))
#> Error in UseMethod("select"): no applicable method for 'select' applied to an object of class "c('double', 'numeric')"
I never used modify_in, but you could use
library(purrr)
library(dplyr)
tib_list %>%
imap(~ if (.y > 1) { select(.x, -col_one) } else { .x })
to get
[[1]]
# A tibble: 5 x 3
col_one col_two col_three
<dbl> <dbl> <dbl>
1 0.710 0.189 0.644
2 0.217 0.946 0.955
3 0.590 0.770 0.0180
4 0.135 0.101 0.888
5 0.640 0.645 0.346
[[2]]
# A tibble: 5 x 2
col_two col_three
<dbl> <dbl>
1 0.267 0.926
2 0.456 0.0902
3 0.659 0.707
4 0.421 0.0451
5 0.801 0.220
[[3]]
# A tibble: 5 x 2
col_two col_three
<dbl> <dbl>
1 0.437 0.649
2 0.256 0.466
3 0.331 0.594
4 0.586 0.558
5 0.625 0.444
We can use modify_if
modify_if(tib_list,.f = ~ .x %>% select(-col_one),
.p = seq_along(tib_list) != 1)
-output
[[1]]
# A tibble: 5 x 3
col_one col_two col_three
<dbl> <dbl> <dbl>
1 0.819 0.666 0.384
2 0.183 0.549 0.0211
3 0.374 0.240 0.252
4 0.359 0.913 0.792
5 0.515 0.402 0.217
[[2]]
# A tibble: 5 x 2
col_two col_three
<dbl> <dbl>
1 0.696 0.0269
2 0.433 0.147
3 0.235 0.743
4 0.589 0.748
5 0.635 0.851
[[3]]
# A tibble: 5 x 2
col_two col_three
<dbl> <dbl>
1 0.707 0.976
2 0.0966 0.130
3 0.574 0.572
4 0.854 0.680
5 0.819 0.582

Can I overlook a missing variable in a summing part of a function?

This is a shortened version of my real df. I have a function (called: calc) which creates a new variable called 'total', for simplicity this adds up three variables: a, b, c. When I add a dataframe, to that function, that does not feature one variable (say c) so only has a & b, the function falls over. Is there a 'function' / simple way that counts the variables regardless if they are missing?
calc <- function(x) {x %>% mutate(total = a + b + c)}
data.2 has two columns a & b with many rows of values, but when running that in the function it cannot find c so does not calculate.
new.df <- calc(data.2)
Many thanks.
If you want to perform rowwise sum or mean they have na.rm argument which you can use to ignore NA values.
library(dplyr)
calc <- function(x) {x %>% mutate(total = rowSums(select(., a:c), na.rm = TRUE))}
In general case if you are not able to find a function which gives you an out-of-box solution you can replace NA values with 0 maybe and then perform the operation that you want to perform.
calc <- function(x) {
x %>%
mutate(across(a:c, tidyr::replace_na, 0),
total = a + b + c)
}
You can use rowwise() and c_across() with any_of() (or any other tidyselect function) from dplyr (>= 1.0.0).
library(dplyr)
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(any_of(c("a", "b", "c"))))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 -0.884 0.851 -0.0339
#> 2 -1.56 -0.464 -2.02
#> 3 -0.884 0.815 -0.0689
#> 4 -1.46 -0.259 -1.71
#> 5 0.211 -0.528 -0.317
#> 6 1.85 0.190 2.04
#> 7 -1.31 -0.921 -2.23
#> 8 0.450 0.394 0.845
#> 9 -1.14 0.428 -0.714
#> 10 -1.11 0.417 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.0868 0.632 1.81 2.36
#> 2 0.568 -0.523 0.240 0.286
#> 3 -0.0325 0.377 -0.437 -0.0921
#> 4 0.660 0.456 1.28 2.39
#> 5 -0.123 1.75 -1.03 0.599
#> 6 0.641 1.39 0.902 2.93
#> 7 0.266 0.520 0.904 1.69
#> 8 -1.53 0.319 0.439 -0.776
#> 9 0.942 0.468 -1.69 -0.277
#> 10 0.254 -0.600 -0.196 -0.542
If you want to be able to generalize beyond those 3 variables you can use any tidyselect methodology.
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(everything()))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 0.775 1.17 1.95
#> 2 -1.05 1.21 0.155
#> 3 2.07 -0.264 1.81
#> 4 1.11 0.793 1.90
#> 5 -0.700 -0.216 -0.916
#> 6 -1.04 -1.03 -2.07
#> 7 -0.525 1.60 1.07
#> 8 0.354 0.828 1.18
#> 9 0.126 0.110 0.236
#> 10 -0.0954 -0.603 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.616 0.767 0.0462 0.196
#> 2 -0.370 -0.538 -0.186 -1.09
#> 3 0.337 1.11 -0.700 0.751
#> 4 -0.993 -0.531 -0.984 -2.51
#> 5 0.0538 1.50 -0.0808 1.47
#> 6 -0.907 -1.54 -0.734 -3.18
#> 7 -1.65 -0.242 1.43 -0.455
#> 8 -0.166 0.447 -0.281 -0.000524
#> 9 0.0637 -0.0185 0.754 0.800
#> 10 1.81 -1.09 -2.15 -1.42
Created on 2020-09-10 by the reprex package (v0.3.0)

Dynamic portfolio re-balancing if PF weights deviate by more than a threshold

It's not so hard to backtest a portfolio with given weights and a set rebalancing frequency (e.g. daily/weekly...). There are R packages doing this, for example PerformanceAnalytics, or tidyquant's tq_portfolio which uses that function.
I would like to backtest a portfolio that is re-balanced when the weights deviate by a certain threshold given in percentage points.
Say I have two equally-weighted stocks and a threshold of +/-15 percentage points, I would rebalance to the initial weights when one of the weights exceeds 65%.
For example I have 3 stocks with equal weights (we should also be able to set other weights).
library(dplyr)
set.seed(3)
n <- 6
rets <- tibble(period = rep(1:n, 3),
stock = c(rep("A", n), rep("B", n), rep("C", n)),
ret = c(rnorm(n, 0, 0.3), rnorm(n, 0, 0.2), rnorm(n, 0, 0.1)))
target_weights <- tibble(stock = c("A", "B", "C"), target_weight = 1/3)
rets_weights <- rets %>%
left_join(target_weights, by = "stock")
rets_weights
# # A tibble: 18 x 4
# period stock ret target_weight
# <int> <chr> <dbl> <dbl>
# 1 1 A -0.289 0.333
# 2 2 A -0.0878 0.333
# 3 3 A 0.0776 0.333
# 4 4 A -0.346 0.333
# 5 5 A 0.0587 0.333
# 6 6 A 0.00904 0.333
# 7 1 B 0.0171 0.333
# 8 2 B 0.223 0.333
# 9 3 B -0.244 0.333
# 10 4 B 0.253 0.333
# 11 5 B -0.149 0.333
# 12 6 B -0.226 0.333
# 13 1 C -0.0716 0.333
# 14 2 C 0.0253 0.333
# 15 3 C 0.0152 0.333
# 16 4 C -0.0308 0.333
# 17 5 C -0.0953 0.333
# 18 6 C -0.0648 0.333
Here are the actual weights without rebalancing:
rets_weights_actual <- rets_weights %>%
group_by(stock) %>%
mutate(value = cumprod(1+ret)*target_weight[1]) %>%
group_by(period) %>%
mutate(actual_weight = value/sum(value))
rets_weights_actual
# # A tibble: 18 x 6
# # Groups: period [6]
# period stock ret target_weight value actual_weight
# <int> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 A -0.289 0.333 0.237 0.268
# 2 2 A -0.0878 0.333 0.216 0.228
# 3 3 A 0.0776 0.333 0.233 0.268
# 4 4 A -0.346 0.333 0.153 0.178
# 5 5 A 0.0587 0.333 0.162 0.207
# 6 6 A 0.00904 0.333 0.163 0.238
# 7 1 B 0.0171 0.333 0.339 0.383
# 8 2 B 0.223 0.333 0.415 0.437
# 9 3 B -0.244 0.333 0.314 0.361
# 10 4 B 0.253 0.333 0.393 0.458
# 11 5 B -0.149 0.333 0.335 0.430
# 12 6 B -0.226 0.333 0.259 0.377
# 13 1 C -0.0716 0.333 0.309 0.349
# 14 2 C 0.0253 0.333 0.317 0.335
# 15 3 C 0.0152 0.333 0.322 0.371
# 16 4 C -0.0308 0.333 0.312 0.364
# 17 5 C -0.0953 0.333 0.282 0.363
# 18 6 C -0.0648 0.333 0.264 0.385
So I want that if in any period any stock's weight goes over or under the threshold (for example 0.33+/-0.1), the portfolio weights should be set back to the initial weights.
This has to be done dynamically, so we could have a lot of periods and a lot of stocks. Rebalancing could be necessary several times.
What I tried to solve it: I tried to work with lag and set the initial weights when the actual weights exceed the threshold, however I was unable to do so dynamically, as the weights depend on the returns given the rebalanced weights.
The approach to rebalance upon deviation by more than a certain threshold is called percentage-of-portfolio rebalancing.
My solution is to iterate period-by-period and check if the upper or lower threshold was passed. If so we reset to the initial weights.
library(tidyverse)
library(tidyquant)
rets <- FANG %>%
group_by(symbol) %>%
mutate(ret = adjusted/lag(adjusted)-1) %>%
select(symbol, date, ret) %>%
pivot_wider(names_from = "symbol", values_from = ret)
weights <- rep(0.25, 4)
threshold <- 0.05
r_out <- tibble()
i0 <- 1
trade_rebalance <- 1
pf_value <- 1
for (i in 1:nrow(rets)) {
r <- rets[i0:i,]
j <- 0
r_i <- r %>%
mutate_if(is.numeric, replace_na, 0) %>%
mutate_if(is.numeric, list(v = ~ pf_value * weights[j <<- j + 1] * cumprod(1 + .))) %>%
mutate(pf = rowSums(select(., contains("_v")))) %>%
mutate_at(vars(ends_with("_v")), list(w = ~ ./pf))
touch_upper_band <- any(r_i[nrow(r_i),] %>% select(ends_with("_w")) %>% unlist() > weights + threshold)
touch_lower_band <- any(r_i[nrow(r_i),] %>% select(ends_with("_w")) %>% unlist() < weights - threshold)
if (touch_upper_band | touch_lower_band | i == nrow(rets)) {
i0 <- i + 1
r_out <- bind_rows(r_out, r_i %>% mutate(trade_rebalance = trade_rebalance))
pf_value <- r_i[[nrow(r_i), "pf"]]
trade_rebalance <- trade_rebalance + 1
}
}
r_out %>% head()
# # A tibble: 6 x 15
# date FB AMZN NFLX GOOG FB_v AMZN_v NFLX_v GOOG_v pf FB_v_w AMZN_v_w NFLX_v_w GOOG_v_w trade_rebalance
# <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2013-01-02 0 0 0 0 0.25 0.25 0.25 0.25 1 0.25 0.25 0.25 0.25 1
# 2 2013-01-03 -0.00821 0.00455 0.0498 0.000581 0.248 0.251 0.262 0.250 1.01 0.245 0.248 0.259 0.247 1
# 3 2013-01-04 0.0356 0.00259 -0.00632 0.0198 0.257 0.252 0.261 0.255 1.02 0.251 0.246 0.255 0.249 1
# 4 2013-01-07 0.0229 0.0359 0.0335 -0.00436 0.263 0.261 0.270 0.254 1.05 0.251 0.249 0.257 0.243 1
# 5 2013-01-08 -0.0122 -0.00775 -0.0206 -0.00197 0.259 0.259 0.264 0.253 1.04 0.251 0.250 0.255 0.245 1
# 6 2013-01-09 0.0526 -0.000113 -0.0129 0.00657 0.273 0.259 0.261 0.255 1.05 0.261 0.247 0.249 0.244 1
r_out %>% tail()
# # A tibble: 6 x 15
# date FB AMZN NFLX GOOG FB_v AMZN_v NFLX_v GOOG_v pf FB_v_w AMZN_v_w NFLX_v_w GOOG_v_w trade_rebalance
# <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2016-12-22 -0.0138 -0.00553 -0.00727 -0.00415 0.945 1.10 1.32 1.08 4.45 0.213 0.247 0.297 0.243 10
# 2 2016-12-23 -0.00111 -0.00750 0.0000796 -0.00171 0.944 1.09 1.32 1.08 4.43 0.213 0.246 0.298 0.243 10
# 3 2016-12-27 0.00631 0.0142 0.0220 0.00208 0.950 1.11 1.35 1.08 4.49 0.212 0.247 0.301 0.241 10
# 4 2016-12-28 -0.00924 0.000946 -0.0192 -0.00821 1.11 1.12 1.10 1.11 4.45 0.250 0.252 0.247 0.250 11
# 5 2016-12-29 -0.00488 -0.00904 -0.00445 -0.00288 1.11 1.11 1.10 1.11 4.42 0.250 0.252 0.248 0.251 11
# 6 2016-12-30 -0.0112 -0.0200 -0.0122 -0.0140 1.09 1.09 1.08 1.09 4.36 0.251 0.250 0.248 0.251 11
Here we would have rebalanced 11 times.
r_out %>%
mutate(performance = pf-1) %>%
ggplot(aes(x = date, y = performance)) +
geom_line(data = FANG %>%
group_by(symbol) %>%
mutate(performance = adjusted/adjusted[1L]-1),
aes(color = symbol)) +
geom_line(size = 1)
The approach is slow and using a loop is far from elegant. If anyone has a better solution, I would happily upvote and accept.

Calculate all possible interactions in model_matrix

I'm simulating data with a fluctuating number of variables. As part of the situation, I am needing to calculate a model matrix with all possible combinations. See the following reprex for an example. I am able to get all two-interactions by specifying the formula as ~ .*.. However, this particular dataset has 3 variables (ndim <- 3). I can get all two- and three-way interactions by specifying the formula as ~ .^3. The issue is that there may be 4+ variables that I need to calculate, so I would like to be able to generalize this. I have tried specifying the formula as ~ .^ndim, but this throws an error.
Is there a way define the power in the formula with a variable?
library(tidyverse)
library(mvtnorm)
library(modelr)
ndim <- 3
data <- rmvnorm(100, mean = rep(0, ndim)) %>%
as_tibble(.name_repair = ~ paste0("dim_", seq_len(ndim)))
model_matrix(data, ~ .*.)
#> # A tibble: 100 x 7
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 1 more variable: `dim_2:dim_3` <dbl>
model_matrix(data, ~ .^3)
#> # A tibble: 100 x 8
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 2 more variables: `dim_2:dim_3` <dbl>,
#> # `dim_1:dim_2:dim_3` <dbl>
model_matrix(data, ~.^ndim)
#> Error in terms.formula(object, data = data): invalid power in formula
Created on 2019-02-15 by the reprex package (v0.2.1)
You can use use as.formula with paste in model_matrix:
model_matrix(data, as.formula(paste0("~ .^", ndim)))

Resources