Basically, I have 14 columns of numerical variables (say v1 to v14), and I want R to do the following mutation:
v1=v1-v2
v3=v3-v4
v5=v5-v6
...
Here is what I tried. I called all the odd-column variables 'source_vars' and
the even-column variables 'use_vars', and tried the following
# Defining source and use vars
source_vars<-c("ofc_s","privnonfin_s","hh_s",
"ROW_s", "total_s", "banking_s", "totgov_s")
use_vars<-c("ofc_u","privnonfin_u","hh_u",
"ROW_u", "total_u", "banking_u", "totgov_u")
new<-sw_flows%>%filter(sector=="Total")%>%
mutate(sector="Source-Use", source_vars=source_vars-use_vars)
This didnt work.
Is there an efficient way to do this without having to name the variables?
Here is a purrr one-liner.
I first create a test data set, then get the odd and even column numbers with the modulus operator and finally use map2 to apply function - to the columns.
set.seed(2022)
df1 <- matrix(rnorm(10*14), ncol = 14)
df1 <- as.data.frame(df1)
source_vars <- which(seq.int(ncol(df1)) %% 2 == 1)
use_vars <- seq.int(ncol(df1))[-source_vars]
purrr::map2_dfc(df1[source_vars], df1[use_vars], `-`)
#> # A tibble: 10 x 7
#> V1 V3 V5 V7 V9 V11 V13
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.106 0.247 -1.96 -1.20 -1.76 -0.861 -1.38
#> 2 -0.988 -0.448 -0.188 -2.62 1.61 -0.221 0.636
#> 3 0.0843 2.67 -1.31 -0.746 0.648 1.25 -1.27
#> 4 -1.54 1.43 -2.67 0.0901 -0.706 0.667 1.43
#> 5 -0.278 0.469 -0.249 -2.14 -5.08 1.00 2.69
#> 6 -2.82 -1.94 -1.43 0.185 1.14 2.47 -0.0427
#> 7 -0.405 -0.430 0.317 -2.16 3.22 0.448 -2.23
#> 8 1.23 0.186 -1.44 -0.294 0.667 -0.749 0.205
#> 9 -0.270 -0.675 -0.851 0.838 2.50 -0.597 -0.246
#> 10 -0.617 -0.0703 -0.0478 -0.464 -0.513 2.17 1.94
Created on 2022-02-26 by the reprex package (v2.0.1)
And a base R way.
res <- Map(\(x, y) x - y, df1[source_vars], df1[use_vars])
do.call(cbind.data.frame, res)
Created on 2022-02-26 by the reprex package (v2.0.1)
If we want to subtract odd from even column sequence columns (assuming there are equal number of even/odd columns
sw_flows[c(TRUE, FALSE)] - sw_flows[c(FALSE, TRUE)]
Related
I would like a function that would find the nth lowest value across columns. In other words, a function that is similar to pmin() but rather than finding the lowest, I am hoping it returns the nth lowest. Thank you in advance!
df %>%
rowid_to_column() %>%
pivot_longer(-rowid)%>%
arrange(value)%>% #You could arrange with decreasing to find max
group_by(rowid) %>%
summarise(value = nth(value, 2)) # Find the second minimum
# A tibble: 10 x 2
rowid value
<int> <dbl>
1 1 -0.560
2 2 -0.218
3 3 0.401
4 4 0.0705
5 5 -0.556
6 6 1.72
7 7 0.498
8 8 -1.27
9 9 -0.687
10 10 -0.446
Here is a simple one (it could be modified to deal with NAs):
nth_lowest <- function(x,n) x[order(x)[n]]
Apply it to a data frame, using rowwise() and c_across() from the dplyr package.
df %>%
rowwise() %>%
mutate( second_lowest = f(c_across(x:z),2))
Output:
x y z second_lowest
<dbl> <dbl> <dbl> <dbl>
1 -0.560 1.22 -1.07 -0.560
2 -0.230 0.360 -0.218 -0.218
3 1.56 0.401 -1.03 0.401
4 0.0705 0.111 -0.729 0.0705
5 0.129 -0.556 -0.625 -0.556
6 1.72 1.79 -1.69 1.72
7 0.461 0.498 0.838 0.498
8 -1.27 -1.97 0.153 -1.27
9 -0.687 0.701 -1.14 -0.687
10 -0.446 -0.473 1.25 -0.446
Input:
set.seed(123)
df <- data.frame(x=rnorm(10), y=rnorm(10), z=rnorm(10))
We may also do this with pmap and nth
library(purrr)
library(dplyr)
pmap_dbl(df, ~ nth(sort(c(...)), n = 2))
I have a simplified tibble where I select two columns (manually) and pass them to a custom function, but in this case just using sum. Any ideas on how I could expand this to accommodate any number of ko. In this case there's only 2, but let's say there were 5?
library(dplyr)
library(purrr)
df <- tibble(l2fc_ko1 = rnorm(1:10), l2fc_ko2 = rnorm(1:10), ctrl_ko1 = rnorm(1:10), ctrl_ko2 = rnorm(1:10))
df %>% mutate(ko1_sum = map2_dbl(ctrl_ko1, l2fc_ko1, sum),
ko2_sum = map2_dbl(ctrl_ko2, l2fc_ko2, sum))
We can use pivot_longer to reshape the data, creating a column for each level of ko. Compute the sum, then pivot_wider to get back to your original format:
library(tidyverse)
df %>%
mutate(idx = row_number()) %>%
pivot_longer(-idx, names_sep = '_', names_to = c('group', 'ko')) %>%
pivot_wider(names_from = group, values_from = value) %>%
mutate(sum = l2fc + ctrl) %>%
pivot_wider(names_from = ko, values_from = c(l2fc, ctrl, sum))
idx l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 sum_ko1 sum_ko2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -1.04 -0.710 -0.288 -1.65 -1.33 -2.36
2 2 0.0338 0.400 -0.850 0.319 -0.816 0.719
3 3 2.08 0.723 0.325 0.314 2.40 1.04
4 4 0.740 -0.411 -0.307 1.77 0.433 1.36
5 5 0.347 -1.57 -0.153 0.657 0.195 -0.915
6 6 -0.998 -0.145 0.265 -1.95 -0.733 -2.09
7 7 2.05 -0.0876 -0.909 -0.190 1.14 -0.278
8 8 0.0735 -0.134 -2.04 -0.832 -1.96 -0.966
9 9 1.52 2.37 1.53 -0.596 3.05 1.78
10 10 1.42 -0.753 -1.61 1.84 -0.194 1.09
If you have a dynamic number of paired ctrl_/l2fc_ columns, then try this:
Ensure we have all ctrl_ that have a corresponding l2fc_ (and vice versa):
ctrls <- grep("^ctrl_ko", names(df), value = TRUE)
l2fcs <- gsub("^ctrl", "l2fc", ctrls)
ctrls <- ctrls[ l2fcs %in% names(df) ]
l2fcs <- l2fcs[ l2fcs %in% names(df) ] # or intersect(l2fcs, names(df))
Combine these into one vector (we'll split on it later) and convert this to the new _sum names we'll need.
nms <- c(l2fcs, ctrls)
nms
# [1] "l2fc_ko1" "l2fc_ko2" "ctrl_ko1" "ctrl_ko2"
newnms <- gsub("ctrl_(.*)", "\\1_sum", ctrls)
newnms
# [1] "ko1_sum" "ko2_sum"
Using split.default (which will split the df into groups of columns) and rowSums, we can devise two _sum columns:
setNames(as.data.frame(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums)), newnms)
# ko1_sum ko2_sum
# 1 1.0643199 1.7603198
# 2 -2.3460066 2.9914827
# 3 0.1912111 -0.3537572
# 4 1.8475373 -0.8877151
# 5 2.2994618 0.3716338
# 6 -0.5365936 -1.0810583
# 7 1.2542526 -1.0687119
# 8 -1.8578221 -3.5073630
# 9 2.4785211 -4.8546746
# 10 -0.7027090 1.3562360
We can cbind/bind_cols those in, or we can mutate them just as well. For the latter, we'll replace df with cur_data() for within the mutate environment, and we'll need to add as.data.frame)
Choose one of the following, all producing effectively the same results:
cbind(df, setNames(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums), newnms))
bind_cols(df, setNames(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums), newnms))
df %>%
mutate(
setNames(
as.data.frame(
lapply(split.default(cur_data()[nms], gsub(".*_ko", "", nms)), rowSums)),
newnms)
)
# # A tibble: 10 x 6
# l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 ko1_sum ko2_sum
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1.37 1.30 -0.307 0.455 1.06 1.76
# 2 -0.565 2.29 -1.78 0.705 -2.35 2.99
# 3 0.363 -1.39 -0.172 1.04 0.191 -0.354
# 4 0.633 -0.279 1.21 -0.609 1.85 -0.888
# 5 0.404 -0.133 1.90 0.505 2.30 0.372
# 6 -0.106 0.636 -0.430 -1.72 -0.537 -1.08
# 7 1.51 -0.284 -0.257 -0.784 1.25 -1.07
# 8 -0.0947 -2.66 -1.76 -0.851 -1.86 -3.51
# 9 2.02 -2.44 0.460 -2.41 2.48 -4.85
# 10 -0.0627 1.32 -0.640 0.0361 -0.703 1.36
How about rowwise? You can specify the columns you want with c or c_across.
df %>%
rowwise() %>%
mutate(total = sum(c_across(ends_with("ko1"))))
# A tibble: 10 x 5
# Rowwise:
l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 total
<dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.179 0.496 -1.10 -0.375 -1.27
2 -0.0887 -0.873 0.613 -0.348 0.525
3 -2.33 -0.322 -0.515 3.03 -2.84
4 -0.602 -0.0387 0.704 -0.118 0.102
5 -0.389 -0.00801 0.276 0.500 -0.113
6 -2.18 0.648 -0.485 -0.243 -2.66
7 0.0529 0.237 -0.371 -0.0382 -0.318
8 0.818 -0.181 1.11 -1.25 1.93
9 -0.271 -0.883 0.480 -0.296 0.209
10 -0.208 -1.11 1.09 -0.528 0.882
I am working on a project which is updated every few years. We then get new data for every quarter. Usually a table, in which columns 1:5 are IDs and discriptive information necessary for further analysis. And then there are consecutive pairs of columns (e.g. 6:7, 8:9, ...) which contain the updated data (variables) for the specific quarter. So far so good.
The problem is that given the unregular update interval, the input data is quite different. Sometimes I get CSVs for one quarter, sometimes for a number of quarters, and sometimes the date is even in SPSS format (sav). I managed to automatically join all past and current files, when I have the input data in the following format: columns 1:5 descriptive data, columns 6:7 updated data.
So what's left to do is to import data from CSVs or SAVs that cointain more than one quarter, split them into seperate dataframes in the requested format (columns 1:5 descriptive data, columns 6:7 new variables), and export the new dataframe as CSV to a specific location - if possible even automatically naming them.
While this is fairly easy to do manually, I could not come up with an automated solution, as there are so many variable factors. Manually I would use the dplyr package:
tops_2007_1 <- tops_2007_1_2015_2 %>% select(1:5, 6:7) #and so forth
tops_2007_2 <- tops_2007_1_2015_2 %>% select(1:5, 8:9)
...
write_excel_csv2(tops_2007_1, "Data/Tops/tops_2007_1.csv") # and so forth
write_excel_csv2(tops_2007_2, "Data/Tops/tops_2007_2.csv")
...
Is there a way to automate this? Information on the new dataframe names would be in the column headers (e.g. 20071_X, 20071_Y). I got plenty of hints towards for loops and apply functions, but failed to put it together. I would appreciate any hints that could get me started on this!
Thanks!
Using the same random data generated by #Limey, you can do:
library(tidyverse)
library(purrr)
Generate some random data
df <- tibble(
id1=rnorm(10),id2=rnorm(10),id3=rnorm(10),id4=rnorm(10),id5=rnorm(10),
var1x=rnorm(10),var1y=rnorm(10),var2x=rnorm(10),var2y=rnorm(10),var3x=rnorm(10),
var3y=rnorm(10),var4x=rnorm(10),var4y=rnorm(10),var5x=rnorm(10),var5y=rnorm(10)
)
spits <- split(names(df)[6:length(names(df))], ceiling(seq_along(names(df)[6:length(names(df))])/2))
pmap(list(spits,1:length(spits)), ~df%>%select(1:5,all_of(..1))%>%write_excel_csv2(.,paste0('tops_2007_',as.character(..2),'.csv')))
Which outputs:
$`1`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var1x var1y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 0.618 0.642
2 -1.07 -1.29 1.03 0.222 -0.266 -0.824 0.862
3 0.872 -1.62 0.225 1.31 -0.737 0.885 -0.569
4 -1.85 0.653 0.279 -1.28 -1.04 0.0762 0.769
5 0.913 -2.27 0.0821 1.10 1.04 1.50 0.791
6 1.36 0.203 -0.310 1.43 1.43 -0.489 0.259
7 -0.961 1.10 0.642 -1.42 2.01 -1.79 -0.732
8 0.861 -0.831 1.29 0.684 -0.124 0.0549 0.755
9 1.20 1.43 1.20 -0.287 -1.35 -1.48 0.237
10 -0.645 1.01 0.411 0.383 -0.413 1.75 0.386
$`2`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var2x var2y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 -0.602 -0.938
2 -1.07 -1.29 1.03 0.222 -0.266 1.47 0.122
3 0.872 -1.62 0.225 1.31 -0.737 -0.600 0.584
4 -1.85 0.653 0.279 -1.28 -1.04 -0.835 -0.918
5 0.913 -2.27 0.0821 1.10 1.04 -0.322 0.623
6 1.36 0.203 -0.310 1.43 1.43 0.333 1.22
7 -0.961 1.10 0.642 -1.42 2.01 -0.287 -0.471
8 0.861 -0.831 1.29 0.684 -0.124 0.0853 -1.34
9 1.20 1.43 1.20 -0.287 -1.35 0.800 -0.528
10 -0.645 1.01 0.411 0.383 -0.413 0.982 1.76
$`3`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var3x var3y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 -0.00896 0.320
2 -1.07 -1.29 1.03 0.222 -0.266 -0.321 0.213
3 0.872 -1.62 0.225 1.31 -0.737 0.785 1.81
4 -1.85 0.653 0.279 -1.28 -1.04 0.965 0.695
5 0.913 -2.27 0.0821 1.10 1.04 1.18 1.22
6 1.36 0.203 -0.310 1.43 1.43 -1.64 0.804
7 -0.961 1.10 0.642 -1.42 2.01 0.648 1.25
8 0.861 -0.831 1.29 0.684 -0.124 -0.933 -0.483
9 1.20 1.43 1.20 -0.287 -1.35 1.56 0.710
10 -0.645 1.01 0.411 0.383 -0.413 -2.24 0.964
$`4`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var4x var4y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 0.172 -0.901
2 -1.07 -1.29 1.03 0.222 -0.266 -0.336 -0.720
3 0.872 -1.62 0.225 1.31 -0.737 2.18 -0.557
4 -1.85 0.653 0.279 -1.28 -1.04 1.18 1.30
5 0.913 -2.27 0.0821 1.10 1.04 0.0288 0.369
6 1.36 0.203 -0.310 1.43 1.43 -1.34 0.506
7 -0.961 1.10 0.642 -1.42 2.01 0.193 0.0254
8 0.861 -0.831 1.29 0.684 -0.124 0.209 0.168
9 1.20 1.43 1.20 -0.287 -1.35 0.181 0.658
10 -0.645 1.01 0.411 0.383 -0.413 1.40 -0.360
$`5`
# A tibble: 10 x 7
id1 id2 id3 id4 id5 var5x var5y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.748 0.587 1.47 2.23 -1.06 0.0203 1.14
2 -1.07 -1.29 1.03 0.222 -0.266 -1.22 0.339
3 0.872 -1.62 0.225 1.31 -0.737 -0.423 -0.291
4 -1.85 0.653 0.279 -1.28 -1.04 0.815 -0.428
5 0.913 -2.27 0.0821 1.10 1.04 -0.00963 -0.690
6 1.36 0.203 -0.310 1.43 1.43 0.141 0.451
7 -0.961 1.10 0.642 -1.42 2.01 -1.77 1.60
8 0.861 -0.831 1.29 0.684 -0.124 0.370 0.438
9 1.20 1.43 1.20 -0.287 -1.35 -0.582 -2.92
10 -0.645 1.01 0.411 0.383 -0.413 1.33 -0.221
and creates the following files in your directory:
Something like:
library(tidyverse)
Generate some random data
df <- tibble(
id1=rnorm(10),id2=rnorm(10),id3=rnorm(10),id4=rnorm(10),id5=rnorm(10),
var1x=rnorm(10),var1y=rnorm(10),var2x=rnorm(10),var2y=rnorm(10),var3x=rnorm(10),
var3y=rnorm(10),var4x=rnorm(10),var4y=rnorm(10),var5x=rnorm(10),var5y=rnorm(10)
)
ans <- bind_rows(
lapply(
seq(6, ncol(df), 2),
function(x) {
df %>%
select(1:5, c(x, x+1)) %>%
rename(x=6, y=7) %>%
add_column(var=floor((x-5)/2)+1)
}
)
)
ans
Giving
# A tibble: 50 x 8
id1 id2 id3 id4 id5 x y var
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.816 0.315 -0.0932 0.232 -0.950 -1.17 -0.157 1
2 -0.0704 1.15 1.67 2.85 -2.61 -0.190 -1.60 1
3 -1.94 -0.343 0.873 -1.13 -0.638 0.802 -0.254 1
4 -1.19 0.247 0.0939 0.337 1.99 -0.806 1.43 1
5 0.100 -1.61 -1.39 -0.253 -1.53 -1.99 -1.04 1
6 -0.106 -0.244 -0.233 2.49 1.60 0.229 0.919 1
7 -0.979 -0.648 0.432 1.22 -1.16 -0.754 0.903 1
8 0.284 -0.220 -0.735 -0.804 0.0707 -0.714 -1.87 1
9 -0.0801 -1.53 -0.819 0.952 0.852 1.27 -0.702 1
10 -0.579 -0.0120 0.611 -0.621 -2.13 -0.812 1.43 1
# … with 40 more rows
How does it work?
seq(6, ncol(df), 2)
Defines the first of each pair of columns you wish to extract, starting with the 6th (the first after your first 5 id variables) and stepping through the columns in steps of two.
df %>%
select(1:5, c(x, x+1))
Selects the id columns and the current pair of data columns
rename(x=6, y=7) %>%
add_column(var=floor((x-5)/2)+1)
Renames the two data columns to x and y and adds a new id column identifying which pair of data columns are being handled. You will need to adapt this step to reflect your actual data.
lapply(...)
Performs the above steps on all pairs of data columns in the current data frame and returns the results as a list
bind_rows(...)
Converts the list of data frames to a single one.
Some comments:
Using column indexes to identify columns is fragile. I wouldn't do it. I'd use the column names themselves.
If your data column names are, for example, 20071_X then your data are not tidy because the column names contain information (here, the year and the quarter). This will likely cause you pain downstream. I'd recommend putting this right as soon as you can. By modifying my derivation of var, you can tidy your data as you extract it.
Please see this post for advice on how to write a good minimal working example.
This is a shortened version of my real df. I have a function (called: calc) which creates a new variable called 'total', for simplicity this adds up three variables: a, b, c. When I add a dataframe, to that function, that does not feature one variable (say c) so only has a & b, the function falls over. Is there a 'function' / simple way that counts the variables regardless if they are missing?
calc <- function(x) {x %>% mutate(total = a + b + c)}
data.2 has two columns a & b with many rows of values, but when running that in the function it cannot find c so does not calculate.
new.df <- calc(data.2)
Many thanks.
If you want to perform rowwise sum or mean they have na.rm argument which you can use to ignore NA values.
library(dplyr)
calc <- function(x) {x %>% mutate(total = rowSums(select(., a:c), na.rm = TRUE))}
In general case if you are not able to find a function which gives you an out-of-box solution you can replace NA values with 0 maybe and then perform the operation that you want to perform.
calc <- function(x) {
x %>%
mutate(across(a:c, tidyr::replace_na, 0),
total = a + b + c)
}
You can use rowwise() and c_across() with any_of() (or any other tidyselect function) from dplyr (>= 1.0.0).
library(dplyr)
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(any_of(c("a", "b", "c"))))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 -0.884 0.851 -0.0339
#> 2 -1.56 -0.464 -2.02
#> 3 -0.884 0.815 -0.0689
#> 4 -1.46 -0.259 -1.71
#> 5 0.211 -0.528 -0.317
#> 6 1.85 0.190 2.04
#> 7 -1.31 -0.921 -2.23
#> 8 0.450 0.394 0.845
#> 9 -1.14 0.428 -0.714
#> 10 -1.11 0.417 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.0868 0.632 1.81 2.36
#> 2 0.568 -0.523 0.240 0.286
#> 3 -0.0325 0.377 -0.437 -0.0921
#> 4 0.660 0.456 1.28 2.39
#> 5 -0.123 1.75 -1.03 0.599
#> 6 0.641 1.39 0.902 2.93
#> 7 0.266 0.520 0.904 1.69
#> 8 -1.53 0.319 0.439 -0.776
#> 9 0.942 0.468 -1.69 -0.277
#> 10 0.254 -0.600 -0.196 -0.542
If you want to be able to generalize beyond those 3 variables you can use any tidyselect methodology.
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(everything()))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 0.775 1.17 1.95
#> 2 -1.05 1.21 0.155
#> 3 2.07 -0.264 1.81
#> 4 1.11 0.793 1.90
#> 5 -0.700 -0.216 -0.916
#> 6 -1.04 -1.03 -2.07
#> 7 -0.525 1.60 1.07
#> 8 0.354 0.828 1.18
#> 9 0.126 0.110 0.236
#> 10 -0.0954 -0.603 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.616 0.767 0.0462 0.196
#> 2 -0.370 -0.538 -0.186 -1.09
#> 3 0.337 1.11 -0.700 0.751
#> 4 -0.993 -0.531 -0.984 -2.51
#> 5 0.0538 1.50 -0.0808 1.47
#> 6 -0.907 -1.54 -0.734 -3.18
#> 7 -1.65 -0.242 1.43 -0.455
#> 8 -0.166 0.447 -0.281 -0.000524
#> 9 0.0637 -0.0185 0.754 0.800
#> 10 1.81 -1.09 -2.15 -1.42
Created on 2020-09-10 by the reprex package (v0.3.0)
I'm simulating data with a fluctuating number of variables. As part of the situation, I am needing to calculate a model matrix with all possible combinations. See the following reprex for an example. I am able to get all two-interactions by specifying the formula as ~ .*.. However, this particular dataset has 3 variables (ndim <- 3). I can get all two- and three-way interactions by specifying the formula as ~ .^3. The issue is that there may be 4+ variables that I need to calculate, so I would like to be able to generalize this. I have tried specifying the formula as ~ .^ndim, but this throws an error.
Is there a way define the power in the formula with a variable?
library(tidyverse)
library(mvtnorm)
library(modelr)
ndim <- 3
data <- rmvnorm(100, mean = rep(0, ndim)) %>%
as_tibble(.name_repair = ~ paste0("dim_", seq_len(ndim)))
model_matrix(data, ~ .*.)
#> # A tibble: 100 x 7
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 1 more variable: `dim_2:dim_3` <dbl>
model_matrix(data, ~ .^3)
#> # A tibble: 100 x 8
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 2 more variables: `dim_2:dim_3` <dbl>,
#> # `dim_1:dim_2:dim_3` <dbl>
model_matrix(data, ~.^ndim)
#> Error in terms.formula(object, data = data): invalid power in formula
Created on 2019-02-15 by the reprex package (v0.2.1)
You can use use as.formula with paste in model_matrix:
model_matrix(data, as.formula(paste0("~ .^", ndim)))