map_dfr outputting a row rather than a column - r

This is similar to purrr::map_dfr binds by columns, not row as expected but the solutions there aren't working for me. I have a dataframe like
beta_df <- structure(list(intercept = c(-2.75747056032685, -2.90831892599742,
-2.92478082251453, -2.99701559041538, -2.88885796048347, -3.09564193631675
), B1 = c(0.0898235360814854, 0.0291839369781567, 0.0881023522236231,
0.231703026085554, 0.0441573699433149, 0.258219673780526), B2 = c(-0.222367437619057,
0.770536384299238, 0.199648657850609, 0.0529038155448773, 0.00310458335580774,
0.132604387458483), B3 = c(1.26339268033385, 1.29883641278223,
0.949504940387809, 1.26904511447941, 0.863882674439083, 0.823907268679309
), B4 = c(2.13662994525526, 1.02340744740827, 0.959079691725652,
1.60672779812489, 1.19095838867883, -0.0693120654049908)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
#> # A tibble: 6 × 5
#> intercept B1 B2 B3 B4
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -2.76 0.0898 -0.222 1.26 2.14
#> 2 -2.91 0.0292 0.771 1.30 1.02
#> 3 -2.92 0.0881 0.200 0.950 0.959
#> 4 -3.00 0.232 0.0529 1.27 1.61
#> 5 -2.89 0.0442 0.00310 0.864 1.19
#> 6 -3.10 0.258 0.133 0.824 -0.0693
I'd like to turn this into a tibble with columns for the mean, 0.025 and 0.975 quantiles. For the quantile function this works:
beta_df %>%
map_dfr(quantile,0.025)
#> # A tibble: 5 × 1
#> `2.5%`
#> <dbl>
#> 1 -3.08
#> 2 0.0311
#> 3 -0.194
#> 4 0.829
#> 5 0.0592
And this gets me both quantiles
bind_cols(beta_df %>%
map_dfr(quantile, 0.025),
beta_df %>%
map_dfr(quantile, 0.975))
#> # A tibble: 5 × 2
#> `2.5%` `97.5%`
#> <dbl> <dbl>
#> 1 -3.08 -2.77
#> 2 0.0311 0.255
#> 3 -0.194 0.699
#> 4 0.829 1.30
#> 5 0.0592 2.07
But for mean,
beta_df %>%
map_dfr(mean)
#> # A tibble: 1 × 5
#> intercept B1 B2 B3 B4
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -2.93 0.124 0.156 1.08 1.14
Gives me a long row rather than a column. How can I turn the mean of each column of the original dataframe into a row of a single column dataframe labelled mean?

The reason is because the output of quantile() is a named vector whereas for the mean() is just a single value.
Lets create a custom function with the mean that outputs a named vector,
myMean <- function(x) {setNames(mean(x), nm = 'theMean')}
Applying that using map_dfr we get,
library(dplyr)
beta_df %>%
purrr::map_dfr(myMean)
# A tibble: 5 x 1
theMean
<dbl>
1 -2.93
2 0.124
3 0.156
4 1.08
5 1.14

Related

Using dplyr to calculate geomean in a row wise fashion

I'd like to calculate the geomean using each row from three columns. I found solutions to calculate it from the values in one column (example), but not from a row.
Here's a simplified example:
data <- structure(list(fs_id = structure(1:8, levels = c("CON1", "NC",
"water", "SCR1", "FAN1_1", "CON2", "SCR2", "FAN1_2"), class = "factor"),
twodct_ATP5B = c(1.06960527260684, 0.00241424406360917, NA,
0.953100847649869, 0.404512354245938, 0.934924336678708,
1.32283164360403, 0.194667767059346), twodct_EIF4A2 = c(1.07741209897215,
NA, NA, 1.01873805854745, 0.467988708062081, 0.928149963188649,
1.31762036152893, 0.33377442013251), twodct_GAPDH = c(1.04388739915294,
0.000156497290441042, NA, 0.972431569982792, 0.547030142788418,
0.957957726869246, 0.942311505534324, 0.337842927620691)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
The table looks like this:
> data
# A tibble: 8 × 4
fs_id twodct_ATP5B twodct_EIF4A2 twodct_GAPDH
<fct> <dbl> <dbl> <dbl>
1 CON1 1.07 1.08 1.04
2 NC 0.00241 NA 0.000156
3 water NA NA NA
4 SCR1 0.953 1.02 0.972
5 FAN1_1 0.405 0.468 0.547
6 CON2 0.935 0.928 0.958
7 SCR2 1.32 1.32 0.942
8 FAN1_2 0.195 0.334 0.338
I want to get the row wise geomean of columns twodct_ATP5B, twodct_EIF4A2 and twodct_GAPDH.
I've had a crack like this, but doesn't seem to work:
data %>%
rowwise() %>%
dplyr::mutate(geomean = exp(mean(log(select(., c("twodct_ATP5B", "twodct_EIF4A2", "twodct_GAPDH")))))) %>%
ungroup()
This is a good time to use c_across within the rowwise:
data %>%
rowwise() %>%
dplyr::mutate(geomean = exp(mean(log(c_across(c(twodct_ATP5B, twodct_EIF4A2, twodct_GAPDH)))))) %>%
ungroup()
# # A tibble: 8 × 5
# fs_id twodct_ATP5B twodct_EIF4A2 twodct_GAPDH geomean
# <fct> <dbl> <dbl> <dbl> <dbl>
# 1 CON1 1.07 1.08 1.04 1.06
# 2 NC 0.00241 NA 0.000156 NA
# 3 water NA NA NA NA
# 4 SCR1 0.953 1.02 0.972 0.981
# 5 FAN1_1 0.405 0.468 0.547 0.470
# 6 CON2 0.935 0.928 0.958 0.940
# 7 SCR2 1.32 1.32 0.942 1.18
# 8 FAN1_2 0.195 0.334 0.338 0.280
You can vectorize the computation by rowMeans(). It's optional to set na.rm = TRUE in rowMeans() to omit missing values.
data %>%
mutate(geomean = exp(rowMeans(log(pick(twodct_ATP5B, twodct_EIF4A2, twodct_GAPDH)))))
# # A tibble: 8 × 5
# fs_id twodct_ATP5B twodct_EIF4A2 twodct_GAPDH geomean
# <fct> <dbl> <dbl> <dbl> <dbl>
# 1 CON1 1.07 1.08 1.04 1.06
# 2 NC 0.00241 NA 0.000156 NA
# 3 water NA NA NA NA
# 4 SCR1 0.953 1.02 0.972 0.981
# 5 FAN1_1 0.405 0.468 0.547 0.470
# 6 CON2 0.935 0.928 0.958 0.940
# 7 SCR2 1.32 1.32 0.942 1.18
# 8 FAN1_2 0.195 0.334 0.338 0.280
Note: pick() is a new function since dplyr v1.1.0. If you have not updated, replace it with across or select. pick(a, b, c) is equivalent to
across(c(a, b, c)) (c() is necessary)
select(., a, b, c) (The dot is necessary)

How do I combine many tibbles by a simple code?

I have pop_1910, ... pop_2000. Each tibble has the following style. I want to combine these tibbles to one tibble. I know bind_rows to do that pop_1910 %>% bind_rows(pop_1920) %>% bind_rows(pop_1930). But it is a little bit tedious. Are there some efficient ways to combine many dataframes?
> pop_2000
# A tibble: 3,143 x 3
fips year pop
<chr> <dbl> <dbl>
1 01001 2000 33364
2 01003 2000 112162
3 01005 2000 23042
4 01007 2000 15432
5 01009 2000 40165
6 01011 2000 9142
7 01013 2000 16798
8 01015 2000 90175
9 01017 2000 29086
10 01019 2000 19470
If you have them inside a list, you can use reduce() to bind all in one move.
library(tidyverse)
my_df_list <- map(1:4, ~tibble(x = rnorm(5), y = rnorm(5)))
my_df_list
#> [[1]]
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 1.99 1.19
#> 2 0.273 0.208
#> 3 1.12 1.18
#> 4 0.00855 -0.593
#> 5 0.502 -0.926
#>
#> [[2]]
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 0.570 -0.709
#> 2 0.599 -0.408
#> 3 -0.687 1.38
#> 4 0.375 1.53
#> 5 0.0394 1.90
#>
#> [[3]]
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 -0.576 1.64
#> 2 0.147 -0.0384
#> 3 0.904 0.164
#> 4 -1.16 -1.02
#> 5 -0.678 1.32
#>
#> [[4]]
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 -0.849 -0.445
#> 2 -0.786 -0.991
#> 3 1.17 -1.00
#> 4 0.222 1.65
#> 5 -0.656 -0.808
reduce(my_df_list, bind_rows)
#> # A tibble: 20 x 2
#> x y
#> <dbl> <dbl>
#> 1 1.99 1.19
#> 2 0.273 0.208
#> 3 1.12 1.18
#> 4 0.00855 -0.593
#> 5 0.502 -0.926
#> 6 0.570 -0.709
#> 7 0.599 -0.408
#> 8 -0.687 1.38
#> 9 0.375 1.53
#> 10 0.0394 1.90
#> 11 -0.576 1.64
#> 12 0.147 -0.0384
#> 13 0.904 0.164
#> 14 -1.16 -1.02
#> 15 -0.678 1.32
#> 16 -0.849 -0.445
#> 17 -0.786 -0.991
#> 18 1.17 -1.00
#> 19 0.222 1.65
#> 20 -0.656 -0.808
Created on 2021-06-07 by the reprex package (v2.0.0)
you may also use map_dfr simply
purrr::map_dfr(my_list, ~.x)
This will give you a single df binded by rows.
OR in baseR
do.call(rbind, my_list)
Even easier is piping your list to dplyr::bind_rows(), e.g.
library(dplyr)
my_list %>% bind_rows()

Running Many R Regressions

I have data such as:
data = data.frame(Y1 = runif(10000),
E1=runif(10000),
E2=runif(10000),
E3=runif(10000),
AQ=runif(10000),
WE=runif(10000),
SZ=runif(10000),
PO=runif(10000),
LL=runif(10000),
SCHOOL=sample(1:2, r = T),
CLASS = sample(1:4, r = T))
My goals is to fit these simple regression models:
Y1 = Ei + AQ
Y1 = Ei + AQ + WE + SZ
Y1 = Ei + AQ + WE + SZ + PO + LL
for every combination of SCHOOL and CLASS, this gives 3 * 2 * 4 = 24 MODELS.
From all the models I just wish to save the coefficient and p value and confidence interval for all the independent variables.
You can pass regression models to lm() as characters. So, you just need to loop through your variable names and update the model for each run:
results = list()
models = list()
model = "Y1 ~ "
for(i in 2:length(names(data))){
model = paste(model, '+', names(data)[i])
models[i] = model
results[i] = list(lm(model,data=data))
}
This code extends the regression equation at each iteration and stores the model equation and the regression results in lists.
EDIT: Sorry, I misread the initial post and proposed regression models that were not demanded.
The code below computes the three different regression models for all E-Variables and for all classes and schools individually:
counter = 1
models = c()
results = list()
for(s in unique(data$SCHOOL)){
for(c in unique(data$CLASS)){
subset = data[data$SCHOOL==s & data$CLASS==c,]
for(ei in 1:3){
for(dv in c("AQ", "AQ + WE + SZ", "AQ + WE + SZ + PO + LL")){
model = paste("Y1 ~ E",ei," + ",dv,sep="")
models[counter] = paste(model,"for SCHOOL=",s," and CLASS=",c)
r = lm(model,data=subset)
results[counter]=list(r)
counter=counter+1
}
}
}
}
The third model in this loop would be:
> models[[3]]
[1] "Y1 ~ E1 + AQ + WE + SZ + PO + LL for SCHOOL= 2 and CLASS= 3"
> summary(results[[3]])
Call:
lm(formula = model, data = subset)
Residuals:
Min 1Q Median 3Q Max
-0.53038 -0.24119 -0.00201 0.24940 0.54257
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.445346 0.024633 18.079 < 2e-16 ***
E1 0.035352 0.019676 1.797 0.072504 .
AQ 0.017344 0.019958 0.869 0.384919
WE -0.002737 0.019820 -0.138 0.890174
SZ 0.067423 0.020079 3.358 0.000797 ***
PO -0.029509 0.019897 -1.483 0.138188
LL 0.018326 0.019483 0.941 0.346988
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2863 on 2493 degrees of freedom
Multiple R-squared: 0.007309, Adjusted R-squared: 0.00492
F-statistic: 3.059 on 6 and 2493 DF, p-value: 0.005518
Here is one option with tidyverse
library(dplyr) # >= 1.0.0
library(tidyr)
library(purrr)
library(broom)
lst1 <- list("AQ", c("AQ", "WE", "SZ"),
c("AQ", "WE", "SZ", "PO", "LL"))
nm1 <- grep("^E\\d+$", names(data), value = TRUE)
fmlst <- do.call(c, lapply(lst1, function(vec)
lapply(nm1, function(nm) reformulate(c(nm, vec), response = 'Y1'))))
data %>%
nest_by(SCHOOL, CLASS) %>%
summarise(lmmodels = map(fmlst, ~ lm(.x, data = data)),
tidyout = map(lmmodels, tidy))
# A tibble: 36 x 4
# Groups: SCHOOL, CLASS [4]
# SCHOOL CLASS lmmodels tidyout
# <int> <int> <list> <list>
# 1 1 2 <lm> <tibble [3 × 5]>
# 2 1 2 <lm> <tibble [3 × 5]>
# 3 1 2 <lm> <tibble [3 × 5]>
# 4 1 2 <lm> <tibble [5 × 5]>
# 5 1 2 <lm> <tibble [5 × 5]>
# 6 1 2 <lm> <tibble [5 × 5]>
# 7 1 2 <lm> <tibble [7 × 5]>
# 8 1 2 <lm> <tibble [7 × 5]>
# 9 1 2 <lm> <tibble [7 × 5]>
#10 1 4 <lm> <tibble [3 × 5]>
# … with 26 more rows
I would solve it like this
library(tidyverse)
data <- data.frame(
Y1 = runif(10000),
E1 = runif(10000),
E2 = runif(10000),
E3 = runif(10000),
AQ = runif(10000),
WE = runif(10000),
SZ = runif(10000),
PO = runif(10000),
LL = runif(10000),
SCHOOL = sample(1:2, size = 10000, r = T),
CLASS = sample(1:4, size = 10000, r = T)
)
xx <- data %>%
nest_by(SCHOOL, CLASS) %>%
mutate(
model1 = list(lm(Y1 ~ E1 + AQ, data = data)),
model2 = list(lm(Y1 ~ E1 + AQ + WE + SZ, data = data)),
model3 = list(lm(Y1 ~ E1 + AQ + WE + SZ + PO + LL, data = data))
) %>%
mutate(across(contains("model"), .fns = ~ list(broom::tidy(.x))))
xx$model1
#> [[1]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.511 0.0215 23.8 5.04e-103
#> 2 E1 -0.0188 0.0277 -0.680 4.97e- 1
#> 3 AQ -0.0152 0.0279 -0.546 5.85e- 1
#>
#> [[2]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.496 0.0209 23.7 2.05e-103
#> 2 E1 0.0239 0.0273 0.875 3.81e- 1
#> 3 AQ -0.0162 0.0274 -0.591 5.54e- 1
#>
#> [[3]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.489 0.0217 22.6 9.05e-95
#> 2 E1 -0.00647 0.0288 -0.225 8.22e- 1
#> 3 AQ 0.0315 0.0290 1.09 2.76e- 1
#>
#> [[4]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.521 0.0221 23.6 1.21e-102
#> 2 E1 -0.0264 0.0278 -0.949 3.43e- 1
#> 3 AQ -0.00902 0.0287 -0.315 7.53e- 1
#>
#> [[5]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.521 0.0213 24.5 1.49e-108
#> 2 E1 0.00735 0.0282 0.260 7.95e- 1
#> 3 AQ -0.0257 0.0280 -0.917 3.59e- 1
#>
#> [[6]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.490 0.0218 22.5 3.92e-94
#> 2 E1 0.0359 0.0285 1.26 2.09e- 1
#> 3 AQ -0.00878 0.0288 -0.305 7.61e- 1
#>
#> [[7]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.529 0.0217 24.4 3.04e-107
#> 2 E1 -0.0498 0.0287 -1.74 8.28e- 2
#> 3 AQ -0.0341 0.0289 -1.18 2.38e- 1
#>
#> [[8]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.532 0.0229 23.3 5.57e-99
#> 2 E1 -0.0760 0.0296 -2.56 1.04e- 2
#> 3 AQ -0.00109 0.0286 -0.0382 9.70e- 1
xx$model2
#> [[1]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.507 0.0293 17.3 5.37e-60
#> 2 E1 -0.0183 0.0277 -0.660 5.09e- 1
#> 3 AQ -0.0143 0.0279 -0.514 6.07e- 1
#> 4 WE -0.0288 0.0283 -1.02 3.09e- 1
#> 5 SZ 0.0378 0.0287 1.32 1.89e- 1
#>
#> [[2]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.504 0.0288 17.5 8.12e-62
#> 2 E1 0.0238 0.0273 0.871 3.84e- 1
#> 3 AQ -0.0164 0.0274 -0.599 5.49e- 1
#> 4 WE -0.0135 0.0275 -0.489 6.25e- 1
#> 5 SZ -0.00165 0.0280 -0.0589 9.53e- 1
#>
#> [[3]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.486 0.0308 15.8 3.03e-51
#> 2 E1 -0.00598 0.0288 -0.208 8.36e- 1
#> 3 AQ 0.0316 0.0290 1.09 2.77e- 1
#> 4 WE 0.0135 0.0287 0.469 6.39e- 1
#> 5 SZ -0.00838 0.0286 -0.293 7.69e- 1
#>
#> [[4]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.516 0.0299 17.3 4.36e-60
#> 2 E1 -0.0255 0.0279 -0.916 3.60e- 1
#> 3 AQ -0.00908 0.0287 -0.317 7.52e- 1
#> 4 WE -0.00593 0.0278 -0.213 8.31e- 1
#> 5 SZ 0.0157 0.0280 0.559 5.76e- 1
#>
#> [[5]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.512 0.0293 17.5 1.93e-61
#> 2 E1 0.00714 0.0282 0.253 8.00e- 1
#> 3 AQ -0.0261 0.0280 -0.933 3.51e- 1
#> 4 WE 0.0464 0.0282 1.64 1.01e- 1
#> 5 SZ -0.0283 0.0283 -1.00 3.16e- 1
#>
#> [[6]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.500 0.0299 16.7 1.73e-56
#> 2 E1 0.0356 0.0286 1.25 2.13e- 1
#> 3 AQ -0.00939 0.0289 -0.325 7.45e- 1
#> 4 WE -0.0184 0.0290 -0.633 5.27e- 1
#> 5 SZ -0.000915 0.0291 -0.0314 9.75e- 1
#>
#> [[7]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.494 0.0307 16.1 6.12e-53
#> 2 E1 -0.0497 0.0287 -1.73 8.35e- 2
#> 3 AQ -0.0297 0.0290 -1.02 3.06e- 1
#> 4 WE 0.0210 0.0295 0.714 4.76e- 1
#> 5 SZ 0.0452 0.0287 1.58 1.15e- 1
#>
#> [[8]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.516 0.0306 16.9 2.41e-57
#> 2 E1 -0.0760 0.0296 -2.57 1.04e- 2
#> 3 AQ -0.00122 0.0286 -0.0425 9.66e- 1
#> 4 WE 0.0396 0.0289 1.37 1.70e- 1
#> 5 SZ -0.00644 0.0292 -0.220 8.26e- 1
xx$model3
#> [[1]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.515 0.0355 14.5 3.63e-44
#> 2 E1 -0.0184 0.0278 -0.662 5.08e- 1
#> 3 AQ -0.0143 0.0279 -0.513 6.08e- 1
#> 4 WE -0.0286 0.0283 -1.01 3.12e- 1
#> 5 SZ 0.0374 0.0288 1.30 1.94e- 1
#> 6 PO -0.00520 0.0282 -0.185 8.53e- 1
#> 7 LL -0.0117 0.0280 -0.419 6.75e- 1
#>
#> [[2]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.493 0.0348 14.1 2.24e-42
#> 2 E1 0.0244 0.0273 0.893 3.72e- 1
#> 3 AQ -0.0174 0.0274 -0.635 5.26e- 1
#> 4 WE -0.0124 0.0275 -0.452 6.52e- 1
#> 5 SZ -0.00223 0.0280 -0.0795 9.37e- 1
#> 6 PO 0.0317 0.0275 1.15 2.50e- 1
#> 7 LL -0.0103 0.0272 -0.380 7.04e- 1
#>
#> [[3]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.469 0.0366 12.8 2.04e-35
#> 2 E1 -0.00503 0.0288 -0.175 8.61e- 1
#> 3 AQ 0.0302 0.0290 1.04 2.98e- 1
#> 4 WE 0.0132 0.0287 0.460 6.46e- 1
#> 5 SZ -0.00696 0.0286 -0.244 8.08e- 1
#> 6 PO -0.0194 0.0281 -0.691 4.89e- 1
#> 7 LL 0.0547 0.0281 1.95 5.18e- 2
#>
#> [[4]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.525 0.0372 14.1 4.15e-42
#> 2 E1 -0.0258 0.0279 -0.923 3.56e- 1
#> 3 AQ -0.0101 0.0288 -0.353 7.24e- 1
#> 4 WE -0.00707 0.0278 -0.254 7.99e- 1
#> 5 SZ 0.0155 0.0280 0.552 5.81e- 1
#> 6 PO 0.0171 0.0279 0.613 5.40e- 1
#> 7 LL -0.0327 0.0282 -1.16 2.47e- 1
#>
#> [[5]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.501 0.0366 13.7 5.28e-40
#> 2 E1 0.00872 0.0282 0.309 7.57e- 1
#> 3 AQ -0.0249 0.0280 -0.888 3.75e- 1
#> 4 WE 0.0455 0.0283 1.61 1.08e- 1
#> 5 SZ -0.0258 0.0283 -0.913 3.61e- 1
#> 6 PO -0.0287 0.0281 -1.02 3.07e- 1
#> 7 LL 0.0456 0.0284 1.60 1.09e- 1
#>
#> [[6]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.514 0.0363 14.2 2.57e-42
#> 2 E1 0.0361 0.0285 1.26 2.06e- 1
#> 3 AQ -0.00899 0.0288 -0.312 7.55e- 1
#> 4 WE -0.0167 0.0290 -0.576 5.65e- 1
#> 5 SZ 0.000892 0.0291 0.0307 9.76e- 1
#> 6 PO -0.0644 0.0288 -2.24 2.56e- 2
#> 7 LL 0.0325 0.0291 1.12 2.64e- 1
#>
#> [[7]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.484 0.0366 13.2 1.74e-37
#> 2 E1 -0.0508 0.0288 -1.76 7.85e- 2
#> 3 AQ -0.0302 0.0291 -1.04 3.00e- 1
#> 4 WE 0.0214 0.0295 0.725 4.68e- 1
#> 5 SZ 0.0459 0.0287 1.60 1.10e- 1
#> 6 PO 0.0203 0.0287 0.708 4.79e- 1
#> 7 LL 0.000378 0.0288 0.0131 9.90e- 1
#>
#> [[8]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.503 0.0376 13.4 3.27e-38
#> 2 E1 -0.0761 0.0297 -2.57 1.04e- 2
#> 3 AQ -0.00115 0.0286 -0.0400 9.68e- 1
#> 4 WE 0.0400 0.0289 1.38 1.67e- 1
#> 5 SZ -0.00594 0.0293 -0.203 8.39e- 1
#> 6 PO 0.0208 0.0288 0.721 4.71e- 1
#> 7 LL 0.00232 0.0295 0.0786 9.37e- 1
Created on 2020-08-13 by the reprex package (v0.3.0)

Calculate all possible interactions in model_matrix

I'm simulating data with a fluctuating number of variables. As part of the situation, I am needing to calculate a model matrix with all possible combinations. See the following reprex for an example. I am able to get all two-interactions by specifying the formula as ~ .*.. However, this particular dataset has 3 variables (ndim <- 3). I can get all two- and three-way interactions by specifying the formula as ~ .^3. The issue is that there may be 4+ variables that I need to calculate, so I would like to be able to generalize this. I have tried specifying the formula as ~ .^ndim, but this throws an error.
Is there a way define the power in the formula with a variable?
library(tidyverse)
library(mvtnorm)
library(modelr)
ndim <- 3
data <- rmvnorm(100, mean = rep(0, ndim)) %>%
as_tibble(.name_repair = ~ paste0("dim_", seq_len(ndim)))
model_matrix(data, ~ .*.)
#> # A tibble: 100 x 7
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 1 more variable: `dim_2:dim_3` <dbl>
model_matrix(data, ~ .^3)
#> # A tibble: 100 x 8
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 2 more variables: `dim_2:dim_3` <dbl>,
#> # `dim_1:dim_2:dim_3` <dbl>
model_matrix(data, ~.^ndim)
#> Error in terms.formula(object, data = data): invalid power in formula
Created on 2019-02-15 by the reprex package (v0.2.1)
You can use use as.formula with paste in model_matrix:
model_matrix(data, as.formula(paste0("~ .^", ndim)))

dplyr unquoting does not work with filter function

maybe I am missing something, but I can't seem to make dplyr's unquoting operator to work with the filter function. It does with with select, but not with filter...
Example
set.seed(1234)
A = matrix(rnorm(100),nrow = 10, ncol = 10)
colnames(A) <- paste("var", seq(1:10), sep = "")
varname_test <- "var2"
A <- as_tibble(A)
select(A, !!varname_test) #this works as expected
# this does NOT give me only the rows where var2
# is positive
(result1 <- filter(A, !!varname_test > 0))
# This is how the result 1 should look like
(result2 <- filter(A, var2 > 0))
# result1 is not equal to result2
I would appreciate any help!
I would suggest the following:
library(dplyr)
filter_at(A, vars(starts_with(varname_test)), any_vars(. > 0))
A tibble: 3 x 10
var1 var2 var3 var4 var5 var6 var7 var8 var9 var10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -2.35 0.0645 0.460 -0.501 -0.281 -1.01 -0.670 0.648 -0.174 1.00
2 0.429 0.959 -0.694 -1.63 -0.994 -0.162 -0.00760 2.07 0.850 -0.496
3 -0.890 2.42 -0.936 -0.466 -0.497 -1.16 0.336 -0.317 -1.19 2.12
Here, my solution (1g) uses filter_ and conditions built up with paste.
Of course, 1a is a perfectly fine solution (as was provided by joran and aosmith in the comments).
I thought this might be a good place to use curly curly but I couldn't get it to work (maybe not applicable?)
I also thought: what if we wanted to filter by multiple variables? This is where you see 2g working below (while 2a does not work anymore).
Other issues: filter_ is now deprecated, and I'm not sure what the correct syntax would be here. Will be asking this in a question.
library(tidyverse)
set.seed(1234)
A <- matrix(rnorm(30),nrow = 10, ncol = 3) %>% as_tibble() %>% set_names(paste("var", seq(1:3), sep = ""))
varnames_1 <- c("var2")
(expected_result_1 <- filter(A, var2 > 0))
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
(answer_1a <- filter(A,!!ensym(varnames_1) > 0)) # works (thanks joran and aosmith)
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
(answer_1b <- filter_(A, varnames_1 > 0)) # filter_ not doing what I thought it might
#> Warning: filter_() is deprecated.
#> Please use filter() instead
#>
#> The 'programming' vignette or the tidyeval book can help you
#> to program with filter() : https://tidyeval.tidyverse.org
#> This warning is displayed once per session.
#> # A tibble: 10 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -1.21 -0.477 0.134
#> 2 0.277 -0.998 -0.491
#> 3 1.08 -0.776 -0.441
#> 4 -2.35 0.0645 0.460
#> 5 0.429 0.959 -0.694
#> 6 0.506 -0.110 -1.45
#> 7 -0.575 -0.511 0.575
#> 8 -0.547 -0.911 -1.02
#> 9 -0.564 -0.837 -0.0151
#> 10 -0.890 2.42 -0.936
(answer_1c <- filter(A, {{varnames_1}} > 0)) # curly curly not doing what I thought it might
#> # A tibble: 10 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -1.21 -0.477 0.134
#> 2 0.277 -0.998 -0.491
#> 3 1.08 -0.776 -0.441
#> 4 -2.35 0.0645 0.460
#> 5 0.429 0.959 -0.694
#> 6 0.506 -0.110 -1.45
#> 7 -0.575 -0.511 0.575
#> 8 -0.547 -0.911 -1.02
#> 9 -0.564 -0.837 -0.0151
#> 10 -0.890 2.42 -0.936
(answer_1d <- filter(A, {{varnames_1 > 0}})) # curly curly not doing what I thought it might
#> `arg` must be a symbol
conditions_1 <- paste(varnames_1, "> 0")
(answer_1e <- filter(A, conditions_1)) # does not work
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_1f <- filter(A, {{conditions_1}})) # curly curly not doing what I thought it might
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_1g <- filter_(A, conditions_1)) # works
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
# what if we wanted to filter multiple variables?
varnames_2 <- c("var2", "var3")
(expected_result_2 <- filter(A, var2 > 0 & var3 > 0))
#> # A tibble: 1 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
(answer_2a <- filter(A,!!ensym(varnames_2) > 0)) # does not work
#> Only strings can be converted to symbols
conditions_2 <- paste(paste(varnames_2, "> 0"), collapse = " & ")
(answer_2f <- filter(A, {{conditions_2}})) # curly curly not doing what I thought it might
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_2g <- filter_(A, conditions_2)) # works
#> # A tibble: 1 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
Created on 2019-08-28 by the reprex package (v0.3.0)

Resources