renaming column names with dplyr using tidyselect functions - r

I am trying to rename a few columns using dplyr::rename and tidyselect helpers to do so using some patterns.
How can I get this to work?
library(tidyverse)
# tidy output from broom (using development version)
(df <- broom::tidy(stats::oneway.test(formula = wt ~ cyl, data = mtcars)))
#> # A tibble: 1 x 5
#> num.df den.df statistic p.value method
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 2 19.0 20.2 0.0000196 One-way analysis of means (not assuming equ~
# renaming
df %>%
dplyr::rename(
.data = .,
parameter1 = dplyr::matches("^num"),
parameter2 = dplyr::matches("^denom")
)
#> Error: Column positions must be scalar
Created on 2020-01-12 by the reprex package (v0.3.0.9001)

Your code works fine with me, however here are some other shorter ways that can help you and you can try;
library(tidyverse)
# tidy output from broom (using development version)
(df <- broom::tidy(stats::oneway.test(formula = wt ~ cyl, data = mtcars)))
#> # A tibble: 1 x 5
#> num.df den.df statistic p.value method
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 2 19.0 20.2 0.0000196 One-way analysis of means (not assuming equ~
# renaming
df %>%
rename(parameter1 = matches("^num"),
parameter2 = matches("^denom"))
# # A tibble: 1 x 5
# parameter1 parameter2 statistic p.value method
# <dbl> <dbl> <dbl> <dbl> <chr>
# 1 2 19.0 20.2 0.0000196 One-way analysis of means (not assuming..
df %>%
rename(parameter1 = contains("num"),
parameter2 = contains("denom"))
# # A tibble: 1 x 5
# parameter1 parameter2 statistic p.value method
# <dbl> <dbl> <dbl> <dbl> <chr>
# 1 2 19.0 20.2 0.0000196 One-way analysis of means (not assuming..
df %>%
rename(parameter1 = starts_with("num"),
parameter2 = starts_with("denom"))
# # A tibble: 1 x 5
# parameter1 parameter2 statistic p.value method
# <dbl> <dbl> <dbl> <dbl> <chr>
# 1 2 19.0 20.2 0.0000196 One-way analysis of means (not assuming..

We can also rename from a named vector
library(dplyr)
library(stringr)
df %>%
rename(!!!set_names(names(df)[1:2], str_c('parameter', 1:2)))
# A tibble: 1 x 5
# parameter1 parameter2 statistic p.value method
# <dbl> <dbl> <dbl> <dbl> <chr>
#1 2 19.0 20.2 0.0000196 One-way analysis of means (not assuming equal variances)

Related

Adjust p.value for each term from list of ANOVA results/tidy DF's

I have a data with 2 independent variables and a thousands of dependent variables. I've performed multiple two-way ANOVA tests a now I have a list containing result for each dependent variable. Let's say that the list looks like this (example data):
> l
$a
# A tibble: 2 x 6
term df sumsq meansq statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Species 2 63.2 31.6 119. 1.67e-31
2 Residuals 147 39.0 0.265 NA NA
$b
# A tibble: 2 x 6
term df sumsq meansq statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Species 2 437. 219. 1180. 2.86e-91
2 Residuals 147 27.2 0.185 NA NA
Now I would like to use the p.adjust method for each term. So what I want to do is to retrieve p.value for Species, ..., Residuals from all dataframes in this list, then use the p.adjust on a vector of p.values from specific term and add each adjusted p.value to respective dataframe (to new column in respective term). Is there any way to do this in a simple (tidyverse?) way? Key here is to use the p.adjust method.
I've managed to find an answer to this. Although the method is not a "tidyverse" way. Let data look like this:
> a = aov(Sepal.Length ~ Species, data = iris)
> b = aov(Petal.Length ~ Species, data = iris)
> l = list(a = broom::tidy(a), b = broom::tidy(b))
> n_terms = nrow(l[[1]])
> n_terms
[1] 2
> for(i in seq_along(l)){
+ l[[i]]$q.value = 0
+ }
> l
$a
# A tibble: 2 x 7
term df sumsq meansq statistic p.value q.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Species 2 63.2 31.6 119. 1.67e-31 0
2 Residuals 147 39.0 0.265 NA NA 0
$b
# A tibble: 2 x 7
term df sumsq meansq statistic p.value q.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Species 2 437. 219. 1180. 2.86e-91 0
2 Residuals 147 27.2 0.185 NA NA 0
The we can create a for loop for each term. In loop we retrieve each p.value for given term using purrr::map_dbl. After that we adjust the vector of pvalues using p.adjust method using desired method. Next step is to loop over every old q.value for a given term and set it to previously calculated value.
> for(term in 1:n_terms){
+ p.vals = purrr::map_dbl(l, ~.x[term, ]$p.value)
+ adjusted = as.vector(p.adjust(p.vals, method = "BY"))
+ for(i in seq_along(adjusted)){
+ l[[i]]$q.value[term] = adjusted[i]
+ }
+ }
> l
$a
# A tibble: 2 x 7
term df sumsq meansq statistic p.value q.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Species 2 63.2 31.6 119. 1.67e-31 2.50e-31
2 Residuals 147 39.0 0.265 NA NA NA
$b
# A tibble: 2 x 7
term df sumsq meansq statistic p.value q.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Species 2 437. 219. 1180. 2.86e-91 8.57e-91
2 Residuals 147 27.2 0.185 NA NA NA

for loop in r -looping through vector

I am trying to loop through all the cols in my df and run a prop test on each of them.
library(gss)
To run on just one variable I can use--
infer::prop_test(gss,
college ~ sex,
order = c("female", "male"))
But now I want to run this for each variable in my df like this:
cols <- gss %>% select(-sex) %>% names(.)
for (i in cols){
# print(i)
prop_test(gss,
i~sex)
}
But this loop does not recognize the i;
Error: The response variable `i` cannot be found in this dataframe.
Any suggestions please??
We need to create the formula. Either use reformulate
library(gss)
library(infer)
out <- vector('list', length(cols))
names(out) <- cols
for(i in cols) {
out[[i]] <- prop_test(gss, reformulate("sex", response = i))
}
-output
> out
$college
# A tibble: 1 × 6
statistic chisq_df p_value alternative lower_ci upper_ci
<dbl> <dbl> <dbl> <chr> <dbl> <dbl>
1 0.0000204 1 0.996 two.sided -0.0917 0.101
$partyid
# A tibble: 1 × 3
statistic chisq_df p_value
<dbl> <dbl> <dbl>
1 12.9 3 0.00484
$class
# A tibble: 1 × 3
statistic chisq_df p_value
<dbl> <dbl> <dbl>
1 2.54 3 0.467
$finrela
# A tibble: 1 × 3
statistic chisq_df p_value
<dbl> <dbl> <dbl>
1 9.11 5 0.105
or paste
for(i in cols) {
prop_test(gss, as.formula(paste0(i, " ~ sex")))
}
data
library(dplyr)
data(gss)
cols <- gss %>%
select(where(is.factor), -sex, -income) %>%
names(.)

T-test with column number instead of column name

I am trying to perform a series of T-tests using RStatix's t_test(), where the dependent variable is the same in every test and the grouping variable changes. I am doing these tests inside a loop, so I would like to select the grouping variable with the column number instead of the column name. I have tried to do this with colnames(dataframe)[[columnnumber]], but I get the following error: "Can't extract columns that don't exist". How can I select the grouping variable with the column number instead of the column name?
Below is a minimal reproductible example with a ficticious dataframe; the test works correctly when the grouping variable's name (gender) is indicated, but not when the column number is indicated instead.
library(tidyverse)
library(rstatix)
dat<-data.frame(gender=rep(c("Male", "Female"), 1000),
age=rep(c("Young","Young", "Old", "Old"),500),
tot= round(runif(2000, min=0, max=1),0))
dat %>% t_test(tot ~ gender,detailed=T) ##Works
dat %>% t_test(tot ~ colnames(dat)[[1]],detailed=T) ##Doesn't work
colnames(dat)[1] is a string. t_test requires formula object, you need to convert string to formula and pass it in t_test. This can be done using reformulate or as.formula.
library(rstatix)
dat %>% t_test(reformulate(colnames(dat)[1], 'tot'),detailed=T)
# A tibble: 1 x 15
# estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic
#* <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl>
#1 0.011 0.505 0.494 tot Female Male 1000 1000 0.492
# … with 6 more variables: p <dbl>, df <dbl>, conf.low <dbl>,
# conf.high <dbl>, method <chr>, alternative <chr>
If we want to use tidyverse way of construction, then do this with in an expr
library(rstatix)
dat %>%
t_test(formula = eval(rlang::expr(tot ~ !! rlang::sym(names(.)[1]))),
detailed = TRUE)
# A tibble: 1 x 15
# estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p df conf.low conf.high method alternative
#* <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#1 -0.02 0.497 0.517 tot Female Male 1000 1000 -0.894 0.371 1998. -0.0639 0.0239 T-test two.sided
NOTE: values are different as the data was constructed without any set.seed (wrt rnorm)

Why does broom::tidy occasionally return a wrong type of 'estimate' with speedglm?

It is documented that broom::tidy can tidy a speedglm object: https://broom.tidyverse.org/reference/tidy.speedglm.html. In the following examples, broom::tidy a speedglm object returns some columns as 'fct' rather than 'dbl'. I guess it happens when there are one or more scientific notations in the column. I have tried to change option(digits = #, scipen = #) with no luck. Any suggestions would be appreciated.
library(broom)
library(speedglm)
#> Loading required package: Matrix
#> Loading required package: MASS
library(chest) # get example data (diab_df)
m1 <- glm(Diabetes ~ Sex + Married, family = binomial(), data = diab_df)
tidy(m1) # works fine
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) -0.803 0.157 -5.13 0.000000287
#> 2 Sex 0.0958 0.0925 1.04 0.300
#> 3 Married -0.0000364 0.0908 -0.000401 1.000
m2 <- speedglm(Diabetes ~ Sex + Married, family = binomial(), data = diab_df)
tidy(m2) # returns <fct> for values with scintific notation: `estimate` and `p.value`
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <fct> <dbl> <dbl> <fct>
#> 1 (Intercept) -8.034e-01 0.157 -5.13 2.87e-07
#> 2 Sex " 9.578e-02" 0.0925 1.04 3.00e-01
#> 3 Married -3.640e-05 0.0908 -0.0004 1.00e+00
Created on 2019-12-18 by the reprex package (v0.3.0)

Run a aov test through a tibble in a tidy way

I want to run a linear regression on a data frame using the same dependent variable. A similar question was solved here. The problem is that aov function to implement ANOVA doesn't accept x and y as arguments (as far as I know). Is there a way to implement the analysis in a tidy way? So far I've tried something like:
library(tidyverse)
iris %>%
as_tibble() %>%
select(Sepal.Length, Species) %>%
mutate(foo_a = as_factor(sample(c("a", "b", "c"), nrow(.), replace = T)),
foo_b = as_factor(sample(c("d", "e", "f"), nrow(.), replace = T))) %>%
map(~aov(Sepal.Length ~ .x, data = .))
Created on 2019-02-12 by the reprex package (v0.2.1)
The desired output is three analysis: Sepal.Length and Species, Sepal.Length and foo_a and the last one Sepal.Length and foo_b. Is it possible or I am totally wrong?
One approach is to make this into a long-shaped data frame, group by the independent variable of interest, and use the "many models" approach. I usually prefer something like this over trying to do tidyeval across multiple columns—it just gives me a clearer sense of what's going on.
To save space, I'm working with iris_foo, which is your data as you created it up through the 2 mutate lines. Putting it into a long format gives you a key of the names of those three columns that will be used as independent variables in each of the aov calls.
library(tidyverse)
iris_foo %>%
gather(key, value, -Sepal.Length)
#> # A tibble: 450 x 3
#> Sepal.Length key value
#> <dbl> <chr> <chr>
#> 1 5.1 Species setosa
#> 2 4.9 Species setosa
#> 3 4.7 Species setosa
#> 4 4.6 Species setosa
#> 5 5 Species setosa
#> 6 5.4 Species setosa
#> 7 4.6 Species setosa
#> 8 5 Species setosa
#> 9 4.4 Species setosa
#> 10 4.9 Species setosa
#> # … with 440 more rows
From there, nest by key and create a new list-column of ANOVA models. This will be a list of aov objects. For simplicity with getting your models back out, you can drop the data column.
aov_models <- iris_foo %>%
gather(key, value, -Sepal.Length) %>%
group_by(key) %>%
nest() %>%
mutate(model = map(data, ~aov(Sepal.Length ~ value, data = .))) %>%
select(-data)
aov_models
#> # A tibble: 3 x 2
#> key model
#> <chr> <list>
#> 1 Species <S3: aov>
#> 2 foo_a <S3: aov>
#> 3 foo_b <S3: aov>
From there, you can work with the models however you like. They're accessible in the list aov_models$model. Printed, they look how you'd expect. For example, the first model:
aov_models$model[[1]]
#> Call:
#> aov(formula = Sepal.Length ~ value, data = .)
#>
#> Terms:
#> value Residuals
#> Sum of Squares 63.21213 38.95620
#> Deg. of Freedom 2 147
#>
#> Residual standard error: 0.5147894
#> Estimated effects may be unbalanced
To see all the models, call aov_models$model %>% map(print). You might also want to use broom functions, such as broom::tidy or broom::glance, depending on how you need to present the models.
aov_models$model %>%
map(broom::tidy)
#> [[1]]
#> # A tibble: 2 x 6
#> term df sumsq meansq statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 value 2 63.2 31.6 119. 1.67e-31
#> 2 Residuals 147 39.0 0.265 NA NA
#>
#> [[2]]
#> # A tibble: 2 x 6
#> term df sumsq meansq statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 value 2 0.281 0.141 0.203 0.817
#> 2 Residuals 147 102. 0.693 NA NA
#>
#> [[3]]
#> # A tibble: 2 x 6
#> term df sumsq meansq statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 value 2 0.756 0.378 0.548 0.579
#> 2 Residuals 147 101. 0.690 NA NA
Or tidying all the models into a single data frame, which keeps the key column, you could do:
aov_models %>%
mutate(model_tidy = map(model, broom::tidy)) %>%
unnest(model_tidy)

Resources