Adding column names to vars() inside a dplyr function - r

I have a function that can be used for summarizing a variable based on some user-defined groups, making use of dplyr:
library(tidyverse)
get_var_summary <- function(.data, .target_var, .group_vars = vars()) {
.target_var = enquo(.target_var)
return(
.data %>%
filter(!is.na(!! .target_var)) %>%
group_by_at(.vars = .group_vars) %>%
summarize(
mean = mean(!! .target_var),
sd = sd(!! .target_var),
ci = qnorm(0.975) * sd(!! .target_var) / sqrt(n()),
median = median(!! .target_var),
n = n()
) %>%
mutate(
sd = ifelse(is.na(sd), Inf, sd),
ci = ifelse(is.na(ci), Inf, ci)
) %>%
ungroup()
)
}
mtcars %>%
get_var_summary(wt, .group_vars = vars(cyl))
Returns:
# A tibble: 3 x 6
cyl mean sd ci median n
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 4. 2.29 0.570 0.337 2.20 11
2 6. 3.12 0.356 0.264 3.22 7
3 8. 4.00 0.759 0.398 3.76 14
Now, in order to be able to easily repeat the .group_vars, but occasionally supply another grouping var in addition, I would like to define another function that calls get_var_summary, but with one additional column added to .group_vars:
get_var_summary_by_another <- function(.data, .extra_var, .target_var, .group_vars = vars()) {
# how do I add .extra_var to .group_vars?
}
How can I do that?

The idea is to first splice the .group_vars with !!!, and add the .extra_var to a new vars() call:
get_var_summary_by_another <- function(.data, .extra_var, .target_var, .group_vars = vars()) {
.extra_var = enquo(.extra_var)
.target_var = enquo(.target_var)
.group_vars = vars(!!! .group_vars, !! .extra_var)
return(
.data %>% get_var_summary(
!! .target_var,
.group_vars
)
)
}
mtcars %>%
get_var_summary_by_another(gear, .target_var = wt, .group_vars = vars(cyl))
Returns:
# A tibble: 8 x 7
cyl gear mean sd ci median n
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 4. 3. 2.46 Inf Inf 2.46 1
2 4. 4. 2.38 0.601 0.416 2.26 8
3 4. 5. 1.83 0.443 0.614 1.83 2
4 6. 3. 3.34 0.173 0.240 3.34 2
5 6. 4. 3.09 0.413 0.405 3.16 4
6 6. 5. 2.77 Inf Inf 2.77 1
7 8. 3. 4.10 0.768 0.435 3.81 12
8 8. 5. 3.37 0.283 0.392 3.37 2

You only need to create one function to accomplish your goal of using an arbitrary number of grouping variables to summarize on. You can rewrite the original function using a combination of dplyr::group_by(), dplyr::across(), and curly curly embracing {{. This works with dplyr version 1.0.0 and greater.
I've edited the original example and code for clarity.
library(tidyverse)
var_summary <- function(.data, target, group = NULL) {
.data %>%
filter(!is.na({{ target }})) %>%
group_by(across({{ group }})) %>%
summarize(
"mean_{{target}}" := mean({{ target }}),
sd := sd({{ target }}),
ci := qnorm(0.975) * sd({{ target }}) / sqrt(n()),
"median_{{target}}" := median({{ target }}),
"n_{{target}}" := n()
) %>%
mutate(
sd := if_else(is.na(sd), Inf, sd),
ci := if_else(is.na(ci), Inf, ci)
) %>%
rename("sd_{{target}}" := sd, "ci_{{target}}" := ci)
}
var_summary(mtcars, target = wt)
#> # A tibble: 1 x 5
#> mean_wt sd_wt ci_wt median_wt n_wt
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 3.22 0.978 0.339 3.32 32
var_summary(mtcars, target = wt, group = cyl)
#> # A tibble: 3 x 6
#> cyl mean_wt sd_wt ci_wt median_wt n_wt
#> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 4 2.29 0.570 0.337 2.2 11
#> 2 6 3.12 0.356 0.264 3.22 7
#> 3 8 4.00 0.759 0.398 3.76 14
var_summary(mtcars, target = wt, group = c(cyl, gear))
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
#> # A tibble: 8 x 7
#> # Groups: cyl [3]
#> cyl gear mean_wt sd_wt ci_wt median_wt n_wt
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 4 3 2.46 Inf Inf 2.46 1
#> 2 4 4 2.38 0.601 0.416 2.26 8
#> 3 4 5 1.83 0.443 0.614 1.83 2
#> 4 6 3 3.34 0.173 0.240 3.34 2
#> 5 6 4 3.09 0.413 0.405 3.16 4
#> 6 6 5 2.77 Inf Inf 2.77 1
#> 7 8 3 4.10 0.768 0.435 3.81 12
#> 8 8 5 3.37 0.283 0.392 3.37 2
Created on 2021-09-06 by the reprex package (v2.0.0)

Related

Rolling window with slide_dbl() on grouped data

This is an extension to following question: Rolling window slider::slide() with grouped data
I want to mutate a column of my grouped tibble with slide_dbl(), i.e. applying slide_dbl() on all groups, but only within them, not across them.
When running the solution of linked question I receive following error message:
Error: Problem with `mutate()` input `rollreg`.
x Inapplicable method for 'mutate_' applied to object of class "c('double', 'numeric')".
My tibble has following structure:
tibble [450,343 x 3] (S3: grouped_df/tbl_df/tbl/data.frame)
$ company: num [1:450343] 1 1 1 1 1 ...
$ date: Date[1:450343], format: "2011-11-30" "2011-12-31" "2012-01-31" "2012-02-29" ...
$ result: num [1:450343] NA NA NA 12.5981 -2.9023 ...
- attr(*, "groups")= tibble [3,339 x 2] (S3: tbl_df/tbl/data.frame)
..$ company: num [1:3339] 1 2 3 4 5 ...
..$ .rows : list<int> [1:3339]
To complete, this is the code I ran according to the linked solution:
testtest <- data %>%
group_by(company) %>% nest() %>%
mutate(rollreg = map(data, ~ .x %>% mutate(result_2 = slide_dbl(.x = .$result, .f = ~prod(1+.)-1, .before = 11, .after = -1, complete=TRUE)))) %>%
select(-data) %>% unnest(rollreg)
Here, above mentioned error message occurs. I guess it's because of the data structure. Yet, I can't figure any solution (also not with similar functions like group_map() or group_modify()). Can anyone help? Thanks in advance!
An option is group_split by the grouping column (in the example, using 'case', loop over the list of datasets with map, create new column in mutate by applying the slide_dbl
library(dplyr)
library(tidyr)
library(purrr)
data %>%
group_split(case) %>%
map_dfr(~ .x %>%
mutate(out = slide_dbl(r1, .f = ~ prod(1 + .x) - 1,
.before = 5, .after = -1, complete = TRUE)))
-output
# A tibble: 30 x 6
# t case r1 r2 r3 out
# <int> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 a -0.294 -0.164 1.33 0
# 2 2 a 0.761 1.01 0.115 -0.294
# 3 3 a -0.781 -0.499 0.290 0.243
# 4 4 a -0.0732 -0.110 0.289 -0.728
# 5 5 a -0.528 0.707 0.181 -0.748
# 6 6 a -1.35 -0.411 -1.47 -0.881
# 7 7 a -0.397 -1.28 0.172 -1.06
# 8 8 a 1.68 0.956 -2.81 -1.02
# 9 9 a -0.0167 -0.0727 -1.08 -1.24
#10 10 a 1.25 -0.326 1.61 -1.26
## … with 20 more rows
Or if we need to use the nest_by, it creates an attribute rowwise, so, it is better to ungroup before applying
out1 <- data %>%
select(-t) %>%
nest_by(case) %>%
ungroup %>%
mutate(data = map(data, ~ .x %>%
mutate(out = slide_dbl(r1, .f = ~ prod(1 + .x) - 1,
.before = 5, .after = -1, complete = TRUE))))
-output
out1
# A tibble: 3 x 2
# case data
# <chr> <list>
#1 a <tibble [10 × 4]>
#2 b <tibble [10 × 4]>
#3 c <tibble [10 × 4]>
Now, we unnest the structure
out1 %>%
unnest(data)
# A tibble: 30 x 5
# case r1 r2 r3 out
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 a -0.294 -0.164 1.33 0
# 2 a 0.761 1.01 0.115 -0.294
# 3 a -0.781 -0.499 0.290 0.243
# 4 a -0.0732 -0.110 0.289 -0.728
# 5 a -0.528 0.707 0.181 -0.748
# 6 a -1.35 -0.411 -1.47 -0.881
# 7 a -0.397 -1.28 0.172 -1.06
# 8 a 1.68 0.956 -2.81 -1.02
# 9 a -0.0167 -0.0727 -1.08 -1.24
#10 a 1.25 -0.326 1.61 -1.26
# … with 20 more rows
data
data <- tibble(t = rep(1:10, 3),
case = c(rep("a", 10), rep("b", 10), rep("c", 10)),
r1 = rnorm(30),
r2 = rnorm(30),
r3 = rnorm(30))
i also got a question regarding the slide_Dbl function. I would like to check out other rollingregressions. My data is already fixed with an 8 weak week, but if i would like to look at for example 16 or 24 weeks, should i change the (before= ) from 8 to 16? The reason why i am asking is that i dont have the original dataset, but its already fixed with 8 weeks, so if i add the (before= ) with an additional 8 will it be 16?
new8 <- new%>%mutate( across(
where(is.double),
function(x) slide_dbl(x, mean, na.rm = TRUE, .before = 8L) %>% lag()))
Or should i put
new16 <- new%>%mutate(across(
where(is.double),
function(x) slide_dbl(x, mean, na.rm = TRUE, .before = 16L) %>% lag()))

summarize across -- is it order dependent?

I came across something weird with dplyr and across, or at least something I do not understand.
If we use the across function to compute the mean and standard error of the mean across multiple columns, I am tempted to use the following command:
mtcars %>% group_by(gear) %>% select(mpg,cyl) %>%
summarize(across(everything(), ~mean(.x, na.rm = TRUE), .names = "{col}"),
across(everything(), ~sd(.x, na.rm=T)/sqrt(sum(!is.na(.x))), .names="se_{col}")) %>% head()
Which results in
gear mpg cyl se_mpg se_cyl
<dbl> <dbl> <dbl> <dbl> <dbl>
1 3 16.1 7.47 NA NA
2 4 24.5 4.67 NA NA
3 5 21.4 6 NA NA
However, if I switch the order of the individual across commands, I get the following:
mtcars %>% group_by(gear) %>% select(mpg,cyl) %>%
summarize(across(everything(), ~sd(.x, na.rm=T)/sqrt(sum(!is.na(.x))), .names="se_{col}"),
across(everything(), ~mean(.x, na.rm = TRUE), .names = "{col}")) %>% head()
# A tibble: 3 x 5
gear se_mpg se_cyl mpg cyl
<dbl> <dbl> <dbl> <dbl> <dbl>
1 3 0.871 0.307 16.1 7.47
2 4 1.52 0.284 24.5 4.67
3 5 2.98 0.894 21.4 6
Why is this the case? Does it have something to do with my usage of everything()? In my situation I'd like the mean and the standard error of the mean calculated across every variable in my dataset.
I have no idea why summarize behaves like that, it's probably due to an underlying interaction of the two across functions (although it seems weird to me). Anyway, I suggest you to write a single across statement and use a list of lambda functions as suggested by the across documentation.
In this way it doesn't matter if the mean or the standard deviation is specified as first function, you will get no NAs.
mtcars %>%
group_by(gear) %>%
select(mpg, cyl) %>%
summarize(across(everything(), list(
mean = ~mean(.x, na.rm = TRUE),
se = ~sd(.x, na.rm = TRUE)/sqrt(sum(!is.na(.x)))
), .names = "{fn}_{col}"))
# A tibble: 3 x 5
# gear mean_mpg se_mpg mean_cyl se_cyl
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 3 16.1 0.871 7.47 0.307
# 2 4 24.5 1.52 4.67 0.284
# 3 5 21.4 2.98 6 0.894
mtcars %>%
group_by(gear) %>%
select(mpg, cyl) %>%
summarize(across(everything(), list(
se = ~sd(.x, na.rm = TRUE)/sqrt(sum(!is.na(.x))),
mean = ~mean(.x, na.rm = TRUE)
), .names = "{fn}_{col}"))
# A tibble: 3 x 5
# gear se_mpg mean_mpg se_cyl mean_cyl
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 3 0.871 16.1 0.307 7.47
# 2 4 1.52 24.5 0.284 4.67
# 3 5 2.98 21.4 0.894 6

r studio: simulate my code 1000 times and pick the things which p value<0.05

Here is my original code:
x = rbinom(1000,1,0.5)
z = log(1.3)*x
pr = 1/(1+exp(-z))
y = rbinom(1000,1,pr)
k=glm(y~x,family="binomial")$coef
t=exp(k)
How can I simulate it 1000 times and pick the one with a p-value<0.05?
This is a perfect application for the tidyverse and it's list columns. Please see explanation in the inline comments.
library(tidyverse)
library(broom)
# create a tibble with an id column for each simulation and x wrapped in list()
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum)) %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)",
p.value < 0.05)
sim
#> # A tibble: 545 x 6
#> id term estimate std.error statistic p.value
#> <int> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 3 .x 0.301 0.127 2.37 0.0176
#> 2 7 .x 0.263 0.127 2.06 0.0392
#> 3 8 .x 0.293 0.127 2.31 0.0211
#> 4 11 .x 0.377 0.128 2.96 0.00312
#> 5 12 .x 0.265 0.127 2.08 0.0373
#> 6 13 .x 0.366 0.127 2.88 0.00403
#> 7 14 .x 0.461 0.128 3.61 0.000305
#> 8 17 .x 0.274 0.127 2.16 0.0309
#> 9 18 .x 0.394 0.127 3.09 0.00200
#> 10 19 .x 0.371 0.127 2.92 0.00354
#> # … with 535 more rows
Created on 2020-05-18 by the reprex package (v0.3.0)
I won't do this for you, but these are the steps you'll probably want to go through:
Write your code as a function that returns the value you're interested in (presumably t)
Use something like replicate to run this function many times and record all the answers
Use something like quantile to extract the percentile you're interested in

R How to use curly curly with filter or filter_?

I was answering this question where commenters suggested !!ensym, and I thought this might be a good place to use curly curly {{ but I couldn't get it to work (maybe not applicable?).
How might I do this filter operation, without using filter_, eval/parse, or quote-unquote? Would ~ help?
My solution (1g) uses filter_ and conditions built up with paste.
1a works (but could it use {{ }} somehow?)
And what if we wanted to filter by multiple variables? This is where you see 2g working below (while 2a does not work anymore).
library(tidyverse)
set.seed(1234)
A <- matrix(rnorm(30),nrow = 10, ncol = 3) %>% as_tibble() %>% set_names(paste("var", seq(1:3), sep = ""))
varnames_1 <- c("var2")
(expected_result_1 <- filter(A, var2 > 0))
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
(answer_1a <- filter(A,!!ensym(varnames_1) > 0)) # works (thanks joran and aosmith)
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
(answer_1b <- filter_(A, varnames_1 > 0)) # filter_ not doing what I thought it might
#> Warning: filter_() is deprecated.
#> Please use filter() instead
#>
#> The 'programming' vignette or the tidyeval book can help you
#> to program with filter() : https://tidyeval.tidyverse.org
#> This warning is displayed once per session.
#> # A tibble: 10 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -1.21 -0.477 0.134
#> 2 0.277 -0.998 -0.491
#> 3 1.08 -0.776 -0.441
#> 4 -2.35 0.0645 0.460
#> 5 0.429 0.959 -0.694
#> 6 0.506 -0.110 -1.45
#> 7 -0.575 -0.511 0.575
#> 8 -0.547 -0.911 -1.02
#> 9 -0.564 -0.837 -0.0151
#> 10 -0.890 2.42 -0.936
(answer_1c <- filter(A, {{varnames_1}} > 0)) # curly curly not doing what I thought it might
#> # A tibble: 10 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -1.21 -0.477 0.134
#> 2 0.277 -0.998 -0.491
#> 3 1.08 -0.776 -0.441
#> 4 -2.35 0.0645 0.460
#> 5 0.429 0.959 -0.694
#> 6 0.506 -0.110 -1.45
#> 7 -0.575 -0.511 0.575
#> 8 -0.547 -0.911 -1.02
#> 9 -0.564 -0.837 -0.0151
#> 10 -0.890 2.42 -0.936
(answer_1d <- filter(A, {{varnames_1 > 0}})) # curly curly not doing what I thought it might
#> `arg` must be a symbol
conditions_1 <- paste(varnames_1, "> 0")
(answer_1e <- filter(A, conditions_1)) # does not work
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_1f <- filter(A, {{conditions_1}})) # curly curly not doing what I thought it might
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_1g <- filter_(A, conditions_1)) # works
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
# what if we wanted to filter multiple variables?
varnames_2 <- c("var2", "var3")
(expected_result_2 <- filter(A, var2 > 0 & var3 > 0))
#> # A tibble: 1 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
(answer_2a <- filter(A,!!ensym(varnames_2) > 0)) # does not work
#> Only strings can be converted to symbols
conditions_2 <- paste(paste(varnames_2, "> 0"), collapse = " & ")
(answer_2f <- filter(A, {{conditions_2}})) # curly curly not doing what I thought it might
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_2g <- filter_(A, conditions_2)) # works
#> # A tibble: 1 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
Created on 2019-08-28 by the reprex package (v0.3.0)
{{ only works within functions, with function arguments. Same for ensym() and other operators starting with en by the way.
If you're not in a function and you have variable names as strings, you need !!sym(). The sym() part transforms the variable name to a code object (a symbol), and the !! part insert it in place.
As Lionel points out, curly-curly works inside functions. To use it with filter, you thus have to wrap the call inside a function.
f <- function(.df, v) {
filter(.df, {{ v }} > 0)
}
# Curly-curly provides automatic NSE support
f( A, var2 )
# # A tibble: 3 x 3
# var1 var2 var3
# <dbl> <dbl> <dbl>
# 1 -2.35 0.0645 0.460
# 2 0.429 0.959 -0.694
# 3 -0.890 2.42 -0.936
# Strings have to be first converted to symbols
f( A, !!sym("var3") )
# # A tibble: 3 x 3
# var1 var2 var3
# <dbl> <dbl> <dbl>
# 1 -1.21 -0.477 0.134
# 2 -2.35 0.0645 0.460
# 3 -0.575 -0.511 0.575
Curly-curly is meant to reference a single argument. You can extend it to work with multiple variables through sequential application with the help of purrr::reduce. (Don't forget to convert your strings into actual variable names first!):
syms(varnames_2) %>% reduce(f, .init=A)
# # A tibble: 1 x 3
# var1 var2 var3
# <dbl> <dbl> <dbl>
# 1 -2.35 0.0645 0.460
If the paste(paste(varnames_2, "> 0"), collapse = " & ") is the main question. You have to build the filter arguments.
library(tidyverse)
library(rlang)
set.seed(1234)
A <- matrix(rnorm(30),nrow = 10, ncol = 3) %>% as_tibble() %>% set_names(paste("var", seq(1:3), sep = ""))
# with variables as arguments
filter_gt0 <- function(d, ...) {
conds <- ensyms(...)
conds <- map(conds, ~quo(!!.x > 0))
d %>%
filter(!!!conds)
}
A %>%
filter_gt0(var2, var3)
# # A tibble: 1 x 3
# var1 var2 var3
# <dbl> <dbl> <dbl>
# 1 -2.35 0.0645 0.460
# or with variables as input
conds <- quos(var2, var3)
filter_gt0_2 <- function(d, conds) {
conds <- map(conds, ~quo(!!.x > 0))
d %>%
filter(!!!conds)
}
A %>%
filter_gt0_2(conds)
# # A tibble: 1 x 3
# var1 var2 var3
# <dbl> <dbl> <dbl>
# 1 -2.35 0.0645 0.460

Get median of group means without breaking piped workflow (or joining back temporary tibble)

My data have a grouping variable group, and I would like to find the median of the group means of x so that I can flag groups that have group means of x higher than the median group mean of x.
This calculation is easy if I save the group means to a tibble temp, compare x_mean to median(x_mean), and merge back temp.
library(tidyverse)
set.seed(2001)
tb <- tibble(group = c(1, 2, rep(3, 3))) %>%
mutate(x = runif(n()) + ifelse(group %in% 1:2, 1, 0))
tb
#> # A tibble: 5 x 2
#> group x
#> <dbl> <dbl>
#> 1 1 1.76
#> 2 2 1.61
#> 3 3 0.218
#> 4 3 0.229
#> 5 3 0.153
temp <- tb %>%
group_by(group) %>%
summarize(x_mean = mean(x)) %>%
ungroup() %>%
mutate(x_hi = (x_mean > median(x_mean)))
temp
#> # A tibble: 3 x 3
#> group x_mean x_hi
#> <dbl> <dbl> <lgl>
#> 1 1 1.76 TRUE
#> 2 2 1.61 FALSE
#> 3 3 0.200 FALSE
tb <- inner_join(tb, temp)
#> Joining, by = "group"
Here is the desired output. It may seem odd that 4/5 observations are below the median, but this is possible since my group counts are not equal.
tb
#> # A tibble: 5 x 4
#> group x x_mean x_hi
#> <dbl> <dbl> <dbl> <lgl>
#> 1 1 1.76 1.76 TRUE
#> 2 2 1.61 1.61 FALSE
#> 3 3 0.218 0.200 FALSE
#> 4 3 0.229 0.200 FALSE
#> 5 3 0.153 0.200 FALSE
I would like to do this without breaking my piped workflow. The following attempt fails because my groups have different counts.
tb <- tb %>%
group_by(group) %>%
mutate(x_mean2 = mean(x)) %>%
ungroup() %>%
mutate(x_hi2 = (x_mean > median(x_mean)))
tb
#> # A tibble: 5 x 6
#> group x x_mean x_hi x_mean2 x_hi2
#> <dbl> <dbl> <dbl> <lgl> <dbl> <lgl>
#> 1 1 1.76 1.76 TRUE 1.76 TRUE
#> 2 2 1.61 1.61 FALSE 1.61 TRUE
#> 3 3 0.218 0.200 FALSE 0.200 FALSE
#> 4 3 0.229 0.200 FALSE 0.200 FALSE
#> 5 3 0.153 0.200 FALSE 0.200 FALSE
Is there a way to grab the median of the group means of x without breaking my piped workflow?
Created on 2019-07-29 by the reprex package (v0.3.0)
Just use unique:
library(dplyr)
tb %>%
group_by(group) %>%
mutate(x_mean = mean(x)) %>%
ungroup %>%
mutate(x_hi = x_mean > median(unique(x_mean)))
#> # A tibble: 5 x 4
#> group x x_mean x_hi
#> <dbl> <dbl> <dbl> <lgl>
#> 1 1 1.76 1.76 TRUE
#> 2 2 1.61 1.61 FALSE
#> 3 3 0.218 0.200 FALSE
#> 4 3 0.229 0.200 FALSE
#> 5 3 0.153 0.200 FALSE
M-M's answer works for the specific case, but I don't think it would be accurate if more than one group had the same mean scores.
tb %>%
group_by(group) %>%
mutate(x_mean = mean(x)) %>%
ungroup %>%
nest(-x_mean, -group) %>%
mutate(x_median = median(x_mean)) %>%
unnest %>%
mutate(x_hi = x_mean > x_median)

Resources