group_by and summaries with variable number of variables - r

Using the {{var}} notation the following code works.
The variables to be used for grouping and for summarizing van be given as parameters to my_summary
I would like to modify my_summary so that I can give a varying number of variables for both grouping and summarizing. Can this be done?
suppressPackageStartupMessages({
library(tidyverse)
})
set.seed(4321)
demo_df <-
tibble(age=as.integer(rep(c(10,20),each=10)),
gender=rep(c("f","m"),10),
weight=rnorm(20,70,7),
size=rnorm(20,160,15))
my_summary <- function(df_in,group_var,summary_var){
df_in |>
group_by({{group_var}}) |>
summarise_at(vars({{summary_var}}),mean)
}
my_summary(demo_df,gender,weight)

Another possible solution, allowing for multiple grouping variables:
library(tidyverse)
my_summary <- function(df_in, group_var,summary_var){
df_in %>%
group_by(!!!group_var) %>%
summarise(across({{summary_var}}, mean), .groups = "drop")
}
my_summary(demo_df, vars(age,gender), c(weight,size))
#> # A tibble: 4 × 4
#> age gender weight size
#> <int> <chr> <dbl> <dbl>
#> 1 10 f 71.5 159.
#> 2 10 m 72.4 158.
#> 3 20 f 64.3 167.
#> 4 20 m 71.6 164.
Alternatively, without vars (that may be superseded):
library(tidyverse)
my_summary <- function(df_in, summary_var , ...){
summary_var <- enquos(summary_var)
group_var <- enquos(...)
df_in %>%
group_by(!!!group_var) %>%
summarise(across(!!!summary_var,mean), .groups = "drop")
}
my_summary(demo_df, c(weight, size), age, gender)
#> # A tibble: 4 × 4
#> age gender weight size
#> <int> <chr> <dbl> <dbl>
#> 1 10 f 71.5 159.
#> 2 10 m 72.4 158.
#> 3 20 f 64.3 167.
#> 4 20 m 71.6 164.

Use summarise(across(.)).
suppressPackageStartupMessages({
library(tidyverse)
})
set.seed(4321)
demo_df <-
tibble(age=as.integer(rep(c(10,20),each=10)),
gender=rep(c("f","m"),10),
weight=rnorm(20,70,7),
size=rnorm(20,160,15))
my_summary <- function(df_in,group_var,summary_var){
df_in |>
group_by({{group_var}}) |>
summarise(across({{summary_var}}, mean))
}
my_summary(demo_df, gender, weight:size)
#> # A tibble: 2 × 3
#> gender weight size
#> <chr> <dbl> <dbl>
#> 1 f 67.9 163.
#> 2 m 72.0 161.
Created on 2022-06-09 by the reprex package (v2.0.1)

Related

Trying to isolate all columns (representing years) via conditional operator and renaming them in R

library(readr)
d <- read.csv("per_capita.csv")
rc <- d[,-2:-3]
df <- data.frame(rc)
draw <- df$X1994[df$Country.Name == "India"]
format(draw, scientific = F, big.marks = ",")
library(dplyr)
df %>%
filter(Country.Name == "India") %>%
select(names(.)[-1][readr::parse_integer(names(.)[-1] > 1994])
I tried this code and its giving me an error in the last line. Also, how should I rename these columns in the CSV file without using a dataframe?
The column names are: X1994, X1995..... and so on.
Thank You!
If you want to select columns that have numbers greater than a value, you could do this:
library(tidyverse)
#example
set.seed(24)
df <- tibble(country = rep(c("India", "Canada"), each = 3),
X1990 = runif(6),
X1991 = runif(6),
X1992 = runif(6))
df |>
filter(country == "India") |>
select(!!!vars(colnames(df)[-1][which(parse_number(colnames(df)[-1]) > 1990)]))
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Although that is pretty complicated. It might be better to go long, filter, then go wide:
df |>
filter(country == "India") |>
mutate(id = row_number()) |>
pivot_longer(contains("X")) |>
mutate(name = parse_number(name))|>
filter(name > 1990) |>
pivot_wider(names_from = name, values_from = value)|>
select(-c(id, country))
#> # A tibble: 3 x 2
#> `1991` `1992`
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
We can see that this answer is pretty long and cumbersome. Maybe we stick in base R:
cols <- which(as.numeric(sub("^.*?(\\d+).*$", "\\1", colnames(df)[-1])) > 1990) +1
rows <- df$country == "India"
df[rows,cols]
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Or actually, maybe we can make the tidyverse version cleaner if we just look for strings that have values higher than the target year:
all_years <- 1990:1995
df |>
filter(country == "India") |>
select(contains(paste0("X", all_years[all_years > 1990])))
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Using the same logic, we can also do a partial string match with base R:
all_years <- 1990:1995
cols <- grepl(paste(all_years[all_years>1990], collapse = "|"), colnames(df))
rows <- df$country == "India"
df[rows,cols]
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Hopefully one of these helps and strikes your fancy. Lots of options out there for whatever flavor your in the mood for.

simplifying tidyeval with multiple symbols

The following function behaves as desired: several variables can be passed to group_by without the need to put them into alist() or dplyr::vars:
mean_by_grp <- function(df, meanvar, grp) {
grouping <- enexpr(grp) %>%
expr_deparse %>%
str_split(",",simplify = T) %>% `[`(1,) %>%
map(str_remove,"c\\(") %>% map(str_remove,"\\)") %>% map(str_trim) %>%
unlist %>% syms
df %>%
group_by(!!!syms(grouping)) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
starwars %>% mean_by_grp(height, species)
starwars %>% mean_by_grp(height, c(species, homeworld))
However, it is complicated. I need to turn c(var1,....varn) into a string, split it and turn it into a list of symbols so I can use with with syms.
Isn't there a much easier way to do this?
Of course, I could use ellipses instead of grp, but then I can only have one argument that passes multiple symbols to another function.
One option would be dplyr::across:
mean_by_grp <- function(df, meanvar, grp) {
df %>%
group_by(across({{ grp }})) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
library(dplyr)
starwars %>% mean_by_grp(height, species)
#> # A tibble: 38 × 2
#> species average_height
#> <chr> <dbl>
#> 1 Aleena 79
#> 2 Besalisk 198
#> 3 Cerean 198
#> 4 Chagrian 196
#> 5 Clawdite 168
#> 6 Droid 131.
#> 7 Dug 112
#> 8 Ewok 88
#> 9 Geonosian 183
#> 10 Gungan 209.
#> # … with 28 more rows
starwars %>% mean_by_grp(height, c(species, homeworld))
#> # A tibble: 58 × 3
#> species homeworld average_height
#> <chr> <chr> <dbl>
#> 1 Aleena Aleen Minor 79
#> 2 Besalisk Ojom 198
#> 3 Cerean Cerea 198
#> 4 Chagrian Champala 196
#> 5 Clawdite Zolan 168
#> 6 Droid Naboo 96
#> 7 Droid Tatooine 132
#> 8 Droid <NA> 148
#> 9 Dug Malastare 112
#> 10 Ewok Endor 88
#> # … with 48 more rows

Choosing the function in a selfmade function with tidyeval

So this example is basically from https://tidyeval.tidyverse.org/dplyr.html#patterns-for-single-arguments and it works just fine:
library(tidyverse)
group_mean <- function(df, group_var, summary_var){
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_mean")
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := mean(!!summary_var, na.rm = TRUE))
}
mtcars %>% group_mean(group_var = cyl, summary_var = disp)
#> # A tibble: 3 x 2
#> cyl disp_mean
#> <dbl> <dbl>
#> 1 4 105.
#> 2 6 183.
#> 3 8 353.
I would like to e.g. be able to choose median instead of mean sometimes and e.g. change the function name to group_stat().
You can do something like this. I'm not quite sure exactly how this works but I've seen this method used in the source code of library(purrr) for as_mapper():
https://github.com/tidyverse/purrr/blob/master/R/as_mapper.R
library(tidyverse)
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_closure(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
mtcars %>%
group_stat(group_var = cyl, summary_var = disp, median)
#> # A tibble: 3 x 2
#> cyl disp_median
#> <dbl> <dbl>
#> 1 4 108
#> 2 6 168.
#> 3 8 350.
mtcars %>%
group_stat(group_var = cyl, summary_var = disp, mean)
#> # A tibble: 3 x 2
#> cyl disp_mean
#> <dbl> <dbl>
#> 1 4 105.
#> 2 6 183.
#> 3 8 353.
mtcars %>%
group_stat(group_var = cyl, summary_var = disp, max)
#> # A tibble: 3 x 2
#> cyl disp_max
#> <dbl> <dbl>
#> 1 4 147.
#> 2 6 258
#> 3 8 472
mtcars %>%
group_stat(group_var = cyl, summary_var = disp, min)
#> # A tibble: 3 x 2
#> cyl disp_min
#> <dbl> <dbl>
#> 1 4 71.1
#> 2 6 145
#> 3 8 276.
Created on 2019-05-02 by the reprex package (v0.2.1)

Passing a list of arguments to a function with quasiquotation

I am trying to write a function in R that summarizes a data frame according to grouping variables. The grouping variables are given as a list and passed to group_by_at, and I would like to parametrize them.
What I am doing now is this:
library(tidyverse)
d = tribble(
~foo, ~bar, ~baz,
1, 2, 3,
1, 3, 5
4, 5, 6,
4, 5, 1
)
sum_fun <- function(df, group_vars, sum_var) {
sum_var = enquo(sum_var)
return(
df %>%
group_by_at(.vars = group_vars) %>%
summarize(sum(!! sum_var))
)
}
d %>% sum_fun(group_vars = c("foo", "bar"), baz)
However, I would like to call the function like so:
d %>% sum_fun(group_vars = c(foo, bar), baz)
Which means the grouping vars should not be evaluated in the call, but in the function. How would I go about rewriting the function to enable that?
I have tried using enquo just like for the summary variable, and then replacing group_vars with !! group_vars, but it leads to this error:
Error in !group_vars : invalid argument type
Using group_by(!!!group_vars) yields:
Column `c(foo, bar)` must be length 2 (the number of rows) or one, not 4
What would be the proper way to rewrite the function?
I'd just use vars to do the quoting. Here is an example using mtcars dataset
library(tidyverse)
sum_fun <- function(.data, .summary_var, .group_vars) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(.group_vars) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun(mtcars, disp, .group_vars = vars(cyl, am))
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
You can also replace .group_vars with ... (dot-dot-dot)
sum_fun2 <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(...) %>% # Forward `...`
summarise(mean = mean(!!summary_var))
}
sum_fun2(mtcars, disp, vars(cyl, am))
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
If you prefer to supply inputs as a list of columns, you will need to use enquos for the ...
sum_fun3 <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
group_var <- enquos(...)
print(group_var)
.data %>%
group_by_at(group_var) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun3(mtcars, disp, c(cyl, am))
#> [[1]]
#> <quosure>
#> expr: ^c(cyl, am)
#> env: global
#>
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
Edit: append an .addi_var to .../.group_var.
sum_fun4 <- function(.data, .summary_var, .addi_var, .group_vars) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(c(.group_vars, .addi_var)) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun4(mtcars, disp, .addi_var = vars(gear), .group_vars = vars(cyl, am))
#> # A tibble: 10 x 4
#> # Groups: cyl, am [?]
#> cyl am gear mean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 0 3 120.
#> 2 4 0 4 144.
#> 3 4 1 4 88.9
#> 4 4 1 5 108.
#> 5 6 0 3 242.
#> 6 6 0 4 168.
#> 7 6 1 4 160
#> 8 6 1 5 145
#> 9 8 0 3 358.
#> 10 8 1 5 326
group_by_at() can also take input as a character vector of column names
sum_fun5 <- function(.data, .summary_var, .addi_var, ...) {
summary_var <- enquo(.summary_var)
addi_var <- enquo(.addi_var)
group_var <- enquos(...)
### convert quosures to strings for `group_by_at`
all_group <- purrr::map_chr(c(addi_var, group_var), quo_name)
.data %>%
group_by_at(all_group) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun5(mtcars, disp, gear, cyl, am)
#> # A tibble: 10 x 4
#> # Groups: gear, cyl [?]
#> gear cyl am mean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 3 4 0 120.
#> 2 3 6 0 242.
#> 3 3 8 0 358.
#> 4 4 4 0 144.
#> 5 4 4 1 88.9
#> 6 4 6 0 168.
#> 7 4 6 1 160
#> 8 5 4 1 108.
#> 9 5 6 1 145
#> 10 5 8 1 326
Created on 2018-10-09 by the reprex package (v0.2.1.9000)
You can rewrite the function using a combination of dplyr::group_by(), dplyr::across(), and curly curly embracing {{. This works with dplyr version 1.0.0 and greater.
I've edited the original example and code for clarity.
library(tidyverse)
my_data <- tribble(
~foo, ~bar, ~baz,
"A", "B", 3,
"A", "C", 5,
"D", "E", 6,
"D", "E", 1
)
sum_fun <- function(.data, group, sum_var) {
.data %>%
group_by(across({{ group }})) %>%
summarize("sum_{{sum_var}}" := sum({{ sum_var }}))
}
sum_fun(my_data, group = c(foo, bar), sum_var = baz)
#> `summarise()` has grouped output by 'foo'. You can override using the `.groups` argument.
#> # A tibble: 3 x 3
#> # Groups: foo [2]
#> foo bar sum_baz
#> <chr> <chr> <dbl>
#> 1 A B 3
#> 2 A C 5
#> 3 D E 7
Created on 2021-09-06 by the reprex package (v2.0.0)
You could make use of the ellipse .... Take the following example:
sum_fun <- function(df, sum_var, ...) {
sum_var <- substitute(sum_var)
grps <- substitute(list(...))[-1L]
return(
df %>%
group_by_at(.vars = as.character(grps)) %>%
summarize(sum(!! sum_var))
)
}
d %>% sum_fun(baz, foo, bar)
We take the additional arguments and create a list out of them. Afterwards we use non-standard evaluation (substitute) to get the variable names and prevent R from evaluating them. Since group_by_at expects an object of type character or numeric, we simply convert the vector of names into a vector of characters and the function gets evaluated as we would expect.
> d %>% sum_fun(baz, foo, bar)
# A tibble: 3 x 3
# Groups: foo [?]
foo bar `sum(baz)`
<dbl> <dbl> <dbl>
1 1 2 3
2 1 3 5
3 4 5 7
If you do not want to supply grouping variables as any number of additional arguments, then you can of course use a named argument:
sum_fun <- function(df, sum_var, grps) {
sum_var <- enquo(sum_var)
grps <- as.list(substitute(grps))[-1L]
return(
df %>%
group_by_at(.vars = as.character(grps)) %>%
summarize(sum(!! sum_var))
)
}
sum_fun(mtcars, sum_var = hp, grps = c(cyl, gear))
The reason why I use substitute is that it makes it easy to split the expression list(cyl, gear) in its components. There might be a way to use rlang but I have not digged into that package so far.

Remove list elements that are zero-row tibbles

I have a list of tibbles. I'm trying to filter on a column common to all tibbles, and then remove any tibbles that end up with zero rows (but are not technically empty since they have columns). It seems like purrr:::compact() is intended for this purpose, but I don't think I've got it quite right. Is there a better solution?
require(tidyverse)
#> Loading required package: tidyverse
mylst <- lst(cars1 = cars %>% as.tibble(), cars2 = cars %>% as.tibble() %>% mutate(speed = speed + 100))
#This produces a list with zero-row tibble elements:
mylst %>% map(function(x) filter(x, speed == 125))
#> $cars1
#> # A tibble: 0 x 2
#> # ... with 2 variables: speed <dbl>, dist <dbl>
#>
#> $cars2
#> # A tibble: 1 x 2
#> speed dist
#> <dbl> <dbl>
#> 1 125. 85.
#This results in the same thing:
mylst %>% map(function(x) filter(x, speed == 125)) %>% compact()
#> $cars1
#> # A tibble: 0 x 2
#> # ... with 2 variables: speed <dbl>, dist <dbl>
#>
#> $cars2
#> # A tibble: 1 x 2
#> speed dist
#> <dbl> <dbl>
#> 1 125. 85.
#Putting compact inside the map function reduces $cars1 to 0x0, but it's still there:
mylst %>% map(function(x) filter(x, speed == 125) %>% compact())
#> $cars1
#> # A tibble: 0 x 0
#>
#> $cars2
#> # A tibble: 1 x 2
#> speed dist
#> <dbl> <dbl>
#> 1 125. 85.
#This finally drops the empty element, but seems clumsy.
mylst %>% map(function(x) filter(x, speed == 125) %>% compact()) %>% compact()
#> $cars2
#> # A tibble: 1 x 2
#> speed dist
#> <dbl> <dbl>
#> 1 125. 85.
Created on 2018-04-06 by the reprex package (v0.2.0).
You are trying to use compact but this only filters out NULL elements. To filter out zero row elements, you can use discard:
mylst %>%
map(function(x) filter(x, speed == 125)) %>%
discard(function(x) nrow(x) == 0)
#$cars2
## A tibble: 1 x 2
# speed dist
# <dbl> <dbl>
#1 125. 85.

Resources