Using the {{var}} notation the following code works.
The variables to be used for grouping and for summarizing van be given as parameters to my_summary
I would like to modify my_summary so that I can give a varying number of variables for both grouping and summarizing. Can this be done?
suppressPackageStartupMessages({
library(tidyverse)
})
set.seed(4321)
demo_df <-
tibble(age=as.integer(rep(c(10,20),each=10)),
gender=rep(c("f","m"),10),
weight=rnorm(20,70,7),
size=rnorm(20,160,15))
my_summary <- function(df_in,group_var,summary_var){
df_in |>
group_by({{group_var}}) |>
summarise_at(vars({{summary_var}}),mean)
}
my_summary(demo_df,gender,weight)
Another possible solution, allowing for multiple grouping variables:
library(tidyverse)
my_summary <- function(df_in, group_var,summary_var){
df_in %>%
group_by(!!!group_var) %>%
summarise(across({{summary_var}}, mean), .groups = "drop")
}
my_summary(demo_df, vars(age,gender), c(weight,size))
#> # A tibble: 4 × 4
#> age gender weight size
#> <int> <chr> <dbl> <dbl>
#> 1 10 f 71.5 159.
#> 2 10 m 72.4 158.
#> 3 20 f 64.3 167.
#> 4 20 m 71.6 164.
Alternatively, without vars (that may be superseded):
library(tidyverse)
my_summary <- function(df_in, summary_var , ...){
summary_var <- enquos(summary_var)
group_var <- enquos(...)
df_in %>%
group_by(!!!group_var) %>%
summarise(across(!!!summary_var,mean), .groups = "drop")
}
my_summary(demo_df, c(weight, size), age, gender)
#> # A tibble: 4 × 4
#> age gender weight size
#> <int> <chr> <dbl> <dbl>
#> 1 10 f 71.5 159.
#> 2 10 m 72.4 158.
#> 3 20 f 64.3 167.
#> 4 20 m 71.6 164.
Use summarise(across(.)).
suppressPackageStartupMessages({
library(tidyverse)
})
set.seed(4321)
demo_df <-
tibble(age=as.integer(rep(c(10,20),each=10)),
gender=rep(c("f","m"),10),
weight=rnorm(20,70,7),
size=rnorm(20,160,15))
my_summary <- function(df_in,group_var,summary_var){
df_in |>
group_by({{group_var}}) |>
summarise(across({{summary_var}}, mean))
}
my_summary(demo_df, gender, weight:size)
#> # A tibble: 2 × 3
#> gender weight size
#> <chr> <dbl> <dbl>
#> 1 f 67.9 163.
#> 2 m 72.0 161.
Created on 2022-06-09 by the reprex package (v2.0.1)
Related
library(readr)
d <- read.csv("per_capita.csv")
rc <- d[,-2:-3]
df <- data.frame(rc)
draw <- df$X1994[df$Country.Name == "India"]
format(draw, scientific = F, big.marks = ",")
library(dplyr)
df %>%
filter(Country.Name == "India") %>%
select(names(.)[-1][readr::parse_integer(names(.)[-1] > 1994])
I tried this code and its giving me an error in the last line. Also, how should I rename these columns in the CSV file without using a dataframe?
The column names are: X1994, X1995..... and so on.
Thank You!
If you want to select columns that have numbers greater than a value, you could do this:
library(tidyverse)
#example
set.seed(24)
df <- tibble(country = rep(c("India", "Canada"), each = 3),
X1990 = runif(6),
X1991 = runif(6),
X1992 = runif(6))
df |>
filter(country == "India") |>
select(!!!vars(colnames(df)[-1][which(parse_number(colnames(df)[-1]) > 1990)]))
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Although that is pretty complicated. It might be better to go long, filter, then go wide:
df |>
filter(country == "India") |>
mutate(id = row_number()) |>
pivot_longer(contains("X")) |>
mutate(name = parse_number(name))|>
filter(name > 1990) |>
pivot_wider(names_from = name, values_from = value)|>
select(-c(id, country))
#> # A tibble: 3 x 2
#> `1991` `1992`
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
We can see that this answer is pretty long and cumbersome. Maybe we stick in base R:
cols <- which(as.numeric(sub("^.*?(\\d+).*$", "\\1", colnames(df)[-1])) > 1990) +1
rows <- df$country == "India"
df[rows,cols]
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Or actually, maybe we can make the tidyverse version cleaner if we just look for strings that have values higher than the target year:
all_years <- 1990:1995
df |>
filter(country == "India") |>
select(contains(paste0("X", all_years[all_years > 1990])))
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Using the same logic, we can also do a partial string match with base R:
all_years <- 1990:1995
cols <- grepl(paste(all_years[all_years>1990], collapse = "|"), colnames(df))
rows <- df$country == "India"
df[rows,cols]
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Hopefully one of these helps and strikes your fancy. Lots of options out there for whatever flavor your in the mood for.
The following function behaves as desired: several variables can be passed to group_by without the need to put them into alist() or dplyr::vars:
mean_by_grp <- function(df, meanvar, grp) {
grouping <- enexpr(grp) %>%
expr_deparse %>%
str_split(",",simplify = T) %>% `[`(1,) %>%
map(str_remove,"c\\(") %>% map(str_remove,"\\)") %>% map(str_trim) %>%
unlist %>% syms
df %>%
group_by(!!!syms(grouping)) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
starwars %>% mean_by_grp(height, species)
starwars %>% mean_by_grp(height, c(species, homeworld))
However, it is complicated. I need to turn c(var1,....varn) into a string, split it and turn it into a list of symbols so I can use with with syms.
Isn't there a much easier way to do this?
Of course, I could use ellipses instead of grp, but then I can only have one argument that passes multiple symbols to another function.
One option would be dplyr::across:
mean_by_grp <- function(df, meanvar, grp) {
df %>%
group_by(across({{ grp }})) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
library(dplyr)
starwars %>% mean_by_grp(height, species)
#> # A tibble: 38 × 2
#> species average_height
#> <chr> <dbl>
#> 1 Aleena 79
#> 2 Besalisk 198
#> 3 Cerean 198
#> 4 Chagrian 196
#> 5 Clawdite 168
#> 6 Droid 131.
#> 7 Dug 112
#> 8 Ewok 88
#> 9 Geonosian 183
#> 10 Gungan 209.
#> # … with 28 more rows
starwars %>% mean_by_grp(height, c(species, homeworld))
#> # A tibble: 58 × 3
#> species homeworld average_height
#> <chr> <chr> <dbl>
#> 1 Aleena Aleen Minor 79
#> 2 Besalisk Ojom 198
#> 3 Cerean Cerea 198
#> 4 Chagrian Champala 196
#> 5 Clawdite Zolan 168
#> 6 Droid Naboo 96
#> 7 Droid Tatooine 132
#> 8 Droid <NA> 148
#> 9 Dug Malastare 112
#> 10 Ewok Endor 88
#> # … with 48 more rows
So this example is basically from https://tidyeval.tidyverse.org/dplyr.html#patterns-for-single-arguments and it works just fine:
library(tidyverse)
group_mean <- function(df, group_var, summary_var){
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_mean")
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := mean(!!summary_var, na.rm = TRUE))
}
mtcars %>% group_mean(group_var = cyl, summary_var = disp)
#> # A tibble: 3 x 2
#> cyl disp_mean
#> <dbl> <dbl>
#> 1 4 105.
#> 2 6 183.
#> 3 8 353.
I would like to e.g. be able to choose median instead of mean sometimes and e.g. change the function name to group_stat().
You can do something like this. I'm not quite sure exactly how this works but I've seen this method used in the source code of library(purrr) for as_mapper():
https://github.com/tidyverse/purrr/blob/master/R/as_mapper.R
library(tidyverse)
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_closure(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
mtcars %>%
group_stat(group_var = cyl, summary_var = disp, median)
#> # A tibble: 3 x 2
#> cyl disp_median
#> <dbl> <dbl>
#> 1 4 108
#> 2 6 168.
#> 3 8 350.
mtcars %>%
group_stat(group_var = cyl, summary_var = disp, mean)
#> # A tibble: 3 x 2
#> cyl disp_mean
#> <dbl> <dbl>
#> 1 4 105.
#> 2 6 183.
#> 3 8 353.
mtcars %>%
group_stat(group_var = cyl, summary_var = disp, max)
#> # A tibble: 3 x 2
#> cyl disp_max
#> <dbl> <dbl>
#> 1 4 147.
#> 2 6 258
#> 3 8 472
mtcars %>%
group_stat(group_var = cyl, summary_var = disp, min)
#> # A tibble: 3 x 2
#> cyl disp_min
#> <dbl> <dbl>
#> 1 4 71.1
#> 2 6 145
#> 3 8 276.
Created on 2019-05-02 by the reprex package (v0.2.1)
I am trying to write a function in R that summarizes a data frame according to grouping variables. The grouping variables are given as a list and passed to group_by_at, and I would like to parametrize them.
What I am doing now is this:
library(tidyverse)
d = tribble(
~foo, ~bar, ~baz,
1, 2, 3,
1, 3, 5
4, 5, 6,
4, 5, 1
)
sum_fun <- function(df, group_vars, sum_var) {
sum_var = enquo(sum_var)
return(
df %>%
group_by_at(.vars = group_vars) %>%
summarize(sum(!! sum_var))
)
}
d %>% sum_fun(group_vars = c("foo", "bar"), baz)
However, I would like to call the function like so:
d %>% sum_fun(group_vars = c(foo, bar), baz)
Which means the grouping vars should not be evaluated in the call, but in the function. How would I go about rewriting the function to enable that?
I have tried using enquo just like for the summary variable, and then replacing group_vars with !! group_vars, but it leads to this error:
Error in !group_vars : invalid argument type
Using group_by(!!!group_vars) yields:
Column `c(foo, bar)` must be length 2 (the number of rows) or one, not 4
What would be the proper way to rewrite the function?
I'd just use vars to do the quoting. Here is an example using mtcars dataset
library(tidyverse)
sum_fun <- function(.data, .summary_var, .group_vars) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(.group_vars) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun(mtcars, disp, .group_vars = vars(cyl, am))
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
You can also replace .group_vars with ... (dot-dot-dot)
sum_fun2 <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(...) %>% # Forward `...`
summarise(mean = mean(!!summary_var))
}
sum_fun2(mtcars, disp, vars(cyl, am))
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
If you prefer to supply inputs as a list of columns, you will need to use enquos for the ...
sum_fun3 <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
group_var <- enquos(...)
print(group_var)
.data %>%
group_by_at(group_var) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun3(mtcars, disp, c(cyl, am))
#> [[1]]
#> <quosure>
#> expr: ^c(cyl, am)
#> env: global
#>
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
Edit: append an .addi_var to .../.group_var.
sum_fun4 <- function(.data, .summary_var, .addi_var, .group_vars) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(c(.group_vars, .addi_var)) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun4(mtcars, disp, .addi_var = vars(gear), .group_vars = vars(cyl, am))
#> # A tibble: 10 x 4
#> # Groups: cyl, am [?]
#> cyl am gear mean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 0 3 120.
#> 2 4 0 4 144.
#> 3 4 1 4 88.9
#> 4 4 1 5 108.
#> 5 6 0 3 242.
#> 6 6 0 4 168.
#> 7 6 1 4 160
#> 8 6 1 5 145
#> 9 8 0 3 358.
#> 10 8 1 5 326
group_by_at() can also take input as a character vector of column names
sum_fun5 <- function(.data, .summary_var, .addi_var, ...) {
summary_var <- enquo(.summary_var)
addi_var <- enquo(.addi_var)
group_var <- enquos(...)
### convert quosures to strings for `group_by_at`
all_group <- purrr::map_chr(c(addi_var, group_var), quo_name)
.data %>%
group_by_at(all_group) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun5(mtcars, disp, gear, cyl, am)
#> # A tibble: 10 x 4
#> # Groups: gear, cyl [?]
#> gear cyl am mean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 3 4 0 120.
#> 2 3 6 0 242.
#> 3 3 8 0 358.
#> 4 4 4 0 144.
#> 5 4 4 1 88.9
#> 6 4 6 0 168.
#> 7 4 6 1 160
#> 8 5 4 1 108.
#> 9 5 6 1 145
#> 10 5 8 1 326
Created on 2018-10-09 by the reprex package (v0.2.1.9000)
You can rewrite the function using a combination of dplyr::group_by(), dplyr::across(), and curly curly embracing {{. This works with dplyr version 1.0.0 and greater.
I've edited the original example and code for clarity.
library(tidyverse)
my_data <- tribble(
~foo, ~bar, ~baz,
"A", "B", 3,
"A", "C", 5,
"D", "E", 6,
"D", "E", 1
)
sum_fun <- function(.data, group, sum_var) {
.data %>%
group_by(across({{ group }})) %>%
summarize("sum_{{sum_var}}" := sum({{ sum_var }}))
}
sum_fun(my_data, group = c(foo, bar), sum_var = baz)
#> `summarise()` has grouped output by 'foo'. You can override using the `.groups` argument.
#> # A tibble: 3 x 3
#> # Groups: foo [2]
#> foo bar sum_baz
#> <chr> <chr> <dbl>
#> 1 A B 3
#> 2 A C 5
#> 3 D E 7
Created on 2021-09-06 by the reprex package (v2.0.0)
You could make use of the ellipse .... Take the following example:
sum_fun <- function(df, sum_var, ...) {
sum_var <- substitute(sum_var)
grps <- substitute(list(...))[-1L]
return(
df %>%
group_by_at(.vars = as.character(grps)) %>%
summarize(sum(!! sum_var))
)
}
d %>% sum_fun(baz, foo, bar)
We take the additional arguments and create a list out of them. Afterwards we use non-standard evaluation (substitute) to get the variable names and prevent R from evaluating them. Since group_by_at expects an object of type character or numeric, we simply convert the vector of names into a vector of characters and the function gets evaluated as we would expect.
> d %>% sum_fun(baz, foo, bar)
# A tibble: 3 x 3
# Groups: foo [?]
foo bar `sum(baz)`
<dbl> <dbl> <dbl>
1 1 2 3
2 1 3 5
3 4 5 7
If you do not want to supply grouping variables as any number of additional arguments, then you can of course use a named argument:
sum_fun <- function(df, sum_var, grps) {
sum_var <- enquo(sum_var)
grps <- as.list(substitute(grps))[-1L]
return(
df %>%
group_by_at(.vars = as.character(grps)) %>%
summarize(sum(!! sum_var))
)
}
sum_fun(mtcars, sum_var = hp, grps = c(cyl, gear))
The reason why I use substitute is that it makes it easy to split the expression list(cyl, gear) in its components. There might be a way to use rlang but I have not digged into that package so far.
I have a list of tibbles. I'm trying to filter on a column common to all tibbles, and then remove any tibbles that end up with zero rows (but are not technically empty since they have columns). It seems like purrr:::compact() is intended for this purpose, but I don't think I've got it quite right. Is there a better solution?
require(tidyverse)
#> Loading required package: tidyverse
mylst <- lst(cars1 = cars %>% as.tibble(), cars2 = cars %>% as.tibble() %>% mutate(speed = speed + 100))
#This produces a list with zero-row tibble elements:
mylst %>% map(function(x) filter(x, speed == 125))
#> $cars1
#> # A tibble: 0 x 2
#> # ... with 2 variables: speed <dbl>, dist <dbl>
#>
#> $cars2
#> # A tibble: 1 x 2
#> speed dist
#> <dbl> <dbl>
#> 1 125. 85.
#This results in the same thing:
mylst %>% map(function(x) filter(x, speed == 125)) %>% compact()
#> $cars1
#> # A tibble: 0 x 2
#> # ... with 2 variables: speed <dbl>, dist <dbl>
#>
#> $cars2
#> # A tibble: 1 x 2
#> speed dist
#> <dbl> <dbl>
#> 1 125. 85.
#Putting compact inside the map function reduces $cars1 to 0x0, but it's still there:
mylst %>% map(function(x) filter(x, speed == 125) %>% compact())
#> $cars1
#> # A tibble: 0 x 0
#>
#> $cars2
#> # A tibble: 1 x 2
#> speed dist
#> <dbl> <dbl>
#> 1 125. 85.
#This finally drops the empty element, but seems clumsy.
mylst %>% map(function(x) filter(x, speed == 125) %>% compact()) %>% compact()
#> $cars2
#> # A tibble: 1 x 2
#> speed dist
#> <dbl> <dbl>
#> 1 125. 85.
Created on 2018-04-06 by the reprex package (v0.2.0).
You are trying to use compact but this only filters out NULL elements. To filter out zero row elements, you can use discard:
mylst %>%
map(function(x) filter(x, speed == 125)) %>%
discard(function(x) nrow(x) == 0)
#$cars2
## A tibble: 1 x 2
# speed dist
# <dbl> <dbl>
#1 125. 85.