If I want to make overscoping explicit, I can use the .data pronoun like this
library(dplyr)
cyl <- 3
transmute(as_tibble(mtcars), cyl_plus_one = .data$cyl + 1)
#> # A tibble: 32 x 1
#> cyl_plus_one
#> <dbl>
#> 1 7
#> 2 7
#> 3 5
#> 4 7
#> 5 9
#> 6 7
#> 7 9
#> 8 5
#> 9 5
#> 10 7
#> # ... with 22 more rows
However, what about the opposite, i.e. if I want to avoid overscoping explicitly? In the example below, I would like to add a new column that contains the value b (supplied via the function call, not the b in the data) plus 1, which does obviously not work the way it's stated now (because of overscoping).
library(dplyr)
add_one <- function(data, b) {
data %>%
mutate(a = b + 1)
}
data <- data_frame(
b = 999
)
add_one(data, 3)
#> # A tibble: 1 x 2
#> b a
#> <dbl> <dbl>
#> 1 999 1000
I also tried to create the new value outside the mutate() call, but then I still need to rely on new_val being not in the data.
library(dplyr)
add_one <- function(data, b) {
new_val <- b + 1
data %>%
mutate(a = new_val)
}
Just unquote with !! to look for a variable with that name above the data frame scope:
library(tidyverse)
add_one <- function(data, b) {
data %>% mutate(a = !!b + 1)
}
data <- data_frame(b = 999)
add_one(data, 3)
#> # A tibble: 1 x 2
#> b a
#> <dbl> <dbl>
#> 1 999 4.00
Related
I am trying to assign the vector output (i.e. greater than length 1) of a function to multiple columns in a single operation (or at least as concisely as possible).
Take the range() function for example which returns as output a numeric vector of length 2 denoting the minimum and maximum, respectively. Let's say I want to compute the range() per group and assign the output to two columns min and max.
My current approach is combining summarize followed by manually adding a key and then re-shaping to wide format:
library(magrittr)
# create data
df <- dplyr::tibble(group = rep(letters[1:3], each = 3),
x = rpois(9, 10))
df
#> # A tibble: 9 x 2
#> group x
#> <chr> <int>
#> 1 a 8
#> 2 a 12
#> 3 a 8
#> 4 b 9
#> 5 b 14
#> 6 b 9
#> 7 c 11
#> 8 c 6
#> 9 c 12
# summarize gives two lines per group
range_df <- df %>%
dplyr::group_by(group) %>%
dplyr::summarize(range = range(x)) %>%
dplyr::ungroup()
range_df
#> # A tibble: 6 x 2
#> group range
#> <chr> <int>
#> 1 a 8
#> 2 a 12
#> 3 b 9
#> 4 b 14
#> 5 c 6
#> 6 c 12
# add key and reshape
range_df %>%
dplyr::mutate(key = rep(c("min", "max"), 3)) %>%
tidyr::pivot_wider(names_from = key, values_from = range)
#> # A tibble: 3 x 3
#> group min max
#> <chr> <int> <int>
#> 1 a 8 12
#> 2 b 9 14
#> 3 c 6 12
Is there a more elegant / concise alternative to this?
Edit:
Ideally the alternative solution could handle an arbitrary number of outputs (e.g. if the function returns an output with length 3 then 3 variables should be created).
# Writw a small function that does the job:
library(tidyverse)
f <- function(x){
setNames(data.frame(t(range(x))), c('min', 'max'))
}
df %>%
summarise(across(x, f, .unpack = TRUE), .by=group)
#> # A tibble: 3 × 3
#> group x_min x_max
#> <chr> <int> <int>
#> 1 a 10 13
#> 2 b 7 10
#> 3 c 10 12
If you are using older version of dplyr
df %>%
group_by(group)%>%
summarise(across(x, f))%>%
unpack(x)
#> # A tibble: 3 × 3
#> group min max
#> <chr> <int> <int>
#> 1 a 6 9
#> 2 b 7 12
#> 3 c 6 10
Based on onyambu's answer, I build a small generic function for this. There probably will be some edge cases, where this will not work.
out2col <- function(x, fun, out_names = c(), add_args = list()) {
tmp <- do.call(what = fun, args = c(list(x), add_args))
out <- data.frame(t(tmp))
if (length(out_names) != 0) {
if (length(tmp) != length(out_names)) {
stop("provided names did not match the number of outputs")
}
out <- setNames(object = out, nm = out_names)
}
return(out)
}
Examples without any additional parameters:
df %>%
summarise(across(x, out2col, .unpack = TRUE, fun = range),
.by=group)
Output:
# A tibble: 3 × 3
group x_X1 x_X2
<chr> <int> <int>
1 a 7 10
2 b 11 14
3 c 9 14
Examples with additional parameters:
df %>%
summarise(across(x, out2col, .unpack = TRUE, fun = quantile,
out_names = c("min", "max", "Q25"),
add_args = list(probs = c(0, 1, 0.25))
),
.by=group)
Output:
# A tibble: 3 × 4
group x_min x_max x_Q25
<chr> <dbl> <dbl> <dbl>
1 a 7 10 7.5
2 b 11 14 11.5
3 c 9 14 10
set.seed(1)
df <- dplyr::tibble(group = rep(letters[1:3], each = 3),
x = rpois(9, 10))
function
g <- function(x){
data.frame(min = min(x), max = max(x))
}
calling g:
df %>%
group_by(group) %>%
summarise(across(x, g, .unpack = TRUE))
I have two tibbles with different number of columns. I want to filter df1 using a value from column b and I also want to filter df2 using a value from column b and also column c. Is it possible to do this using the same function?
I followed the list(...) procedure, but of course, I got an error since, in the first case there is no x[[2]].
library(dplyr)
df1 <- tibble(a = c(4,2,3,4),
b = c(8,6,7,8))
df2 <- tibble(a = c(1,2,3,4),
b = c(5,6,7,8),
c = c(1,5,3,7))
df1
#> # A tibble: 4 × 2
#> a b
#> <dbl> <dbl>
#> 1 4 8
#> 2 2 6
#> 3 3 7
#> 4 4 8
df2
#> # A tibble: 4 × 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 5 1
#> 2 2 6 5
#> 3 3 7 3
#> 4 4 8 7
createTable <- function(df, ...) {
x <- list(...)
tabl <- df %>%
filter(b < x[[1]], c < x[[2]])
return(tabl)
}
tabl1 <- createTable(df1, 8)
#> Error in `filter()`:
#> ! Problem while computing `..2 = c < x[[2]]`.
#> Caused by error in `x[[2]]`:
#> ! subscript out of bounds
tabl2 <- createTable(df2, 7, 5)
Created on 2022-07-27 by the reprex package (v2.0.1)
#Function that takes df1,group_vars as input and return df1 with seq columns as output
get_seq <- function(df1,group_vars) {
df1<-df1[ with( df1, do.call(order, mget(group_vars)) ), ]
df1<-df1 %>%
group_by(.dots=group_vars) %>%
mutate(seq=row_number())
return(df1)
}
Try using this function :
library(dplyr)
get_seq <- function(df1, group_vars) {
df1 %>%
arrange(across(all_of(group_vars))) %>%
group_by(across(all_of(group_vars))) %>%
mutate(seq=row_number())
}
You can call this function as :
df2 <- get_seq(df1, 'col1')
df2 <- get_seq(df1, c('col1', 'col2'))
It's really not clear what you're trying to do here. If you want to pass a variable number of column names to a function, sort the data frame according to these columns, then group_by the columns, then add a row number within each subgroup, you would do:
get_seq <- function(df1, ...)
{
group_vars <- enquos(...)
df1 %>%
arrange(!!!group_vars) %>%
group_by(!!!group_vars) %>%
mutate(seq = row_number())
}
So if we had a data frame like this:
df <- data.frame(a = rep(1:3, each = 4),
b = rep(LETTERS[4:1], each = 3),
c = rnorm(12))
We could do:
get_seq(df, a, b)
#> # A tibble: 12 x 4
#> # Groups: a, b [6]
#> a b c seq
#> <int> <fct> <dbl> <int>
#> 1 1 C 0.779 1
#> 2 1 D 0.318 1
#> 3 1 D -0.0710 2
#> 4 1 D 0.183 3
#> 5 2 B -0.351 1
#> 6 2 B 0.401 2
#> 7 2 C -1.26 1
#> 8 2 C 1.99 2
#> 9 3 A -0.0723 1
#> 10 3 A -0.602 2
#> 11 3 A 2.05 3
#> 12 3 B 2.13 1
I want to create a function based on dplyr that performs certain operations on subsets of data. The subsets are defined by values of one or more key columns in the dataset. When only one column is used to identify subsets, my code works fine:
set.seed(1)
df <- tibble(
g1 = c(1, 1, 2, 2, 2),
g2 = c(1, 2, 1, 2, 1),
a = sample(5)
)
group_key <- "g1"
aggregate <- function(df, by) {
df %>% group_by(!!sym(by)) %>% summarize(a = mean(a))
}
aggregate(df, by = group_key)
This works as expected and returns something like this:
# A tibble: 2 x 2
g1 a
<dbl> <dbl>
1 1 1.5
2 2 4
Unfortunately everything breaks down if I change group_key:
group_key <- c("g1", "g2")
aggregate(df, by = group_key)
I get an error: Only strings can be converted to symbols, which I think comes from rlang::sym(). Replacing it with syms() does not work since I get a list of names, on which group_by() chokes.
Any suggestions would be appreciated!
You need to use the unquote-splice operator !!!:
aggregate <- function(df, by) {
df %>% group_by(!!!syms(by)) %>% summarize(a = mean(a))
}
group_key <- c("g1", "g2")
aggregate(df, by = group_key)
## A tibble: 4 x 3
## Groups: g1 [2]
# g1 g2 a
# <dbl> <dbl> <dbl>
#1 1 1 1
#2 1 2 4
#3 2 1 2.5
#4 2 2 5
Alternatively, you can use dplyr::group_by_at:
agg <- function(df, by) {
require(dplyr)
df %>% group_by_at(vars(one_of(by))) %>% summarize(a = mean(a))}
group_key <- "g1"
group_keys <- c("g1","g2")
agg(df, by = group_key)
#> # A tibble: 2 x 2
#> g1 a
#> <dbl> <dbl>
#> 1 1 2.5
#> 2 2 3.33
agg(df, by = group_keys)
#> # A tibble: 4 x 3
#> # Groups: g1 [2]
#> g1 g2 a
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 2 4
#> 3 2 1 2.5
#> 4 2 2 5
Update with dplyr 1.0.0
The new across() allows tidyselect functions like all_of which replaces the quote-unqote procedure of NSE. The code looks a bit simpler with that:
aggregate <- function(df, by) {
df %>%
group_by(across(all_of(by))) %>%
summarize(a = mean(a))
}
df %>% aggregate(group_key)
I am trying to write a function in R that summarizes a data frame according to grouping variables. The grouping variables are given as a list and passed to group_by_at, and I would like to parametrize them.
What I am doing now is this:
library(tidyverse)
d = tribble(
~foo, ~bar, ~baz,
1, 2, 3,
1, 3, 5
4, 5, 6,
4, 5, 1
)
sum_fun <- function(df, group_vars, sum_var) {
sum_var = enquo(sum_var)
return(
df %>%
group_by_at(.vars = group_vars) %>%
summarize(sum(!! sum_var))
)
}
d %>% sum_fun(group_vars = c("foo", "bar"), baz)
However, I would like to call the function like so:
d %>% sum_fun(group_vars = c(foo, bar), baz)
Which means the grouping vars should not be evaluated in the call, but in the function. How would I go about rewriting the function to enable that?
I have tried using enquo just like for the summary variable, and then replacing group_vars with !! group_vars, but it leads to this error:
Error in !group_vars : invalid argument type
Using group_by(!!!group_vars) yields:
Column `c(foo, bar)` must be length 2 (the number of rows) or one, not 4
What would be the proper way to rewrite the function?
I'd just use vars to do the quoting. Here is an example using mtcars dataset
library(tidyverse)
sum_fun <- function(.data, .summary_var, .group_vars) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(.group_vars) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun(mtcars, disp, .group_vars = vars(cyl, am))
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
You can also replace .group_vars with ... (dot-dot-dot)
sum_fun2 <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(...) %>% # Forward `...`
summarise(mean = mean(!!summary_var))
}
sum_fun2(mtcars, disp, vars(cyl, am))
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
If you prefer to supply inputs as a list of columns, you will need to use enquos for the ...
sum_fun3 <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
group_var <- enquos(...)
print(group_var)
.data %>%
group_by_at(group_var) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun3(mtcars, disp, c(cyl, am))
#> [[1]]
#> <quosure>
#> expr: ^c(cyl, am)
#> env: global
#>
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
Edit: append an .addi_var to .../.group_var.
sum_fun4 <- function(.data, .summary_var, .addi_var, .group_vars) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(c(.group_vars, .addi_var)) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun4(mtcars, disp, .addi_var = vars(gear), .group_vars = vars(cyl, am))
#> # A tibble: 10 x 4
#> # Groups: cyl, am [?]
#> cyl am gear mean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 0 3 120.
#> 2 4 0 4 144.
#> 3 4 1 4 88.9
#> 4 4 1 5 108.
#> 5 6 0 3 242.
#> 6 6 0 4 168.
#> 7 6 1 4 160
#> 8 6 1 5 145
#> 9 8 0 3 358.
#> 10 8 1 5 326
group_by_at() can also take input as a character vector of column names
sum_fun5 <- function(.data, .summary_var, .addi_var, ...) {
summary_var <- enquo(.summary_var)
addi_var <- enquo(.addi_var)
group_var <- enquos(...)
### convert quosures to strings for `group_by_at`
all_group <- purrr::map_chr(c(addi_var, group_var), quo_name)
.data %>%
group_by_at(all_group) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun5(mtcars, disp, gear, cyl, am)
#> # A tibble: 10 x 4
#> # Groups: gear, cyl [?]
#> gear cyl am mean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 3 4 0 120.
#> 2 3 6 0 242.
#> 3 3 8 0 358.
#> 4 4 4 0 144.
#> 5 4 4 1 88.9
#> 6 4 6 0 168.
#> 7 4 6 1 160
#> 8 5 4 1 108.
#> 9 5 6 1 145
#> 10 5 8 1 326
Created on 2018-10-09 by the reprex package (v0.2.1.9000)
You can rewrite the function using a combination of dplyr::group_by(), dplyr::across(), and curly curly embracing {{. This works with dplyr version 1.0.0 and greater.
I've edited the original example and code for clarity.
library(tidyverse)
my_data <- tribble(
~foo, ~bar, ~baz,
"A", "B", 3,
"A", "C", 5,
"D", "E", 6,
"D", "E", 1
)
sum_fun <- function(.data, group, sum_var) {
.data %>%
group_by(across({{ group }})) %>%
summarize("sum_{{sum_var}}" := sum({{ sum_var }}))
}
sum_fun(my_data, group = c(foo, bar), sum_var = baz)
#> `summarise()` has grouped output by 'foo'. You can override using the `.groups` argument.
#> # A tibble: 3 x 3
#> # Groups: foo [2]
#> foo bar sum_baz
#> <chr> <chr> <dbl>
#> 1 A B 3
#> 2 A C 5
#> 3 D E 7
Created on 2021-09-06 by the reprex package (v2.0.0)
You could make use of the ellipse .... Take the following example:
sum_fun <- function(df, sum_var, ...) {
sum_var <- substitute(sum_var)
grps <- substitute(list(...))[-1L]
return(
df %>%
group_by_at(.vars = as.character(grps)) %>%
summarize(sum(!! sum_var))
)
}
d %>% sum_fun(baz, foo, bar)
We take the additional arguments and create a list out of them. Afterwards we use non-standard evaluation (substitute) to get the variable names and prevent R from evaluating them. Since group_by_at expects an object of type character or numeric, we simply convert the vector of names into a vector of characters and the function gets evaluated as we would expect.
> d %>% sum_fun(baz, foo, bar)
# A tibble: 3 x 3
# Groups: foo [?]
foo bar `sum(baz)`
<dbl> <dbl> <dbl>
1 1 2 3
2 1 3 5
3 4 5 7
If you do not want to supply grouping variables as any number of additional arguments, then you can of course use a named argument:
sum_fun <- function(df, sum_var, grps) {
sum_var <- enquo(sum_var)
grps <- as.list(substitute(grps))[-1L]
return(
df %>%
group_by_at(.vars = as.character(grps)) %>%
summarize(sum(!! sum_var))
)
}
sum_fun(mtcars, sum_var = hp, grps = c(cyl, gear))
The reason why I use substitute is that it makes it easy to split the expression list(cyl, gear) in its components. There might be a way to use rlang but I have not digged into that package so far.