Passing multiple columns from function's argument to group_by - r

Consider the following example:
library(tidyverse)
df <- tibble(
cat = rep(1:2, times = 4, each = 2),
loc = rep(c("a", "b"), each = 8),
value = rnorm(16)
)
df %>%
group_by(cat, loc) %>%
summarise(mean = mean(value), .groups = "drop")
# # A tibble: 4 x 3
# cat loc mean
# * <int> <chr> <dbl>
# 1 1 a -0.563
# 2 1 b -0.394
# 3 2 a 0.159
# 4 2 b 0.212
I would like to make a function of the last two lines that takes a group argument to pass multiple columns to group_by.
Here's a dummy function that computes the mean values by a group of columns as an example:
group_mean <- function(data, col_value, group) {
data %>%
group_by(across(all_of(group))) %>%
summarise(mean = mean({{col_value}}), .groups = "drop")
}
group_mean(df, value, c("cat", "loc"))
# # A tibble: 4 x 3
# cat loc mean
# * <int> <chr> <dbl>
# 1 1 a -0.563
# 2 1 b -0.394
# 3 2 a 0.159
# 4 2 b 0.212
The function works but I would prefer a tidyselect/rlang approach to avoid quoting column names, like so:
group_mean(df, value, c(cat, loc))
# Error: Problem adding computed columns in `group_by()`.
# x Problem with `mutate()` input `..1`.
# x object 'loc' not found
# ℹ Input `..1` is `across(all_of(c(cat, loc)))`.
Enclosing group in {{}} works for a single column but not for multiple columns. How can I do that?

Consider using ... and then we can have the option to use either quoted or unquoted after converting to symbol with ensym
group_mean <- function(data, col_value, ...) {
data %>%
group_by(!!! ensyms(...)) %>%
summarise(mean = mean({{col_value}}), .groups = "drop")
}
-testing
> group_mean(df, value, cat, loc)
# A tibble: 4 x 3
cat loc mean
<int> <chr> <dbl>
1 1 a 0.327
2 1 b -0.291
3 2 a -0.382
4 2 b -0.320
> group_mean(df, value, 'cat', 'loc')
# A tibble: 4 x 3
cat loc mean
<int> <chr> <dbl>
1 1 a 0.327
2 1 b -0.291
3 2 a -0.382
4 2 b -0.320
If we are already using ... as other arguments, then an option is
group_mean <- function(data, col_value, group) {
grp_lst <- as.list(substitute(group))
if(length(grp_lst)> 1) grp_lst <- grp_lst[-1]
grps <- purrr::map_chr(grp_lst, rlang::as_string)
data %>%
group_by(across(all_of(grps))) %>%
summarise(mean = mean({{col_value}}), .groups = "drop")
}
-testing
> group_mean(df, value, c(cat, loc))
# A tibble: 4 x 3
cat loc mean
<int> <chr> <dbl>
1 1 a 0.327
2 1 b -0.291
3 2 a -0.382
4 2 b -0.320

Related

Creating a function in R that takes input as dataframe , orders it grouped columns and generates a sequence. New column isnt coming in DF1

#Function that takes df1,group_vars as input and return df1 with seq columns as output
get_seq <- function(df1,group_vars) {
df1<-df1[ with( df1, do.call(order, mget(group_vars)) ), ]
df1<-df1 %>%
group_by(.dots=group_vars) %>%
mutate(seq=row_number())
return(df1)
}
Try using this function :
library(dplyr)
get_seq <- function(df1, group_vars) {
df1 %>%
arrange(across(all_of(group_vars))) %>%
group_by(across(all_of(group_vars))) %>%
mutate(seq=row_number())
}
You can call this function as :
df2 <- get_seq(df1, 'col1')
df2 <- get_seq(df1, c('col1', 'col2'))
It's really not clear what you're trying to do here. If you want to pass a variable number of column names to a function, sort the data frame according to these columns, then group_by the columns, then add a row number within each subgroup, you would do:
get_seq <- function(df1, ...)
{
group_vars <- enquos(...)
df1 %>%
arrange(!!!group_vars) %>%
group_by(!!!group_vars) %>%
mutate(seq = row_number())
}
So if we had a data frame like this:
df <- data.frame(a = rep(1:3, each = 4),
b = rep(LETTERS[4:1], each = 3),
c = rnorm(12))
We could do:
get_seq(df, a, b)
#> # A tibble: 12 x 4
#> # Groups: a, b [6]
#> a b c seq
#> <int> <fct> <dbl> <int>
#> 1 1 C 0.779 1
#> 2 1 D 0.318 1
#> 3 1 D -0.0710 2
#> 4 1 D 0.183 3
#> 5 2 B -0.351 1
#> 6 2 B 0.401 2
#> 7 2 C -1.26 1
#> 8 2 C 1.99 2
#> 9 3 A -0.0723 1
#> 10 3 A -0.602 2
#> 11 3 A 2.05 3
#> 12 3 B 2.13 1

Writing function that calculates rowwise mean for subset of columns and creates column name

I want to turn this line of code into a function:
mutate(var_avg = rowMeans(select(., starts_with("var"))))
It works in the pipe:
df <- read_csv("var_one,var_two,var_three
1,1,1
2,2,2
3,3,3")
df %>% mutate(var_avg = rowMeans(select(., starts_with("var"))))
># A tibble: 3 x 4
> var_one var_two var_three var_avg
> <dbl> <dbl> <dbl> <dbl>
>1 1 1 1 1
>2 2 2 2 2
>3 3 3 3 3
Here's my attempt (I'm new at writing functions):
colnameMeans <- function(x) {
columnname <- paste0("avg_",x)
mutate(columnname <- rowMeans(select(., starts_with(x))))
}
It doesn't work.
df %>% colnameMeans("var")
>Error in colnameMeans(., "var") : unused argument ("var")
I have a lot to learn about functions and I'm not sure where to start with fixing this. Any help would be much appreciated. Note that this is a simplified example. In my real data, I have several column prefixes and I want to calculate a row-wise mean for each one. EDIT: Being able to run the function for multiple prefixes at once would be a bonus.
If we need to assign column name on the lhs of assignment, use := and evaluate (!!) the string. The <- inside mutate won't work as the default option is = and it would evaluate unquoted value on the lhs of = literally. In addition, we may need to specify the data as argument in the function
library(dplyr)
colnameMeans <- function(., x) {
columnname<- paste0("avg_", x)
mutate(., !! columnname := rowMeans(select(., starts_with(x))))
}
df %>%
colnameMeans('var')
# A tibble: 3 x 4
# var_one var_two var_three avg_var
# <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 1
#2 2 2 2 2
#3 3 3 3 3
If there are several prefixes, use map
library(purrr)
library(stringr)
colnameMeans <- function(., x) {
columnname<- paste0("avg_", x)
transmute(., !! columnname := rowMeans(select(., starts_with(x))))
}
map_dfc(c('var', 'alt'), ~ df1 %>%
colnameMeans(.x)) %>%
bind_cols(df1, .)
# A tibble: 3 x 8
# var_one var_two var_three alt_var_one alt_var_two alt_var_three avg_var avg_alt
#* <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 1 1 1 1 1
#2 2 2 2 2 2 2 2 2
#3 3 3 3 3 3 3 3 3
data
df1 <- bind_cols(df, df %>% rename_all(~ str_replace(., 'var_', 'new_')))

Generate a list of tibble from a tibble by using map and select

I want to generate list of tibble fron one tibble in the following codes.
tbl = tibble(id=1:10, a = rnorm(10), b = rnorm(10))
tbl_list = c("a", "b") %>% map(~ tbl %>% select(c("id", .)))
The output I want is
tbl_list
[[1]]
# A tibble: 2 x 2
id a
<int> <dbl>
1 1 -0.704
2 2 -0.917
[[2]]
# A tibble: 2 x 2
id a
<int> <dbl>
1 1 -0.704
2 2 -0.917
However, it shows the error message,
"c("id", .) must evaluate to column positions or names, not a list" ,
so it seems that . is not recognized a character, but a list
Could you tell me how to avoid this error?
You can use .x to access the element
library(tidyverse)
c("a", "b") %>% map(~ tbl %>% select(c("id", .x)))
#[[1]]
# A tibble: 10 x 2
# id a
# <int> <dbl>
# 1 1 1.42
# 2 2 1.51
# 3 3 -0.385
#...
#[[2]]
# A tibble: 10 x 2
# id b
# <int> <dbl>
# 1 1 1.42
# 2 2 0.100
# 3 3 1.28
#....
You can also use . but while using it in chain operation . is referring to the object which is on the left-side of the chain i.e tbl in this case , hence it returns an error. To use . one way is
c("a", "b") %>% map(~select(tbl, c('id', .)))

dplyr: Handing over multiple variables to group_by in a function [duplicate]

This question already has an answer here:
How to pass multiple group_by arguments and a dynamic variable argument to a dplyr function
(1 answer)
Closed 3 years ago.
I have a function with dplyr::summarize. How can I hand over more than one variable to it?
Example:
myfunction <- function(mydf, grp) {
library(dplyr)
grp <- enquo(grp)
result <- mydf %>%
group_by(!! grp) %>%
summarise(sum = sum(x))
result
}
# works
myfunction(df, grp1)
# doesn't work
myfunction(df, c(grp1, grp2))
If we pass multiple variables, pass that as a string and make use of group_by_at
myfunction <- function(mydf, grp, xvar) {
mydf %>%
group_by_at(grp) %>%
summarise(sum = sum({{xvar}}))
}
myfunction(mtcars, "am", mpg)
# A tibble: 2 x 2
# am sum
# <dbl> <dbl>
#1 0 326.
#2 1 317.
myfunction(mtcars, c("am", "gear"), mpg)
# A tibble: 4 x 3
# Groups: am [2]
# am gear sum
# <dbl> <dbl> <dbl>
#1 0 3 242.
#2 0 4 84.2
#3 1 4 210.
#4 1 5 107.
In case, we want to pass the groups as showed in the OP's post, one way is to convert with enexpr and evaluate (!!!)
myfunction <- function(mydf, grp, xvar) {
grp <- as.list(rlang::enexpr(grp))
grp <- if(length(grp) > 1) grp[-1] else grp
mydf %>%
group_by(!!! grp) %>%
summarise(sum = sum({{xvar}}))
}
myfunction(mtcars, am, mpg)
# A tibble: 2 x 2
# am sum
# <dbl> <dbl>
#1 0 326.
#2 1 317.
myfunction(mtcars, c(am, gear), mpg)
# A tibble: 4 x 3
# Groups: am [2]
# am gear sum
# <dbl> <dbl> <dbl>
#1 0 3 242.
#2 0 4 84.2
#3 1 4 210.
#4 1 5 107.

Cannot use multi word variables in dplyr or am I missing something?

Why doesn't dplyr like this format of 'beta linalool' in my function as compared to beta.linalool?
It took me a few hours of troubleshooting to figure out what the problem was. Is there any way to use data where variables are labeled as more than one word or should I just move everything to the beta.linalool type format?
Everything I have learned has been from Programming with dplyr.
library(ggplot2)
library(readxl)
library(dplyr)
library(magrittr)
Data3<- read_excel("Desktop/Data3.xlsx")
Data3 %>% filter(Variety=="CS 420A"&`Red Blotch`=="-")%>% group_by(`Time Point`)%>%
summarise(m=mean(`beta linalool`),SD=sd(`beta linalool`))
# A tibble: 4 x 3
`Time Point` m SD
<chr> <dbl> <dbl>
1 End 0.00300 0.000117
2 Mid 0.00385 0.000353
3 Must 0.000254 0.00000633
4 Start 0.000785 0.000283
Now when I work it into a function:
cwine<-function(df,v,rb,c){
c<-enquo(c)
df %>% filter(Variety==v&`Red Blotch`==rb)%>%
group_by(`Time Point`) %>%
summarise_(m=mean(!!c),SD=sd(!!c)) %>%
}
cwine(Data3,"CS 420A","-",'beta linalool')
# A tibble: 4 x 3
`Time Point` m SD
<chr> <dbl> <dbl>
1 End NA NA
2 Mid NA NA
3 Must NA NA
4 Start NA NA
Warning messages:
1: In mean.default(~"beta linalool") :
argument is not numeric or logical: returning NA #this statement is repeated 4 more times
5: In var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm) :
NAs introduced by coercion #this statement is repeated 4 more times
The problem lies in that beta linalool is typed in as 'beta linalool'. I figured this out by trying this methodology on the iris dataset and seeing that Petal.Length is not 'Petal Width':
my_function<-function(ds,x,y,c){
c<-enquo(c)
ds %>%filter(Sepal.Length>x&Sepal.Width<y) %>%
group_by(Species) %>%
summarise(m=mean(!!c),SD=sd(!!c))
}
my_function2(iris,5,4,Petal.Length)
# A tibble: 3 x 3
Species m SD
<fct> <dbl> <dbl>
1 setosa 1.53 0.157
2 versicolor 4.32 0.423
3 virginica 5.57 0.536
In fact my function works fine on a different variable:
> cwine(Data2,"CS 420A","-",nerol)
# A tibble: 4 x 3
`Time Point` m SD
<chr> <dbl> <dbl>
1 End 0.000453 0.0000338
2 Mid 0.000659 0.0000660
3 Must 0.000560 0.0000234
4 Start 0.000927 0.0000224
Is dplyr just that sensitive or am I missing something?
One option would be convert it to symbol and evaluate it
library(tidyverse)
cwine <- function(df,v,rb,c){
df %>%
filter(Variety==v & `Red Blotch` == rb)%>%
group_by(`Time Point`) %>%
summarise(m = mean(!!rlang::sym(c)),
SD = sd(!! rlang::sym(c)))
}
cwine(Data3,"CS 420A","-",'beta linalool')
# A tibble: 2 x 3
# `Time Point` m SD
# <int> <dbl> <dbl>
#1 2 -2.11 2.23
#2 4 0.0171 NA
Also, if we want to pass it by converting to quosure (enquo), it works, when we pass the variable name with backquotes (usually, unquoted version works, but here there is a space between words and to evaluate it as it is, backquote is needed)
cwine <- function(df,v,rb,c){
c1 <- enquo(c)
df %>%
filter(Variety==v & `Red Blotch` == rb)%>%
group_by(`Time Point`) %>%
summarise(m = mean(!! c1 ),
SD = sd(!! c1))
}
cwine(Data3,"CS 420A","-",`beta linalool`)
# A tibble: 2 x 3
# `Time Point` m SD
# <int> <dbl> <dbl>
#1 2 -2.11 2.23
#2 4 0.0171 NA
data
set.seed(24)
Data3 <- tibble(Variety = sample(c("CS 420A", "CS 410A"), 20, replace = TRUE),
`Red Blotch` = sample(c("-", "+"), 20, replace = TRUE),
`Time Point` = sample(1:4, 20, replace = TRUE),
`beta linalool` = rnorm(20))

Resources