How to combine function argument with group_by in R - r

I would like to use group_by( ) function with my customised function but the column names that goes within group_by would be defined in my function argument.
See a hypothetical example of what my data would look like:
data <- data.frame(ind = rep(c("A", "B", "C"), 4),
gender = rep(c("F", "M"), each = 6),
value = sample(1:100, 12))
And this is the result I would like to have:
result <- data %>%
group_by(ind, gender) %>%
mutate(value = mean(value)) %>%
distinct()
This is how I was trying to make my function to work:
myFunction <- function(data, set_group, variable){
result <- data %>%
group_by(get(set_group)) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
distinct()
}
result3 <- myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
result3
I want to allow that the user define as many set_group as needed and as many variable as needed. I tried using get( ) function, all_of( ) function and mget( ) function within group_by but none worked.
Does anyone know how can I code it?
Thank you!

We could use across within group_by
myFunction <- function(data, set_group, variable){
data %>%
group_by(across(all_of(set_group))) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
ungroup %>%
distinct()
}
-testing
> myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
# A tibble: 6 × 3
ind gender value
<chr> <chr> <dbl>
1 A F 43.5
2 B F 87.5
3 C F 67.5
4 A M 13
5 B M 43.5
6 C M 37.5
Another option is to convert to symbols and evaluate (!!!)
myFunction <- function(data, set_group, variable){
data %>%
group_by(!!! rlang::syms(set_group)) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
ungroup %>%
distinct()
}
-testing
> myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
# A tibble: 6 × 3
ind gender value
<chr> <chr> <dbl>
1 A F 43.5
2 B F 87.5
3 C F 67.5
4 A M 13
5 B M 43.5
6 C M 37.5
NOTE: get is used when there is a single object, for multiple objects mget can be used. But, it is better to use tidyverse functions

Related

median doesn't work after group_by in r function

I wrote a r function to compute the median by group:
varA<-rep(c(1:2),times=30)
df1<-data.frame(varA)
df1$var1 <- sample(500:1000, length(df1$varA))
df1 <- df1 %>% mutate(outcome=ifelse(varA==1, "Yes", "No"))
ctn_me<- function(df, var, group_var) {
df[[group_var]]<-as.character(df[[group_var]])
# df[[var]]<-as.numeric(df[[var]])
tbl1<-df %>%
bind_rows(mutate(., !!group_var := 'Total')) %>%
dplyr::group_by(gpvar=.[[group_var]])%>%
dplyr::summarise(
median=median(.[[var]], na.rm = TRUE),
N = n())
print(tbl1)
}
ctn_me(df1, "var1", "outcome")
It gave me results like this:
#### gpvar median N
#### <chr> <dbl> <int>
#### 1 No 734 30
#### 2 Total 734 60
#### 3 Yes 734 30
So it can count the number of rows within each group, but for the median, it returned the overall median instead by the group.
This gave me the results I wanted:
df1 %>% bind_rows(mutate(., outcome := 'Total')) %>%
dplyr::group_by(outcome)%>%
dplyr::summarise(
median=median(var1, na.rm = TRUE),
N = n())
# A tibble: 3 x 3
# outcome median N
# <chr> <dbl> <int>
# 1 No 713 30
# 2 Total 734 60
# 3 Yes 788. 30
I was trying to figure out what was wrong with my r function. Can anyone let me know? Thanks!
The docs state that you need to specifically reference ".data" within the summarise() function:
"When you have an env-variable that is a character vector, you need to
index into the .data pronoun with [[, like summarise(df, mean =
mean(.data[[var]]))."
In this case, you need to change .[[variable]] to .data[[variable]], i.e.
library(tidyverse)
set.seed(123)
varA<-rep(c(1:2),times=30)
df1<-data.frame(varA)
df1$var1 <- sample(500:1000, length(df1$varA))
df1 <- df1 %>% mutate(outcome=ifelse(varA==1, "Yes", "No"))
ctn_me <- function(df, var, group_var) {
df %>%
bind_rows(mutate(., !!group_var := "Total")) %>%
group_by(gpvar = .[[group_var]]) %>%
summarise(
median_group = median(.data[[var]], na.rm = TRUE),
N = n()
)
}
ctn_me(df1, "var1", "outcome")
#> # A tibble: 3 × 3
#> gpvar median_group N
#> <chr> <dbl> <int>
#> 1 No 740. 30
#> 2 Total 754 60
#> 3 Yes 776. 30
Created on 2022-07-19 by the reprex package (v2.0.1)
Original answer:
If you use a different syntax inside the summarise() function it works as expected, so I think it's something to do with the summarise() function:
library(tidyverse)
set.seed(123)
varA<-rep(c(1:2),times=30)
df1<-data.frame(varA)
df1$var1 <- sample(500:1000, length(df1$varA))
df1 <- df1 %>% mutate(outcome=ifelse(varA==1, "Yes", "No"))
ctn_me <- function(df, var, group_var) {
df %>%
bind_rows(mutate(., !!group_var := "Total")) %>%
group_by(gpvar = .[[group_var]]) %>%
summarise(
median_group = median(!!sym(var), na.rm = TRUE),
N = n()
)
}
ctn_me(df1, "var1", "outcome")
#> # A tibble: 3 × 3
#> gpvar median_group N
#> <chr> <dbl> <int>
#> 1 No 740. 30
#> 2 Total 754 60
#> 3 Yes 776. 30
Created on 2022-07-19 by the reprex package (v2.0.1)
Try this for non-standard evaluation.
ctn_me<- function(df, var, group_var) {
df[[group_var]]<-as.character(df[[group_var]])
# df[[var]]<-as.numeric(df[[var]])
tbl1<-df %>%
bind_rows(mutate(., !!group_var := 'Total')) %>%
dplyr::group_by(.data[[group_var]])%>%
dplyr::summarise(
median=median(.data[[var]], na.rm = TRUE),
N = n())
print(tbl1)
}```

Summaries based on one reference column compared to the other columns in dplyr

I want to get sums of a variable depending on the other variable's na or non-na values in R. A working example code is below:
library(dplyr)
df <- data.frame(A = c(1,2,3,NA,4),
B = c(NA,2,3,NA,5),
C = c(3,4,NA,NA,NA),
REF = c(10,20,30,40,50))
df.na <- df %>% mutate_at(vars(-REF),is.na)
sums <- matrix(0,2,3)
row.names(sums) <- c("NON-NA","NA")
colnames(sums) <- c("A","B","C")
for(i in 1:3){
sums[,i] <- df.na %>% group_by_at(i) %>% summarise(sum=sum(REF)) %>% select(sum) %>% unlist()
}
> sums
A B C
NON-NA 110 100 30
NA 40 50 120
For example, since 4th term in the A column is NA, the corresponding column values are 30 and 10+20+3+50 = 150-30 = 120 in sums object.
My question is how do I get this output without a for loop?
Here is a solution using the pivot_ functions from tidyr. The approach pivots to a longer form so that you can group by column name and whether the column value is NA.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = c("A", "B", "C")) %>%
mutate(isna = is.na(value)) %>%
group_by(name, isna) %>%
summarize(value = sum(REF)) %>%
pivot_wider()
isna A B C
<lgl> <dbl> <dbl> <dbl>
1 FALSE 110 100 30
2 TRUE 40 50 120
df <- data.frame(A = c(1,2,3,NA,4),
B = c(NA,2,3,NA,5),
C = c(3,4,NA,NA,NA),
REF = c(10,20,30,40,50))
library(tidyverse)
imap(.x = df[1:3],
.f = ~ df %>%
group_by(grp = is.na(.x)) %>%
summarise(!!.y := sum(REF, na.rm = T))) %>%
reduce(left_join)
#> Joining, by = "grp"
#> Joining, by = "grp"
#> # A tibble: 2 x 4
#> grp A B C
#> <lgl> <dbl> <dbl> <dbl>
#> 1 FALSE 110 100 30
#> 2 TRUE 40 50 120
Created on 2022-01-26 by the reprex package (v2.0.1)

How can I pass a column name to a is.na function via dot-dot-dot?

I am trying to write a function that takes the column name of a data frame as a ... argument and passes it to is.na within a dplyr::mutate function call. The following toy example illustrates the basic goal:
library(tidyverse)
df <- tribble(
~id, ~value,
"a", 1,
"a", 2,
NA, 3,
"b", 4
)
df %>%
group_by(id) %>%
mutate(avg = if_else(is.na(id), NA_real_, mean(value)))
The above yields the desired outcome, but not via a function.
The following function works, but does not populate the avg column with NAs as desired:
my_fun1 <- function(.data, ...) {
.data %>%
group_by(...) %>%
mutate(avg = mean(value))
}
my_fun1(df, id)
The following function yields an error indicating "is.na() applied to non-(list or vector) of type 'closure'":
my_fun2 <- function(.data, ...) {
.data %>%
group_by(...) %>%
mutate(avg = if_else(is.na(...), NA_real_, mean(value)))
}
my_fun2(df, id)
Why does this error result, and how can I avoid it?
If we have a fixed column, then passing unquoted argument is easy i.e. use curly-curly operator
myfun <- function(.data, grpcol) {
.data %>%
group_by({{grpcol}}) %>%
mutate(avg = if_else(is.na({{grpcol}}), NA_real_,
mean(value, na.rm = TRUE))) %>%
ungroup
}
-testing
> myfun(df, id)
# A tibble: 4 x 3
id value avg
<chr> <dbl> <dbl>
1 a 1 1.5
2 a 2 1.5
3 <NA> 3 NA
4 b 4 4
Or if we want to use ... and in case if the user inputs more than one column name, select the first one always
myfun <- function(.data, ...) {
grps <- rlang::ensyms(...)[[1]]
.data %>%
group_by(!! grps) %>%
mutate(avg = if_else(is.na(!!grps), NA_real_,
mean(value, na.rm = TRUE))) %>%
ungroup
}
myfun(df, id)
# A tibble: 4 x 3
id value avg
<chr> <dbl> <dbl>
1 a 1 1.5
2 a 2 1.5
3 <NA> 3 NA
4 b 4 4

Getting summary by group and overall using tidyverse

I am trying to find a way to get summary stats such as means by group and overall in one step using dplyr
#Data set-up
sex <- sample(c("M", "F"), size=100, replace=TRUE)
age <- rnorm(n=100, mean=20 + 4*(sex=="F"), sd=0.1)
dsn <- data.frame(sex, age)
library("tidyverse")
#Using dplyr to get means by group and overall
mean_by_sex <- dsn %>%
group_by(sex) %>%
summarise(mean_age = mean(age))
mean_all <- dsn %>%
summarise(mean_age = mean(age)) %>%
add_column(sex = "All")
#combining the results by groups and overall
final_result <- rbind(mean_by_sex, mean_all)
final_result
#> # A tibble: 3 x 2
#> sex mean_age
#> <fct> <dbl>
#> 1 F 24.0
#> 2 M 20.0
#> 3 All 21.9
#This is the table I want but I wonder if is the only way to do this
Is there a way this in shorter step using group_by_at or group_by_all or a similar functions using tidyverse and dplyr
Any help would be greatly appreciated
One option could perhaps be:
dsn %>%
group_by(sex) %>%
summarise(mean_age = mean(age)) %>%
add_row(sex = "ALL", mean_age = mean(dsn$age))
sex mean_age
<fct> <dbl>
1 F 24.0
2 M 20.0
3 ALL 21.9
A little switching around can do it, too.
final_result <- dsn %>%
add_row(sex = "All", age = mean(age)) %>%
group_by(sex) %>%
summarise(mean_age = mean(age))
These answers are great if you have one variable to summarize by. What about two? I want to summarize across one but leave the other as is. The above solutions do not work in this case because the data frame still needs to be grouped.
#Data set up
set.seed(3243242)
dsn <- tibble(
obese = sample(c(TRUE, FALSE), size=100, replace = TRUE),
sex = sample(c("M", "F"), size=100, replace=TRUE),
age = rnorm(n=100, mean=20 + 4*(sex=="F"), sd=0.1)
)
library("tidyverse")
I restated the original problem using 2 group_by variables.
#Extend to 2 group_by variables?
df1 <- dsn %>%
group_by(sex, obese) %>%
summarise(mean_age = mean(age)) %>%
ungroup()
#Also across sex
df2 <- dsn %>%
group_by(obese) %>%
summarise(mean_age = mean(age)) %>%
ungroup()
#Final_result:
bind_rows(df1, df2)
Way to do this in one step? You can add mean with add_row() but not with a grouped df. Another option is to create a function that does all the things on the group dataset. If there are other things you want to do, say sort or create new variables, you can do it in the function. Then, you can apply the function to each grouped dataset. After combining via dplyr::bind_rows(), you can change the missing group variable to all via tidyr::replace_na().
#'#param df_group A grouped tibble
find_summary <- function(df_group){
df_group %>%
summarize(mean_age = mean(age)) #add other dplyr verbs here as needed like arrange or mutate
}
bind_rows(
find_summary(group_by(dsn, sex, obese)),
find_summary(group_by(dsn, obese))
) %>%
replace_na(list(sex = "all"))
sex obese mean_age
<chr> <lgl> <dbl>
1 F FALSE 24.0
2 F TRUE 24.0
3 M FALSE 20.0
4 M TRUE 20.0
5 all FALSE 21.7
6 all TRUE 22.3
You can extend the idea if you want a summary of all variables, by one variable, and by two variables.
bind_rows(
find_summary(group_by(dsn, sex, obese)),
find_summary(group_by(dsn, obese)),
find_summary(dsn)
) %>%
replace_na(list(sex = "all", obese = "all"))
sex obese mean_age
<chr> <chr> <dbl>
1 F FALSE 24.0
2 F TRUE 24.0
3 M FALSE 20.0
4 M TRUE 20.0
5 all FALSE 21.7
6 all TRUE 22.3
7 all all 22.0

dplyr pass NULL to group_by

This has probably been answered somewhere, but I cannot find the answer...Mark as a duplicate and downvote as you like, but someone please help me :)
Short question
How can I pass NULL to dplyr::group_by inside a function?
library(dplyr)
dt <- data.frame(a = sample(LETTERS[1:2], 100, replace = TRUE), b = sample(LETTERS[3:4], 100, replace = TRUE), value = rnorm(100,5,1))
f1 <- function(dt, a, b, c) {
dt %>% group_by(a, b, c) %>% summarise(mean = mean(value))
}
f1(dt, a = "a", b = "b", c = NULL)
# Error in grouped_df_impl(data, unname(vars), drop) :
# Column `c` is unknown
Long explanation
I am writing a function where "b" column can be given as NULL meaning that the function should ignore this column. If the "b" column is given as a character the function should use the column to summarize data. Like this:
f2 <- function(dt, a, b) {
if(is.null(b)) {
dt %>% group_by(a) %>% summarize(mean = mean(value))
} else {
dt %>% group_by(a, b) %>% summarize(mean = mean(value))
}
}
The actual function is quite long and complicated, and uses dplyr pipes to make all the summarizing code shorter. I have multiple conditions leading to different outputs and summarizing alternatives, and therefore I have shortened the if else statements by grouping first and summarizing in a separate step:
f3 <- function(dt, a, b, type = "mean") {
if(is.null(b)) {
tmp <- dt %>% group_by(a)
} else {
tmp <- dt %>% group_by(a, b)
}
if(type == "mean") {
tmp %>% summarize(mean = mean(value))
} else {
tmp %>% summarise(sum = sum(value))
}
}
If it was possible to pass NULL to the group_by function, I could considerably shorten my code (NULL is supposed to be empty anyway and such passing works with many functions such as reshape2::melt from the same author).
I'm not sure if this covers all of your use cases, but a function using tidy evaluation (see the programming with dplyr vignette) would be more flexible in that you wouldn't have to worry about how many grouping variables there are and you could pass an arbitrary vector of functions to summarize by. Hopefully, this avoids the need to keep track of NULL columns or use ifelse to choose the summary function.
For example, in the code below, ... is any number of grouping columns, including no grouping columns at all. The type argument allows you to summarize by one or more arbitrary functions:
library(tidyverse)
library(rlang)
set.seed(2)
dt <- data.frame(a = sample(LETTERS[1:2], 100, replace = TRUE),
b = sample(LETTERS[3:4], 100, replace = TRUE),
value = rnorm(100,5,1))
f1 = function(data, value.var, ..., type="mean") {
groups = enquos(...)
value.var = enquo(value.var)
names(type) = paste0(type, "_", quo_text(value.var))
type = syms(type)
data %>% group_by(!!!groups) %>%
summarise_at(vars(!!value.var), funs(!!!type))
}
f1(dt, value, a, b)
a b mean_value
<fct> <fct> <dbl>
1 A C 5.01
2 A D 5.05
3 B C 4.95
4 B D 5.13
f1(dt, value)
mean_value
<dbl>
1 5.03
weird_func = function(x) {
paste(round(cos(x),1)[1:3], collapse="/")
}
f1(dt, value, a, b, type=c("mean", "min", "median", "max", "weird_func"))
a b mean_value min_value median_value max_value weird_func_value
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr>
1 A C 5.01 3.26 5.07 7.08 1/-0.1/1
2 A D 5.05 2.90 5.33 6.36 -0.4/0.9/0
3 B C 4.95 3.66 4.73 7.11 0.5/-0.5/0.7
4 B D 5.13 2.98 5.46 7.05 0/0.7/0.7
f1(mtcars, mpg, cyl, type=c("mean", "median"))
cyl mean_mpg median_mpg
<dbl> <dbl> <dbl>
1 4 26.7 26
2 6 19.7 19.7
3 8 15.1 15.2
I think you need to first convert it from NULL to NA, like this (as from your answers you just need to pass the value through without involving it in the calculations)
library(dplyr)
dt <- data.frame(a = sample(LETTERS[1:2], 100, replace = TRUE), b = sample(LETTERS[3:4], 100, replace = TRUE), value = rnorm(100,5,1))
f1 <- function(dt, a, b, c) {
dt %>%
mutate(c = ifelse(is_empty(c)==TRUE,NA,c)) %>%
group_by(a, b,c) %>%
summarise(mean = mean(value))
}
f1(dt, a = "a", b = "b",c=NULL)
Results:
# A tibble: 4 x 4
# Groups: a, b [?]
a b c mean
<fct> <fct> <lgl> <dbl>
1 A C NA 5.27
2 A D NA 5.18
3 B C NA 5.27
4 B D NA 5.49

Resources