Peform operations on column names within a user defined function - r

I recently understood how to access a column names inside a user defined function: How to access a column name in a user defined function with dplyr?
However, now I also need to access the column names within the operations that are being carried out. For example I would like to do this:
samp_df <- tibble(var1 = c('a', 'b', 'c'),
var_in_df = c(3,7,9))
calculateSummaries <- function(df, variable){
df <- df %>%
mutate("mean_of_{{variable}}" := mean({{variable}}),
"sd_of_{{variable}}" := sd({{variable}}),
"sd_plus_mean_of_{{variable}}" := ("mean_of_{{variable}}" + "sd_of_{{variable}}")
)
}
df_result <- calculateSummaries(samp_df, var_in_df)
Of course I could do:
"sd_plus_mean_of_{{variable}}" := mean({{variable}}) + sd({{variable}})
But in practice, with the real data this won't be practical.
Does anyone know how to so this?

This case ineed a little bit tricky, I think we have to constuct the names first and then use !! sym() to evaluate the strings as objects.
library(dplyr)
samp_df <- tibble(var1 = c('a', 'b', 'c'),
var_in_df = c(3,7,9))
calculateSummaries <- function(df, variable){
var_nm <- deparse(substitute(variable))
mean_var_nm <- paste0("mean_of_", var_nm)
sd_var_nm <- paste0("sd_of_", var_nm)
df %>%
mutate("mean_of_{{variable}}" := mean({{variable}}),
"sd_of_{{variable}}" := sd({{variable}}),
"sd_plus_mean_of_{{variable}}" := !! sym(mean_var_nm) + !! sym(sd_var_nm)
)
}
calculateSummaries(samp_df, var_in_df)
#> # A tibble: 3 x 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df sd_plus_mean_of_var_in_df
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
An alternative way is using across(), but we still have to construct the variable names.
calculateSummaries <- function(df, variable){
df %>%
mutate("mean_of_{{variable}}" := mean({{variable}}),
"sd_of_{{variable}}" := sd({{variable}}),
across(c({{ variable }}),
list(sd_plus_mean_of = ~ get(paste0("mean_of_", cur_column())) + get(paste0("sd_of_", cur_column())))
)
)
}
calculateSummaries(samp_df, var_in_df)
#> # A tibble: 3 x 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df var_in_df_sd_plus_mean_of
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
Here is a final way inspired by Lionel Henry's answer to this question. We can use rlang::englue() to construct names and use those names with the .data[[...]] pronoun.
calculateSummaries <- function(df, variable){
mean_var_nm <- rlang::englue("mean_of_{{ variable }}")
sd_var_nm <- rlang::englue("sd_of_{{ variable }}")
df %>%
mutate("mean_of_{{ variable }}" := mean({{ variable }}),
"sd_of_{{ variable }}" := sd({{ variable }}),
"sd_plus_mean_of_{{ variable }}" := .data[[mean_var_nm]] + .data[[sd_var_nm]]
)
}
calculateSummaries(samp_df, var_in_df)
#> # A tibble: 3 x 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df sd_plus_mean_of_var_in_df
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
Created on 2022-10-13 by the reprex package (v2.0.1)

According to this tidyverse blog post glue strings are only supported as result names, which IMHO means only on the LHS.
Besides the options offered by #TimTeaFan another option would be to use across to compute all desired values and name the columns using the .names argument:
library(dplyr)
calculateSummaries1 <- function(df, variable) {
df <- df %>%
mutate(across({{ variable }},
.fns = list(
mean = mean,
sd = sd,
sd_plus_mean = ~ mean(.x) + sd(.x)
),
.names = "{.fn}_of_{.col}"
))
df
}
calculateSummaries1(samp_df, var_in_df)
#> # A tibble: 3 × 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df sd_plus_mean_of_var_in_df
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
And a second option would be to use some helper variable names for the mean and the sd which avoids to use glue syntax one the RHS but requires an additional rename step:
calculateSummaries2 <- function(df, variable) {
df <- df %>%
mutate(
mean = mean({{ variable }}),
sd = sd({{ variable }}),
"sd_plus_mean_of_{{variable}}" := mean + sd
) |>
rename("mean_of_{{variable}}" := mean, "sd_of_{{variable}}" := sd)
df
}
calculateSummaries2(samp_df, var_in_df)
#> # A tibble: 3 × 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df sd_plus_mean_of_var_in_df
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39

Related

Create parameterized summaries of a column

I have a tibble and I want create several summaries of the same column, specifically the first, second and third quartiles.
To do it, I create a named list of functions and that works fine.
library("tidyverse")
set.seed(1234)
df <- tibble(x = rnorm(100))
df %>%
summarise(
across(x,
list(
Q1 = ~ quantile(., 1 / 4),
Q2 = ~ quantile(., 2 / 4),
Q3 = ~ quantile(., 3 / 4)
),
.names = "{.fn}"
)
)
#> # A tibble: 1 × 3
#> Q1 Q2 Q3
#> <dbl> <dbl> <dbl>
#> 1 -0.895 -0.385 0.471
Can I achieve this by specifying the list of probabilities to pass to quantile? So that I save myself typing and more importantly avoid hard-coding the arguments to pass to the aggregating function.
The following doesn't work because it creates one row per probability rather than one column.
df %>%
summarise(
across(x, quantile, 1:3 / 4)
)
#> # A tibble: 3 × 1
#> x
#> <dbl>
#> 1 -0.895
#> 2 -0.385
#> 3 0.471
you're almost here
df <- tibble(x = rnorm(100))
df %>%
summarise(
across(x,
map(1:3, ~partial(quantile, probs=./4)),
.names = "Q{.fn}"
)
)
# A tibble: 1 x 3
Q1 Q2 Q3
<dbl> <dbl> <dbl>
1 -0.579 0.0815 0.475
If you define the quantiles like this:
Q <- c(0.25, 0.5, 0.75)
Then the following code will produce columns of the appropriate quantiles with sensible labels:
df %>%
summarise(
across(x,
setNames( lapply(Q,
function(x) { f <- ~quantile(., b); f[2][[1]][[3]] <- x; f }),
paste("Q", round(100 * Q), sep = "_")),
.names = "{.fn}"
)
)
#> # A tibble: 1 x 3
#> Q_25 Q_50 Q_75
#> <dbl> <dbl> <dbl>
#> 1 -0.895 -0.385 0.471
Created on 2022-06-29 by the reprex package (v2.0.1)

group_by and summaries with variable number of variables

Using the {{var}} notation the following code works.
The variables to be used for grouping and for summarizing van be given as parameters to my_summary
I would like to modify my_summary so that I can give a varying number of variables for both grouping and summarizing. Can this be done?
suppressPackageStartupMessages({
library(tidyverse)
})
set.seed(4321)
demo_df <-
tibble(age=as.integer(rep(c(10,20),each=10)),
gender=rep(c("f","m"),10),
weight=rnorm(20,70,7),
size=rnorm(20,160,15))
my_summary <- function(df_in,group_var,summary_var){
df_in |>
group_by({{group_var}}) |>
summarise_at(vars({{summary_var}}),mean)
}
my_summary(demo_df,gender,weight)
Another possible solution, allowing for multiple grouping variables:
library(tidyverse)
my_summary <- function(df_in, group_var,summary_var){
df_in %>%
group_by(!!!group_var) %>%
summarise(across({{summary_var}}, mean), .groups = "drop")
}
my_summary(demo_df, vars(age,gender), c(weight,size))
#> # A tibble: 4 × 4
#> age gender weight size
#> <int> <chr> <dbl> <dbl>
#> 1 10 f 71.5 159.
#> 2 10 m 72.4 158.
#> 3 20 f 64.3 167.
#> 4 20 m 71.6 164.
Alternatively, without vars (that may be superseded):
library(tidyverse)
my_summary <- function(df_in, summary_var , ...){
summary_var <- enquos(summary_var)
group_var <- enquos(...)
df_in %>%
group_by(!!!group_var) %>%
summarise(across(!!!summary_var,mean), .groups = "drop")
}
my_summary(demo_df, c(weight, size), age, gender)
#> # A tibble: 4 × 4
#> age gender weight size
#> <int> <chr> <dbl> <dbl>
#> 1 10 f 71.5 159.
#> 2 10 m 72.4 158.
#> 3 20 f 64.3 167.
#> 4 20 m 71.6 164.
Use summarise(across(.)).
suppressPackageStartupMessages({
library(tidyverse)
})
set.seed(4321)
demo_df <-
tibble(age=as.integer(rep(c(10,20),each=10)),
gender=rep(c("f","m"),10),
weight=rnorm(20,70,7),
size=rnorm(20,160,15))
my_summary <- function(df_in,group_var,summary_var){
df_in |>
group_by({{group_var}}) |>
summarise(across({{summary_var}}, mean))
}
my_summary(demo_df, gender, weight:size)
#> # A tibble: 2 × 3
#> gender weight size
#> <chr> <dbl> <dbl>
#> 1 f 67.9 163.
#> 2 m 72.0 161.
Created on 2022-06-09 by the reprex package (v2.0.1)

Loop to make a basic table for many variables by condition

I am running an experiment where participants are randomly assigned to one of two conditions, and then I collect data on several variables. Here is an example of my code:
df <- data.frame(condition =c(1,1,1,1,1,-1,-1,-1,-1,-1),
var1 = c(6,6,4,7,5,6,6,6,4,7),
var2 = c(3,4,3,6,7,1,2,1,2,5),
var3 = c(2,2,6,6,7,1,7,7,3,1),
var4 = c(6,4,3,6,4,1,3,3,4,4))
df$condition = factor(df$condition, levels = c(-1,1),labels = c("Digital","Physical"))
For each variable (var1, var2, etc.) I would like a little table with the count, mean, and standard deviation. This code creates the kind of table that I want:
group_by(df, df$condition) %>%
summarise(
count = n(),
mean = mean(var1),
sd = sd(var1))
But because I have many variables, I would like to use some kind of loop (or "lapply"?) to create all these tables at once. It would also be great if each table could show the name of the variable. Thanks!
You can just use summarise on all the variables, i.e.
library(dplyr)
group_by(df, condition) %>%
summarise(across(everything(), ~ c(count = n(), mean = mean(.), sd = sd(.))))
`summarise()` has grouped output by 'condition'. You can override using the `.groups` argument.
# A tibble: 6 x 5
# Groups: condition [2]
condition var1 var2 var3 var4
<fct> <dbl> <dbl> <dbl> <dbl>
1 Digital 5 5 5 5
2 Digital 5.8 2.2 3.8 3
3 Digital 1.10 1.64 3.03 1.22
4 Physical 5 5 5 5
5 Physical 5.6 4.6 4.6 4.6
6 Physical 1.14 1.82 2.41 1.34
You can control the output structure by changing object in the formula, i.e.
group_by(df, condition) %>%
summarise(across(everything(), ~ data.frame(count = n(), mean = mean(.), sd = sd(.))))
# A tibble: 2 x 5
condition var1$count $mean $sd var2$count $mean $sd var3$count $mean $sd var4$count $mean $sd
<fct> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 Digital 5 5.8 1.10 5 2.2 1.64 5 3.8 3.03 5 3 1.22
2 Physical 5 5.6 1.14 5 4.6 1.82 5 4.6 2.41 5 4.6 1.34
We could still do it my summarise using a list:
library(dplyr)
df %>%
group_by(condition) %>%
summarise(across(starts_with("var"), .f = list(n = ~n(),
mean = mean,
sd = sd), na.rm = TRUE))
condition var1_n var1_mean var1_sd var2_n var2_mean var2_sd var3_n var3_mean var3_sd var4_n var4_mean var4_sd
<dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 -1 5 5.8 1.10 5 2.2 1.64 5 3.8 3.03 5 3 1.22
2 1 5 5.6 1.14 5 4.6 1.82 5 4.6 2.41 5 4.6 1.34
df <- data.frame(condition =c(1,1,1,1,1,-1,-1,-1,-1,-1),
var1 = c(6,6,4,7,5,6,6,6,4,7),
var2 = c(3,4,3,6,7,1,2,1,2,5),
var3 = c(2,2,6,6,7,1,7,7,3,1),
var4 = c(6,4,3,6,4,1,3,3,4,4))
df$condition = factor(df$condition, levels = c(-1,1),labels = c("Digital","Physical"))
for (var in names(df)[2:length(names(df))]){
tab <- group_by(df, condition) %>%
select(c("condition", var)) %>%
dplyr::rename(v = var) %>%
summarise(
count = n(),
mean = mean(v),
sd = sd(v)
)
print(var)
print(tab)
}
gives
[1] "var1"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 5.8 1.10
2 Physical 5 5.6 1.14
[1] "var2"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 2.2 1.64
2 Physical 5 4.6 1.82
[1] "var3"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 3.8 3.03
2 Physical 5 4.6 2.41
[1] "var4"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 3 1.22
2 Physical 5 4.6 1.34
>
Rather than lapply, the function of choice is aggregate, a close relative to the *apply family at least. Put in a custom function f.
f <- \(x) c(n=length(x), mu=mean(x), sd=sd(x))
aggregate(. ~ condition, df, f)
# condition var1.n var1.mu var1.sd var2.n var2.mu var2.sd var3.n var3.mu var3.sd var4.n var4.mu var4.sd
# 1 Digital 5.000000 5.800000 1.095445 5.000000 2.200000 1.643168 5.000000 3.800000 3.033150 5.000000 3.000000 1.224745
# 2 Physical 5.000000 5.600000 1.140175 5.000000 4.600000 1.816590 5.000000 4.600000 2.408319 5.000000 4.600000 1.341641
If you want to aggregate on a specific set of variables (e.g. assembled with grep), use list notation instead.
aggregate(df[grep('^var', names(df))], df['condition'], f)
You can use gtsummary here if you need to present the results.
Example one below will make one table with all of your variables. Example two will split each variable into its own table (if you need them to be seperate)
library(gtsummary)
#example one:
tbl_summary(df, by = condition,
type = list(everything()~"continuous"),
statistic = list(all_continuous()~"{mean} ({sd}) "))
#example two:
tbl_summary(df, by = condition,
type = list(everything()~"continuous"),
statistic = list(all_continuous()~"{mean} ({sd}) ")) %>%
tbl_split(variables = c(var1, var2,var3,var4))

conditionally mutating column values using `dplyr`

I am using WRS2 to carry out robust pairwise comparisons. But one problem is that it removes the group level names from the output dataframes and saves it in a different object.
# setup
set.seed(123)
library(WRS2)
library(tidyverse)
# robust pairwise comparisons
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
# comparisons
x$comp
#> Group Group psihat ci.lower ci.upper p.value
#> [1,] 1 2 -1.0 -3.440879 1.44087853 0.25984505
#> [2,] 1 3 -2.8 -5.536161 -0.06383861 0.04914871
#> [3,] 2 3 -1.8 -4.536161 0.93616139 0.17288911
# vector with group level names
x$fnames
#> [1] "placebo" "low" "high"
I can convert it to a tibble:
# converting to tibble
suppressMessages(as_tibble(x$comp, .name_repair = "unique")) %>%
dplyr::rename(group1 = Group...1, group2 = Group...2)
#> # A tibble: 3 x 6
#> group1 group2 psihat ci.lower ci.upper p.value
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 -1 -3.44 1.44 0.260
#> 2 1 3 -2.8 -5.54 -0.0638 0.0491
#> 3 2 3 -1.8 -4.54 0.936 0.173
I would then like to replace the group column numeric values with actual names included in fnames (so map fnames[1] -> 1, fnames[2] -> 2, and so on).
So the final dataframe should look something like the following-
#> # A tibble: 3 x 6
#> group1 group2 psihat ci.lower ci.upper p.value
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 placebo low -1 -3.44 1.44 0.260
#> 2 placebo high -2.8 -5.54 -0.0638 0.0491
#> 3 low high -1.8 -4.54 0.936 0.173
In this case, it was easy to just copy-paste the three values, but I want to have a generalizable approach where no matter the number of levels, it works. How can I do this using dplyr?
Using a named vector to match with tidyverse. This matches by value and not by the sequence of index i.e. if the value in 'Group' columns are not in a sequence or character, this would still work
library(dplyr)
as_tibble(x$comp, .name_repair = 'unique') %>%
mutate(across(starts_with("Group"),
~ setNames(x$fnames, seq_along(x$fnames))[as.character(.)]))
Does this fullfil your needs :
names <- c("A","B","C")
df = data.frame(group=c(1,2,3))
library(dplyr)
df %>% mutate(group = names[group])
group
1 A
2 B
3 C
Here's an approach using the recode function, with the recoding vector built programmatically from the data:
# Setup
set.seed(123)
library(WRS2)
library(tidyverse)
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
# Create recoding vector
recode.vec = x$fnames %>% set_names(1:length(x$fnames))
# Recode columns
x.comp = x$comp %>%
as_tibble(.name_repair=make.unique) %>%
mutate(across(starts_with("Group"), ~recode(., !!!recode.vec)))
Output:
x.comp
#> # A tibble: 3 x 6
#> Group Group.1 psihat ci.lower ci.upper p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 placebo low -1 -3.44 1.44 0.260
#> 2 placebo high -2.8 -5.54 -0.0638 0.0491
#> 3 low high -1.8 -4.54 0.936 0.173
Try this tidyverse approach formating data to long after extracting the objects as tibbles. You can use left_join() to get your groups as you want. Here the code to get something close to what you want:
# setup
set.seed(123)
library(WRS2)
library(tidyverse)
# robust pairwise comparisons
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
#Transform to tibble
df1 <- suppressMessages(as_tibble(x$comp, .name_repair = "unique")) %>%
dplyr::rename(group1 = Group...1, group2 = Group...2)
#Extract labels
df2 <- tibble(treat=x$fnames) %>% mutate(value=1:n())
#Format to long df1
df1 <- df1 %>%
mutate(id=1:n()) %>%
pivot_longer(cols = c(group1,group2)) %>%
rename(group=name) %>% left_join(df2) %>% select(-value) %>%
pivot_wider(names_from = group,values_from=treat) %>% select(-id)
Output:
# A tibble: 3 x 6
psihat ci.lower ci.upper p.value group1 group2
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 -1 -3.44 1.44 0.260 placebo low
2 -2.8 -5.54 -0.0638 0.0491 placebo high
3 -1.8 -4.54 0.936 0.173 low high

Summarize variables beside

I am looking for a solution for my problem. I just can solve it with manually rearranging.
Example code:
library(dplyr)
set.seed(1)
Data <- data.frame(
W = sample(1:10),
X = sample(1:10),
Y = sample(c("yes", "no"), 10, replace = TRUE),
Z = sample(c("cat", "dog"), 10, replace = TRUE)
)
#
summarized <- Data %>% group_by(Z) %>% summarise_if(is.numeric,funs(mean,median),na.rm=T)
print(Data)
I want the output looks like below, with each function applied to the first col and then and each function applied to the second col and so on. My code does it vice versa.
Of course I could rearrange the cols but that is not what Data Science is about. I have hundreds of cols and want to apply multiple functions.
This is what I want:
summarized <- summarized[,c(1,2,4,3,5)] #best solution yet
Is there any argument I am missing? I bet there is an easy solution or an other function does the job.
Guys, thx in advance!
One option would be to post-process with adequate select_helpers
library(dplyr)
summarized %>%
select(Z, starts_with('W'), everything())
# A tibble: 2 x 5
# Z W_mean W_median X_mean X_median
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 cat 5.25 5.5 3.75 3.5
#2 dog 5.67 5.5 6.67 7
If there are 100s of columns, one approach is to get the substring of the column names, and order
library(stringr)
summarized %>%
select(Z, order(str_remove(names(.), "_.*")))
# A tibble: 2 x 5
# Z W_mean W_median X_mean X_median
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 cat 5.25 5.5 3.75 3.5
#2 dog 5.67 5.5 6.67 7
You can use starts_with() to select the columns, instead of by number.
library(dplyr)
set.seed(1)
Data <- data.frame(
W = sample(1:10),
X = sample(1:10),
Y = sample(c("yes", "no"), 10, replace = TRUE),
Z = sample(c("cat", "dog"), 10, replace = TRUE)
)
summarized <-
Data %>%
group_by(Z) %>%
summarise_if(is.numeric,funs(mean,median),na.rm=T) %>%
select(Z, starts_with("W_"), starts_with("X_"))
summarized
#> # A tibble: 2 x 5
#> Z W_mean W_median X_mean X_median
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 cat 5.25 5.5 3.75 3.5
#> 2 dog 5.67 5.5 6.67 7
Created on 2019-12-09 by the reprex package (v0.3.0)

Resources