group by multiple variables without intersection - r

I want to group_by multiple columns wihout intersection.
I am looking for the output below without having to replicate the code for both variables.
library(dplyr)
> mtcars %>%
+ group_by(cyl) %>%
+ summarise(mean(disp))
# A tibble: 3 × 2
cyl `mean(disp)`
<dbl> <dbl>
1 4 105.
2 6 183.
3 8 353.
>
> mtcars %>%
+ group_by(am) %>%
+ summarise(mean(disp))
# A tibble: 2 × 2
am `mean(disp)`
<dbl> <dbl>
1 0 290.
2 1 144.
I am not looking for the code below since this gives the intersection between the variables:
> mtcars %>%
+ group_by(cyl, am) %>%
+ summarise(mean(disp))
# A tibble: 6 × 3
# Groups: cyl [3]
cyl am `mean(disp)`
<dbl> <dbl> <dbl>
1 4 0 136.
2 4 1 93.6
3 6 0 205.
4 6 1 155
5 8 0 358.
6 8 1 326
Thanks a lot!

An alternative would be a custom function:
my_func <- function(df, group){
df %>%
group_by({{group}}) %>%
summarise(mean_disp = mean(disp))
}
my_func(mtcars, cyl)
my_func(mtcars, am)
cyl mean_disp
<dbl> <dbl>
1 4 105.
2 6 183.
3 8 353.
> my_func(mtcars, am)
# A tibble: 2 × 2
am mean_disp
<dbl> <dbl>
1 0 290.
2 1 144.

Something like this?
library(tidyverse)
c("cyl", "am") %>%
map(~ mtcars %>%
group_by(!!sym(.x)) %>%
summarise(result = mean(disp)))
[[1]]
# A tibble: 3 x 2
cyl result
<dbl> <dbl>
1 4 105.
2 6 183.
3 8 353.
[[2]]
# A tibble: 2 x 2
am result
<dbl> <dbl>
1 0 290.
2 1 144.

Related

curly curly operator doesnt work with map() in R

library(tidyverse)
mean_by <- function(data,by,conti){
data %>% group_by({{by}}) %>% summarise(mean=mean({{conti}})) %>%
print() %>%
ggplot(aes(x={{by}},y=mean))+geom_col()
}
map(mtcars %>% select_if(is.numeric),~mean_by(mtcars,cyl,.))
# Not quite the same
mean_by(mtcars,cyl,carb)
I was toying around with the curly curly operator in R (just learned about it!) and then when iterating using map it seemd like the grouping isnt working very well, and I cant get my hands around the problem. What am I doing wrong?
Btw, When trying the explicit pmap way, I couldnt get around using the cyl variable in a clever way
pmap(mtcars %>% select_if(is.numeric),mean_by,..1=mtcars,..2=cyl,..3=.)
Error in pmap():
i In index: 1.
Caused by error in withCallingHandlers():
! object 'cyl' not found
Run rlang::last_error() to see where the error occurred.
It is expecting the column names and not the values - here, the select_if returns a subset of columns that are numeric. We may need the names to loop which would be a string, thus it is better to convert to symbol and evaluate (!!)
library(dplyr)
library(purrr)
mean_by <- function(data,by,conti){
by_sym <- rlang::ensym(by)
conti <- rlang::ensym(conti)
data %>% group_by(!! by_sym) %>%
summarise(mean=mean(!!conti)) %>%
print() %>%
ggplot(aes(x= !!by_sym,y=mean))+geom_col()
}
map(mtcars %>%
select_if(is.numeric) %>%
names,~mean_by(mtcars,cyl, !!.x))
-output (graphs removed)
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 26.7
2 6 19.7
3 8 15.1
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 4
2 6 6
3 8 8
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 105.
2 6 183.
3 8 353.
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 82.6
2 6 122.
3 8 209.
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 4.07
2 6 3.59
3 8 3.23
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 2.29
2 6 3.12
3 8 4.00
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 19.1
2 6 18.0
3 8 16.8
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 0.909
2 6 0.571
3 8 0
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 0.727
2 6 0.429
3 8 0.143
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 4.09
2 6 3.86
3 8 3.29
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 1.55
2 6 3.43
3 8 3.5
I've not seen the tilde syntax with map, but if you change that it seems to work.
map(mtcars %>% select_if(is.numeric), mean_by, data=mtcars, by=cyl)
Side note, you don't need that print() statement in mean_by.
mean_by <- function(data,by,conti){
data %>% group_by({{by}}) %>% summarise(mean=mean({{conti}})) %>%
ggplot(aes(x={{by}},y=mean))+geom_col()
}

How can I take the first input of ...?

I want to create a function where the first input of create_df gets turned into the number 1 using tidyeval. Here it should be only the cyl column.
How do I pull the first "input" of ...?
library(dplyr, quietly = T)
create_df <- function(...){
var <- enquos(...)
first <- as_label(quos(...))
mtcars %>%
group_by(!!!var) %>%
summarise(mean = mean(mpg)) %>%
mutate(!!first := 1)
}
create_df(cyl, am)
#> `summarise()` regrouping output by 'cyl' (override with `.groups` argument)
#> # A tibble: 6 x 4
#> # Groups: cyl [3]
#> cyl am mean `<quos>`
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 0 22.9 1
#> 2 4 1 28.1 1
#> 3 6 0 19.1 1
#> 4 6 1 20.6 1
#> 5 8 0 15.0 1
#> 6 8 1 15.4 1
Created on 2020-07-01 by the reprex package (v0.3.0)
You can pull the first variable out of the dots by simply taking the first element out of vars (with credit to Lionel Henry for pointing this out).
create_df <- function(...){
var <- enquos(...)
first <- as_label(var[[1]])
mtcars %>%
group_by(!!!var) %>%
summarise(mean = mean(mpg)) %>%
mutate(!!first := 1)
}
create_df(cyl, am)
#> `summarise()` regrouping output by 'cyl' (override with `.groups` argument)
#> # A tibble: 6 x 3
#> # Groups: cyl [1]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 1 0 22.9
#> 2 1 1 28.1
#> 3 1 0 19.1
#> 4 1 1 20.6
#> 5 1 0 15.0
#> 6 1 1 15.4

creating variables for proportions in each category, over multiple variables

I want to create a data frame with columns for the proportion of observations in each category, much like this:
library(tidyverse)
mtcars %>%
group_by(am) %>%
summarise(gear3 = sum(gear == 3)/n(),
gear4 = sum(gear == 4)/n(),
gear5 = sum(gear == 5)/n(),
cyl4 = sum(cyl == 4)/n(),
cyl6 = sum(cyl == 6)/n(),
cyl8 = sum(cyl == 8)/n())
# # A tibble: 2 x 7
# am gear3 gear4 gear5 cyl4 cyl6 cyl8
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0 0.789 0.211 0 0.158 0.211 0.632
# 2 1 0 0.615 0.385 0.615 0.231 0.154
I am looking for way to this without manually naming the new summary variables?
There seems to be a few questions, such as here, related to creating a proportions for single variables, which i could replicate for each variable, pivot and and then combine but it will become tedious in my application - i am trying to build the data frame for many variables
mtcars %>%
group_by(am, gear) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
# # A tibble: 4 x 4
# # Groups: am [2]
# am gear n freq
# <dbl> <dbl> <int> <dbl>
# 1 0 3 15 0.789
# 2 0 4 4 0.211
# 3 1 4 8 0.615
# 4 1 5 5 0.385
mtcars %>%
group_by(am, cyl) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
# # A tibble: 6 x 4
# # Groups: am [2]
# am cyl n freq
# <dbl> <dbl> <int> <dbl>
# 1 0 4 3 0.158
# 2 0 6 4 0.211
# 3 0 8 12 0.632
# 4 1 4 8 0.615
# 5 1 6 3 0.231
# 6 1 8 2 0.154
Here is one solution:
library(dplyr)
freqPairs <- function(df, first, second){
pairs <- as.list(data.frame(t(expand.grid(first, second))))
res <- lapply(pairs, function(z) df %>%
group_by(!!sym(z[1]), !!sym(z[2])) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n)) %>%
{colnames(.)[1:2] = c("Var1", "Var2"); .} %>%
ungroup())
setNames(res, unlist(lapply(pairs, paste, collapse="_vs_")))
}
bind_rows(freqPairs(mtcars, first=c("am"), second=c("cyl", "gear")), .id = "comparison")
#> # A tibble: 10 x 5
#> comparison Var1 Var2 n freq
#> <chr> <dbl> <dbl> <int> <dbl>
#> 1 am_vs_cyl 0 4 3 0.158
#> 2 am_vs_cyl 0 6 4 0.211
#> 3 am_vs_cyl 0 8 12 0.632
#> 4 am_vs_cyl 1 4 8 0.615
#> 5 am_vs_cyl 1 6 3 0.231
#> 6 am_vs_cyl 1 8 2 0.154
#> 7 am_vs_gear 0 3 15 0.789
#> 8 am_vs_gear 0 4 4 0.211
#> 9 am_vs_gear 1 4 8 0.615
#> 10 am_vs_gear 1 5 5 0.385
Created on 2020-05-13 by the reprex package (v0.3.0)
You can always recover the names of Var1 and Var2 from the comparison column, e.g. by splitting that string. Example:
library(data.table)
res <- bind_rows(freqPairs(mtcars, first=c("am"), second=c("cyl", "gear")), .id = "comparison")
data.table(res)[, c("Variable1", "Variable2") := tstrsplit(comparison, "_vs_")][]
#> comparison Var1 Var2 n freq Variable1 Variable2
#> 1: am_vs_cyl 0 4 3 0.1578947 am cyl
#> 2: am_vs_cyl 0 6 4 0.2105263 am cyl
#> 3: am_vs_cyl 0 8 12 0.6315789 am cyl
#> 4: am_vs_cyl 1 4 8 0.6153846 am cyl
#> 5: am_vs_cyl 1 6 3 0.2307692 am cyl
#> 6: am_vs_cyl 1 8 2 0.1538462 am cyl
#> 7: am_vs_gear 0 3 15 0.7894737 am gear
#> 8: am_vs_gear 0 4 4 0.2105263 am gear
#> 9: am_vs_gear 1 4 8 0.6153846 am gear
#> 10: am_vs_gear 1 5 5 0.3846154 am gear
Note:
If you really want all possible pairs in both orders, you could use something like:
pairs <- c(combn(colnames(mtcars), 2, simplify=FALSE),
lapply(combn(colnames(mtcars), 2, simplify=FALSE), rev))
Figured out a way using map() in purrr
First, a function to calculate a named vector of proportions
prop <- function(v){
n <- match.call() %>%
as.character() %>%
.[2] %>%
str_extract(pattern = "(?<=\\$)(.*)")
table(v) %>%
`/`(sum(.)) %>%
as.matrix() %>%
t() %>%
as_tibble() %>%
set_names(paste0(n, colnames(.)))
}
prop(v = mtcars$gear)
# # A tibble: 1 x 3
# gear3 gear4 gear5
# <dbl> <dbl> <dbl>
# 1 0.469 0.375 0.156
Then using map() to apply the function to each group, one variable at a time
mtcars %>%
group_nest(am) %>%
mutate(p_gear = map(.x = data, .f = ~prop(.x$gear)),
p_cyl = map(.x = data, .f = ~prop(.x$cyl))) %>%
unnest(c(p_gear, p_cyl)) %>%
select(-data)
# # A tibble: 2 x 7
# am gear3 gear4 gear5 cyl4 cyl6 cyl8
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0 0.789 0.211 NA 0.158 0.211 0.632
# 2 1 NA 0.615 0.385 0.615 0.231 0.154
A further example, including replacing NA with zeros
mtcars %>%
group_nest(carb) %>%
mutate(p_gear = map(.x = data, .f = ~prop(.x$gear)),
p_cyl = map(.x = data, .f = ~prop(.x$cyl)),
p_vs = map(.x = data, .f = ~prop(.x$vs))) %>%
unnest(c(p_gear, p_cyl, p_vs)) %>%
select(-data) %>%
mutate_all(~ifelse(is.na(.), 0, .))
# # A tibble: 6 x 9
# carb gear3 gear4 gear5 cyl4 cyl6 cyl8 vs1 vs0
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 0.429 0.571 0 0.714 0.286 0 1 0
# 2 2 0.4 0.4 0.2 0.6 0 0.4 0.5 0.5
# 3 3 1 0 0 0 0 1 0 1
# 4 4 0.5 0.4 0.1 0 0.4 0.6 0.2 0.8
# 5 6 0 0 1 0 1 0 0 1
# 6 8 0 0 1 0 0 1 0 1

Standard Deviation coming up NA when using summarise() function

I am trying to calculate descriptive statistics for the birthweight data set (birthwt) found in RStudio. However, I'm only interested in a few variables: age, ftv, ptl and lwt.
This is the code I have so far:
library(MASS)
library(dplyr)
data("birthwt")
grouped <- group_by(birthwt, age, ftv, ptl, lwt)
summarise(grouped,
mean = mean(bwt),
median = median(bwt),
SD = sd(bwt))
It gives me a pretty-printed table but only a limited number of the SD is filled and the rest say NA. I just can't work out why or how to fix it!
I stumbled here for another reason and also for me, the answer comes from the docs:
# BEWARE: reusing variables may lead to unexpected results
mtcars %>%
group_by(cyl) %>%
summarise(disp = mean(disp), sd = sd(disp))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 3 x 3
#> cyl disp sd
#> <dbl> <dbl> <dbl>
#> 1 4 105. NA
#> 2 6 183. NA
#> 3 8 353. NA
So, in case someone has the same reason as me, instead of reusing a variable, create new ones:
mtcars %>%
group_by(cyl) %>%
summarise(
disp_mean = mean(disp),
disp_sd = sd(disp)
)
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 3 x 3
cyl disp_mean disp_sd
<dbl> <dbl> <dbl>
1 4 105. 26.9
2 6 183. 41.6
3 8 353. 67.8
The number of rows for some of the groups are 1.
grouped %>%
summarise(n = n())
# A tibble: 179 x 5
# Groups: age, ftv, ptl [?]
# age ftv ptl lwt n
# <int> <int> <int> <int> <int>
# 1 14 0 0 135 1
# 2 14 0 1 101 1
# 3 14 2 0 100 1
# 4 15 0 0 98 1
# 5 15 0 0 110 1
# 6 15 0 0 115 1
# 7 16 0 0 110 1
# 8 16 0 0 112 1
# 9 16 0 0 135 2
#10 16 1 0 95 1
According to ?sd,
The standard deviation of a length-one vector is NA.
This results in NA values for the sd where there is only one element

Iterative summary by column pairs using purrr map

I have a large dataset from which I wish to obtain summary estimates (mean, medians, counts, etc) of one column when grouped by two other columns.
Trying really hard to work out how to do this using purrr - hopefully to get this workflow to click for future projects... but very stuck.
As a reproducible example, this works for grouping by am and vs, and estimating summary values of mpg
library(tidyverse)
library(rlang)
mtcars %>%
group_by(am, vs) %>%
summarise(mean_mpg = mean(mpg),
median_mpg = median(mpg),
count = n())
However, to extend this example, say I wanted to group for am and vs; then am and gear; then am and carb. Intuitively, this seems to be something map should handle.
group_vars <- c("vs", "gear", "carb")
group_syms <- rlang::syms(group_vars)
sym_am <- rlang::sym("am")
mtcars %>%
map_df(~group_by(!!sym_am, !!!group_syms) %>%
summarise(mean_mpg = mean(mpg),
summarise(median_mpg = median(mpg),
summarise(count = n())
)
#Error in !sym_am : invalid argument type
We could use the map2 from purrr to use multiple symbols as arguments and then evaluate it within the group_by and summarise the output
library(tidyverse)
map2_df(list(sym_am), group_syms, ~ mtcars %>%
group_by(!!.x, !!.y) %>%
summarise(mean_mgp = mean(mpg), median_mpg = median(mpg),count = n()))
Here's one approach
library(tidyverse)
variable_grp <- c("vs", "gear", "carb")
constant_grp <- c("am")
group_vars <- lapply(variable_grp, function(i) c(constant_grp, i))
map(group_vars, ~group_by_at(mtcars, .x) %>%
summarise( mean_mgp = mean(mpg),
median_mpg = median(mpg),
count = n()))
This will produce a list of the summary statistics for each group. The issue with using map_df with your problem is that your column names for each group are different (1st group: am, vs ; 2nd group: am, gear ...). Therefore, you need to rename the variable_column if you're using map_df
map_df(group_vars, ~group_by_at(mtcars, .x) %>%
summarise( mean_mgp = mean(mpg),
median_mpg = median(mpg),
count = n()) %>%
setNames(c("am", "variable_column", "mean_mpg", "median_mpg", "count")))
# A tibble: 17 x 5
# Groups: am [2]
# am variable_column mean_mpg median_mpg count
# <dbl> <dbl> <dbl> <dbl> <int>
# 1 0 0 15.05000 15.20 12
# 2 0 1 20.74286 21.40 7
# 3 1 0 19.75000 20.35 6
# 4 1 1 28.37143 30.40 7
# 5 0 3 16.10667 15.50 15
# 6 0 4 21.05000 21.00 4
# 7 1 4 26.27500 25.05 8
# 8 1 5 21.38000 19.70 5
# 9 0 1 20.33333 21.40 3
# 10 0 2 19.30000 18.95 6
# 11 0 3 16.30000 16.40 3
# 12 0 4 14.30000 14.30 7
# 13 1 1 29.10000 29.85 4
# 14 1 2 27.05000 28.20 4
# 15 1 4 19.26667 21.00 3
# 16 1 6 19.70000 19.70 1
# 17 1 8 15.00000 15.00 1
You can save the variable_column name using the .id argument of map_df and a post-map_df mutate
map_df(group_vars, ~group_by_at(mtcars, .x) %>%
summarise( mean_mgp = mean(mpg),
median_mpg = median(mpg),
count = n()) %>%
setNames(c("am", "variable_column", "mean_mpg", "median_mpg", "count")),
.id="variable_col_name") %>%
mutate(variable_col_name = variable_grp[as.numeric(variable_col_name)])
# A tibble: 17 x 6
# Groups: am [2]
# variable_col_name am variable_column mean_mpg median_mpg count
# <chr> <dbl> <dbl> <dbl> <dbl> <int>
# 1 vs 0 0 15.05000 15.20 12
# 2 vs 0 1 20.74286 21.40 7
# 3 vs 1 0 19.75000 20.35 6
# 4 vs 1 1 28.37143 30.40 7
# 5 gear 0 3 16.10667 15.50 15
# 6 gear 0 4 21.05000 21.00 4
# 7 gear 1 4 26.27500 25.05 8
# 8 gear 1 5 21.38000 19.70 5
# 9 carb 0 1 20.33333 21.40 3
# 10 carb 0 2 19.30000 18.95 6
# 11 carb 0 3 16.30000 16.40 3
# 12 carb 0 4 14.30000 14.30 7
# 13 carb 1 1 29.10000 29.85 4
# 14 carb 1 2 27.05000 28.20 4
# 15 carb 1 4 19.26667 21.00 3
# 16 carb 1 6 19.70000 19.70 1
# 17 carb 1 8 15.00000 15.00 1

Resources