I want to get dplyr::spread for multiple columns using purrr::mapinto list of data.frames. Wonder how to achieve the desired result?
library(tidyverse)
mtcars %>%
dplyr::group_by(gear, carb) %>%
dplyr::summarise_at(
.vars = names(.)[1:9]
, .funs = c("mean")
) %>%
dplyr::select(gear, carb, mpg) %>%
tidyr::spread(key = "gear", value = mpg)
# A tibble: 6 x 4
carb `3` `4` `5`
<dbl> <dbl> <dbl> <dbl>
1 1 20.3 29.1 NA
2 2 17.2 24.8 28.2
3 3 16.3 NA NA
4 4 12.6 19.8 15.8
5 6 NA NA 19.7
6 8 NA NA 15
mtcars %>%
dplyr::group_by(gear, carb) %>%
dplyr::summarise_at(
.vars = names(.)[1:9]
, .funs = c("mean")
) %>%
dplyr::select(gear, carb, disp) %>%
tidyr::spread(key = "gear", value = disp)
# A tibble: 6 x 4
carb `3` `4` `5`
<dbl> <dbl> <dbl> <dbl>
1 1 201. 84.2 NA
2 2 346. 121. 108.
3 3 276. NA NA
4 4 416. 164. 351
5 6 NA NA 145
6 8 NA NA 301
Now I want to perform both process with a single command using purrr::map. Wonder how this can be achieved.
mtcars %>%
dplyr::group_by(gear, carb) %>%
dplyr::summarise_at(
.vars = names(.)[1:9]
, .funs = c("mean")
) %>%
dplyr::select(gear, carb, mpg, disp) %>%
purrr::map(.f = ~ tidyr::spread(data = mtcars, key = "gear", value = .x))
So the key thing here is that the list you want to map over is actually the column names, not the columns or the dataframe itself. Here is a rough and ready approach that does what you want, though it is very brittle (the data frame and grouping columns are all hard-coded into the function). You might look into the programming with dplyr vignette if you need to do anything more fancy.
library(tidyverse)
to_spread <- mtcars %>%
group_by(gear, carb) %>%
summarise_all(mean)
map(
.x = colnames(to_spread)[3:11],
.f = function(col) {
to_spread %>%
select(gear, carb, col) %>%
spread(gear, col)
}
) %>%
set_names(colnames(to_spread)[3:11]) %>%
head(3)
#> $mpg
#> # A tibble: 6 x 4
#> carb `3` `4` `5`
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 20.3 29.1 NA
#> 2 2 17.2 24.8 28.2
#> 3 3 16.3 NA NA
#> 4 4 12.6 19.8 15.8
#> 5 6 NA NA 19.7
#> 6 8 NA NA 15
#>
#> $cyl
#> # A tibble: 6 x 4
#> carb `3` `4` `5`
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 5.33 4 NA
#> 2 2 8 4 4
#> 3 3 8 NA NA
#> 4 4 8 6 8
#> 5 6 NA NA 6
#> 6 8 NA NA 8
#>
#> $disp
#> # A tibble: 6 x 4
#> carb `3` `4` `5`
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 201. 84.2 NA
#> 2 2 346. 121. 108.
#> 3 3 276. NA NA
#> 4 4 416. 164. 351
#> 5 6 NA NA 145
#> 6 8 NA NA 301
Created on 2018-06-22 by the reprex package (v0.2.0).
Related
I want to group_by multiple columns wihout intersection.
I am looking for the output below without having to replicate the code for both variables.
library(dplyr)
> mtcars %>%
+ group_by(cyl) %>%
+ summarise(mean(disp))
# A tibble: 3 × 2
cyl `mean(disp)`
<dbl> <dbl>
1 4 105.
2 6 183.
3 8 353.
>
> mtcars %>%
+ group_by(am) %>%
+ summarise(mean(disp))
# A tibble: 2 × 2
am `mean(disp)`
<dbl> <dbl>
1 0 290.
2 1 144.
I am not looking for the code below since this gives the intersection between the variables:
> mtcars %>%
+ group_by(cyl, am) %>%
+ summarise(mean(disp))
# A tibble: 6 × 3
# Groups: cyl [3]
cyl am `mean(disp)`
<dbl> <dbl> <dbl>
1 4 0 136.
2 4 1 93.6
3 6 0 205.
4 6 1 155
5 8 0 358.
6 8 1 326
Thanks a lot!
An alternative would be a custom function:
my_func <- function(df, group){
df %>%
group_by({{group}}) %>%
summarise(mean_disp = mean(disp))
}
my_func(mtcars, cyl)
my_func(mtcars, am)
cyl mean_disp
<dbl> <dbl>
1 4 105.
2 6 183.
3 8 353.
> my_func(mtcars, am)
# A tibble: 2 × 2
am mean_disp
<dbl> <dbl>
1 0 290.
2 1 144.
Something like this?
library(tidyverse)
c("cyl", "am") %>%
map(~ mtcars %>%
group_by(!!sym(.x)) %>%
summarise(result = mean(disp)))
[[1]]
# A tibble: 3 x 2
cyl result
<dbl> <dbl>
1 4 105.
2 6 183.
3 8 353.
[[2]]
# A tibble: 2 x 2
am result
<dbl> <dbl>
1 0 290.
2 1 144.
Somewhat hard to define this question without sounding like lots of similar questions!
I have a function for which I want one of the parameters to be a function name, that will be passed to dplyr::summarise, e.g. "mean" or "sum":
data(mtcars)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
zColquo = quo_name(zCol)
cellSummaries <- x %>%
group_by(gear, !!sym(groupcol)) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
!!zColquo := mean(!!sym(zColquo))) # mean should be zFun, user-defined
ungroup
}
(this groups by gear and cyl, then returns, per group, count and mean(disp))
Per my note, I'd like 'mean' to be dynamic, performing the function defined by zFun, but I can't for the life of me work out how to do it! Thanks in advance for any advice.
You can use match.fun to make the function dynamic. I also removed zColquo as it's not needed.
library(dplyr)
library(rlang)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
cellSummaries <- x %>%
group_by(gear, !!sym(groupcol)) %>%
summarise(Count = n(),
!!zCol := match.fun(zFun)(!!sym(zCol))) %>%
ungroup
return(cellSummaries)
}
You can then check output
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326
f(zFun = "sum")
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 483
#3 3 8 12 4291.
#4 4 4 8 821
#5 4 6 4 655.
#6 5 4 2 215.
#7 5 6 1 145
#8 5 8 2 652
We can use get
library(dplyr)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
zColquo = quo_name(zCol)
x %>%
group_by(gear, !!sym(groupcol)) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
!!zColquo := get(zFun)(!!sym(zCol))) %>%
ungroup
}
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326
f(zFun = "sum")
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 483
#3 3 8 12 4291.
#4 4 4 8 821
#5 4 6 4 655.
#6 5 4 2 215.
#7 5 6 1 145
#8 5 8 2 652
In addition, we could remove the sym evaluation in group_by and in summarise if we wrap with across
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
x %>%
group_by(across(c(gear, groupcol))) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
across(zCol, ~ get(zFun)(.))) %>%
ungroup
}
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326
I want to create a data frame with columns for the proportion of observations in each category, much like this:
library(tidyverse)
mtcars %>%
group_by(am) %>%
summarise(gear3 = sum(gear == 3)/n(),
gear4 = sum(gear == 4)/n(),
gear5 = sum(gear == 5)/n(),
cyl4 = sum(cyl == 4)/n(),
cyl6 = sum(cyl == 6)/n(),
cyl8 = sum(cyl == 8)/n())
# # A tibble: 2 x 7
# am gear3 gear4 gear5 cyl4 cyl6 cyl8
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0 0.789 0.211 0 0.158 0.211 0.632
# 2 1 0 0.615 0.385 0.615 0.231 0.154
I am looking for way to this without manually naming the new summary variables?
There seems to be a few questions, such as here, related to creating a proportions for single variables, which i could replicate for each variable, pivot and and then combine but it will become tedious in my application - i am trying to build the data frame for many variables
mtcars %>%
group_by(am, gear) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
# # A tibble: 4 x 4
# # Groups: am [2]
# am gear n freq
# <dbl> <dbl> <int> <dbl>
# 1 0 3 15 0.789
# 2 0 4 4 0.211
# 3 1 4 8 0.615
# 4 1 5 5 0.385
mtcars %>%
group_by(am, cyl) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
# # A tibble: 6 x 4
# # Groups: am [2]
# am cyl n freq
# <dbl> <dbl> <int> <dbl>
# 1 0 4 3 0.158
# 2 0 6 4 0.211
# 3 0 8 12 0.632
# 4 1 4 8 0.615
# 5 1 6 3 0.231
# 6 1 8 2 0.154
Here is one solution:
library(dplyr)
freqPairs <- function(df, first, second){
pairs <- as.list(data.frame(t(expand.grid(first, second))))
res <- lapply(pairs, function(z) df %>%
group_by(!!sym(z[1]), !!sym(z[2])) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n)) %>%
{colnames(.)[1:2] = c("Var1", "Var2"); .} %>%
ungroup())
setNames(res, unlist(lapply(pairs, paste, collapse="_vs_")))
}
bind_rows(freqPairs(mtcars, first=c("am"), second=c("cyl", "gear")), .id = "comparison")
#> # A tibble: 10 x 5
#> comparison Var1 Var2 n freq
#> <chr> <dbl> <dbl> <int> <dbl>
#> 1 am_vs_cyl 0 4 3 0.158
#> 2 am_vs_cyl 0 6 4 0.211
#> 3 am_vs_cyl 0 8 12 0.632
#> 4 am_vs_cyl 1 4 8 0.615
#> 5 am_vs_cyl 1 6 3 0.231
#> 6 am_vs_cyl 1 8 2 0.154
#> 7 am_vs_gear 0 3 15 0.789
#> 8 am_vs_gear 0 4 4 0.211
#> 9 am_vs_gear 1 4 8 0.615
#> 10 am_vs_gear 1 5 5 0.385
Created on 2020-05-13 by the reprex package (v0.3.0)
You can always recover the names of Var1 and Var2 from the comparison column, e.g. by splitting that string. Example:
library(data.table)
res <- bind_rows(freqPairs(mtcars, first=c("am"), second=c("cyl", "gear")), .id = "comparison")
data.table(res)[, c("Variable1", "Variable2") := tstrsplit(comparison, "_vs_")][]
#> comparison Var1 Var2 n freq Variable1 Variable2
#> 1: am_vs_cyl 0 4 3 0.1578947 am cyl
#> 2: am_vs_cyl 0 6 4 0.2105263 am cyl
#> 3: am_vs_cyl 0 8 12 0.6315789 am cyl
#> 4: am_vs_cyl 1 4 8 0.6153846 am cyl
#> 5: am_vs_cyl 1 6 3 0.2307692 am cyl
#> 6: am_vs_cyl 1 8 2 0.1538462 am cyl
#> 7: am_vs_gear 0 3 15 0.7894737 am gear
#> 8: am_vs_gear 0 4 4 0.2105263 am gear
#> 9: am_vs_gear 1 4 8 0.6153846 am gear
#> 10: am_vs_gear 1 5 5 0.3846154 am gear
Note:
If you really want all possible pairs in both orders, you could use something like:
pairs <- c(combn(colnames(mtcars), 2, simplify=FALSE),
lapply(combn(colnames(mtcars), 2, simplify=FALSE), rev))
Figured out a way using map() in purrr
First, a function to calculate a named vector of proportions
prop <- function(v){
n <- match.call() %>%
as.character() %>%
.[2] %>%
str_extract(pattern = "(?<=\\$)(.*)")
table(v) %>%
`/`(sum(.)) %>%
as.matrix() %>%
t() %>%
as_tibble() %>%
set_names(paste0(n, colnames(.)))
}
prop(v = mtcars$gear)
# # A tibble: 1 x 3
# gear3 gear4 gear5
# <dbl> <dbl> <dbl>
# 1 0.469 0.375 0.156
Then using map() to apply the function to each group, one variable at a time
mtcars %>%
group_nest(am) %>%
mutate(p_gear = map(.x = data, .f = ~prop(.x$gear)),
p_cyl = map(.x = data, .f = ~prop(.x$cyl))) %>%
unnest(c(p_gear, p_cyl)) %>%
select(-data)
# # A tibble: 2 x 7
# am gear3 gear4 gear5 cyl4 cyl6 cyl8
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0 0.789 0.211 NA 0.158 0.211 0.632
# 2 1 NA 0.615 0.385 0.615 0.231 0.154
A further example, including replacing NA with zeros
mtcars %>%
group_nest(carb) %>%
mutate(p_gear = map(.x = data, .f = ~prop(.x$gear)),
p_cyl = map(.x = data, .f = ~prop(.x$cyl)),
p_vs = map(.x = data, .f = ~prop(.x$vs))) %>%
unnest(c(p_gear, p_cyl, p_vs)) %>%
select(-data) %>%
mutate_all(~ifelse(is.na(.), 0, .))
# # A tibble: 6 x 9
# carb gear3 gear4 gear5 cyl4 cyl6 cyl8 vs1 vs0
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 0.429 0.571 0 0.714 0.286 0 1 0
# 2 2 0.4 0.4 0.2 0.6 0 0.4 0.5 0.5
# 3 3 1 0 0 0 0 1 0 1
# 4 4 0.5 0.4 0.1 0 0.4 0.6 0.2 0.8
# 5 6 0 0 1 0 1 0 0 1
# 6 8 0 0 1 0 0 1 0 1
I'm trying to arrange values in decreasing order within a exact group in a nested dataframe. My input data looks like this. I've got two grouping variables (group1 and group2) and three values (i.e. id, value2, value3).
library(tidyverse)
set.seed(1234)
df <- tibble(group1 = c(rep(LETTERS[1:3], 4)),
group2 = c(rep(0, 6), rep(2, 6)),
value2 = rnorm(12, 20, sd = 10),
value3 = rnorm(12, 20, sd = 50)) %>%
group_by(group1) %>%
mutate(id = c(1:4)) %>%
ungroup()
I decided to group them by group1 and group2 and then nest():
df_nested <- df %>%
group_by(group1, group2) %>%
nest()
# A tibble: 6 x 3
# Groups: group1, group2 [6]
group1 group2 data
<chr> <dbl> <list>
1 A 0 <tibble [2 x 3]>
2 B 0 <tibble [2 x 3]>
3 C 0 <tibble [2 x 3]>
4 A 2 <tibble [2 x 3]>
5 B 2 <tibble [2 x 3]>
6 C 2 <tibble [2 x 3]>
Perfect. Now I need to sort only those data which group2 is equal to 2 by id. However I'm receiving a following error:
df_nested %>%
mutate(data = map2_df(.x = data, .y = group2,
~ifelse(.y == 2, arrange(-.x$id),
.x)))
Error: Argument 1 must have names
You could do :
library(dplyr)
library(purrr)
df_nested$data <- map2(df_nested$data, df_nested$group2,~if(.y == 2)
arrange(.x, -.x$id) else .x)
So data where group2 is not equal to 2 is not sorted
df_nested$data[[1]]
# A tibble: 2 x 3
# value2 value3 id
# <dbl> <dbl> <int>
#1 13.1 -89.0 1
#2 9.76 -3.29 2
and where group2 is 2 is sorted.
df_nested$data[[4]]
# A tibble: 2 x 3
#value2 value3 id
# <dbl> <dbl> <int>
#1 15.0 -28.4 4
#2 31.0 -22.8 3
If you want to combine them do :
map2_df(df_nested$data, df_nested$group2,~if(.y == 2) arrange(.x, -.x$id) else .x)
I would suggest creating an additional variable id_ which will be equal to the original id variable when group2 == 2 and NA otherwise. This way if we use it in sorting it'll make no effect when group2 != 2.
df %>%
mutate(id_ = if_else(group2 == 2, id, NA_integer_)) %>%
arrange(group1, group2, -id_)
#> # A tibble: 12 x 6
#> group1 group2 value2 value3 id id_
#> <chr> <dbl> <dbl> <dbl> <int> <int>
#> 1 A 0 17.6 50.2 1 NA
#> 2 A 0 33.8 -14.4 2 NA
#> 3 A 2 23.1 22.6 4 4
#> 4 A 2 13.7 50.2 3 3
#> 5 B 0 15.4 49.9 1 NA
#> 6 B 0 16.2 63.7 2 NA
#> 7 B 2 41.7 -2.90 4 4
#> 8 B 2 16.6 46.7 3 3
#> 9 C 0 19.9 -64.3 1 NA
#> 10 C 0 19.9 59.7 2 NA
#> 11 C 2 34.1 48.5 4 4
#> 12 C 2 32.3 23.1 3 3
Then if needed we can group and nest the result.
I want to create a function that takes a grouping argument. Which can be a single or multiple variables. I want it to look like this:
wanted <- function(data, groups, other_params){
data %>% group_by( {{groups}} ) %>% count()
}
This work only when a single group is given but breaks when there are multiple groups. I know it's possible to use the following with ellipsis ... (But I want the syntax groups = something):
not_wanted <- function(data, ..., other_params){
data %>% group_by( ... ) %>% count()
}
Here is the entire code:
library(dplyr)
library(magrittr)
iris$group2 <- rep(1:5, 30)
wanted <- function(data, groups, other_params){
data %>% group_by( {{groups}} ) %>% count()
}
not_wanted <- function(data, ..., other_params){
data %>% group_by( ... ) %>% count()
}
# works
wanted(iris, groups = Species )
not_wanted(iris, Species, group2)
# doesn't work
wanted(iris, groups = vars(Species, group2) )
wanted(iris, groups = c(Species, group2) )
wanted(iris, groups = vars("Species", "group2") )
# Error: Column `vars(Species, group2)` must be length 150 (the number of rows) or one, not 2
You guys are over complicating things, this works just fine:
library(tidyverse)
wanted <- function(data, groups){
data %>% count(!!!groups)
}
mtcars %>% wanted(groups = vars(mpg,disp,hp))
# A tibble: 31 x 4
mpg disp hp n
<dbl> <dbl> <dbl> <int>
1 10.4 460 215 1
2 10.4 472 205 1
3 13.3 350 245 1
4 14.3 360 245 1
5 14.7 440 230 1
6 15 301 335 1
7 15.2 276. 180 1
8 15.2 304 150 1
9 15.5 318 150 1
10 15.8 351 264 1
# … with 21 more rows
The triple bang operator and parse_quos from the rlang package will do the trick. For more info, see e.g. https://stackoverflow.com/a/49941635/6086135
library(dplyr)
library(magrittr)
iris$group2 <- rep(1:5, 30)
vec <- c("Species", "group2")
wanted <- function(data, groups){
data %>% count(!!!rlang::parse_quos(groups, rlang::current_env()))
}
wanted(iris, vec)
#> # A tibble: 15 x 3
#> Species group2 n
#> <fct> <int> <int>
#> 1 setosa 1 10
#> 2 setosa 2 10
#> 3 setosa 3 10
#> 4 setosa 4 10
#> 5 setosa 5 10
#> 6 versicolor 1 10
#> 7 versicolor 2 10
#> 8 versicolor 3 10
#> 9 versicolor 4 10
#> 10 versicolor 5 10
#> 11 virginica 1 10
#> 12 virginica 2 10
#> 13 virginica 3 10
#> 14 virginica 4 10
#> 15 virginica 5 10
Created on 2020-01-06 by the reprex package (v0.3.0)
Here is another option to avoid quotations in the function call. I admit its not very pretty though.
library(tidyverse)
wanted <- function(data, groups){
grouping <- gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
data %>% group_by_at(grouping) %>% count()
}
iris$group2 <- rep(1:5, 30)
wanted(iris, groups = c(Species, group2) )
#> # A tibble: 15 x 3
#> # Groups: Species, group2 [15]
#> Species group2 n
#> <fct> <int> <int>
#> 1 setosa 1 10
#> 2 setosa 2 10
#> 3 setosa 3 10
#> 4 setosa 4 10
#> 5 setosa 5 10
#> 6 versicolor 1 10
#> 7 versicolor 2 10
#> 8 versicolor 3 10
#> 9 versicolor 4 10
#> 10 versicolor 5 10
#> 11 virginica 1 10
#> 12 virginica 2 10
#> 13 virginica 3 10
#> 14 virginica 4 10
#> 15 virginica 5 10