Spread multiple columns in a function - r

Often I need to spread multiple value columns, as in this question. But I do it often enough that I'd like to be able to write a function that does this.
For example, given the data:
set.seed(42)
dat <- data_frame(id = rep(1:2,each = 2),
grp = rep(letters[1:2],times = 2),
avg = rnorm(4),
sd = runif(4))
> dat
# A tibble: 4 x 4
id grp avg sd
<int> <chr> <dbl> <dbl>
1 1 a 1.3709584 0.6569923
2 1 b -0.5646982 0.7050648
3 2 a 0.3631284 0.4577418
4 2 b 0.6328626 0.7191123
I'd like to create a function that returns something like:
# A tibble: 2 x 5
id a_avg b_avg a_sd b_sd
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 -0.5646982 0.6569923 0.7050648
2 2 0.3631284 0.6328626 0.4577418 0.7191123
How can I do that?

We'll return to the answer provided in the question linked to, but for the moment let's start with a more naive approach.
One idea would be to spread each value column individually, and then join the results, i.e.
library(dplyr)
library(tidyr)
library(tibble)
dat_avg <- dat %>%
select(-sd) %>%
spread(key = grp,value = avg) %>%
rename(a_avg = a,
b_avg = b)
dat_sd <- dat %>%
select(-avg) %>%
spread(key = grp,value = sd) %>%
rename(a_sd = a,
b_sd = b)
> full_join(dat_avg,
dat_sd,
by = 'id')
# A tibble: 2 x 5
id a_avg b_avg a_sd b_sd
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 -0.5646982 0.6569923 0.7050648
2 2 0.3631284 0.6328626 0.4577418 0.7191123
(I used a full_join just in case we run into situations where not all combinations of the join columns appear in all of them.)
Let's start with a function that works like spread but allows you to pass the key and value columns as characters:
spread_chr <- function(data, key_col, value_cols, fill = NA,
convert = FALSE,drop = TRUE,sep = NULL){
n_val <- length(value_cols)
result <- vector(mode = "list", length = n_val)
id_cols <- setdiff(names(data), c(key_col,value_cols))
for (i in seq_along(result)){
result[[i]] <- spread(data = data[,c(id_cols,key_col,value_cols[i]),drop = FALSE],
key = !!key_col,
value = !!value_cols[i],
fill = fill,
convert = convert,
drop = drop,
sep = paste0(sep,value_cols[i],sep))
}
result %>%
purrr::reduce(.f = full_join, by = id_cols)
}
> dat %>%
spread_chr(key_col = "grp",
value_cols = c("avg","sd"),
sep = "_")
# A tibble: 2 x 5
id grp_avg_a grp_avg_b grp_sd_a grp_sd_b
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 -0.5646982 0.6569923 0.7050648
2 2 0.3631284 0.6328626 0.4577418 0.7191123
The key ideas here are to unquote the arguments key_col and value_cols[i] using the !! operator, and using the sep argument in spread to control the resulting value column names.
If we wanted to convert this function to accept unquoted arguments for the key and value columns, we could modify it like so:
spread_nq <- function(data, key_col,..., fill = NA,
convert = FALSE, drop = TRUE, sep = NULL){
val_quos <- rlang::quos(...)
key_quo <- rlang::enquo(key_col)
value_cols <- unname(tidyselect::vars_select(names(data),!!!val_quos))
key_col <- unname(tidyselect::vars_select(names(data),!!key_quo))
n_val <- length(value_cols)
result <- vector(mode = "list",length = n_val)
id_cols <- setdiff(names(data),c(key_col,value_cols))
for (i in seq_along(result)){
result[[i]] <- spread(data = data[,c(id_cols,key_col,value_cols[i]),drop = FALSE],
key = !!key_col,
value = !!value_cols[i],
fill = fill,
convert = convert,
drop = drop,
sep = paste0(sep,value_cols[i],sep))
}
result %>%
purrr::reduce(.f = full_join,by = id_cols)
}
> dat %>%
spread_nq(key_col = grp,avg,sd,sep = "_")
# A tibble: 2 x 5
id grp_avg_a grp_avg_b grp_sd_a grp_sd_b
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 -0.5646982 0.6569923 0.7050648
2 2 0.3631284 0.6328626 0.4577418 0.7191123
The change here is that we capture the unquoted arguments with rlang::quos and rlang::enquo and then simply convert them back to characters using tidyselect::vars_select.
Returning to the solution in the linked question that uses a sequence of gather, unite and spread, we can use what we've learned to make a function like this:
spread_nt <- function(data,key_col,...,fill = NA,
convert = TRUE,drop = TRUE,sep = "_"){
key_quo <- rlang::enquo(key_col)
val_quos <- rlang::quos(...)
value_cols <- unname(tidyselect::vars_select(names(data),!!!val_quos))
key_col <- unname(tidyselect::vars_select(names(data),!!key_quo))
data %>%
gather(key = ..var..,value = ..val..,!!!val_quos) %>%
unite(col = ..grp..,c(key_col,"..var.."),sep = sep) %>%
spread(key = ..grp..,value = ..val..,fill = fill,
convert = convert,drop = drop,sep = NULL)
}
> dat %>%
spread_nt(key_col = grp,avg,sd,sep = "_")
# A tibble: 2 x 5
id a_avg a_sd b_avg b_sd
* <int> <dbl> <dbl> <dbl> <dbl>
1 1 1.3709584 0.6569923 -0.5646982 0.7050648
2 2 0.3631284 0.4577418 0.6328626 0.7191123
This relies on the same techniques from rlang from the last example. We're using some unusual names like ..var.. for our intermediate variables in order to reduce the chances of name collisions with existing columns in our data frame.
Also, we're using the sep argument in unite to control the resulting column names, so in this case when we spread we force sep = NULL.

Spreading operations can also be done by unnesting a properly reformated table, here's an alternative using tidyverse :
# helper function that returns an horizontal one lined named tibble wrapped into a list
lhframe <- function(x,nms) list(setNames(as_tibble(t(x)),nms))
dat %>% group_by(id) %>%
summarize(avg = lhframe(avg,grp),
sd = lhframe(sd,grp)) %>%
unnest(.sep="_")
# # A tibble: 2 x 5
# id avg_a avg_b sd_a sd_b
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 -1.7631631 0.4600974 0.7595443 0.5664884
# 2 2 -0.6399949 0.4554501 0.8496897 0.1894739
Unfortunately the following doesn't work:
dat %>% group_by(id) %>%
summarize_at(vars(avg,sd),lhframe,grp) %>%
unnest(.sep="_")

Since tidyr version 1.0.0
tidyr::pivot_wider(data = dat, id_cols = id, names_from = grp, values_from = avg:sd)
# # A tibble: 2 x 5
# id avg_a avg_b sd_a sd_b
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 1.37 -0.565 0.657 0.705
# 2 2 0.363 0.633 0.458 0.719

Related

How to use map_at properly in a nested list?

I have a nested list of lists that include data tables at the bottom level. My goal is to map a function to a single table to transform the data. I'm currently trying to use purrr:map_depth and purrr::map_at in conjunction to create the plot. The reason I need to use map_at or map_if is the plot function I'm using takes different arguments depending on the table.
example below
library(data.table)
library(purrr)
example = list(
group1 = list(
all = data.table(
x = 1:10,
y = 10:1),
not_all = data.table(
x2 = 11:20,
y2 = 20:11)
),
group2 = list(
all = data.table(
x = 1:10,
y = 10:1),
not_all = data.table(
x2 = 11:20,
y2 = 20:11)
),
group3 = list(
all = data.table(
x = 1:10,
y = 10:1),
not_all = data.table(
x2 = 11:20,
y2 = 20:11)
),
group4 = list(
all = data.table(
x = 1:10,
y = 10:1),
not_all = data.table(
x2 = 11:20,
y2 = 20:11)
)
)
I'm planning to use highcharter::data_to_boxplot as the mapping function.
So far I've been unable to extract a single table to map to and don't have a solid grasp of the purrr syntax yet.
map_depth(example, 2, map_at(., "all", data_to_boxplot, variable = x))
# Error: character indexing requires a named object
map_depth(example, 2, ~map_at(., "all", data_to_boxplot, variable = x))
# this prints out the entire list
# would like to try something like this too but can't figure out the piping correctly
map_depth(example, 2) %>%
map_at(., "all", data_to_boxplot, variable = x)
# Error in as_mapper(.f, ...) : argument ".f" is missing, with no default
Any help would be greatly appreciated!
I think you need a nested map-map_at-construct:
library(data.table)
library(highcharter)
library(purrr)
example %>%
map(~.x %>%
map_at("all", data_to_boxplot, variable = x))
This returns
$group1
$group1$all
# A tibble: 1 x 4
name data id type
<lgl> <list> <lgl> <chr>
1 NA <list [1]> NA boxplot
$group1$not_all
x2 y2
1: 11 20
2: 12 19
3: 13 18
4: 14 17
5: 15 16
6: 16 15
7: 17 14
8: 18 13
9: 19 12
10: 20 11
$group2
$group2$all
# A tibble: 1 x 4
name data id type
<lgl> <list> <lgl> <chr>
1 NA <list [1]> NA boxplot
$group3
$group3$all
# A tibble: 1 x 4
name data id type
<lgl> <list> <lgl> <chr>
1 NA <list [1]> NA boxplot
$group4
$group4$all
# A tibble: 1 x 4
name data id type
<lgl> <list> <lgl> <chr>
1 NA <list [1]> NA boxplot
How does this work?
example is a list of lists. map applies a function to each element of this list. These elements are also lists.
The custom function used in map is just another map-function, which is applied to an object named all (here at "level 2").
This is equivalent to
example %>%
map_depth(1,
~.x %>%
map_at("all", data_to_boxplot, variable = x),
)
Take a look at ?map_depth:
map_depth(.x, .depth, .f, ..., .ragged = FALSE)
with .depth defined as "Level of .x to map on. Use a negative value to count up from the lowest level of the list."
map_depth(x, 0, fun) is equivalent to fun(x).
map_depth(x, 1, fun) is equivalent to x <- map(x, fun).
map_depth(x, 2, fun) is equivalent to x <- map(x, ~ map(., fun)).

Create NA's based on another dataframe without long data

I have a tibble with the explicit "id" and colnames I need to convert to NA's. Is there anyway I can create the NA's without making my df a long dataset? I considered using the new rows_update function, but I'm not sure if this is correct because I only want certain columns to be NA.
library(dplyr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
# desired output:
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 1 NA
#> 2 2 1 2
#> 3 3 NA 2
Created on 2020-07-03 by the reprex package (v0.3.0)
This definitely isn't the most elegant solution, but it gets the output you want.
library(dplyr)
library(purrr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
map2(to_na$x, to_na$col, #Pass through these two objects in parallel
function(xval_to_missing, col) df %>% #Two objects above matched by position here.
mutate_at(col, #mutate_at the specified cols
~if_else(x == xval_to_missing, NA_real_, .) #if x == xval_to_missing, make NA, else keep as is.
) %>%
select(x, col) #keep x and the modified column.
) %>% #end of map2
reduce(left_join, by = "x") %>% #merge within the above list, by x.
relocate(x, y, z) #Keep your ordering
Output:
# A tibble: 3 x 3
x y z
<dbl> <dbl> <dbl>
1 1 1 NA
2 2 1 2
3 3 NA 2
We can use row/column indexing to assign the values to NA in base R
df <- as.data.frame(df)
df[cbind(to_na$x, match(to_na$col, names(df)))] <- NA
df
# x y z
#1 1 1 NA
#2 2 1 2
#3 3 NA 2
If we want to use rows_update
library(dplyr)
library(tidyr)
library(purrr)
lst1 <- to_na %>%
mutate(new = NA_real_) %>%
split(seq_len(nrow(.))) %>%
map(~ .x %>%
pivot_wider(names_from = col, values_from = new))
for(i in seq_along(lst1)) df <- rows_update(df, lst1[[i]])
df
# A tibble: 3 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 1 1 NA
#2 2 1 2
#3 3 NA 2

dplyr::mutate_at() relying on multiple columns with a given prefix/suffix

dplyr::mutate_at() can be used to apply the same function to multiple columns. It also allows you to set the results in new columns using a named list.
However, what if I have many columns in pairs (say, data1_a, data1_b, data2_a, data2_b, ...) and I want to multiply those pairs together? Is that possible?
By hand, that would look like
suppressPackageStartupMessages({
library(dplyr)
})
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
mutate(
data1 = data1_a * data1_b,
data2 = data2_a * data2_b
)
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
My current solution is to write a function which takes the unsuffixed variable name (i.e. "data1"), creates the suffixed names and then performs a simple mutate() on that variable using get(). I then call that function for each output:
foo <- function(df, name) {
a <- paste0(name, "_a")
b <- paste0(name, "_b")
return(
mutate(
df,
!!name := get(a) * get(b)
)
)
}
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
foo("data1") %>%
foo("data2")
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
(or write a loop over all the variable names if there were more of them)
But if it's possible to use mutate_at or something of the sort, that'd be much cleaner.
We can use pivot_longer/pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('grp', '.value'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = a * b) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
Or another option is to split into a list based on the column names and then do the *
library(purrr)
library(stringr)
df1 %>%
split.default(str_remove(names(.), "_.*")) %>%
map_dfr(reduce, `*`) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
With mutate, it is possible, but it would be more manual
df1 %>%
mutate(data1 = select(., starts_with('data1')) %>%
reduce(`*`),
data2 = select(., starts_with('data2')) %>%
reduce(`*`))
data
df1 <- data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6)
After adopting #akrun's elegant solution, I noticed it was unfortunately very inefficient (since it has to recreate two dataframes), taking almost a second on a dataset with 20,000 rows and 11 "groups".
So a while ago I developed the following function (with a bit of help from #user12728748... sorry for not posting here sooner), which takes the names of the groups ("data1", "data2", etc) and a formula using the prefixes, allowing for bquote-style quoting for constant names:
suppressPackageStartupMessages(library(dplyr))
mutateSet <- function(df, colNames, formula,
isPrefix = TRUE,
separator = "_") {
vars <- all.vars(formula)
# extracts names wrapped in `.()`
escapedNames <- function (expr)
{
unquote <- function(e) {
if (is.pairlist(e) || length(e) <= 1L) NULL
else if (e[[1L]] == as.name(".")) deparse(e[[2L]])
else unlist(sapply(e, unquote))
}
unquote(substitute(expr))
}
escapedVars <- eval(rlang::expr(escapedNames(!!formula)))
# remove escaped names from mapping variables
vars <- setdiff(vars, escapedVars)
# get output prefix/suffix as string
lhs <- rlang::f_lhs(formula) %>%
all.vars()
# get operation as string
# deparse() can have line breaks; paste0() brings it back to one line
rhs <- rlang::f_rhs(formula) %>%
deparse() %>%
paste0(collapse = "")
# dummy function to cover for bquote escaping
. <- function(x) x
for (i in colNames) {
if (isPrefix) {
aliases <- paste0(vars, separator, i)
newCol <- paste0(lhs, separator, i)
} else {
aliases <- paste0(i, separator, vars)
newCol <- paste0(i, separator, lhs)
}
if (length(lhs) == 0) newCol <- i
mapping <- rlang::list2(!!!aliases)
names(mapping) <- vars
mapping <- do.call(wrapr::qc, mapping)
df <- rlang::expr(wrapr::let(
mapping,
df %>% dplyr::mutate(!!newCol := ...RHS...)
)) %>%
deparse() %>%
gsub(
pattern = "...RHS...",
replacement = rhs
) %>%
{eval(parse(text = .))}
}
return(df)
}
df <- data.frame(a_data1 = 1:3, b_data1 = 2:4,
a_data2 = 3:5, b_data2 = 4:6,
static = 5:7)
mutateSet(df, "data1", ~ a + b)
#> a_data1 b_data1 a_data2 b_data2 static data1
#> 1 1 2 3 4 5 3
#> 2 2 3 4 5 6 5
#> 3 3 4 5 6 7 7
mutateSet(df, c("data1", "data2"), x ~ sqrt(a) + b)
#> a_data1 b_data1 a_data2 b_data2 static x_data1 x_data2
#> 1 1 2 3 4 5 3.000000 5.732051
#> 2 2 3 4 5 6 4.414214 7.000000
#> 3 3 4 5 6 7 5.732051 8.236068
mutateSet(df, c("data1", "data2"), ~ a + b + .(static))
#> a_data1 b_data1 a_data2 b_data2 static data1 data2
#> 1 1 2 3 4 5 8 12
#> 2 2 3 4 5 6 11 15
#> 3 3 4 5 6 7 14 18
Created on 2020-04-28 by the reprex package (v0.3.0)
This can probably be cleaned up (especially that heinous for-loop), but it works for now.
Repeating #user12728748's performance test, we see this is ~100x faster:
suppressPackageStartupMessages({
invisible(lapply(c("dplyr", "tidyr", "rlang", "wrapr", "microbenchmark"),
require, character.only = TRUE))
})
polymutate <- function(df, formula) {
form <- rlang::f_rhs(formula)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('.value', 'grp'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = eval(form)) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df, .)
}
set.seed(1)
df <- setNames(data.frame(matrix(sample(1:12, 6E6, replace=TRUE), ncol=6)),
c("a_data1", "b_data1", "a_data2", "b_data2", "a_data3", "b_data3"))
pd <- polymutate(df, ~ a + b)
pd2 <- mutateSet(df, c("data1", "data2", "data3"), ~ a + b)
all.equal(pd, pd2)
#> [1] TRUE
microbenchmark(polymutate(df, ~ a + b),
mutateSet(df, c("data1", "data2", "data3"), ~ a + b),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> polymutate 1612.306 1628.9776 1690.78586 1670.15600 1741.3490 1806.1412 10
#> mutateSet 8.757 9.6302 13.27135 10.45965 19.2976 20.4657 10
This is now possible using the cur_column() function within across().
library(tidyverse)
dat <- data.frame(
data1_a = 1:3,
data1_b = 2:4,
data2_a = 3:5,
data2_b = 4:6
)
mutate(
dat,
across(ends_with("a"), ~ . * dat[[str_replace(cur_column(), "a$", "b")]],
.names = "updated_{col}")
)
Returns:
data1_a data1_b data2_a data2_b updated_data1_a updated_data2_a
1 1 2 3 4 2 12
2 2 3 4 5 6 20
3 3 4 5 6 12 30
Where updated_data1_a and updated_data2_a contain the desired output variables.

Aggregating if each observation can belong to multiple groups

I want to aggregate Date by group. However, each observation can belong to several groups (e.g. observation 1 belongs to group A and B). I could not find a nice way to achieve this with data.table. Currently I created for each of the possible groups a logical variable which takes the value TRUE if the observation belongs to that group. I am looking for a better way to do this than presented below. I would also like to know how I could achieve this with the tidyverse.
library(data.table)
# Data
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.table(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
# This should be nicer and less repetitive
df[groupA == TRUE, .(A = sum(x)), by = time][
df[groupB == TRUE, .(B = sum(x)), by = time], on = "time"][
df[groupC == TRUE, .(C = sum(x)), by = time], on = "time"]
# desired output
time A B C
1: 1 NA 0.9432955 0.1331984
2: 2 1.2257538 0.2427420 0.1882493
3: 3 -0.1992284 -0.1992284 1.9016244
4: 4 0.5327774 0.9438362 0.9276459
Here is a solution with data.table:
df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# > df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# time groupA groupB groupC
# 1: 1 0.0000000 0.9432955 0.1331984
# 2: 2 1.2257538 0.2427420 0.1882493
# 3: 3 -0.1992284 -0.1992284 1.9016244
# 4: 4 0.5327774 0.9438362 0.9276459
or (thx to #chinsoon12 for the comment) more programmatically:
df[, lapply(.SD*x, sum), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
If you want the result in the long format you can do:
df[, colSums(.SD*x), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
### with indicator for the group:
df[, .(colSums(.SD*x), c("A","B","C")), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
I think it's easier here to work in long format. First I gather the observations to long format, then keep only the values where the observation belongs to the corresponding group. Then I remove the logical column, and rename the groups to single letters. Then I aggregate across groups and time (summarise in dplyr).
Finally I spread back to wide format.
library(dplyr)
library(tidyr)
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.frame(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
df %>%
gather(group, belongs, groupA:groupC) %>%
filter(belongs) %>%
select(-belongs) %>%
mutate(group = gsub("group", "", group)) %>%
group_by(time, group) %>%
summarise(x = sum(x)) %>%
spread(group, x)
Output
# A tibble: 4 x 4
# Groups: time [4]
time A B C
<int> <dbl> <dbl> <dbl>
1 1 NA 0.943 0.133
2 2 1.23 0.243 0.188
3 3 -0.199 -0.199 1.90
4 4 0.533 0.944 0.928
An option can be using tidyr and dplyr packages in combination with data.table. Try to work on data in long format and then change it to wide format.
library(dplyr)
library(tidyr)
melt(df, id.vars = c("time", "x")) %>%
filter(value) %>%
group_by(time, variable) %>%
summarise(sum = sum(x)) %>%
spread(variable, sum)
# # A tibble: 4 x 4
# # Groups: time [4]
# time groupA groupB groupC
# * <int> <dbl> <dbl> <dbl>
# 1 1 NA 0.943 0.133
# 2 2 1.23 0.243 0.188
# 3 3 - 0.199 -0.199 1.90
# 4 4 0.533 0.944 0.928

Grouping Over All Possible Combinations of Several Variables With dplyr

Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6

Resources