I want to have two lists of grouping variables. let's say list1 = c("var2","var3","var4") and list2 = c("var2","var3")
dta = data.frame(var1 = c(1:8),
var2 = c(rep("AA",4),rep("BB",4)),
var3 = rep(c("C","D"),4),
var4 = c(1,1,0,0,0,0,1,1))
dta %>% group_by(var2,var3,var4) %>% summarise(mv1 = mean(var1)) %>%
group_by(var2,var3) %>% summarise(mv1_2 = mean(mv1))
How can I create a function like this
sample_fun = function(dta, list1, list2){
dta %>% group_by(list1) %>% summarise(mv1 = mean(var1)) %>%
group_by(list2) %>% summarise(mv1_2 = mean(mv1))
}
Here are two ways to do this -
Pure dplyr solution using across :
library(dplyr)
library(rlang)
sample_fun = function(dta, list1, list2){
dta %>%
group_by(across(all_of(list1))) %>%
summarise(mv1 = mean(var1)) %>%
ungroup %>%
group_by(across(all_of(list2))) %>%
summarise(mv1_2 = mean(mv1))
}
sample_fun(dta, list1, list2)
# var2 var3 mv1_2
# <chr> <chr> <dbl>
#1 AA C 2
#2 AA D 3
#3 BB C 6
#4 BB D 7
Using non-standard evaluation with syms :
sample_fun = function(dta, list1, list2){
dta %>%
group_by(!!!syms(list1)) %>%
summarise(mv1 = mean(var1)) %>%
ungroup %>%
group_by(!!!syms(all_of(list2))) %>%
summarise(mv1_2 = mean(mv1))
}
sample_fun(dta, list1, list2)
# var2 var3 mv1_2
# <chr> <chr> <dbl>
#1 AA C 2
#2 AA D 3
#3 BB C 6
#4 BB D 7
Related
I'm (once more) stuck with flattening nested lists.
I have this tibble with some list-columns (originating from a JSON format).
library(tidyr)
library(dplyr)
df = tibble(id = c(1, 2, 3),
branch = list(NULL, list(colA = 'abc', colB = 'mno'),
list(list(colA = 'def', colB = 'uvw'),
list(colA = 'ghi', colB = 'xyz'))))
I want to unnest_wider column 'branch'. That works with rows 1 and 2:
df %>%
slice(1:2) %>%
unnest_wider(branch)
However, row 3 consists of a list of lists which I have to unnest_longer first:
bind_rows(
df %>% slice(1,2),
df %>% slice(3) %>% unnest_longer(branch)) %>%
unnest_wider(branch)
above code gives the desired output, but I'm looking for a generic solution like:
If an element of column 'branch' is of type 'unnamed list' (indicating that there is a list of lists) then unnest_longer. Afterwards apply unnest_wider to the whole column 'branch'
Any help appreciated!
First convert the leaves to data frames and then unnest it.
library(dplyr)
library(tidyr)
leaf2df <- function(x) {
if (length(names(x))) as.data.frame(x)
else if (is.list(x)) lapply(x, leaf2df)
}
df %>%
rowwise %>%
mutate(branch = list(bind_rows(leaf2df(branch)))) %>%
ungroup %>%
unnest(branch, keep_empty = TRUE)
giving:
# A tibble: 4 × 3
id colA colB
<dbl> <chr> <chr>
1 1 <NA> <NA>
2 2 abc mno
3 3 def uvw
4 3 ghi xyz
Because leaf2df is recursive as long as all leaves in any row have the same parent it should continue to work. For example, below we have made the list in the last row one level deeper and it still works.
df <- tibble(id = c(1, 2, 3),
branch = list(NULL, list(colA = 'abc', colB = 'mno'),
list(list(list(colA = 'def', colB = 'uvw'),
(list(colA = 'ghi', colB = 'xyz'))))))
A little bit convoluted but here's a possible solution:
Iterate through the rows of your df
Determine if it's a named list by checking names(df$branch[[index]])
If unnamed --> slice + unnest; if named --> slice
Finally, unnest_wider()
library(tidyr)
library(dplyr)
library(purrr)
map_df(1:nrow(df), function(x) {
if (is.null(names(df$branch[[x]]))) {
df %>% slice(x) %>% unnest_longer(branch)
} else {
df %>% slice(x)
}
}) %>%
unnest_wider(branch)
Which returns:
# A tibble: 4 × 3
id colA colB
<dbl> <chr> <chr>
1 1 NA NA
2 2 abc mno
3 3 def uvw
4 3 ghi xyz
library(tidyverse)
df <- tibble(
id = c(1, 2, 3),
branch = list(
NULL, list(colA = "abc", colB = "mno"),
list(
list(colA = "def", colB = "uvw"),
list(colA = "ghi", colB = "xyz")
)
)
)
unnester <- function(x, grp) {
if (grp) {
x <- x |> unnest_longer(branch)
}
unnest_wider(x, branch)
}
df |>
rowwise() |>
mutate(grp = length(names(unlist(branch))) > 2) |>
ungroup() |>
split(~grp) |>
imap_dfr(~ unnester(.x, .y)) |>
select(-grp)
The following approach first modifies the list so that all leafs are located at the same list level, after which we can unnest all rows as needed:
library(tidyr)
library(purrr)
library(dplyr)
mutate(df, branch = map(
.x = branch,
.f = ~if(is.list(.x[[1]])) .x else list(.x)
)) |>
unnest_longer(branch) |>
unnest_wider(branch)
#> # A tibble: 4 × 3
#> id colA colB
#> <dbl> <chr> <chr>
#> 1 1 <NA> <NA>
#> 2 2 abc mno
#> 3 3 def uvw
#> 4 3 ghi xyz
I'm looking for an automated way of converting this:
dat = tribble(
~a, ~b, ~c
, 'x', 1, 'y'
, 'y', 2, NA
, 'q', 4, NA
, 'z', 3, 'q'
)
to:
tribble(
~a, ~b, ~d
, 'x', 1, 2
, 'z', 3, 4
)
So, the column c in dat encodes which row in dat to look at to grab a value for a new column d, and if c is NA, toss that row from the output. Any tips?
We can join dat with itself using c and a columns.
library(dplyr)
dat %>%
inner_join(dat %>% select(-c) %>% rename(d = 'b'),
by = c('c' = 'a'))
# A tibble: 2 x 4
# a b c d
# <chr> <dbl> <chr> <dbl>
#1 x 1 y 2
#2 z 3 q 4
In base R, we can do this with merge :
merge(dat, dat[-3], by.x = 'c', by.y = 'a')
We create the 'd' with lead of 'b' and filter out the NA rows of 'c' and remove the c column with select
library(dplyr)
dat %>%
mutate(d = lead(b)) %>%
filter(!is.na(c)) %>%
select(-c)
# A tibble: 2 x 3
# a b d
# <chr> <dbl> <dbl>
#1 x 1 2
#2 z 3 4
Or more compactly
dat %>%
mutate(d = replace(lead(b), is.na(c), NA), c = NULL) %>%
na.omit
Or with fill
library(tidyr)
dat %>%
mutate(c1 = c) %>%
fill(c1) %>%
group_by(c1) %>%
mutate(d = lead(b)) %>%
ungroup %>%
filter(!is.na(c)) %>%
select(-c, -c1)
Or in data.table
library(data.table)
setDT(dat)[, d := shift(b, type = 'lead')][!is.na(c)][, c := NULL][]
# a b d
#1: x 1 2
#2: z 3 4
NOTE: Both the solutions are simple and doesn't require any joins. Besides, it gives the expected output in the OP's post
Or using match from base R
cbind(na.omit(dat), d = with(dat, b[match(c, a, nomatch = 0)]))[, -3]
# a b d
#1 x 1 2
#2 z 3 4
Given this dataframe:
library(dplyr)
df.ex <- tibble(id = c(rep(1, 4), rep(2, 4), rep(3, 4)),
var1 = c('a','a','b','b','a','a','a','a','b','b','b','b'))
I would like to create a new variable var2 based upon the presence of b in var1 which is grouped by the id column. Thus each id, can then only contain one type of value in the output column. This is the hoped for outcome:
df.ex.outcome <- tibble(id = c(rep(1, 4), rep(2, 4), rep(3, 4)),
var1 = c('a','a','b','b','a','a','a','a','b','b','b','b'),
var2 = c(rep('foo', 4), rep('bar', 4), rep('foo', 4)))
I thought that using group_by would solve this, however it doesn't appear to work, like so:
df.ex <- df.ex %>% group_by(id) %>% mutate(var2 = if_else(var1 %in% 'b', 'foo','bar'))
Does anyone have any ideas on how to do this?
We can wrap with any
df.ex %>%
group_by(id) %>%
mutate(var2 = case_when(any(var1 == "b")~ "foo", TRUE ~ "bar"))
# A tibble: 12 x 3
# Groups: id [3]
# id var1 var2
# <dbl> <chr> <chr>
# 1 1 a foo
# 2 1 a foo
# 3 1 b foo
# 4 1 b foo
# 5 2 a bar
# 6 2 a bar
# 7 2 a bar
# 8 2 a bar
# 9 3 b foo
#10 3 b foo
#11 3 b foo
#12 3 b foo
Or reverse the arguments for %in%
df.ex %>%
group_by(id) %>%
mutate(var2 = case_when("b" %in% var1 ~ "foo", TRUE ~ "bar"))
Or using if_else
df.ex %>%
group_by(id) %>%
mutate(var2 = if_else('b' %in% var1, 'foo','bar'))
so that there will a single TRUE/FALSE output from %in%, which we can also use with if/else
df.ex %>%
group_by(id) %>%
mutate(var2 = if("b" %in% var1) "foo" else "bar")
I would like to deduplicate my data, keeping the row that has the most frequent appearances. If there is a tie in rows, I don't care which gets returned—the first in alphabetical or numeric order is fine. I would like to do this by group of id and var.
MRE:
df <- data.frame(
id = rep("a", 8),
var = c(rep("b", 4), rep("c", 4)),
val = c("d", "d", "d", "e", "f", "f", "g", "g")
)
> df
id var val
1 a b d
2 a b d
3 a b d
4 a b e
5 a c f
6 a c f
7 a c g
8 a c g
Should be:
id var val
1 a b d
2 a c f
I'm working with large datasets and tidyverse pipe chains, so a dplyr solution would be preferable.
Use table and which.max to extract the mode:
df %>%
group_by(id, var) %>%
summarise(val = {t <- table(val); names(t)[which.max(t)] })
# A tibble: 2 x 3
# Groups: id [?]
# id var val
# <fct> <fct> <chr>
#1 a b d
#2 a c f
Another way to do this in base R: Create a three way contingency table directly, and then find the max column along the third axis:
apply(table(df), c(1, 2), function(v) names(v)[which.max(v)])
# var
#id b c
# a "d" "f"
Convert this to a data frame:
as.data.frame.table(
apply(table(df), c(1, 2), function(v) names(v)[which.max(v)])
)
# id var Freq
#1 a b d
#2 a c f
Using dplyr:
library(dplyr)
df %>%
group_by(id, var, val) %>%
summarise(n = n()) %>%
group_by(id, var) %>%
arrange(-n) %>%
slice(1) %>%
ungroup() %>%
select(-n)
# # A tibble: 2 x 3
# id var val
# <fct> <fct> <fct>
# 1 a b d
# 2 a c f
One option could be using table and max as:
library(dplyr)
df %>% group_by(id, var) %>%
filter(table(val) == max(table(val))) %>%
slice(1)
# # A tibble: 2 x 3
# # Groups: id, var [2]
# id var val
# <fctr> <fctr> <fctr>
# 1 a b d
# 2 a c g
NOTE: a c g is case of tie. Per OP any record can be returned in case of tie.
I doubt this is any faster, but another option is
df %>%
group_by(id, var) %>%
filter(row_number() == rle(as.character(val))$lengths %>%
{sum(.[1:which.max(.)])})
A dplyr solution using count:
library(dplyr)
df %>%
count(id,var,val,sort = T) %>%
group_by(id,var) %>%
summarize_at("val",head,1)
# # A tibble: 2 x 3
# id var val
# <fctr> <fctr> <fctr>
# 1 a b d
# 2 a c f
or maybe more idiomatic but longer:
df %>%
count(id,var,val,sort = T) %>%
group_by(id,var) %>%
slice(1) %>%
select(-n) %>%
ungroup
Or with tally for same output with slightly different syntax:
df %>%
group_by(id,var,val) %>%
tally(sort = T) %>%
slice(1) %>%
select(-n) %>%
ungroup
and a base solution :
df2 <- aggregate(x ~ .,cbind(df,x=1),sum)
aggregate(val ~ id+var, df2[order(-df2$x),],head,1)
# id var val
# 1 a b d
# 2 a c f
Here is my try:
library(dplyr)
df %>%
group_by(id,var,val) %>%
mutate(n=n()) %>%
arrange(desc(n)) %>%
group_by(id,var) %>%
filter(row_number()==1) %>%
select(-n)
`
Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6