mixing unnest_longer and unnest_wider - r

I'm (once more) stuck with flattening nested lists.
I have this tibble with some list-columns (originating from a JSON format).
library(tidyr)
library(dplyr)
df = tibble(id = c(1, 2, 3),
branch = list(NULL, list(colA = 'abc', colB = 'mno'),
list(list(colA = 'def', colB = 'uvw'),
list(colA = 'ghi', colB = 'xyz'))))
I want to unnest_wider column 'branch'. That works with rows 1 and 2:
df %>%
slice(1:2) %>%
unnest_wider(branch)
However, row 3 consists of a list of lists which I have to unnest_longer first:
bind_rows(
df %>% slice(1,2),
df %>% slice(3) %>% unnest_longer(branch)) %>%
unnest_wider(branch)
above code gives the desired output, but I'm looking for a generic solution like:
If an element of column 'branch' is of type 'unnamed list' (indicating that there is a list of lists) then unnest_longer. Afterwards apply unnest_wider to the whole column 'branch'
Any help appreciated!

First convert the leaves to data frames and then unnest it.
library(dplyr)
library(tidyr)
leaf2df <- function(x) {
if (length(names(x))) as.data.frame(x)
else if (is.list(x)) lapply(x, leaf2df)
}
df %>%
rowwise %>%
mutate(branch = list(bind_rows(leaf2df(branch)))) %>%
ungroup %>%
unnest(branch, keep_empty = TRUE)
giving:
# A tibble: 4 × 3
id colA colB
<dbl> <chr> <chr>
1 1 <NA> <NA>
2 2 abc mno
3 3 def uvw
4 3 ghi xyz
Because leaf2df is recursive as long as all leaves in any row have the same parent it should continue to work. For example, below we have made the list in the last row one level deeper and it still works.
df <- tibble(id = c(1, 2, 3),
branch = list(NULL, list(colA = 'abc', colB = 'mno'),
list(list(list(colA = 'def', colB = 'uvw'),
(list(colA = 'ghi', colB = 'xyz'))))))

A little bit convoluted but here's a possible solution:
Iterate through the rows of your df
Determine if it's a named list by checking names(df$branch[[index]])
If unnamed --> slice + unnest; if named --> slice
Finally, unnest_wider()
library(tidyr)
library(dplyr)
library(purrr)
map_df(1:nrow(df), function(x) {
if (is.null(names(df$branch[[x]]))) {
df %>% slice(x) %>% unnest_longer(branch)
} else {
df %>% slice(x)
}
}) %>%
unnest_wider(branch)
Which returns:
# A tibble: 4 × 3
id colA colB
<dbl> <chr> <chr>
1 1 NA NA
2 2 abc mno
3 3 def uvw
4 3 ghi xyz

library(tidyverse)
df <- tibble(
id = c(1, 2, 3),
branch = list(
NULL, list(colA = "abc", colB = "mno"),
list(
list(colA = "def", colB = "uvw"),
list(colA = "ghi", colB = "xyz")
)
)
)
unnester <- function(x, grp) {
if (grp) {
x <- x |> unnest_longer(branch)
}
unnest_wider(x, branch)
}
df |>
rowwise() |>
mutate(grp = length(names(unlist(branch))) > 2) |>
ungroup() |>
split(~grp) |>
imap_dfr(~ unnester(.x, .y)) |>
select(-grp)

The following approach first modifies the list so that all leafs are located at the same list level, after which we can unnest all rows as needed:
library(tidyr)
library(purrr)
library(dplyr)
mutate(df, branch = map(
.x = branch,
.f = ~if(is.list(.x[[1]])) .x else list(.x)
)) |>
unnest_longer(branch) |>
unnest_wider(branch)
#> # A tibble: 4 × 3
#> id colA colB
#> <dbl> <chr> <chr>
#> 1 1 <NA> <NA>
#> 2 2 abc mno
#> 3 3 def uvw
#> 4 3 ghi xyz

Related

How to use dplyr mutate to create new columns from inputting one column into a function that returns a list?

I know the title is a mouthful. I have a function that returns a list. I would like to use dplyr mutate to put each value in the column through the function and put the items in the list returned into new columns.
My example:
library(dplyr)
my_df <- data_frame(filename = c("file1","file2","file3","file4"),
digits_only = c("10101010", "11011011", "10011000","11111111"))
compress_it_list <- function(txt) {
len.raw <- sum(nchar(txt))
len.xz <- length(memCompress(txt, "x"))
len.gz <- length(memCompress(txt, "g"))
len.bz2 <- length(memCompress(txt, "b"))
return(list("len_raw" = len.raw,
"len_xz" = len.xz,
"len_gz" = len.gz,
"len_bz2" = len.bz2,
"min_compression" = min(c(len.raw, len.xz, len.gz, len.bz2))))
}
I could make the function return a dataframe, but I think I'd have the same problem.
compress_it_df <- function(txt) {
len.raw <- sum(nchar(txt))
len.xz <- length(memCompress(txt, "x"))
len.gz <- length(memCompress(txt, "g"))
len.bz2 <- length(memCompress(txt, "b"))
return(data_frame("len_raw" = len.raw,
"len_xz" = len.xz,
"len_gz" = len.gz,
"len_bz2" = len.bz2,
"min_compression" = min(c(len.raw, len.xz, len.gz, len.bz2))))
}
I was trying to figure out something in the vein of
new_df <- my_df %>%
mutate_at(.vars = digits_only, .funs = compress_it_list)
Here, we have the option of unnest_wider
library(dplyr)
library(tidyr)
library(purrr)
my_df %>%
mutate(new = map(digits_only, compress_it_list)) %>%
unnest_wider(c(new))
# A tibble: 4 x 7
# filename digits_only len_raw len_xz len_gz len_bz2 min_compression
# <chr> <chr> <int> <int> <int> <int> <int>
#1 file1 10101010 8 60 12 39 8
#2 file2 11011011 8 60 13 39 8
#3 file3 10011000 8 60 16 39 8
#4 file4 11111111 8 64 11 39 8

Compact a data frame by removing some of the NA cells?

If I have a data frame as follows:
data.frame(
cola = c(3,NA,NA),
colb = c(1,NA,NA),
colc = c(NA,6,NA),
cold = c(NA,7,NA),
cole = c(NA,3,NA),
colf = c(NA,NA,9),
colg = c(NA,NA,8)
)
How can I shift all fields to the left, eliminating NA values as needed, as here:
data.frame(
cola = c(3,6,9),
colb = c(1,7,8),
colc = c(NA,3,NA)
)
Thanks for any help.
We loop through the rows, remove the NA elements with na.omit and then append NA at the end based on the max length of the list
lst <- apply(df1, 1, na.omit)
out <- as.data.frame(do.call(rbind, lapply(lst, `length<-`, max(lengths(lst)))))
Another option could be:
library(tidyverse)
df %>% rownames_to_column() %>%
gather(measure, value, -rowname) %>%
group_by(rowname) %>%
na.omit() %>%
mutate(measure = paste0("col", row_number())) %>%
spread(measure, value) %>%
ungroup() %>%
select(-rowname)
# col1 col2 col3
# 1 3 1 NA
# 2 6 7 3
# 3 9 8 NA
You can transpose, drop NAs, transpose back:
library(magrittr)
library(data.table)
DF %>% transpose %>% lapply(na.omit) %>% transpose %>%
data.frame %>% setNames(names(DF)[seq_along(.)])
cola colb colc
1 3 1 NA
2 6 7 3
3 9 8 NA

Remove duplicates, keeping most frequent row

I would like to deduplicate my data, keeping the row that has the most frequent appearances. If there is a tie in rows, I don't care which gets returned—the first in alphabetical or numeric order is fine. I would like to do this by group of id and var.
MRE:
df <- data.frame(
id = rep("a", 8),
var = c(rep("b", 4), rep("c", 4)),
val = c("d", "d", "d", "e", "f", "f", "g", "g")
)
> df
id var val
1 a b d
2 a b d
3 a b d
4 a b e
5 a c f
6 a c f
7 a c g
8 a c g
Should be:
id var val
1 a b d
2 a c f
I'm working with large datasets and tidyverse pipe chains, so a dplyr solution would be preferable.
Use table and which.max to extract the mode:
df %>%
group_by(id, var) %>%
summarise(val = {t <- table(val); names(t)[which.max(t)] })
# A tibble: 2 x 3
# Groups: id [?]
# id var val
# <fct> <fct> <chr>
#1 a b d
#2 a c f
Another way to do this in base R: Create a three way contingency table directly, and then find the max column along the third axis:
apply(table(df), c(1, 2), function(v) names(v)[which.max(v)])
# var
#id b c
# a "d" "f"
Convert this to a data frame:
as.data.frame.table(
apply(table(df), c(1, 2), function(v) names(v)[which.max(v)])
)
# id var Freq
#1 a b d
#2 a c f
Using dplyr:
library(dplyr)
df %>%
group_by(id, var, val) %>%
summarise(n = n()) %>%
group_by(id, var) %>%
arrange(-n) %>%
slice(1) %>%
ungroup() %>%
select(-n)
# # A tibble: 2 x 3
# id var val
# <fct> <fct> <fct>
# 1 a b d
# 2 a c f
One option could be using table and max as:
library(dplyr)
df %>% group_by(id, var) %>%
filter(table(val) == max(table(val))) %>%
slice(1)
# # A tibble: 2 x 3
# # Groups: id, var [2]
# id var val
# <fctr> <fctr> <fctr>
# 1 a b d
# 2 a c g
NOTE: a c g is case of tie. Per OP any record can be returned in case of tie.
I doubt this is any faster, but another option is
df %>%
group_by(id, var) %>%
filter(row_number() == rle(as.character(val))$lengths %>%
{sum(.[1:which.max(.)])})
A dplyr solution using count:
library(dplyr)
df %>%
count(id,var,val,sort = T) %>%
group_by(id,var) %>%
summarize_at("val",head,1)
# # A tibble: 2 x 3
# id var val
# <fctr> <fctr> <fctr>
# 1 a b d
# 2 a c f
or maybe more idiomatic but longer:
df %>%
count(id,var,val,sort = T) %>%
group_by(id,var) %>%
slice(1) %>%
select(-n) %>%
ungroup
Or with tally for same output with slightly different syntax:
df %>%
group_by(id,var,val) %>%
tally(sort = T) %>%
slice(1) %>%
select(-n) %>%
ungroup
and a base solution :
df2 <- aggregate(x ~ .,cbind(df,x=1),sum)
aggregate(val ~ id+var, df2[order(-df2$x),],head,1)
# id var val
# 1 a b d
# 2 a c f
Here is my try:
library(dplyr)
df %>%
group_by(id,var,val) %>%
mutate(n=n()) %>%
arrange(desc(n)) %>%
group_by(id,var) %>%
filter(row_number()==1) %>%
select(-n)
`

Grouping Over All Possible Combinations of Several Variables With dplyr

Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6

Summarize all group values and a conditional subset in the same call

I'll illustrate my question with an example.
Sample data:
df <- data.frame(ID = c(1, 1, 2, 2, 3, 5), A = c("foo", "bar", "foo", "foo", "bar", "bar"), B = c(1, 5, 7, 23, 54, 202))
df
ID A B
1 1 foo 1
2 1 bar 5
3 2 foo 7
4 2 foo 23
5 3 bar 54
6 5 bar 202
What I want to do is to summarize, by ID, the sum of B and the sum of B when A is "foo". I can do this in a couple steps like:
require(magrittr)
require(dplyr)
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B))
df2 <- df %>%
filter(A == "foo") %>%
group_by(ID) %>%
summarize(sumBfoo = sum(B))
left_join(df1, df2)
ID sumB sumBfoo
1 1 6 1
2 2 30 30
3 3 54 NA
4 5 202 NA
However, I'm looking for a more elegant/faster way, as I'm dealing with 10gb+ of out-of-memory data in sqlite.
require(sqldf)
my_db <- src_sqlite("my_db.sqlite3", create = T)
df_sqlite <- copy_to(my_db, df)
I thought of using mutate to define a new Bfoo column:
df_sqlite %>%
mutate(Bfoo = ifelse(A=="foo", B, 0))
Unfortunately, this doesn't work on the database end of things.
Error in sqliteExecStatement(conn, statement, ...) :
RS-DBI driver: (error in statement: no such function: IFELSE)
You can do both sums in a single dplyr statement:
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B),
sumBfoo = sum(B[A=="foo"]))
And here is a data.table version:
library(data.table)
dt = setDT(df)
dt1 = dt[ , .(sumB = sum(B),
sumBfoo = sum(B[A=="foo"])),
by = ID]
dt1
ID sumB sumBfoo
1: 1 6 1
2: 2 30 30
3: 3 54 0
4: 5 202 0
Writing up #hadley's comment as an answer
df_sqlite %>%
group_by(ID) %>%
mutate(Bfoo = if(A=="foo") B else 0) %>%
summarize(sumB = sum(B),
sumBfoo = sum(Bfoo)) %>%
collect
If you want to do counting instead of summarizing, then the answer is somewhat different. The change in code is small, especially in the conditional counting part.
df1 <- df %>%
group_by(ID) %>%
summarize(countB = n(),
countBfoo = sum(A=="foo"))
df1
Source: local data frame [4 x 3]
ID countB countBfoo
1 1 2 1
2 2 2 2
3 3 1 0
4 5 1 0
If you wanted to count the rows, instead of summing them, can you pass a variable to the function:
df1 <- df %>%
group_by(ID) %>%
summarize(RowCountB = n(),
RowCountBfoo = n(A=="foo"))
I get an error both with n() and nrow().

Resources