How to filter nested data - r

How can I filter a nested dataset (make sure the nest is the exact same as some reference vector or tibble)?
library(tidyverse)
rev_vec <- c("apple", "pear", "banana")
df <- tibble(
ID= rep(1:3, each =3),
fruits = c("apple", "pear", "banana",
"Pineapple", "Pineapple", "orange",
"lime", "pear", NA))
df_vec <- df %>%
group_by(ID) %>%
summarise(fruits = list(unique(fruits)))
## This does not work
df_vec %>%
filter(fruits == rev_vec)
## This does not work
df_vec %>%
filter(unlist(fruits) == rev_vec)
## This does not work
df_vec %>%
filter(all(unlist(fruits[[1]]) ==rev_vec))
Basically, I just need to know which ID (in this case 1) matches the reference vector
expected outcome
Only ID 1 matches the rev vec.
df_vec %>%
filter(....)
# A tibble: 1 x 2
ID fruits
<int> <list>
1 1 <chr [3]>

df_vec %>%
filter(map_lgl(fruits, ~setequal(., rev_vec)))
# A tibble: 1 x 2
ID fruits
<int> <list>
1 1 <chr [3]>

Not sure how you want the output structured, but here is an idea
library(dplyr)
df %>%
group_by(ID) %>%
mutate(new = sum(fruits %in% rev_vec) == n())
# A tibble: 9 x 3
# Groups: ID [3]
ID fruits new
<int> <chr> <lgl>
1 1 apple TRUE
2 1 pear TRUE
3 1 banana TRUE
4 2 Pineapple FALSE
5 2 Pineapple FALSE
6 2 orange FALSE
7 3 lime FALSE
8 3 pear FALSE
9 3 NA FALSE
Another output,
df %>%
group_by(ID) %>%
mutate(new = sum(fruits %in% rev_vec) == n()) %>%
filter(new) %>%
nest()
# A tibble: 1 x 2
# Groups: ID [1]
ID data
<int> <list>
1 1 <tibble [3 x 2]>

Perhaps you could try using identical to see if the fruits for each ID are exactly identical to the reference vector.
library(tidyverse)
df %>%
group_by(ID) %>%
filter(identical(fruits, rev_vec))
Output
ID fruits
<int> <chr>
1 1 apple
2 1 pear
3 1 banana

Related

How to filter out groups empty for 1 column in Tidyverse

tibble(
A = c("A","A","B","B"),
x = c(NA,NA,NA,1),
y = c(1,2,3,4),
) %>% group_by(A) -> df
desired output:
tibble(
A = c("B","B"),
x = c(NA,1)
y = c(3,4),
)
I want to find all groups for which all elements of x and x only are all NA, then remove those groups. "B" is filtered in because it has at least 1 non NA element.
I tried:
df %>%
filter(all(!is.na(x)))
but it seems that filters out if it finds at least 1 NA; I need the correct word, which is not all.
This will remove groups of column A if all elements of x are NA:
library(dplyr)
df %>%
group_by(A) %>%
filter(! all(is.na(x)))
# A tibble: 2 × 3
# Groups: A [1]
# A x y
# <chr> <dbl> <dbl>
#1 B NA 3
#2 B 1 4
Note that group "A" was removed because both cells in the column x are not defined.
We can use any with complete.cases
library(dplyr)
df %>%
group_by(A) %>%
filter(any(complete.cases(x))) %>%
ungroup
-output
# A tibble: 2 × 3
A x y
<chr> <dbl> <dbl>
1 B NA 3
2 B 1 4
In the devel version of dplyr, we could use .by in filter thus we don't need to group_by/ungroup
df %>%
filter(any(complete.cases(x)), .by = 'A')
# A tibble: 2 × 3
A x y
<chr> <dbl> <dbl>
1 B NA 3
2 B 1 4

summarise based on multiple groups in R dplyr

I have a large data frame that looks like this
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df
#> # A tibble: 6 × 3
#> id counts fruit
#> <dbl> <dbl> <chr>
#> 1 1 10 apple
#> 2 1 20 banana
#> 3 2 15 cherry
#> 4 2 15 cherry
#> 5 2 10 ananas
#> 6 3 20 pear
Created on 2022-04-13 by the reprex package (v2.0.1)
For each id, I want to keep the fruit with the maximum counts and then I want to add the sum_counts of unique fruits per id in another column.
I want my data to look like this:
# A tibble: 3 × 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 30
3 3 pear pear 20
This is what I have tried so far and I do not know why I fail miserably
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,15,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id,fruit) %>%
add_count(fruit) %>%
ungroup() %>%
group_by(id) %>%
summarise(central_fruit=fruit[which.max(counts)],
fruits = toString(sort(unique(fruit), decreasing = TRUE)),
sum_counts = sum(unique(counts)))
#> # A tibble: 3 × 4
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana banana, apple 30
#> 2 2 cherry cherry, ananas 15
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)
Here's a dplyr approach.
library(dplyr)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id) %>%
mutate(fruits = paste0(unique(fruit), collapse = ", "),
sum_counts = sum(unique(counts))) %>%
filter(counts == max(counts)) %>%
distinct() %>%
rename("central_fruit" = "fruit") %>%
select(-counts)
#> # A tibble: 3 × 4
#> # Groups: id [3]
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana apple, banana 30
#> 2 2 cherry cherry, ananas 25
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)
This should work:
df |>
group_by(id) |>
distinct(fruit, .keep_all = TRUE) |>
mutate(
is_central_fruit = counts == max(counts),
sum_counts = sum(counts),
fruits = paste(fruit, collapse = ", ")
) |>
filter(
is_central_fruit
) |>
select(
-is_central_fruit,
-counts,
central_fruit = fruit
)
# id central_fruit sum_counts fruits
# <dbl> <chr> <dbl> <chr>
# 1 1 banana 30 apple, banana
# 2 2 cherry 25 cherry, ananas
# 3 3 pear 20 pear
If you want to order the fruits column then I wouldn't store fruits as a character vector, but as a list of factors.
And another dplyr approach but preserving the fruits order (central_fruit is first in fruits):
df %>%
distinct() %>%
group_by(id) %>%
mutate(sum_counts = sum(counts)) %>%
arrange(id, desc(counts)) %>%
mutate(fruits = paste(fruit, collapse = ", ")) %>%
slice(1) %>%
select(id, central_fruit = fruit, fruits, sum_counts) %>%
ungroup()
This returns
# A tibble: 3 x 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 25
3 3 pear pear 20

match data frames based on multiple columns in R

I have two huge datasets that look like this.
there is one fruit from df2, PEACH, which is missing for any reason from df1.
I want to add in df1 the fruits that are missing.
library(tidyverse)
df1 <- tibble(central_fruit=c("ananas","apple"),
fruits=c("ananas,anan,anannas",("apple,appl,appless")),
counts=c("100,10,1","50,20,2"))
df1
#> # A tibble: 2 × 3
#> central_fruit fruits counts
#> <chr> <chr> <chr>
#> 1 ananas ananas,anan,anannas 100,10,1
#> 2 apple apple,appl,appless 50,20,2
df2 <- tibble(fruit=c("ananas","anan","anannas","apple","appl","appless","PEACH"),
counts=c(100,10,1,50,20,2,1000))
df2
#> # A tibble: 7 × 2
#> fruit counts
#> <chr> <dbl>
#> 1 ananas 100
#> 2 anan 10
#> 3 anannas 1
#> 4 apple 50
#> 5 appl 20
#> 6 appless 2
#> 7 PEACH 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
I want my data to look like this
df1
central_fruit fruits counts
<chr> <chr> <chr>
1 ananas ananas,anan,anannas 100,10,1
2 apple apple,appl,appless 50,20,2
3 PEACH NA 1000
any help or advice are highly appreciated
Please find below one possible data.table approach.
Reprex
Code
library(tidyverse) # to read your tibbles
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(central_fruit = fruit)
][, `:=` (counts = fcoalesce(counts, as.character(i.counts)), i.counts = NULL)
][central_fruit %chin% c(df1$central_fruit, setdiff(df2$fruit, unlist(strsplit(df1$fruit, ","))))][]
Output
#> central_fruit fruits counts
#> 1: ananas ananas,anan,anannas 100,10,1
#> 2: apple apple,appl,appless 50,20,2
#> 3: PEACH <NA> 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
You can just take the set of fruits present in your df1 and use them to filter df2, then bind them together.
library(tidyverse)
present <- df1$fruits |>
str_split(",") |>
unlist()
df2 |>
rename(central_fruit = fruit) |>
filter(! central_fruit %in% present) |>
mutate(counts = as.character(counts)) |>
bind_rows(df1)
#> # A tibble: 3 × 3
#> central_fruit counts fruits
#> <chr> <chr> <chr>
#> 1 PEACH 1000 <NA>
#> 2 ananas 100,10,1 ananas,anan,anannas
#> 3 apple 50,20,2 apple,appl,appless
You may get the dataset in a long format by splitting on comma fruits and counts variable, do a full_join with df2, adjust the NA values and for each central_fruit collapse the values.
library(dplyr)
library(tidyr)
df1 %>%
separate_rows(fruits, counts, convert = TRUE) %>%
full_join(df2, by = c('fruits' = 'fruit')) %>%
transmute(central_fruit = ifelse(is.na(central_fruit), fruits, central_fruit),
fruits = ifelse(is.na(counts.x), NA, fruits),
counts = coalesce(counts.x, counts.y)) %>%
group_by(central_fruit) %>%
summarise(across(.fns = toString))
# central_fruit fruits counts
# <chr> <chr> <chr>
#1 ananas ananas, anan, anannas 100, 10, 1
#2 apple apple, appl, appless 50, 20, 2
#3 PEACH NA 1000

How to paste strings between tibble's character column and rows in a nested list-column

I have a tibble with one character column and one list-column that nests dataframes. I want to collapse the dataframes in the list-column (using dplyr::bind_rows()) and append the respective value from the character column for each row.
Example
library(tibble)
my_tibble <-
tibble(category = c("color", "shape"),
items = list(tibble(item = c("red", "blue"), value = c(1, 2)),
tibble(item = c("square", "triangle"), value = c(1, 2))
))
> my_tibble
## # A tibble: 2 x 2
## category items
## <chr> <list>
## 1 color <tibble [2 x 2]>
## 2 shape <tibble [2 x 2]>
I know how to collapse the entire items column:
library(dplyr)
my_tibble %>%
pull(items) %>%
bind_rows()
## # A tibble: 4 x 2
## item value
## <chr> <dbl>
## 1 red 1
## 2 blue 2
## 3 square 1
## 4 triangle 2
But what I'm trying to achieve is to paste the values from the category column of my_tibble to get:
desired output
## # A tibble: 4 x 2
## item value
## <chr> <dbl>
## 1 color_red 1
## 2 color_blue 2
## 3 shape_square 1
## 4 shape_triangle 2
How can I do this?
UPDATE
I think that tidyr::unnest_longer() brings me closer to the target:
library(tidyr)
my_tibble %>%
unnest_longer(items)
# A tibble: 4 x 2
category items$item $value
<chr> <chr> <dbl>
1 color red 1
2 color blue 2
3 shape square 1
4 shape triangle 2
But not sure how to progress. Trying to append with tidyr::unite() fails:
my_tibble %>%
unnest_longer(items) %>%
unite("category", `items$item`)
Error: Can't subset columns that don't exist.
x Column items$item doesn't exist.
unnest() returns an output that's easier to work with than unnest_longer():
library(tidyr)
my_tibble %>%
unnest(items) %>%
unite(col = item, category, item)
## # A tibble: 4 x 2
## item value
## <chr> <dbl>
## 1 color_red 1
## 2 color_blue 2
## 3 shape_square 1
## 4 shape_triangle 2
It's not the nicer way, but it works. Try this:
library(dlpyr)
my_tibble %>%
group_by(category) %>%
group_modify(~data.frame(.$items)) %>%
ungroup() %>%
mutate(item=paste(category,item,sep="_")) %>%
select(-category)

how to "spread" a list-column?

Consider this simple example
mydf <- data_frame(regular_col = c(1,2),
normal_col = c('a','b'),
weird_col = list(list('hakuna', 'matata'),
list('squash', 'banana')))
> mydf
# A tibble: 2 x 3
regular_col normal_col weird_col
<dbl> <chr> <list>
1 1 a <list [2]>
2 2 b <list [2]>
I would like to extract the elements of weird_col (programmatically, the number of elements may change) so that each element is placed on a different column. That is, I expect the following output
> data_frame(regular_col = c(1,2),
+ normal_col = c('a','b'),
+ weirdo_one = c('hakuna', 'squash'),
+ weirdo_two = c('matata', 'banana'))
# A tibble: 2 x 4
regular_col normal_col weirdo_one weirdo_two
<dbl> <chr> <chr> <chr>
1 1 a hakuna matata
2 2 b squash banana
However, I am unable to do so in simple terms. For instance, using the classic unnest fails here, as it expands the dataframe instead of placing each element of the list in a different column.
> mydf %>% unnest(weird_col)
# A tibble: 4 x 3
regular_col normal_col weird_col
<dbl> <chr> <list>
1 1 a <chr [1]>
2 1 a <chr [1]>
3 2 b <chr [1]>
4 2 b <chr [1]>
Is there any solution in the tidyverse for that?
You can extract the values from the output of unnest, process a little to make your column names, and then spread back out. Note that I use flatten_chr because of your depth-one list-column, but if it is nested you can use flatten and spread works just as well on list-cols.
library(tidyverse)
#> Warning: package 'dplyr' was built under R version 3.5.1
mydf <- data_frame(
regular_col = c(1, 2),
normal_col = c("a", "b"),
weird_col = list(
list("hakuna", "matata"),
list("squash", "banana")
)
)
mydf %>%
unnest(weird_col) %>%
group_by(regular_col, normal_col) %>%
mutate(
weird_col = flatten_chr(weird_col),
weird_colname = str_c("weirdo_", row_number())
) %>% # or just as.character
spread(weird_colname, weird_col)
#> # A tibble: 2 x 4
#> # Groups: regular_col, normal_col [2]
#> regular_col normal_col weirdo_1 weirdo_2
#> <dbl> <chr> <chr> <chr>
#> 1 1 a hakuna matata
#> 2 2 b squash banana
Created on 2018-08-12 by the reprex package (v0.2.0).
unnest develops lists and vectors vertically, and one row data frames horizontally. So what we can do is change your lists into data frames (with adequate column names) and unnest afterwards.
mydf %>% mutate(weird_col = map(weird_col,~ as_data_frame(
setNames(.,paste0("weirdo_",1:length(.)))
))) %>%
unnest
# # A tibble: 2 x 4
# regular_col normal_col weirdo_1 weirdo_2
# <dbl> <chr> <chr> <chr>
# 1 1 a hakuna matata
# 2 2 b squash banana

Resources