summarise based on multiple groups in R dplyr - r

I have a large data frame that looks like this
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df
#> # A tibble: 6 × 3
#> id counts fruit
#> <dbl> <dbl> <chr>
#> 1 1 10 apple
#> 2 1 20 banana
#> 3 2 15 cherry
#> 4 2 15 cherry
#> 5 2 10 ananas
#> 6 3 20 pear
Created on 2022-04-13 by the reprex package (v2.0.1)
For each id, I want to keep the fruit with the maximum counts and then I want to add the sum_counts of unique fruits per id in another column.
I want my data to look like this:
# A tibble: 3 × 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 30
3 3 pear pear 20
This is what I have tried so far and I do not know why I fail miserably
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,15,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id,fruit) %>%
add_count(fruit) %>%
ungroup() %>%
group_by(id) %>%
summarise(central_fruit=fruit[which.max(counts)],
fruits = toString(sort(unique(fruit), decreasing = TRUE)),
sum_counts = sum(unique(counts)))
#> # A tibble: 3 × 4
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana banana, apple 30
#> 2 2 cherry cherry, ananas 15
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)

Here's a dplyr approach.
library(dplyr)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id) %>%
mutate(fruits = paste0(unique(fruit), collapse = ", "),
sum_counts = sum(unique(counts))) %>%
filter(counts == max(counts)) %>%
distinct() %>%
rename("central_fruit" = "fruit") %>%
select(-counts)
#> # A tibble: 3 × 4
#> # Groups: id [3]
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana apple, banana 30
#> 2 2 cherry cherry, ananas 25
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)

This should work:
df |>
group_by(id) |>
distinct(fruit, .keep_all = TRUE) |>
mutate(
is_central_fruit = counts == max(counts),
sum_counts = sum(counts),
fruits = paste(fruit, collapse = ", ")
) |>
filter(
is_central_fruit
) |>
select(
-is_central_fruit,
-counts,
central_fruit = fruit
)
# id central_fruit sum_counts fruits
# <dbl> <chr> <dbl> <chr>
# 1 1 banana 30 apple, banana
# 2 2 cherry 25 cherry, ananas
# 3 3 pear 20 pear
If you want to order the fruits column then I wouldn't store fruits as a character vector, but as a list of factors.

And another dplyr approach but preserving the fruits order (central_fruit is first in fruits):
df %>%
distinct() %>%
group_by(id) %>%
mutate(sum_counts = sum(counts)) %>%
arrange(id, desc(counts)) %>%
mutate(fruits = paste(fruit, collapse = ", ")) %>%
slice(1) %>%
select(id, central_fruit = fruit, fruits, sum_counts) %>%
ungroup()
This returns
# A tibble: 3 x 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 25
3 3 pear pear 20

Related

collapse list column elements to a string [duplicate]

This question already has answers here:
Collapse / concatenate / aggregate a column to a single comma separated string within each group
(6 answers)
Closed 8 months ago.
I would like to collapse a subcategory into a comma-delimited string for display.
This is what I have.
library(tibble)
eats <- tribble(
~food,~variety,
"fruit","cherry",
"fruit","apple",
"fruit","peach",
"nut","cashew",
"nut","almonds"
)
eats
#> # A tibble: 5 x 2
#> food variety
#> <chr> <chr>
#> 1 fruit cherry
#> 2 fruit apple
#> 3 fruit peach
#> 4 nut cashew
#> 5 nut almonds
This is what I want:
eats2 <- tribble(
~food,~varieties,
"fruit","cherry, apple, peach",
"nut","cashew, almond"
)
eats2
#> # A tibble: 2 x 2
#> food varieties
#> <chr> <chr>
#> 1 fruit cherry, apple, peach
#> 2 nut cashew, almond
This what I've tried:
eats %>%
nest(data=variety) %>%
mutate(data = paste(data,collapse = ""))
#> # A tibble: 2 x 2
#> food data
#> <chr> <chr>
#> 1 fruit "list(variety = c(\"cherry\", \"apple\", \"peach\"))list(variety = c(\"~
#> 2 nut "list(variety = c(\"cherry\", \"apple\", \"peach\"))list(variety = c(\"~
Nope.
eats %>%
nest(data=variety) %>%
map(~.x %>% mutate(varieties=paste(data,collapse = "")))
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "character"
Created on 2022-07-01 by the reprex package (v2.0.1)
Also nope. What is the right way to do this? Thanks.
You can use the following code using summarise and toString:
library(tibble)
library(dplyr)
eats <- tribble(
~food,~variety,
"fruit","cherry",
"fruit","apple",
"fruit","peach",
"nut","cashew",
"nut","almonds"
)
eats %>%
group_by(food) %>%
summarise(varieties = toString(variety)) %>%
ungroup()
#> # A tibble: 2 × 2
#> food varieties
#> <chr> <chr>
#> 1 fruit cherry, apple, peach
#> 2 nut cashew, almonds
Created on 2022-07-01 by the reprex package (v2.0.1)
Like #Quinten's solution, just using paste instead of toString:
library(dplyr)
eats %>%
group_by(food) %>%
summarise(varieties = paste(variety, collapse = ", ")) %>%
ungroup()
food varieties
<chr> <chr>
1 fruit cherry, apple, peach
2 nut cashew, almonds

How to filter nested data

How can I filter a nested dataset (make sure the nest is the exact same as some reference vector or tibble)?
library(tidyverse)
rev_vec <- c("apple", "pear", "banana")
df <- tibble(
ID= rep(1:3, each =3),
fruits = c("apple", "pear", "banana",
"Pineapple", "Pineapple", "orange",
"lime", "pear", NA))
df_vec <- df %>%
group_by(ID) %>%
summarise(fruits = list(unique(fruits)))
## This does not work
df_vec %>%
filter(fruits == rev_vec)
## This does not work
df_vec %>%
filter(unlist(fruits) == rev_vec)
## This does not work
df_vec %>%
filter(all(unlist(fruits[[1]]) ==rev_vec))
Basically, I just need to know which ID (in this case 1) matches the reference vector
expected outcome
Only ID 1 matches the rev vec.
df_vec %>%
filter(....)
# A tibble: 1 x 2
ID fruits
<int> <list>
1 1 <chr [3]>
df_vec %>%
filter(map_lgl(fruits, ~setequal(., rev_vec)))
# A tibble: 1 x 2
ID fruits
<int> <list>
1 1 <chr [3]>
Not sure how you want the output structured, but here is an idea
library(dplyr)
df %>%
group_by(ID) %>%
mutate(new = sum(fruits %in% rev_vec) == n())
# A tibble: 9 x 3
# Groups: ID [3]
ID fruits new
<int> <chr> <lgl>
1 1 apple TRUE
2 1 pear TRUE
3 1 banana TRUE
4 2 Pineapple FALSE
5 2 Pineapple FALSE
6 2 orange FALSE
7 3 lime FALSE
8 3 pear FALSE
9 3 NA FALSE
Another output,
df %>%
group_by(ID) %>%
mutate(new = sum(fruits %in% rev_vec) == n()) %>%
filter(new) %>%
nest()
# A tibble: 1 x 2
# Groups: ID [1]
ID data
<int> <list>
1 1 <tibble [3 x 2]>
Perhaps you could try using identical to see if the fruits for each ID are exactly identical to the reference vector.
library(tidyverse)
df %>%
group_by(ID) %>%
filter(identical(fruits, rev_vec))
Output
ID fruits
<int> <chr>
1 1 apple
2 1 pear
3 1 banana

match data frames based on multiple columns in R

I have two huge datasets that look like this.
there is one fruit from df2, PEACH, which is missing for any reason from df1.
I want to add in df1 the fruits that are missing.
library(tidyverse)
df1 <- tibble(central_fruit=c("ananas","apple"),
fruits=c("ananas,anan,anannas",("apple,appl,appless")),
counts=c("100,10,1","50,20,2"))
df1
#> # A tibble: 2 × 3
#> central_fruit fruits counts
#> <chr> <chr> <chr>
#> 1 ananas ananas,anan,anannas 100,10,1
#> 2 apple apple,appl,appless 50,20,2
df2 <- tibble(fruit=c("ananas","anan","anannas","apple","appl","appless","PEACH"),
counts=c(100,10,1,50,20,2,1000))
df2
#> # A tibble: 7 × 2
#> fruit counts
#> <chr> <dbl>
#> 1 ananas 100
#> 2 anan 10
#> 3 anannas 1
#> 4 apple 50
#> 5 appl 20
#> 6 appless 2
#> 7 PEACH 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
I want my data to look like this
df1
central_fruit fruits counts
<chr> <chr> <chr>
1 ananas ananas,anan,anannas 100,10,1
2 apple apple,appl,appless 50,20,2
3 PEACH NA 1000
any help or advice are highly appreciated
Please find below one possible data.table approach.
Reprex
Code
library(tidyverse) # to read your tibbles
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(central_fruit = fruit)
][, `:=` (counts = fcoalesce(counts, as.character(i.counts)), i.counts = NULL)
][central_fruit %chin% c(df1$central_fruit, setdiff(df2$fruit, unlist(strsplit(df1$fruit, ","))))][]
Output
#> central_fruit fruits counts
#> 1: ananas ananas,anan,anannas 100,10,1
#> 2: apple apple,appl,appless 50,20,2
#> 3: PEACH <NA> 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
You can just take the set of fruits present in your df1 and use them to filter df2, then bind them together.
library(tidyverse)
present <- df1$fruits |>
str_split(",") |>
unlist()
df2 |>
rename(central_fruit = fruit) |>
filter(! central_fruit %in% present) |>
mutate(counts = as.character(counts)) |>
bind_rows(df1)
#> # A tibble: 3 × 3
#> central_fruit counts fruits
#> <chr> <chr> <chr>
#> 1 PEACH 1000 <NA>
#> 2 ananas 100,10,1 ananas,anan,anannas
#> 3 apple 50,20,2 apple,appl,appless
You may get the dataset in a long format by splitting on comma fruits and counts variable, do a full_join with df2, adjust the NA values and for each central_fruit collapse the values.
library(dplyr)
library(tidyr)
df1 %>%
separate_rows(fruits, counts, convert = TRUE) %>%
full_join(df2, by = c('fruits' = 'fruit')) %>%
transmute(central_fruit = ifelse(is.na(central_fruit), fruits, central_fruit),
fruits = ifelse(is.na(counts.x), NA, fruits),
counts = coalesce(counts.x, counts.y)) %>%
group_by(central_fruit) %>%
summarise(across(.fns = toString))
# central_fruit fruits counts
# <chr> <chr> <chr>
#1 ananas ananas, anan, anannas 100, 10, 1
#2 apple apple, appl, appless 50, 20, 2
#3 PEACH NA 1000

Select the row with the maximum value in each group based on multiple columns in R dplyr

My data frame looks like this one
library(tidyverse)
df1 <- tibble(col1= c("apple","apple","banana","banana"),
col2 = c("appl","aple","banan","bananb"),
count_col1=c(1,1,4,4), count_col2=c(3,4,1,1))
df1
#> # A tibble: 4 × 4
#> col1 col2 count_col1 count_col2
#> <chr> <chr> <dbl> <dbl>
#> 1 apple appl 1 3
#> 2 apple aple 1 4
#> 3 banana banan 4 1
#> 4 banana bananb 4 1
Created on 2022-02-17 by the reprex package (v2.0.1)
I want to select after grouping_by col1 the row that has the maximum value based on count_col1 and count_col2.
I want my data to look like this
col1 col2 count_col1 count_col2
apple aple 1 4
banana banan 4 1
banana bananb 4 1
for one column you can write something
df1 %>%
slice(which.max(count_col1))
but not for two
We may get rowwise max of the 'count' columns with pmax, grouped by 'col1', filter the rows where the max value of 'Max' column is.
library(dplyr)
df1 %>%
mutate(Max = pmax(count_col1, count_col2) ) %>%
group_by(col1) %>%
filter(Max == max(Max)) %>%
ungroup %>%
select(-Max)
-output
# A tibble: 3 × 4
col1 col2 count_col1 count_col2
<chr> <chr> <dbl> <dbl>
1 apple aple 1 4
2 banana banan 4 1
3 banana bananb 4 1
We may also use slice_max
library(purrr)
df1 %>%
group_by(col1) %>%
slice_max(invoke(pmax, across(starts_with("count")))) %>%
ungroup
# A tibble: 3 × 4
col1 col2 count_col1 count_col2
<chr> <chr> <dbl> <dbl>
1 apple aple 1 4
2 banana banan 4 1
3 banana bananb 4 1

How to replace string in a tibble?

I want to replace apple with frui and pear with bord
df <- tibble(
word = c("apple", "apple","apple","banana", "pear","pear"),
i = seq_along(word)
)
Any idea?
A tidyverse option using str_replace_all.
library(tidyverse)
mutate(df, word = str_replace_all(word, c('apple' = 'frui', 'pear' = 'bord')))
# # A tibble: 6 x 2
# word i
# <chr> <int>
# 1 frui 1
# 2 frui 2
# 3 frui 3
# 4 banana 4
# 5 bord 5
# 6 bord 6
We could use recode
library(dplyr)
df <- df %>%
mutate(word = recode(word, apple = 'frui', pear = 'bord'))
-output
df
# A tibble: 6 × 2
word i
<chr> <int>
1 frui 1
2 frui 2
3 frui 3
4 banana 4
5 bord 5
6 bord 6
For the sake of completeness, here is another tidyverse option using case_when.
library(tidyverse)
df <- tibble(
word = c("apple", "apple","apple","banana", "pear","pear"),
i = seq_along(word)
)
df %>%
mutate(word = case_when(
TRUE ~ word,
word == "apple" ~ "frui",
word == "pear" ~ "bord"
))
#> # A tibble: 6 x 2
#> word i
#> <chr> <int>
#> 1 apple 1
#> 2 apple 2
#> 3 apple 3
#> 4 banana 4
#> 5 pear 5
#> 6 pear 6
Created on 2021-11-25 by the reprex package (v0.3.0)

Resources