match data frames based on multiple columns in R - r

I have two huge datasets that look like this.
there is one fruit from df2, PEACH, which is missing for any reason from df1.
I want to add in df1 the fruits that are missing.
library(tidyverse)
df1 <- tibble(central_fruit=c("ananas","apple"),
fruits=c("ananas,anan,anannas",("apple,appl,appless")),
counts=c("100,10,1","50,20,2"))
df1
#> # A tibble: 2 × 3
#> central_fruit fruits counts
#> <chr> <chr> <chr>
#> 1 ananas ananas,anan,anannas 100,10,1
#> 2 apple apple,appl,appless 50,20,2
df2 <- tibble(fruit=c("ananas","anan","anannas","apple","appl","appless","PEACH"),
counts=c(100,10,1,50,20,2,1000))
df2
#> # A tibble: 7 × 2
#> fruit counts
#> <chr> <dbl>
#> 1 ananas 100
#> 2 anan 10
#> 3 anannas 1
#> 4 apple 50
#> 5 appl 20
#> 6 appless 2
#> 7 PEACH 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
I want my data to look like this
df1
central_fruit fruits counts
<chr> <chr> <chr>
1 ananas ananas,anan,anannas 100,10,1
2 apple apple,appl,appless 50,20,2
3 PEACH NA 1000
any help or advice are highly appreciated

Please find below one possible data.table approach.
Reprex
Code
library(tidyverse) # to read your tibbles
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(central_fruit = fruit)
][, `:=` (counts = fcoalesce(counts, as.character(i.counts)), i.counts = NULL)
][central_fruit %chin% c(df1$central_fruit, setdiff(df2$fruit, unlist(strsplit(df1$fruit, ","))))][]
Output
#> central_fruit fruits counts
#> 1: ananas ananas,anan,anannas 100,10,1
#> 2: apple apple,appl,appless 50,20,2
#> 3: PEACH <NA> 1000
Created on 2022-03-20 by the reprex package (v2.0.1)

You can just take the set of fruits present in your df1 and use them to filter df2, then bind them together.
library(tidyverse)
present <- df1$fruits |>
str_split(",") |>
unlist()
df2 |>
rename(central_fruit = fruit) |>
filter(! central_fruit %in% present) |>
mutate(counts = as.character(counts)) |>
bind_rows(df1)
#> # A tibble: 3 × 3
#> central_fruit counts fruits
#> <chr> <chr> <chr>
#> 1 PEACH 1000 <NA>
#> 2 ananas 100,10,1 ananas,anan,anannas
#> 3 apple 50,20,2 apple,appl,appless

You may get the dataset in a long format by splitting on comma fruits and counts variable, do a full_join with df2, adjust the NA values and for each central_fruit collapse the values.
library(dplyr)
library(tidyr)
df1 %>%
separate_rows(fruits, counts, convert = TRUE) %>%
full_join(df2, by = c('fruits' = 'fruit')) %>%
transmute(central_fruit = ifelse(is.na(central_fruit), fruits, central_fruit),
fruits = ifelse(is.na(counts.x), NA, fruits),
counts = coalesce(counts.x, counts.y)) %>%
group_by(central_fruit) %>%
summarise(across(.fns = toString))
# central_fruit fruits counts
# <chr> <chr> <chr>
#1 ananas ananas, anan, anannas 100, 10, 1
#2 apple apple, appl, appless 50, 20, 2
#3 PEACH NA 1000

Related

Convert any element that does not start with a specific string to NA

I have a large data frame that looks like df2.
I want to convert any element across columns code1, code2 that does not start with
AT to NA.
library(tidyverse)
df2 <- tibble(type=c("Jeep", "4x4", "convertible"),
code1=c("ATG1",NA, "ATG2"),
code2=c("random", "ATG3", "xyz"))
df2
#> # A tibble: 3 × 3
#> type code1 code2
#> <chr> <chr> <chr>
#> 1 Jeep ATG1 random
#> 2 4x4 <NA> ATG3
#> 3 convertible ATG2 xyz
Created on 2022-09-29 with reprex v2.0.2
I want my data to look like this
#> type code1 code2
#>
#> 1 Jeep ATG1 NA
#> 2 4x4 ATG3
#> 3 convertible ATG2 NA
You could do
df2 %>%
mutate(across(code1:code2, ~ifelse(substr(.x, 1, 2) == 'AT', .x, NA)))
#> # A tibble: 3 x 3
#> type code1 code2
#> <chr> <chr> <chr>
#> 1 Jeep ATG1 NA
#> 2 4x4 NA ATG3
#> 3 convertible ATG2 NA
With replace and grepl:
df2 %>%
mutate(across(starts_with("code"), ~ replace(.x, !grepl("^AT", .x), NA)))
Using case_when
library(dplyr)
library(stringr)
df2 %>%
mutate(across(starts_with('code'), ~ case_when(str_detect(.x, '^AT')~ .x)))
-output
# A tibble: 3 × 3
type code1 code2
<chr> <chr> <chr>
1 Jeep ATG1 <NA>
2 4x4 <NA> ATG3
3 convertible ATG2 <NA>

collapse list column elements to a string [duplicate]

This question already has answers here:
Collapse / concatenate / aggregate a column to a single comma separated string within each group
(6 answers)
Closed 8 months ago.
I would like to collapse a subcategory into a comma-delimited string for display.
This is what I have.
library(tibble)
eats <- tribble(
~food,~variety,
"fruit","cherry",
"fruit","apple",
"fruit","peach",
"nut","cashew",
"nut","almonds"
)
eats
#> # A tibble: 5 x 2
#> food variety
#> <chr> <chr>
#> 1 fruit cherry
#> 2 fruit apple
#> 3 fruit peach
#> 4 nut cashew
#> 5 nut almonds
This is what I want:
eats2 <- tribble(
~food,~varieties,
"fruit","cherry, apple, peach",
"nut","cashew, almond"
)
eats2
#> # A tibble: 2 x 2
#> food varieties
#> <chr> <chr>
#> 1 fruit cherry, apple, peach
#> 2 nut cashew, almond
This what I've tried:
eats %>%
nest(data=variety) %>%
mutate(data = paste(data,collapse = ""))
#> # A tibble: 2 x 2
#> food data
#> <chr> <chr>
#> 1 fruit "list(variety = c(\"cherry\", \"apple\", \"peach\"))list(variety = c(\"~
#> 2 nut "list(variety = c(\"cherry\", \"apple\", \"peach\"))list(variety = c(\"~
Nope.
eats %>%
nest(data=variety) %>%
map(~.x %>% mutate(varieties=paste(data,collapse = "")))
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "character"
Created on 2022-07-01 by the reprex package (v2.0.1)
Also nope. What is the right way to do this? Thanks.
You can use the following code using summarise and toString:
library(tibble)
library(dplyr)
eats <- tribble(
~food,~variety,
"fruit","cherry",
"fruit","apple",
"fruit","peach",
"nut","cashew",
"nut","almonds"
)
eats %>%
group_by(food) %>%
summarise(varieties = toString(variety)) %>%
ungroup()
#> # A tibble: 2 × 2
#> food varieties
#> <chr> <chr>
#> 1 fruit cherry, apple, peach
#> 2 nut cashew, almonds
Created on 2022-07-01 by the reprex package (v2.0.1)
Like #Quinten's solution, just using paste instead of toString:
library(dplyr)
eats %>%
group_by(food) %>%
summarise(varieties = paste(variety, collapse = ", ")) %>%
ungroup()
food varieties
<chr> <chr>
1 fruit cherry, apple, peach
2 nut cashew, almonds

summarise based on multiple groups in R dplyr

I have a large data frame that looks like this
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df
#> # A tibble: 6 × 3
#> id counts fruit
#> <dbl> <dbl> <chr>
#> 1 1 10 apple
#> 2 1 20 banana
#> 3 2 15 cherry
#> 4 2 15 cherry
#> 5 2 10 ananas
#> 6 3 20 pear
Created on 2022-04-13 by the reprex package (v2.0.1)
For each id, I want to keep the fruit with the maximum counts and then I want to add the sum_counts of unique fruits per id in another column.
I want my data to look like this:
# A tibble: 3 × 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 30
3 3 pear pear 20
This is what I have tried so far and I do not know why I fail miserably
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,15,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id,fruit) %>%
add_count(fruit) %>%
ungroup() %>%
group_by(id) %>%
summarise(central_fruit=fruit[which.max(counts)],
fruits = toString(sort(unique(fruit), decreasing = TRUE)),
sum_counts = sum(unique(counts)))
#> # A tibble: 3 × 4
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana banana, apple 30
#> 2 2 cherry cherry, ananas 15
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)
Here's a dplyr approach.
library(dplyr)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id) %>%
mutate(fruits = paste0(unique(fruit), collapse = ", "),
sum_counts = sum(unique(counts))) %>%
filter(counts == max(counts)) %>%
distinct() %>%
rename("central_fruit" = "fruit") %>%
select(-counts)
#> # A tibble: 3 × 4
#> # Groups: id [3]
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana apple, banana 30
#> 2 2 cherry cherry, ananas 25
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)
This should work:
df |>
group_by(id) |>
distinct(fruit, .keep_all = TRUE) |>
mutate(
is_central_fruit = counts == max(counts),
sum_counts = sum(counts),
fruits = paste(fruit, collapse = ", ")
) |>
filter(
is_central_fruit
) |>
select(
-is_central_fruit,
-counts,
central_fruit = fruit
)
# id central_fruit sum_counts fruits
# <dbl> <chr> <dbl> <chr>
# 1 1 banana 30 apple, banana
# 2 2 cherry 25 cherry, ananas
# 3 3 pear 20 pear
If you want to order the fruits column then I wouldn't store fruits as a character vector, but as a list of factors.
And another dplyr approach but preserving the fruits order (central_fruit is first in fruits):
df %>%
distinct() %>%
group_by(id) %>%
mutate(sum_counts = sum(counts)) %>%
arrange(id, desc(counts)) %>%
mutate(fruits = paste(fruit, collapse = ", ")) %>%
slice(1) %>%
select(id, central_fruit = fruit, fruits, sum_counts) %>%
ungroup()
This returns
# A tibble: 3 x 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 25
3 3 pear pear 20

Select the row with the maximum value in each group based on multiple columns in R dplyr

My data frame looks like this one
library(tidyverse)
df1 <- tibble(col1= c("apple","apple","banana","banana"),
col2 = c("appl","aple","banan","bananb"),
count_col1=c(1,1,4,4), count_col2=c(3,4,1,1))
df1
#> # A tibble: 4 × 4
#> col1 col2 count_col1 count_col2
#> <chr> <chr> <dbl> <dbl>
#> 1 apple appl 1 3
#> 2 apple aple 1 4
#> 3 banana banan 4 1
#> 4 banana bananb 4 1
Created on 2022-02-17 by the reprex package (v2.0.1)
I want to select after grouping_by col1 the row that has the maximum value based on count_col1 and count_col2.
I want my data to look like this
col1 col2 count_col1 count_col2
apple aple 1 4
banana banan 4 1
banana bananb 4 1
for one column you can write something
df1 %>%
slice(which.max(count_col1))
but not for two
We may get rowwise max of the 'count' columns with pmax, grouped by 'col1', filter the rows where the max value of 'Max' column is.
library(dplyr)
df1 %>%
mutate(Max = pmax(count_col1, count_col2) ) %>%
group_by(col1) %>%
filter(Max == max(Max)) %>%
ungroup %>%
select(-Max)
-output
# A tibble: 3 × 4
col1 col2 count_col1 count_col2
<chr> <chr> <dbl> <dbl>
1 apple aple 1 4
2 banana banan 4 1
3 banana bananb 4 1
We may also use slice_max
library(purrr)
df1 %>%
group_by(col1) %>%
slice_max(invoke(pmax, across(starts_with("count")))) %>%
ungroup
# A tibble: 3 × 4
col1 col2 count_col1 count_col2
<chr> <chr> <dbl> <dbl>
1 apple aple 1 4
2 banana banan 4 1
3 banana bananb 4 1

In R, how do I compare for pattern and mismatched rows from two columns with a regex, row-by row?

Using below code i managed to get the matched rows but how can i get the mismatch rows?
ABData <- data.frame(a = c(1,2,3,4,5),b = c("London", "Oxford", "Berlin","Hamburg", "Oslo"),c = c("Hello London","No London","asdBerlin","No Match","OsLondonlohama"))
match<- ABData %>% rowwise() %>% filter(grepl(b,c))
Match Result:
a b c
1 1 London Hello London
2 3 Berlin asdBerlin
along with the match rows i want mismatch rows as well
Help me to get mismatch rows.
Thanks in advance.
I think this could help:
library(tidyverse)
ABData <- data.frame(a = c(1,2,3,4,5),
b = c("London", "Oxford", "Berlin","Hamburg", "Oslo"),
c = c("Hello London","No London","asdBerlin","No Match","OsLondonlohama"))
match <- ABData %>%
rowwise() %>%
filter_at(.vars= vars(c), all_vars(grepl(b,.)))
match
#> Source: local data frame [2 x 3]
#> Groups: <by row>
#>
#> # A tibble: 2 x 3
#> a b c
#> <dbl> <chr> <chr>
#> 1 1 London Hello London
#> 2 3 Berlin asdBerlin
no_match <- ABData %>%
rowwise() %>%
filter_at(.vars= vars(c), all_vars(!grepl(b,.)))
no_match
#> Source: local data frame [3 x 3]
#> Groups: <by row>
#>
#> # A tibble: 3 x 3
#> a b c
#> <dbl> <chr> <chr>
#> 1 2 Oxford No London
#> 2 4 Hamburg No Match
#> 3 5 Oslo OsLondonlohama
Created on 2020-06-03 by the reprex package (v0.3.0)
You can use str_detect from stringr which is vectorized over string as well as pattern so that you don't have to use rowwise.
subset(ABData, !stringr::str_detect(c, b))
# a b c
#2 2 Oxford No London
#4 4 Hamburg No Match
#5 5 Oslo OsLondonlohama
If you want to use it with dplyr :
library(dplyr)
ABData %>% filter(!stringr::str_detect(c, b))

Resources