Merge data frames based on custom condition - string comparison - r

I'd like to merge rows of two data frames - df1 and df2 using column A:
#df1
A <- c('ab','ab','bc','bc','bc','cd')
B <- floor(runif(6, min=0, max=10))
C <- floor(runif(6, min=0, max=10))
D <- floor(runif(6, min=0, max=10))
E <- c('a, b, c','a, d, e','a, g, h','d, e, f','a, d, f','f, j')
df1 <- data.frame(A,B,C,D,E)
df1
A B C D E
1 ab 5 4 3 a, b, c
2 ab 9 4 0 a, d, e
3 bc 4 4 9 a, g, h
4 bc 5 5 6 d, e, f
5 bc 1 6 6 a, d, f
6 cd 1 2 0 f, j
#df2
A <- c('ab','bc','cd')
B <- floor(runif(3, min=0, max=10))
E <- c('a, d','d, f','n, m')
df2 <- data.frame(A,B,E)
df2
A B E
1 ab 4 a, d
2 bc 7 d, f
3 cd 1 n, m
I can do simply:
df3 <- merge(x=df1, y=df2, by='A', all.x = TRUE)
However there's condition of merging. Namely, I'd like to merge only rows from df2 to df1 when all substrings (column E) from df2 are present in df1, so the output should look like this:
df3
A B C D E A.y B.y E.y
1 ab 5 4 3 a, b, c NA NA NA
2 ab 9 4 0 a, d, e, ab 6 a, d
3 bc 4 4 9 a, g, h NA NA NA
4 bc 5 5 6 d, e, f bc 7 d, f
5 bc 1 6 6 a, d, f bc 7 d, f
6 cd 1 2 0 f, j NA NA NA
I know there's an option using %in% regarding vector comparison. However I have strings, should I first do some strsplit and unlist and then perform the comparison?

This is pretty messy but should do what you're looking for:
First, expand rows for both E values, then group by the key column to check if any values from RHS E are in LHS E. Then filter based on the lookup table.
library(tidyverse)
df3 <- merge(x=df1, y=df2, by='A', all.x = TRUE)
check_rows <- df3 %>%
separate_rows(E.y, sep = ',') %>%
separate_rows(E.x, sep = ',') %>%
mutate(E.x = trimws(E.x),
E.y = trimws(E.y)) %>%
group_by(A) %>%
mutate(check = E.y %in% E.x,
check = ifelse(any(check == TRUE), TRUE, FALSE)) %>%
select(A, check) %>%
unique() %>%
filter(check == TRUE)
df3 <- df3 %>%
filter(A %in% check_rows$A)

Related

Extract rows where value appears in any of multiple columns

Let' say I have two data.frames
name_df = read.table(text = "player_name
a
b
c
d
e
f
g", header = T)
game_df = read.table(text = "game_id winner_name loser_name
1 a b
2 b a
3 a c
4 a d
5 b c
6 c d
7 d e
8 e f
9 f a
10 g f
11 g a
12 f e
13 a d", header = T)
name_df contains a unique list of all the winner_name or loser_name values in game_df. I want to create a new data.frame that has, for each person in the name_df a row if a given name (e.g. a) appears in either the winner_name or loser_name column
So I essentially want to merge game_df with name_df, but the key column (name) can appear in either winner_name or loser_name.
So, for just a and b the final output would look something like:
final_df = read.table(text = "player_name game_id winner_name loser_name
a 1 a b
a 2 b a
a 3 a c
a 4 a d
a 9 f a
a 11 g a
a 13 a d
b 1 a b
b 2 b a
b 5 b c", header = T)
We can loop over the elements in 'name_df' for 'player_name', filter the rows from 'game_df' for either the 'winner_name' or 'loser_name'
library(dplyr)
library(purrr)
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ game_df %>%
filter(winner_name %in% .x|loser_name %in% .x), .id = 'player_name')
Or if there are many columns, use if_any
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ {
nm1 <- .x
game_df %>%
filter(if_any(c(winner_name, loser_name), ~ . %in% nm1))
}, .id = 'player_name')
Dedicated to our teacher and mentor dear #akrun
I think we can also make use of the add_row() function you first taught me the other day. Unbelievable!!!
library(dplyr)
library(purrr)
library(tibble)
game_df %>%
rowwise() %>%
mutate(player_name = winner_name) %>%
group_split(game_id) %>%
map_dfr(~ add_row(.x, game_id = .x$game_id, winner_name = .x$winner_name,
loser_name = .x$loser_name, player_name = .x$loser_name)) %>%
arrange(player_name) %>%
relocate(player_name)
# A tibble: 26 x 4
player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
This can be directly expressed in SQL:
library(sqldf)
sqldf("select *
from name_df
left join game_df on winner_name = player_name or loser_name = player_name")
Without using purrr. I think this is appropriate use case of tidyr::unite with argument remove = F where we can first unite the winners' and losers' names and then use tidyr::separate_rows to split new column into rows.
library(tidyr)
library(dplyr)
game_df %>% unite(Player_name, winner_name, loser_name, remove = F, sep = ', ') %>%
separate_rows(Player_name) %>%
relocate(Player_name) %>%
arrange(Player_name)
# A tibble: 26 x 4
Player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
A Base R approach :
result <- do.call(rbind, lapply(name_df$player_name, function(x)
cbind(plaername = x,
subset(game_df, winner_name == x | loser_name == x))))
rownames(result) <- NULL
result
# playername game_id winner_name loser_name
#1 a 1 a b
#2 a 2 b a
#3 a 3 a c
#4 a 4 a d
#5 a 9 f a
#6 a 11 g a
#7 a 13 a d
#8 b 1 a b
#...
#...

tidyverse: all permutations of categories

Here's a problem: I have all possible combinations of M elements from a set of N elements (N choose M). Each combination has a value assigned.
An example for N = 5 and M = 3:
library(tidyverse)
df <- letters[1:5] %>% combn( m = 3 ) %>% t() %>%
as_tibble( .name_repair = function(x) {paste0('id', 1:length(x))} )
df$val <- runif( nrow(df) )
Which gives a set of 10 combinations:
# A tibble: 10 x 4
id1 id2 id3 val
<chr> <chr> <chr> <dbl>
1 a b c 0.713
2 a b d 0.314
3 a b e 0.831
4 a c d 0.555
5 a c e 0.915
6 a d e 0.954
7 b c d 0.131
8 b c e 0.0583
9 b d e 0.533
10 c d e 0.857
Now I would like to add the combinations such that the results represents selection of M elements without replacement (N!/(N-M)!), but conserving the values for each set of M elements.
So, staying with the example, the result should contain 543=60 rows. For the example, I can do it in a "manual" permutation of the columns:
# add missing combinations
df_perm <- df %>% bind_rows(
# 1, 3, 2
df %>% mutate( tmp = id2, id2 = id3, id3 = tmp ) %>%
select( -tmp )
) %>% bind_rows(
# 2, 1, 3
df %>% mutate( tmp = id1, id1 = id2, id2 = tmp ) %>%
select( -tmp )
) %>% bind_rows(
# 2, 3, 1
df %>% mutate( tmp = id1, id1 = id2, id2 = id3, id3 = tmp ) %>%
select( -tmp )
) %>% bind_rows(
# 3, 1, 2
df %>% mutate( tmp = id2, id2 = id1, id1 = id3, id3 = tmp ) %>%
select( -tmp )
) %>% bind_rows(
# 3, 2, 1
df %>% mutate( tmp = id3, id3 = id1, id1 = tmp ) %>%
select( -tmp )
)
However, this becomes unfeasible quickly for M>3.
What would be a more elegant way to achieve the same result?
As I read your question, it essentially seems that you have assigned a value to each possible combination of size M from a set of size N. You would then like to map the value for each combination to its permutations.
For example, if the combination a, b, d has a value of 0.4, then you would like a, b, d, a, d, b, b, a, d, b, d, a, d, b, a and d, a, b to have a value of 0.4.
First, get all possible permutations of the vector 1:M, where M is the number of elements per combination as defined above:
M <- 3
perm_mat <- gtools::permutations(M, M)
Then permute the columns of the df as per the above permutations:
perm_df <- purrr::map_df(1:nrow(perm_mat), function(i){
df_curr <- df[,c(perm_mat[i,], M+1)]
colnames(df_curr) <- colnames(df)
df_curr
})
This produces the following output (first twenty rows):
V1 V2 V3 val
<chr> <chr> <chr> <dbl>
1 a b c 0.0682
2 a b d 0.735
3 a b e 0.0336
4 a c d 0.965
5 a c e 0.889
6 a d e 0.796
7 b c d 0.792
8 b c e 0.508
9 b d e 0.606
10 c d e 0.623
11 a c b 0.0682
12 a d b 0.735
13 a e b 0.0336
14 a d c 0.965
15 a e c 0.889
16 a e d 0.796
17 b d c 0.792
18 b e c 0.508
19 b e d 0.606
20 c e d 0.623
Note that the numbers in the values column are different to the original post simply because I used a different seed before running runif.

Group values in rows according into similar columns

I had a column with multiple values inside it..
Like...
ColumnX1
A,D,C,B,F,E,G
F,A,B,E,G,C
C,D,G,F,A,T
I splitted the data with
Species_Data2 <- data.frame(str_split_fixed(Species_Data$Other.Anopheline.species, ",", 21))
But I got the values as below:
I have dataframe like:-
X1 X2 X3 X4 X5 X6 X7
A D C B F E G
F A B E G NA C
C D G F A T NA
I wanted to make a dataframe like:
X1 X2 X3 X4 X5 X6 X7 X8
A B C D E F G NA
A B C NA E F G NA
A NA C D NA F G T
and then....
I want to make the columns names as row values:-
Colnames
'A' 'B' 'C' 'D' 'E' 'F' 'G' 'T'
A B C D E F G NA
A B C NA E F G NA
A NA C D NA F G T
Tried to create sorting...but does not work that great... :(..
Comes up with O values though....
If I understand correctly, the OP wants to rearrange the data so that there is a separate column for each letter. If a letter is present in a row, then the letter appears in the appropriate column/row of the reshaped data. NA indicates that a letter is missing in a row. In addition, the letter columns should be arranged in alphabetical order.
1. dplyr/tidyr approach
If we start with the data.frame resulting from OP's call to stringr::str_split_fixed() we need to reshape the splitted data from wide to long format, remove empty entries, order rows so that columns appear in letter order and reshape to wide format again. For reshaping, a row id is required. To achieve the desired output, pivot_wide() has to be called the names_from = value parameter:
library(dplyr)
library(tidyr)
as.data.frame(stringr::str_split_fixed(DF$ColumnX1, ",", 21)) %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn) %>%
filter(value != "") %>%
arrange(as.character(value)) %>%
pivot_wider(rn, names_from = value)
rn A B C D E F G T
<int> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
1 1 A B C D E F G NA
2 2 A B C NA E F G NA
3 3 A NA C D NA F G T
2. data.table approach
If we start from the unsplitted original data, there is a much more concise variant which uses data.table's dcast() for reshaping:
library(data.table)
setDT(DF)[, stringr::str_split(ColumnX1, ","), by = 1:nrow(DF)][, dcast(.SD, nrow ~ V1)]
nrow A B C D E F G T
1: 1 A B C D E F G <NA>
2: 2 A B C <NA> E F G <NA>
3: 3 A <NA> C D <NA> F G T
If required, the additional row id column can be removed in both approaches.
Data
DF <- data.frame(ColumnX1 = c("A,D,C,B,F,E,G",
"F,A,B,E,G,C",
"C,D,G,F,A,T")
)
EDIT: Duplicate values
In a comment, the OP has disclosed that the production dataset contains duplicate values.
In case of duplicate values, dcast() uses the length() function by default to aggregate the data.
With a modified dataset DF2 which contains duplicate values in rows 1 and 2, the original data.table approach returns:
library(data.table)
setDT(DF2)[, stringr::str_split(ColumnX1, ","), by = 1:nrow(DF)][, dcast(.SD, nrow ~ V1)]
nrow A B C D E F G T
1: 1 1 1 2 1 1 1 1 0
2: 2 1 1 1 0 1 2 1 0
3: 3 1 0 1 1 0 1 1 1
Here, the number of duplicate letters is shown.
The expected behaviour can be restored by removing the duplicate values before reshaping by using unique():
setDT(DF2)[, stringr::str_split(ColumnX1, ","), by = 1:nrow(DF)][
, dcast(unique(.SD), nrow ~ V1)]
nrow A B C D E F G T
1: 1 A B C D E F G <NA>
2: 2 A B C <NA> E F G <NA>
3: 3 A <NA> C D <NA> F G T
Also the dplyr/tidyr approach needs to be modified by specifying an appropriate aggregation function in the call to pivot_wider():
library(dplyr)
library(tidyr)
as.data.frame(stringr::str_split_fixed(DF2$ColumnX1, ",", 21)) %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn) %>%
filter(value != "") %>%
arrange(as.character(value)) %>%
pivot_wider(rn, names_from = value, values_fn = list(value = unique))
Data with duplicate values
DF2 <- data.frame(ColumnX1 = c("A,D,C,B,F,E,G,C",
"F,A,B,E,G,C,F",
"C,D,G,F,A,T")
)

r create new data frame that matches in rows elements grouped by another column

I want to create a new data frame from the df one below. In the new data frame (df2), each element in df$name is placed in the first column and matched in its row with other element of df$name grouped by df$group.
df <- data.frame(group = rep(letters[1:2], each=3),
name = LETTERS[1:6])
> df
group name
1 a A
2 a B
3 a C
4 b D
5 b E
6 b F
In this example, "A", "B", and "C" in df$name belong to "a" in df$group, and I want to put them in the same row in a new data frame. The desired output looks like this:
> df2
V1 V2
1 A B
2 A C
3 B A
4 B C
5 C A
6 C B
7 D E
8 D F
9 E D
10 E F
11 F D
12 F E
We could do this in base R with merge
out <- setNames(subset(merge(df, df, by.x = 'group', by.y = 'group'),
name.x != name.y, select = -group), c("V1", "V2"))
row.names(out) <- NULL
out
# V1 V2
#1 A B
#2 A C
#3 B A
#4 B C
#5 C A
#6 C B
#7 D E
#8 D F
#9 E D
#10 E F
#11 F D
#12 F E
In my opinion its case of self-join. Using dplyr a solution can be as:
library(dplyr)
inner_join(df, df, by="group") %>%
filter(name.x != name.y) %>%
select(V1 = name.x, V2 = name.y)
# V1 V2
# 1 A B
# 2 A C
# 3 B A
# 4 B C
# 5 C A
# 6 C B
# 7 D E
# 8 D F
# 9 E D
# 10 E F
# 11 F D
# 12 F E
df <- data.frame(group = rep(letters[1:2], each=3),
name = LETTERS[1:6])
library(tidyverse)
df %>%
group_by(group) %>% # for every group
summarise(v = list(expand.grid(V1=name, V2=name))) %>% # create all combinations of names
select(v) %>% # keep only the combinations
unnest(v) %>% # unnest combinations
filter(V1 != V2) # exclude rows with same names
# # A tibble: 12 x 2
# V1 V2
# <fct> <fct>
# 1 B A
# 2 C A
# 3 A B
# 4 C B
# 5 A C
# 6 B C
# 7 E D
# 8 F D
# 9 D E
# 10 F E
# 11 D F
# 12 E F

how to subset in r for this particular condition?

df1 and df2 have columns a,b. I want to subset data from df1 such that each entry in df1$a along with df1$b is in df2$a along with df2$b.
df1
a b c
1 m df1
2 f df1
3 f df1
4 m df1
5 f df1
6 m df1
df2
a b c
1 m df2
3 f df2
4 f df2
5 m df2
6 f df2
7 m df2
desired output
df
a b c
1 m df1
3 f df1
i am using :
df <- subset(df1,(df1$a%in%df2$a & df1$b%in%df2$b))
but this is giving results similar to
df <-subset(df1,df1$a%in%df2$a)
You can use package dplyr:
library(dplyr)
intersect(df1,df2)
# a b
#1 1 m
#2 3 f
Edit for the new data.frames with c column:
you can use function semi_join (also from dplyr):
semi_join(df1,df2,by=c("a","b"))
# a b c
#1 1 m df1
#2 3 f df1
Other option, in base R:
you can paste your a and b variables to subset your data.frame:
df1[paste(df1$a,df1$b) %in% paste(df2$a,df2$b), ]
# a b
#1 1 m
#3 3 f
and with the new data.frames:
# a b c
# 1 1 m df1
# 3 3 f df1
Or you could do
Res <- rbind(df1, df2)
Res[duplicated(Res), ]
# a b
# 7 1 m
# 8 3 f
Edit1: Per the edit, here's a similar data.table solution
library(data.table)
Res <- rbind(df1, df2)
setDT(Res)[duplicated(Res, by = c("a", "b"), fromLast = TRUE)]
# a b c
# 1: 1 m df1
# 2: 3 f df1
Edit2: I see that #CathG opened a join battlefront, so here's how we do it with data.table
setkey(setDT(df1), a, b) ; setkey(setDT(df2), a, b)
df1[df2, nomatch = 0]
# a b c i.c
# 1: 1 m df1 df2
# 2: 3 f df1 df2

Resources