Correspondance between values in two df R - r

I have two df to confrontate. my first df is "sum"
> head(sum)
File_pdb Res1 Chain1 Res2 Chain2
1: 7LD1_CM GLN 81 M ASN 501 C
2: 7LD1_CM TYR 128 M PHE 377 C
3: 7LD1_CM ILE 78 M SER 375 C
4: 7LD1_CM ASN 76 M ALA 372 C
5: 7LD1_CM THR 20 M TYR 369 C
6: 7LD1_CM ARG 408 C LEU 131 M
The second one is "mut"
> head(mut)
RefAA MutAA LineagesCount
1 VAL 3 GLY 3 1
2 LEU 5 PHE 5 2
3 LEU 8 VAL 8 1
4 SER 13 ILE 13 2
5 LEU 18 PHE 18 5
6 THR 20 ILE 20 1
I have to check if in sum$res1 and sum$res2 there are values equal to mut$refAA. If it's so, I need to add the whole row of mut$refAA near to sum$res1 or sum$res2.
here an example:
File_pdb Res1 Chain1 Res2 Chain2 RefAA MutAA LineagesCount
1: 7LD1_CM GLN 81 M ASN 501 C
2: 7LD1_CM TYR 128 M PHE 377 C
3: 7LD1_CM ILE 78 M SER 375 C
4: 7LD1_CM ASN 76 M ALA 372 C
5: 7LD1_CM THR 20 M TYR 369 C THR 20 ILE 20 1
6: 7LD1_CM ARG 408 C LEU 131 M
How I can do this? I was trying something using merge and join functions but I'm not so experienced so I need to practice more. Can someone help me? Thank you!

I had to fix the data a bit, to easily import the data. Then you can try a tidyverse
library(tidyverse)
SUM %>%
mutate(index = 1:n()) %>%
pivot_longer(c(Res1, Res2)) %>%
left_join(mutate(MUT, value=RefAA), by = "value") %>%
group_by(index) %>%
fill(MutAA, RefAA, LineagesCount, .direction = "downup") %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = value, values_fn = toString) %>%
mutate(which_Res = ifelse(RefAA == Res1, "Res1", "Res2"))
# A tibble: 6 x 10
File_pdb Chain1 Chain2 index RefAA MutAA LineagesCount Res1 Res2 which_Res
<chr> <chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr>
1 7LD1_CM M C 1 NA NA NA GLN81 ASN501 NA
2 7LD1_CM M C 2 NA NA NA TYR128 PHE377 NA
3 7LD1_CM M C 3 NA NA NA ILE78 SER375 NA
4 7LD1_CM M C 4 NA NA NA ASN76 ALA372 NA
5 7LD1_CM M C 5 THR20 ILE20 1 THR20 TYR369 Res1
6 7LD1_CM C M 6 NA NA NA ARG408 LEU131 NA
The data
SUM <- read.table(text = " File_pdb Res1 Chain1 Res2 Chain2
1: 7LD1_CM GLN81 M ASN501 C
2: 7LD1_CM TYR128 M PHE377 C
3: 7LD1_CM ILE78 M SER375 C
4: 7LD1_CM ASN76 M ALA372 C
5: 7LD1_CM THR20 M TYR369 C
6: 7LD1_CM ARG408 C LEU131 M")
SUM
MUT <- read.table(text = " RefAA MutAA LineagesCount
1 VAL3 GLY3 1
2 LEU5 PHE5 2
3 LEU8 VAL8 1
4 SER13 ILE13 2
5 LEU18 PHE18 5
6 THR20 ILE20 1")

Hope this would help
do.call(
dplyr::coalesce,
lapply(
c("Res1", "Res2"),
function(x) merge(SUM, MUT, by.x = x, by.y = "RefAA", all.x = TRUE)
)
)
which gives
Res1 File_pdb Chain1 Res2 Chain2 MutAA LineagesCount
1 ARG408 7LD1_CM C LEU131 M <NA> NA
2 ASN76 7LD1_CM M ALA372 C <NA> NA
3 GLN81 7LD1_CM M ASN501 C <NA> NA
4 ILE78 7LD1_CM M SER375 C <NA> NA
5 THR20 7LD1_CM M TYR369 C ILE20 1
6 TYR128 7LD1_CM M PHE377 C <NA> NA
Data
> dput(SUM)
structure(list(File_pdb = c("7LD1_CM", "7LD1_CM", "7LD1_CM",
"7LD1_CM", "7LD1_CM", "7LD1_CM"), Res1 = c("GLN81", "TYR128",
"ILE78", "ASN76", "THR20", "ARG408"), Chain1 = c("M", "M", "M",
"M", "M", "C"), Res2 = c("ASN501", "PHE377", "SER375", "ALA372",
"TYR369", "LEU131"), Chain2 = c("C", "C", "C", "C", "C", "M")), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:"))
> dput(MUT)
structure(list(RefAA = c("VAL3", "LEU5", "LEU8", "SER13", "LEU18",
"THR20"), MutAA = c("GLY3", "PHE5", "VAL8", "ILE13", "PHE18",
"ILE20"), LineagesCount = c(1L, 2L, 1L, 2L, 5L, 1L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))

Related

data.table roll "nearest" left join for single best match (rest to NA)

I have two data.tables with different number of rows. I would like to left join by matching on a single column so that first dt dt1 keeps all rows. Only best nearest values from second dt2 should be joined.
Minimal data:
library(data.table)
set.seed(42)
timestamp <- sort(rnorm(10, mean = 1, sd = 1))
dt1 <- data.table(
id = letters[1:10],
timestamp = timestamp,
timestamp1 = timestamp,
other1 = 1:10,
other2 = 11:20
)
dt2 <- data.table(
timestamp = timestamp[c(3, 5, 8)] + 0.1,
timestamp2 = timestamp[c(3, 5, 8)] + 0.1,
other3 = c("x", "y", "z"),
other4 = c(333, 444, 555)
)
What I tried:
dt2[dt1, roll = "nearest", on = "timestamp"]
#> timestamp timestamp2 other3 other4 id timestamp1 other1 other2
#> 1: 0.4353018 1.005341 x 333 a 0.4353018 1 11
#> 2: 0.8938755 1.005341 x 333 b 0.8938755 2 12
#> 3: 0.9053410 1.005341 x 333 c 0.9053410 3 13
#> 4: 0.9372859 1.005341 x 333 d 0.9372859 4 14
#> 5: 1.3631284 1.463128 y 444 e 1.3631284 5 15
#> 6: 1.4042683 1.463128 y 444 f 1.4042683 6 16
#> 7: 1.6328626 1.463128 y 444 g 1.6328626 7 17
#> 8: 2.3709584 2.470958 z 555 h 2.3709584 8 18
#> 9: 2.5115220 2.470958 z 555 i 2.5115220 9 19
#> 10: 3.0184237 2.470958 z 555 j 3.0184237 10 20
I am failing to understand how roll="nearest" works. I see that it indeed matches the nearest values, but it does it with all of them. I would like to merge only those 3 rows from dt2 that have the absolute nearest values.
Using joins based on tolerance (max_dist) would also give more than three matches, but in this case I found the value of max_dist that gives the best nearest for this tiny example data.
Desired output:
library(fuzzyjoin)
fuzzyjoin::difference_left_join(as.data.frame(dt1), as.data.frame(dt2), by = "timestamp", max_dist = 0.09)
#> id timestamp.x timestamp1 other1 other2 timestamp.y timestamp2 other3 other4
#> 1 a 0.4353018 0.4353018 1 11 NA NA <NA> NA
#> 2 b 0.8938755 0.8938755 2 12 NA NA <NA> NA
#> 3 c 0.9053410 0.9053410 3 13 NA NA <NA> NA
#> 4 d 0.9372859 0.9372859 4 14 1.005341 1.005341 x 333
#> 5 e 1.3631284 1.3631284 5 15 NA NA <NA> NA
#> 6 f 1.4042683 1.4042683 6 16 1.463128 1.463128 y 444
#> 7 g 1.6328626 1.6328626 7 17 NA NA <NA> NA
#> 8 h 2.3709584 2.3709584 8 18 NA NA <NA> NA
#> 9 i 2.5115220 2.5115220 9 19 2.470958 2.470958 z 555
#> 10 j 3.0184237 3.0184237 10 20 NA NA <NA> NA
Created on 2022-08-25 with reprex v2.0.2
You can try a proper left update join and assign the desired variables from dt2 explicitely
library(data.table)
set.seed(42)
timestamp <- sort(rnorm(10, mean = 1, sd = 1))
dt1 <- data.table(
id = letters[1:10],
timestamp = timestamp,
timestamp1 = timestamp,
other1 = 1:10,
other2 = 11:20
)
dt2 <- data.table(
timestamp = timestamp[c(3, 5, 8)] + 0.1,
timestamp2 = timestamp[c(3, 5, 8)] + 0.1,
other3 = c("x", "y", "z"),
other4 = c(333, 444, 555)
)
# left join: leading table on the left
dt1[dt2,
roll = "nearest",
on = "timestamp",
# assign desired values explicitely
`:=`(other3 = i.other3,
other4 = i.other4)]
dt1[]
#> id timestamp timestamp1 other1 other2 other3 other4
#> 1: a 0.4353018 0.4353018 1 11 <NA> NA
#> 2: b 0.8938755 0.8938755 2 12 <NA> NA
#> 3: c 0.9053410 0.9053410 3 13 <NA> NA
#> 4: d 0.9372859 0.9372859 4 14 x 333
#> 5: e 1.3631284 1.3631284 5 15 <NA> NA
#> 6: f 1.4042683 1.4042683 6 16 y 444
#> 7: g 1.6328626 1.6328626 7 17 <NA> NA
#> 8: h 2.3709584 2.3709584 8 18 <NA> NA
#> 9: i 2.5115220 2.5115220 9 19 z 555
#> 10: j 3.0184237 3.0184237 10 20 <NA> NA

Anti_join between df1 and df2 but how to change all mismatch in df2 to NA

Below are my two dataframes, df1 and df2
df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))
id text response
1 632592651 asdf y
2 633322173 cat y
3 634703802 dog y
4 634927873 mouse n
5 635812953 elephant n
6 636004739 goose y
7 636101211 rat y
8 636157799 mice n
9 636263106 kitty n
10 636752420 kitten y
df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))
id text volume
1 632592651 asdf_xyz 1234
2 633322173 cat 432
3 634703802 dog 324
4 634927873 mouse 333
5 635812953 elephant 2223
6 636004739 goose 412346
7 636101211 rat 7456
8 636157799 mice 3456
9 636263106 kitty 2345
10 636752420 kitten 2345
11 636809222 tiger_xyz 6
12 2004722036 lion 345
13 2004894388 leopard 23
14 2005045755 ostrich 2
15 2005535472 kangaroo 4778
16 2005630542 platypus 234
17 2005788781 fish 8675
18 2005809679 reptile 3459
19 2005838317 mammals 8
20 2005866692 amphibians_xyz 9
How do I change the non-matching items from row id1:20 of df2 to NA (i.e. all of them as no matching with df1) and the column 'text' (i.e. asdf_xyz) of id1 to NA?
I have tried
library(dplyr)
df3 <- df2 %>%
anti_join(df1, by=c("id"))
id text volume
1 636809222 tiger_xyz 6
2 2004722036 lion 345
3 2004894388 leopard 23
4 2005045755 ostrich 2
5 2005535472 kangaroo 4778
6 2005630542 platypus 234
7 2005788781 fish 8675
8 2005809679 reptile 3459
9 2005838317 mammals 8
10 2005866692 amphibians_xyz 9
df3$id[df3$id != 0] <- NA
df3$text[df3$text != 0] <- NA
df3$volume[df3$volume != 0] <- NA
(Doing this one by one because I couldn't find solution how to change the entire value of the dataframe to NA)
id text volume
1 <NA> <NA> <NA>
2 <NA> <NA> <NA>
3 <NA> <NA> <NA>
4 <NA> <NA> <NA>
5 <NA> <NA> <NA>
6 <NA> <NA> <NA>
7 <NA> <NA> <NA>
8 <NA> <NA> <NA>
9 <NA> <NA> <NA>
10 <NA> <NA> <NA>
and df4 (solution from How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'?)
inner_join(x = df1,
y = df2,
by = "id") %>%
mutate_if(is.factor, as.character) %>%
mutate(text = ifelse(test = text.x != text.y,
yes = NA,
no = text.x)) %>%
select(id, text, response, volume)
id text response volume
1 632592651 <NA> y 1234
2 633322173 cat y 432
3 634703802 dog y 324
4 634927873 mouse n 333
5 635812953 elephant n 2223
6 636004739 goose y 412346
7 636101211 rat y 7456
8 636157799 mice n 3456
9 636263106 kitty n 2345
10 636752420 kitten y 2345
but not sure how to replace df2 with df3 and df4. The desired output is shown below:
id text volume
1 632592651 NA 1234
2 633322173 cat 432
3 634703802 dog 324
4 634927873 mouse 333
5 635812953 elephant 2223
6 636004739 goose 412346
7 636101211 rat 7456
8 636157799 mice 3456
9 636263106 kitty 2345
10 636752420 kitten 2345
11 NA NA NA
12 NA NA NA
13 NA NA NA
14 NA NA NA
15 NA NA NA
16 NA NA NA
17 NA NA NA
18 NA NA NA
19 NA NA NA
20 NA NA NA
Can someone help please?
If possible, may I also know if there's a manual approach to select subset of df2 based on df3$id and change all values to NA?
Part 2:
For the second part of my request, I would like to create another dataframes from joined_df which appears only in df1 (call it found_in_df1). Example of output:
found_in_df1:
# id text volume
# 1: 632592651 <NA> 1234
# 2: 633322173 cat 432
# 3: 634703802 dog 324
# 4: 634927873 mouse 333
# 5: 635812953 elephant 2223
# 6: 636004739 goose 412346
# 7: 636101211 rat 7456
# 8: 636157799 mice 3456
# 9: 636263106 kitty 2345
#10: 636752420 kitten 2345
The solution is given in How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'? but I'm looking for an alternative approach, i.e., is it possible to write a script to say retrieve from joined_df using df1 to give found_in_df1 since we have df1 and joined_df?
One potential solution for dealing with conflicts is to use the powerjoin package, e.g.
library(dplyr)
df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
response=c("y","y","y","n","n","y","y","n","n","y"))
df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))
expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
text = c(NA, "cat", "dog", "mouse", "elephant", "goose",
"rat", "mice", "kitty", "kitten",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
volume = c(1234, 432, 324, 333, 2223, 412346, 7456,
3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))
library(powerjoin)
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
final_df
#> id text volume
#> 1 632592651 <NA> 1234
#> 2 633322173 cat 432
#> 3 634703802 dog 324
#> 4 634927873 mouse 333
#> 5 635812953 elephant 2223
#> 6 636004739 goose 412346
#> 7 636101211 rat 7456
#> 8 636157799 mice 3456
#> 9 636263106 kitty 2345
#> 10 636752420 kitten 2345
#> 11 <NA> <NA> NA
#> 12 <NA> <NA> NA
#> 13 <NA> <NA> NA
#> 14 <NA> <NA> NA
#> 15 <NA> <NA> NA
#> 16 <NA> <NA> NA
#> 17 <NA> <NA> NA
#> 18 <NA> <NA> NA
#> 19 <NA> <NA> NA
#> 20 <NA> <NA> NA
all_equal(final_df, expected_outcome)
#> [1] TRUE
# Part 2
found_in_df1 <- power_left_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x)) %>%
select(id, text, volume)
found_in_df1
#> id text volume
#> 1 632592651 <NA> 1234
#> 2 633322173 cat 432
#> 3 634703802 dog 324
#> 4 634927873 mouse 333
#> 5 635812953 elephant 2223
#> 6 636004739 goose 412346
#> 7 636101211 rat 7456
#> 8 636157799 mice 3456
#> 9 636263106 kitty 2345
#> 10 636752420 kitten 2345
Created on 2022-07-02 by the reprex package (v2.0.1)
Edit
Per the comment below from the creator of the powerjoin package (Mr. Mudskipper): these operations are vectorised, so you don't need to perform the command 'rowwise', i.e. you can remove "rw" to simplify and gain performance. There is no practical difference between including and excluding "rw" with df1 and df2, but if we use larger dataframes you can see a clear increase in performance, e.g.
library(dplyr)
library(powerjoin)
# define functions
power_full_join_func_rowwise <- function(df1, df2) {
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
return(final_df)
}
power_full_join_func_not_rowwise <- function(df1, df2) {
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = ~ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
return(final_df)
}
library(microbenchmark)
library(purrr)
library(ggplot2)
# make larger dfs (copy df1 and df2 X100)
df3 <- map_dfr(seq_len(100), ~ df1)
df4 <- map_dfr(seq_len(100), ~ df2)
# benchmark performance on the larger dataframes
res <- microbenchmark(power_full_join_func_rowwise(df3, df4),
power_full_join_func_not_rowwise(df3, df4))
res
#> Unit: milliseconds
#> expr min lq mean
#> power_full_join_func_rowwise(df3, df4) 397.32661 426.08117 449.88787
#> power_full_join_func_not_rowwise(df3, df4) 71.85757 77.25344 90.36191
#> median uq max neval cld
#> 446.41715 472.47817 587.3301 100 b
#> 81.18239 93.95103 191.1248 100 a
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Is the result the same?
all_equal(power_full_join_func_rowwise(df3, df4),
power_full_join_func_not_rowwise(df3, df4))
#> [1] TRUE
Created on 2022-11-24 by the reprex package (v2.0.1)
data.table version using an !antijoin, and overwriting := all columns/rows returned in df2 with an NA (recycled list .(NA) to all columns).
Then looping over all the common variables and overwriting any values which don't match by id:
library(data.table)
setDT(df1)
setDT(df2)
df2[!df1, on=.(id), names(df2) := .(NA)]
idvars <- "id"
compvars <- setdiff(intersect(names(df1), names(df2)), idvars)
for (i in compvars) {
df2[!df1, on=c(idvars,i), (i) := NA]
}
# id text volume
# 1: 632592651 <NA> 1234
# 2: 633322173 cat 432
# 3: 634703802 dog 324
# 4: 634927873 mouse 333
# 5: 635812953 elephant 2223
# 6: 636004739 goose 412346
# 7: 636101211 rat 7456
# 8: 636157799 mice 3456
# 9: 636263106 kitty 2345
#10: 636752420 kitten 2345
#11: <NA> <NA> <NA>
#12: <NA> <NA> <NA>
#13: <NA> <NA> <NA>
#14: <NA> <NA> <NA>
#15: <NA> <NA> <NA>
#16: <NA> <NA> <NA>
#17: <NA> <NA> <NA>
#18: <NA> <NA> <NA>
#19: <NA> <NA> <NA>
#20: <NA> <NA> <NA>

How to combine two data.tables based on multiple criteria in R?

I have two data.tables, which I want to combine based on if a date in one table is in the given time range in the other table. In dt1 I have exit dates and I want to check in dt2 which values were valid at the exit date for each ID.
dt1 <- data.table (ID = 1:10,
exit = c("31/12/2010", "01/01/2021", "30/09/2010", "31/12/2015", "30/09/2010","31/10/2018", "01/02/2016", "01/05/2015", "01/09/2013", "01/01/2016"))
dt2 <- data.table (ID = c(1,2,2,2,3,5,6,6,7,8,8,9,10),
valid_from = c("01/01/2010", "01/01/2012", "01/01/2013", "01/12/2017", "01/05/2010", "01/04/2010", "01/05/2014", "01/11/2016", "01/01/2016", "15/04/2013", "01/01/2015", "15/02/2010", "01/04/2012"),
valid_until = c("01/01/2021", "31/12/2012", "30/11/2017", "01/01/2021", "01/01/2021", "01/01/2021", "31/10/2016", "01/01/2021", "01/01/2021", "31/12/2014", "01/05/2015", "01/01/2013", "01/01/2021"),
text1 = c("a", "a", "b", "c", "b", "b", "c", "a", "a", "b", "a", "c", "a"),
text2 = c("I", "I", "II", "I", "III", "I", "II", "III", "I", "II", "II", "I", "III" ))
ID exit
1: 1 31/12/2010
2: 2 01/01/2021
3: 3 30/09/2010
4: 4 31/12/2015
5: 5 30/09/2010
6: 6 31/10/2018
7: 7 01/02/2016
8: 8 01/05/2015
9: 9 01/09/2013
10: 10 01/01/2016
ID valid_from valid_until text1 text2
1: 1 01/01/2010 01/01/2021 a I
2: 2 01/01/2012 31/12/2012 a I
3: 2 01/01/2013 30/11/2017 b II
4: 2 01/12/2017 01/01/2021 c I
5: 3 01/05/2010 01/01/2021 b III
6: 5 01/04/2010 01/01/2021 b I
7: 6 01/05/2014 31/10/2016 c II
8: 6 01/11/2016 01/01/2021 a III
9: 7 01/01/2016 01/01/2021 a I
10: 8 15/04/2013 31/12/2014 b II
11: 8 01/01/2015 01/05/2015 a II
12: 9 15/02/2010 01/01/2013 c I
13: 10 01/04/2012 01/01/2021 a III
As a result I would like to return in dt1 the valid values to the exit dates.
If an ID is not found in dt2 (would be the case for ID 4 in the sample data), it should return NA.
ID exit text1 text2
1: 1 31/12/2010 a I
2: 2 01/01/2021 c I
3: 3 30/09/2010 b III
4: 4 31/12/2015 <NA> <NA>
5: 5 30/09/2010 b I
6: 6 31/10/2018 a III
7: 7 01/02/2016 a I
8: 8 01/05/2015 a II
9: 9 01/09/2013 c I
10: 10 01/01/2016 a III
Could anyone help me solve this?
As the input is a data.table, consider using data.table methods which are fast
library(data.table)
# // convert the date columns to `Date` class
dt1[, exit := as.IDate(exit, '%d/%m/%Y')]
dt2[, c('valid_from', 'valid_until') := .(as.IDate(valid_from, '%d/%m/%Y'),
as.IDate(valid_until, '%d/%m/%Y'))]
# // do a non-equi join
dt1[dt2, c('text1', 'text2') := .(i.text1, i.text2),
on = .(ID, exit >= valid_from, exit <= valid_until)]
-output
> dt1
ID exit text1 text2
1: 1 2010-12-31 a I
2: 2 2021-01-01 c I
3: 3 2010-09-30 b III
4: 4 2015-12-31 <NA> <NA>
5: 5 2010-09-30 b I
6: 6 2018-10-31 a III
7: 7 2016-02-01 a I
8: 8 2015-05-01 a II
9: 9 2013-09-01 <NA> <NA>
10: 10 2016-01-01 a III
Here is a dplyr solution, that was created with the help of #akrun: see here dates: Not yet implemented NAbounds=TRUE for this non-numeric and non-character type
library(dplyr)
libray(lubridate)
df1 <- left_join(dt1, dt2, by="ID") %>%
mutate(across(c(exit, valid_from, valid_until), dmy)) %>%
rowwise() %>%
mutate(match= +(dplyr::between(exit, valid_from, valid_until))) %>%
group_by(ID) %>%
filter(match==max(match) | is.na(match)) %>%
select(ID, exit, text1, text2) %>%
ungroup()
output:
ID exit text1 text2
<dbl> <date> <chr> <chr>
1 1 2010-12-31 a I
2 2 2021-01-01 c I
3 3 2010-09-30 b III
4 4 2015-12-31 NA NA
5 5 2010-09-30 b I
6 6 2018-10-31 a III
7 7 2016-02-01 a I
8 8 2015-05-01 a II
9 9 2013-09-01 c I
10 10 2016-01-01 a III
You may use fuzzyjoin after changing the dates to Date class.
library(fuzzyjoin)
library(dplyr)
dt1 %>%
mutate(exit = as.Date(exit, '%d/%m/%Y')) %>%
fuzzy_left_join(dt2 %>%
mutate(across(starts_with('valid'), as.Date, '%d/%m/%Y')),
by = c('ID', 'exit' = 'valid_from', 'exit' = 'valid_until'),
match_fun = c(`==`, `>=`, `<=`)) %>%
select(ID = ID.x, exit, text1, text2)
# ID exit text1 text2
#1 1 2010-12-31 a I
#2 2 2021-01-01 c I
#3 3 2010-09-30 b III
#4 4 2015-12-31 <NA> <NA>
#5 5 2010-09-30 b I
#6 6 2018-10-31 a III
#7 7 2016-02-01 a I
#8 8 2015-05-01 a II
#9 9 2013-09-01 <NA> <NA>
#10 10 2016-01-01 a III

join information of two dataframes R

I'd like to join two dataframes with R
here the first one
resno resid elety eleno
1 ILE C 3
1 ILE O 4
2 VAL C 11
2 VAL O 12
3 GLY C 18
3 GLY O 19
the second one
C.O dist
12-18 3.112819
27-37 2.982788
51-63 3.185184
52-62 2.771583
63-69 3.157737
70-80 2.956738
so let's explain what i need. Looking at the second dataframe, i have distance ("dist") between points 12-18, corresponding to "eleno" in the first dataframe. for this 2 points I have also "resno" that is what i'm interesting in, because i'd like to obtain something like this
resno resid elety eleno rescoup dist
1 ILE C 3 - -
1 ILE O 4 - -
2 VAL C 11 - -
2 VAL O 12 2-3 3.112819
3 GLY C 18 2-3 3.112819
3 GLY O 19 - -
how can I do? is this possible with R?
thanks!
You could first create a long dataframe from df2 where every number from C.O is a unique row.
library(dplyr)
library(tidyr)
df3 <- df2 %>%
separate(C.O, c('col1', 'col2'), sep = '-', convert = TRUE) %>%
mutate(eleno = purrr::map2(col1, col2, seq), .before = 1,
row = row_number()) %>%
select(-col1, -col2) %>%
unnest(eleno)
df3
# A tibble: 60 x 3
# eleno row dist
# <int> <int> <dbl>
# 1 12 1 3.11
# 2 13 1 3.11
# 3 14 1 3.11
# 4 15 1 3.11
# 5 16 1 3.11
# 6 17 1 3.11
# 7 18 1 3.11
# 8 27 2 2.98
# 9 28 2 2.98
#10 29 2 2.98
# … with 50 more rows
Join this dataframe with df1 and paste resno values to create rescoup.
df1 %>%
left_join(df3, by = 'eleno') %>%
group_by(row) %>%
mutate(rescoup = paste(resno, collapse = '-'),
rescoup = replace(rescoup, is.na(dist), NA)) %>%
ungroup() %>%
select(-row)
# resno resid elety eleno dist rescoup
# <int> <chr> <chr> <int> <dbl> <chr>
#1 1 ILE C 3 NA NA
#2 1 ILE O 4 NA NA
#3 2 VAL C 11 NA NA
#4 2 VAL O 12 3.11 2-3
#5 3 GLY C 18 3.11 2-3
#6 3 GLY O 19 NA NA
Using data.table, split then merge:
library(data.table)
merge(d1,
d2[, lapply(.SD, function(x) unlist(tstrsplit(x, "-", fixed = TRUE,
type.convert = TRUE)))],
by.x = "eleno", by.y = "C.O", all.x = TRUE)
# eleno resno resid elety dist
# 1: 3 1 ILE C NA
# 2: 4 1 ILE O NA
# 3: 11 2 VAL C NA
# 4: 12 2 VAL O 3.112819
# 5: 18 3 GLY C 3.112819
# 6: 19 3 GLY O NA
Example data:
d1 <- fread("resno resid elety eleno
1 ILE C 3
1 ILE O 4
2 VAL C 11
2 VAL O 12
3 GLY C 18
3 GLY O 19")
d2 <- fread("C.O dist
12-18 3.112819
27-37 2.982788
51-63 3.185184
52-62 2.771583
63-69 3.157737
70-80 2.956738")

How to summarizing nested groups in R

In a data frame like data below:
library(tidyverse)
ID <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y","Z", "a","b","c","d")
State <- rep(c("FL", "GA", "SC", "NC", "VA", "GA"), each = 5)
Location <- rep(c("alpha", "beta", "gamma"), each = 10)
Var3 <- rep(c("Bravo", "Charlie", "Delta", "Echo"), times = c(7,8,10,5))
Sex <- rep(c("M","F","M"), times = 10)
data <- data.frame(ID, State, Location, Var3, Sex)
I want to return a data frame, or a list of several data frames, that summarize each way the data can be grouped. I want to see how many individual IDs are in each State, Location, and Var3, how many M and F are in each State, Location, and Var3, how many Locations are in each State, ect... what is the best way to achieve this.
We can use count
library(dplyr)
data %>%
count(State, Location, Var3, Sex)
Also, to get rollup/cube way of hierarchial counts,
library(data.table)
rollup(as.data.table(data), j = .N, by = c("State","Location","Var3", "Sex"))
# State Location Var3 Sex N
# 1: FL alpha Bravo M 3
# 2: FL alpha Bravo F 2
# 3: GA alpha Bravo M 2
# 4: GA alpha Charlie F 1
# 5: GA alpha Charlie M 2
# 6: SC beta Charlie F 2
# 7: SC beta Charlie M 3
# 8: NC beta Delta M 3
# 9: NC beta Delta F 2
#10: VA gamma Delta M 4
#11: VA gamma Delta F 1
#12: GA gamma Echo F 2
#13: GA gamma Echo M 3
#14: FL alpha Bravo <NA> 5
#15: GA alpha Bravo <NA> 2
#16: GA alpha Charlie <NA> 3
#17: SC beta Charlie <NA> 5
#18: NC beta Delta <NA> 5
#19: VA gamma Delta <NA> 5
#20: GA gamma Echo <NA> 5
#21: FL alpha <NA> <NA> 5
#22: GA alpha <NA> <NA> 5
#23: SC beta <NA> <NA> 5
#24: NC beta <NA> <NA> 5
#25: VA gamma <NA> <NA> 5
#26: GA gamma <NA> <NA> 5
#27: FL <NA> <NA> <NA> 5
#28: GA <NA> <NA> <NA> 10
#29: SC <NA> <NA> <NA> 5
#30: NC <NA> <NA> <NA> 5
#31: VA <NA> <NA> <NA> 5
#32: <NA> <NA> <NA> <NA> 30
# State Location Var3 Sex N
Or use cube
cube(as.data.table(data), j = .N, by = c("State","Location","Var3", "Sex"))
#. State Location Var3 Sex N
# 1: FL alpha Bravo M 3
# 2: FL alpha Bravo F 2
# 3: GA alpha Bravo M 2
# 4: GA alpha Charlie F 1
# 5: GA alpha Charlie M 2
# ---
#111: <NA> <NA> Delta <NA> 10
#112: <NA> <NA> Echo <NA> 5
#113: <NA> <NA> <NA> M 20
#114: <NA> <NA> <NA> F 10
#115: <NA> <NA> <NA> <NA> 30
One dplyr and purrr solution to group by all possible combinations of column names could be:
map2(list(colnames(data)),
1:ncol(data),
combn, simplify = FALSE) %>%
flatten() %>%
map(~ data %>%
group_by_at(.x) %>%
tally())
In this case, there are 31 possible combinations of column names, so it returns 31 lists. The first three lists:
[[1]]
# A tibble: 30 x 2
ID n
<fct> <int>
1 a 1
2 A 1
3 b 1
4 B 1
5 c 1
6 C 1
7 d 1
8 D 1
9 E 1
10 F 1
# … with 20 more rows
[[2]]
# A tibble: 5 x 2
State n
<fct> <int>
1 FL 5
2 GA 10
3 NC 5
4 SC 5
5 VA 5
[[3]]
# A tibble: 3 x 2
Location n
<fct> <int>
1 alpha 10
2 beta 10
3 gamma 10

Resources