Merge data based on nearest date R - r

How do I jeft.join 2 data frames based on the nearest date? I currently have the script written so that it joins by the exact date, but I would prefer to do it by nearest date in case there is not an exact match.
This is what I currently have:
MASTER_DATABASE <- left_join(ptnamesMID, CTDB, by = c("LAST_NAME", "FIRST_NAME", "Measure_date" = "VISIT_DATE"))

The rolling joins in the data.table have a parameter roll = "nearest" which does probably what the OP expects.
Unfortunately, the OP has failed to provide sample data so I had to make up my own sample data.
Create sample datasets
set.seed(123L)
dates <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "1 day")
ptnamesMID <- data.frame(ID = 1:10, Measure_date = sample(dates, 10L))
CTDB <- data.frame(ID = sample.int(10, 30L, TRUE), VISIT_DATE = sample(dates, 30L, TRUE))
Here, ID is used in place of LAST_NAME and FIRST_NAME for simplification. ptnamesMID consists of 10 rows while CTDB has 30 rows of randomly sampled visit dates.
Rolling join
library(data.table)
# coerce to data.table and append join columns to preserve the original columns
setDT(CTDB)[, join_date := VISIT_DATE]
setDT(ptnamesMID)[, join_date := Measure_date]
# rolling join
CTDB[ptnamesMID, on = .(ID, join_date), roll = "nearest"]
ID VISIT_DATE join_date Measure_date
1: 1 2017-06-20 2017-04-15 2017-04-15
2: 2 2017-05-17 2017-10-14 2017-10-14
3: 3 2017-06-10 2017-05-29 2017-05-29
4: 4 2017-10-17 2017-11-16 2017-11-16
5: 5 2017-06-13 2017-12-06 2017-12-06
6: 6 2017-02-16 2017-01-17 2017-01-17
7: 7 2017-07-24 2017-07-09 2017-07-09
8: 8 2017-10-23 2017-12-28 2017-12-28
9: 9 2017-02-20 2017-07-16 2017-07-16
10: 10 2017-08-31 2017-06-12 2017-06-12
In data.table syntax, CTDB[ptnamesMID, ...] is equivalent to a left join of ptnamesMID with CTDB, i.e., all rows of of ptnamesMID are kept in the result set.

Without an example it's hard to help your use case. I'd try out a package by David Robinson:
https://cran.r-project.org/web/packages/fuzzyjoin/fuzzyjoin.pdf
Here is the example for interval_join:
if (requireNamespace("IRanges", quietly = TRUE)) {
x1 <- data.frame(id1 = 1:3, start = c(1, 5, 10), end = c(3, 7, 15))
x2 <- data.frame(id2 = 1:3, start = c(2, 4, 16), end = c(4, 8, 20))
interval_inner_join(x1, x2)
# Allow them to be separated by a gap with a maximum:
interval_inner_join(x1, x2, maxgap = 1) # let 1 join with 2
interval_inner_join(x1, x2, maxgap = 20) # everything joins each other
# Require that they overlap by more than a particular amount
interval_inner_join(x1, x2, minoverlap = 3)
# other types of joins:
interval_full_join(x1, x2)
interval_left_join(x1, x2)
interval_right_join(x1, x2)
interval_semi_join(x1, x2)
interval_anti_join(x1, x2)
}

Related

R data.table cross-join by three variables

I'm trying cross join a data.table by three variables (group, id, and date). The R code below accomplishes exactly what I want to do, i.e., each id within each group is expanded to include all of the dates_wanted. But is there a way to do the same thing more efficiently using the excellent data.table package?
library(data.table)
data <- data.table(
group = c(rep("A", 10), rep("B", 10)),
id = c(rep("frank", 5), rep("tony", 5), rep("arthur", 5), rep("edward", 5)),
date = seq(as.IDate("2020-01-01"), as.IDate("2020-01-20"), by = "day")
)
data
dates_wanted <- seq(as.IDate("2020-01-01"), as.IDate("2020-01-31"), by = "day")
names_A <- data[group == "A"][["id"]]
names_B <- data[group == "B"][["id"]]
names_A <- CJ(group = "A", id = names_A, date = dates_wanted, unique = TRUE)
names_B <- CJ(group = "B", id = names_B, date = dates_wanted, unique = TRUE)
alldates <- rbind(names_A, names_B)
alldates
data[alldates, on = .(group, id, date)]
You can also do this:
data[, .(date=dates_wanted), .(group,id)]
Output:
group id date
1: A frank 2020-01-01
2: A frank 2020-01-02
3: A frank 2020-01-03
4: A frank 2020-01-04
5: A frank 2020-01-05
---
120: B edward 2020-01-27
121: B edward 2020-01-28
122: B edward 2020-01-29
123: B edward 2020-01-30
124: B edward 2020-01-31
We can use do.call with CJ on the id and date transformed grouped by group:
out <- data[, do.call(CJ, c(.(id = id, date = dates_wanted),
unique = TRUE)), group]
... checking:
> dim(out)
[1] 124 3
> out0 <- data[alldates, on = .(group, id, date)]
> dim(out0)
[1] 124 3
> all.equal(out, out0)
[1] TRUE

Merge R data frame or data table and overwrite values of multiple columns

How do you merge two data tables (or data frames) in R keeping the non-NA values from each matching column? The question Merge data frames and overwrite values provides a solution if each individual column is specified explicitly (as far as I can tell, at least). But, I have over 40 common columns between the two data tables, and it is somewhat random which of the two has an NA versus a valid value. So, writing ifelse statements for 40 columns seems inefficient.
Below is a simple example, where I'd like to join (merge) the two data.tables by the id and date columns:
dt_1 <- data.table::data.table(id = "abc",
date = "2018-01-01",
a = 3,
b = NA_real_,
c = 4,
d = 6,
e = NA_real_)
setkey(dt_1, id, date)
> dt_1
id date a b c d e
1: abc 2018-01-01 3 NA 4 6 NA
dt_2 <- data.table::data.table(id = "abc",
date = "2018-01-01",
a = 3,
b = 5,
c = NA_real_,
d = 6,
e = NA_real_)
setkey(dt_2, id, date)
> dt_2
id date a b c d e
1: abc 2018-01-01 3 5 NA 6 NA
Here is my desired output:
> dt_out
id date a b c d e
1: abc 2018-01-01 3 5 4 6 NA
I've also tried the dplyr::anti_join solution from left_join two data frames and overwrite without success.
I'd probably put the data in long form and drop dupes:
k = key(dt_1)
DTList = list(dt_1, dt_2)
DTLong = rbindlist(lapply(DTList, function(x) melt(x, id=k)))
setorder(DTLong, na.last = TRUE)
unique(DTLong, by=c(k, "variable"))
id date variable value
1: abc 2018-01-01 a 3
2: abc 2018-01-01 b 5
3: abc 2018-01-01 c 4
4: abc 2018-01-01 d 6
5: abc 2018-01-01 e NA
You can do this by using dplyr::coalesce, which will return the first non-missing value from vectors.
(EDIT: you can use dplyr::coalesce directly on the data frames also, no need to create the function below. Left it there just for completeness, as a record of the original answer.)
Credit where it's due: this code is mostly from this blog post, it builds a function that will take two data frames and do what you need (taking values from the x data frame if they are present).
coalesce_join <- function(x,
y,
by,
suffix = c(".x", ".y"),
join = dplyr::full_join, ...) {
joined <- join(x, y, by = by, suffix = suffix, ...)
# names of desired output
cols <- union(names(x), names(y))
to_coalesce <- names(joined)[!names(joined) %in% cols]
suffix_used <- suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)]
# remove suffixes and deduplicate
to_coalesce <- unique(substr(
to_coalesce,
1,
nchar(to_coalesce) - nchar(suffix_used)
))
coalesced <- purrr::map_dfc(to_coalesce, ~dplyr::coalesce(
joined[[paste0(.x, suffix[1])]],
joined[[paste0(.x, suffix[2])]]
))
names(coalesced) <- to_coalesce
dplyr::bind_cols(joined, coalesced)[cols]
}
We can use {powerjoin}, do a left join and deal with the conflicts using coalesce_xy() (which is pretty much dplyr::coalesce()).
library(powerjoin)
power_left_join(dt_1, dt_2, by = "id", conflict = coalesce_xy)
# id date a b c d e
# 1 abc 2018-01-01 3 5 4 6 NA

How to make a new column in a dataframe, based on ranges within factors from another, in R

This is a slightly complex issue I am trying to solve in R (R-Studio, R version 3.3.1).
I have two dataframes (DF_A, DF_B) . DF_A is structured like this:
Filename Timestamp
A 11
A 12
A 17
B 18
B 22
B 23
C 24
C 28
C 30
And, DF_B like this:
Timestamp
11
12
13
14
15
16
17
18
19
...
30
And I'd like to be able to move the filename from DF_A to DF_B, based on the range of values seen in each Filename factor from DF_A. So:
Timestamp Filename
11 A
12 A
13 A
14 A
...
18 B
19 B
...
24 C
I was considering getting the min-max timestamp of each factor in DF_A, appending the Filename as they belong to the same range of timestamps in DF_B. Thusfar, I have managed to get the min-max by a solution I found, which turns the dataframe into a datatable- and gets the min/max for each factor:
DT_A <- as.data.table(DF_A)
DT[,.SD[which.min(Timestamp)], by = Filename]
DT[,.SD[which.max(Timestamp)], by = Filename]
Alas, this is as far as I have gotten. I am not sure how I would apply this to DF_B. The solution can be pretty open here. Curious to see the different solutions. Any help is greatly appreciated. Thanks!
# import the necessary package
library(data.table)
# create lookup data table
DT_A <- data.table(
Filename = rep(c("A", "B", "C"), each = 3),
Timestamp = c(11, 12, 17, 18, 22, 23, 24, 28, 30)
)
# form data table to be labelled
DT_B <- data.table(
Timestamp = 11:30
)
# get the minimum and maximum timestamp for each filename
DT_limits <- DT_A[ ,
.(from = min(Timestamp, na.rm = T),
to = max(Timestamp, na.rm = T)),
by = Filename]
## apply a fast overlap
DT_B[ , dummy:= Timestamp]
setkey(DT_limits, from, to)
DT_final <- foverlaps(
DT_B,
DT_limits,
by.x = c("Timestamp", "dummy"),
nomatch = 0L
)[ , c("from", "to", "dummy") := NULL]
DT_final
# Filename Timestamp
# 1: A 11
# 2: A 12
# 3: A 13
# 4: A 14
# ...
# 8: B 18
# 9: B 19
# ...
# 14: C 24
# ...

Using "by-argument" in "outer" data.table to filter "inner" data.table

I still have some problems understanding the data.table notation. Could anyone explain why the following is not working?
I'm trying to classify dates into groups using cut. The breaks used can be found in another data.table and depend on the by argument of the outer "data" data.table
data <- data.table(A = c(1, 1, 1, 2, 2, 2),
DATE = as.POSIXct(c("01-01-2012", "30-05-2015", "01-01-2020", "30-06-2012", "30-06-2013", "01-01-1999"), format = "%d-%m-%Y"))
breaks <- data.table(B = c(1, 1, 2, 2),
BREAKPOINT = as.POSIXct(c("01-01-2015", "01-01-2016", "30-06-2012", "30-06-2013"), format = "%d-%m-%Y"))
data[, bucket := cut(DATE, breaks[B == A, BREAKPOINT], ordered_result = T), by = A]
I can get the desired result doing
# expected
data[A == 1, bucket := cut(DATE, breaks[B == 1, BREAKPOINT], ordered_result = T)]
data[A == 2, bucket := cut(DATE, breaks[B == 2, BREAKPOINT], ordered_result = T)]
data
# A DATE bucket
# 1: 1 2012-01-01 NA
# 2: 1 2015-05-30 2015-01-01
# 3: 1 2020-01-01 NA
# 4: 2 2012-06-30 2012-06-30
# 5: 2 2013-06-30 NA
# 6: 2 1999-01-01 NA
Thanks,
Michael
The problem is that cut produces factors and those are not being handled correctly in the data.table by operation (this is a bug and should be reported - the factor levels should be handled the same way they are handled in rbind.data.table or rbindlist). An easy fix to your original expression is to convert to character:
data[, bucket := as.character(cut(DATE, breaks[B == A, BREAKPOINT], ordered_result = T))
, by = A]
# A DATE bucket
#1: 1 2012-01-01 NA
#2: 1 2015-05-30 2015-01-01
#3: 1 2020-01-01 NA
#4: 2 2012-06-30 2012-06-30
#5: 2 2013-06-30 NA
#6: 2 1999-01-01 NA

data.table aggregation with rolling subset on date

I have a set of data along these lines
d1 <- data.frame(
cat1 = sample(c('a', 'b', 'c'), 100, replace = TRUE),
date = rep(Sys.Date() - sample(1:100)),
val = rnorm(100, 50, 5)
)
require(data.table)
d2 <- data.table(d1)
I can get a daily sum without problem
d2[ , list(.N, sum(val)), by = c("cat1", "date")]
I want to get a sum over 2 days (and then 7 days)
This works:
d.list <- sort(unique(d2$date))
o.list <- list()
for(i in seq_along(d.list)){
o.list[[i]] <- d2[d2$date >= d.list[i] - 1 & d2$date <= d.list[i], list(.N, sum(val), max(date)), by = c("cat1")]
}
do.call(rbind, o.list)
But slows down on a bigger data set, and doesn't seem to be the best use of data.table.
Is there a more efficient way?
This is a bit faster:
First we join for exact matches and obtain the last index (in case of multiple matches)
setkey(d2, cat1, date)
tmp1 = d2[unique(d2, by=key(d2)), which=TRUE, mult="last", allow.cartesian=TRUE]
Then, we construct a copy of d2 and change date to date-1 by reference. Then, we perform a join with roll=-Inf - which is next observation carried backwards. In other words, if there's no exact match, it'll fill the next available value.
d3 = copy(d2)[, date := date-1]
setkey(d3, cat1, date)
tmp2 = d2[unique(d3, by=key(d2)), roll=-Inf, which=TRUE, allow.cartesian=TRUE]
From here, we put together the indices:
idx1 = tmp1-tmp2+1L
idx2 = data.table:::vecseq(tmp2, idx1, sum(idx1))
Subset d2 from idx2 and generate unique ids from idx1:
ans1 = d2[idx2][, grp := rep(seq_along(idx1), idx1)]
Finally aggregate by grp and get the desired result:
ans1 = ans1[, list(cat1=cat1[1L], date=date[.N],
N = .N, val=sum(val)), by=grp][, grp:=NULL]
> head(ans1, 10L)
# cat1 date N val
# 1: a 2014-01-20 1 47.69178
# 2: a 2014-01-25 1 52.01006
# 3: a 2014-02-01 1 46.82132
# 4: a 2014-02-06 1 44.62404
# 5: a 2014-02-11 1 49.63218
# 6: a 2014-02-14 1 48.80676
# 7: a 2014-02-22 1 49.27800
# 8: a 2014-02-23 2 96.17617
# 9: a 2014-02-26 1 49.20623
# 10: a 2014-02-28 1 46.72708
The results are identical as in your solution. This one took 0.02 seconds on my laptop, where as yours took 0.58 seconds.
For 7 days, just change:
d3 = copy(d2)[, date := date-1]
to
d3 = copy(d2)[, date := date-6]
It's very poorly explained in the OP what you want, but this seems to be it:
# generate the [date-1,date] sequences for each date
# adjust length.out to suit your needs
dates = d2[, list(date.seq = seq(date, by = -1, length.out = 2)), by = date]
setkey(dates, date.seq)
setkey(d2, date)
# merge and extract info needed
dates[d2][, list(.N, sum(val), date.seq[.N]), by = list(date, cat1)][, !"date"]
# cat1 N V2 V3
# 1: a 1 38.95774 2014-01-21
# 2: a 1 38.95774 2014-01-21
# 3: c 1 55.68445 2014-01-22
# 4: c 2 102.20806 2014-01-23
# 5: c 1 46.52361 2014-01-23
# ---
#164: c 1 50.17986 2014-04-27
#165: b 1 51.43489 2014-04-28
#166: b 2 100.91982 2014-04-29
#167: b 1 49.48493 2014-04-29
#168: c 1 54.93311 2014-04-30
Would it be possible to set up a binned date, and then do by on that?
d2$day7 <- as.integer(d2$date) %/% 7
d2[ , list(.N, sum(val)), by = c("cat1", "day7")]
That would give a binned value - if you want a sliding 7 day window, I'd need to think again. Also, for a binned approach, you might need to subtract an offset before doing the %/% if you want to chose the day of the week the groups start at.

Resources