data.table aggregation with rolling subset on date - r

I have a set of data along these lines
d1 <- data.frame(
cat1 = sample(c('a', 'b', 'c'), 100, replace = TRUE),
date = rep(Sys.Date() - sample(1:100)),
val = rnorm(100, 50, 5)
)
require(data.table)
d2 <- data.table(d1)
I can get a daily sum without problem
d2[ , list(.N, sum(val)), by = c("cat1", "date")]
I want to get a sum over 2 days (and then 7 days)
This works:
d.list <- sort(unique(d2$date))
o.list <- list()
for(i in seq_along(d.list)){
o.list[[i]] <- d2[d2$date >= d.list[i] - 1 & d2$date <= d.list[i], list(.N, sum(val), max(date)), by = c("cat1")]
}
do.call(rbind, o.list)
But slows down on a bigger data set, and doesn't seem to be the best use of data.table.
Is there a more efficient way?

This is a bit faster:
First we join for exact matches and obtain the last index (in case of multiple matches)
setkey(d2, cat1, date)
tmp1 = d2[unique(d2, by=key(d2)), which=TRUE, mult="last", allow.cartesian=TRUE]
Then, we construct a copy of d2 and change date to date-1 by reference. Then, we perform a join with roll=-Inf - which is next observation carried backwards. In other words, if there's no exact match, it'll fill the next available value.
d3 = copy(d2)[, date := date-1]
setkey(d3, cat1, date)
tmp2 = d2[unique(d3, by=key(d2)), roll=-Inf, which=TRUE, allow.cartesian=TRUE]
From here, we put together the indices:
idx1 = tmp1-tmp2+1L
idx2 = data.table:::vecseq(tmp2, idx1, sum(idx1))
Subset d2 from idx2 and generate unique ids from idx1:
ans1 = d2[idx2][, grp := rep(seq_along(idx1), idx1)]
Finally aggregate by grp and get the desired result:
ans1 = ans1[, list(cat1=cat1[1L], date=date[.N],
N = .N, val=sum(val)), by=grp][, grp:=NULL]
> head(ans1, 10L)
# cat1 date N val
# 1: a 2014-01-20 1 47.69178
# 2: a 2014-01-25 1 52.01006
# 3: a 2014-02-01 1 46.82132
# 4: a 2014-02-06 1 44.62404
# 5: a 2014-02-11 1 49.63218
# 6: a 2014-02-14 1 48.80676
# 7: a 2014-02-22 1 49.27800
# 8: a 2014-02-23 2 96.17617
# 9: a 2014-02-26 1 49.20623
# 10: a 2014-02-28 1 46.72708
The results are identical as in your solution. This one took 0.02 seconds on my laptop, where as yours took 0.58 seconds.
For 7 days, just change:
d3 = copy(d2)[, date := date-1]
to
d3 = copy(d2)[, date := date-6]

It's very poorly explained in the OP what you want, but this seems to be it:
# generate the [date-1,date] sequences for each date
# adjust length.out to suit your needs
dates = d2[, list(date.seq = seq(date, by = -1, length.out = 2)), by = date]
setkey(dates, date.seq)
setkey(d2, date)
# merge and extract info needed
dates[d2][, list(.N, sum(val), date.seq[.N]), by = list(date, cat1)][, !"date"]
# cat1 N V2 V3
# 1: a 1 38.95774 2014-01-21
# 2: a 1 38.95774 2014-01-21
# 3: c 1 55.68445 2014-01-22
# 4: c 2 102.20806 2014-01-23
# 5: c 1 46.52361 2014-01-23
# ---
#164: c 1 50.17986 2014-04-27
#165: b 1 51.43489 2014-04-28
#166: b 2 100.91982 2014-04-29
#167: b 1 49.48493 2014-04-29
#168: c 1 54.93311 2014-04-30

Would it be possible to set up a binned date, and then do by on that?
d2$day7 <- as.integer(d2$date) %/% 7
d2[ , list(.N, sum(val)), by = c("cat1", "day7")]
That would give a binned value - if you want a sliding 7 day window, I'd need to think again. Also, for a binned approach, you might need to subtract an offset before doing the %/% if you want to chose the day of the week the groups start at.

Related

dplyr - Join tables based on a date difference

It's late and I can't figure this out. I'm using lubridate and dlypr.
My data is as follows:
table1 =1 observation per subject with a date
table2= 1 or more observations per subject with associated dates
When I left join I actually add observations. This is because I have multiple records in table 2 that match the key. How can I simply make this a conditional join so that only 1 matching record from table 2 is joined given that its date is closest to the date in table 1.
Sorry if this was verbose.
Use the data.table-package to join. Use roll = "nearest" to get the nearest match..
library(data.table)
dt1 <- data.table( id = 1:10, date = 1:10, stringsAsFactors = FALSE )
dt2 <- data.table( date = 6:15, letter = letters[1:10], stringsAsFactors = FALSE )
dt1[, letter := dt2[dt1, letter, on = "date", roll = "nearest"] ][]
# id date letter
# 1: 1 1 a
# 2: 2 2 a
# 3: 3 3 a
# 4: 4 4 a
# 5: 5 5 a
# 6: 6 6 a
# 7: 7 7 b
# 8: 8 8 c
# 9: 9 9 d
# 10: 10 10 e

Merge data based on nearest date R

How do I jeft.join 2 data frames based on the nearest date? I currently have the script written so that it joins by the exact date, but I would prefer to do it by nearest date in case there is not an exact match.
This is what I currently have:
MASTER_DATABASE <- left_join(ptnamesMID, CTDB, by = c("LAST_NAME", "FIRST_NAME", "Measure_date" = "VISIT_DATE"))
The rolling joins in the data.table have a parameter roll = "nearest" which does probably what the OP expects.
Unfortunately, the OP has failed to provide sample data so I had to make up my own sample data.
Create sample datasets
set.seed(123L)
dates <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "1 day")
ptnamesMID <- data.frame(ID = 1:10, Measure_date = sample(dates, 10L))
CTDB <- data.frame(ID = sample.int(10, 30L, TRUE), VISIT_DATE = sample(dates, 30L, TRUE))
Here, ID is used in place of LAST_NAME and FIRST_NAME for simplification. ptnamesMID consists of 10 rows while CTDB has 30 rows of randomly sampled visit dates.
Rolling join
library(data.table)
# coerce to data.table and append join columns to preserve the original columns
setDT(CTDB)[, join_date := VISIT_DATE]
setDT(ptnamesMID)[, join_date := Measure_date]
# rolling join
CTDB[ptnamesMID, on = .(ID, join_date), roll = "nearest"]
ID VISIT_DATE join_date Measure_date
1: 1 2017-06-20 2017-04-15 2017-04-15
2: 2 2017-05-17 2017-10-14 2017-10-14
3: 3 2017-06-10 2017-05-29 2017-05-29
4: 4 2017-10-17 2017-11-16 2017-11-16
5: 5 2017-06-13 2017-12-06 2017-12-06
6: 6 2017-02-16 2017-01-17 2017-01-17
7: 7 2017-07-24 2017-07-09 2017-07-09
8: 8 2017-10-23 2017-12-28 2017-12-28
9: 9 2017-02-20 2017-07-16 2017-07-16
10: 10 2017-08-31 2017-06-12 2017-06-12
In data.table syntax, CTDB[ptnamesMID, ...] is equivalent to a left join of ptnamesMID with CTDB, i.e., all rows of of ptnamesMID are kept in the result set.
Without an example it's hard to help your use case. I'd try out a package by David Robinson:
https://cran.r-project.org/web/packages/fuzzyjoin/fuzzyjoin.pdf
Here is the example for interval_join:
if (requireNamespace("IRanges", quietly = TRUE)) {
x1 <- data.frame(id1 = 1:3, start = c(1, 5, 10), end = c(3, 7, 15))
x2 <- data.frame(id2 = 1:3, start = c(2, 4, 16), end = c(4, 8, 20))
interval_inner_join(x1, x2)
# Allow them to be separated by a gap with a maximum:
interval_inner_join(x1, x2, maxgap = 1) # let 1 join with 2
interval_inner_join(x1, x2, maxgap = 20) # everything joins each other
# Require that they overlap by more than a particular amount
interval_inner_join(x1, x2, minoverlap = 3)
# other types of joins:
interval_full_join(x1, x2)
interval_left_join(x1, x2)
interval_right_join(x1, x2)
interval_semi_join(x1, x2)
interval_anti_join(x1, x2)
}

Factor levels reaching certain values

I need to find out how many factor levels reach values of a continuous variable.
The code below produces the desired result for the example data, but it is rather an awkward work around.
My real dataframe is much larger and the real plot should show more values (or is continuous) on the x-axis. I would appreciate an applicable code a lot.
set.seed(5)
df <- data.frame(ID = factor(c("a","a","b","c","d","e","e")),values = runif(7,0,6))
seq <- 1:5
length.unique <- function(x) length(unique(x))
sub1 <- df[which(df$values >= 1), ]
sub2 <- df[which(df$values >= 2), ]
sub3 <- df[which(df$values >= 3), ]
sub4 <- df[which(df$values >= 4), ]
sub5 <- df[which(df$values >= 5), ]
N_IDs <- c(length.unique(sub1$ID),length.unique(sub2$ID),length.unique(sub3$ID),length.unique(sub4$ID),length.unique(sub5$ID))
plot(N_IDs ~ seq, type="b")
Using tidyverse, you can save some time by first calculating the max value for each ID,
library(tidyverse)
idmax <- df %>% group_by(ID) %>% summarize(max=max(values)) %>% pull(max)
Then for each cut point, return the count that pass
map_df(1:5, ~data.frame(cut=., count=sum(idmax >.)))
# cut count
# 1 1 4
# 2 2 3
# 3 3 3
# 4 4 3
# 5 5 1
Using non-equi joins:
library(data.table)
setDT(df)
df[.(seq = 1:5), on = .(values >= seq), allow = T, .(N_IDs = uniqueN(ID)), by = .EACHI]
# values N_IDs
#1: 1 4
#2: 2 3
#3: 3 3
#4: 4 3
#5: 5 1

Data.Table rolling join by group

How can I find the last value, prior to test.day, for each (loc.x, loc.y) pair?
dt <- data.table(
loc.x = as.integer(c(1, 1, 3, 1, 3, 1)),
loc.y = as.integer(c(1, 2, 1, 2, 1, 2)),
time = as.IDate(c("2015-03-11", "2015-05-10", "2015-09-27",
"2015-11-25", "2014-09-13", "2015-08-19")),
value = letters[1:6]
)
setkey(dt, loc.x, loc.y, time)
test.day <- as.IDate("2015-10-01")
Required output:
loc.x loc.y value
1: 1 1 a
2: 1 2 f
3: 3 1 c
Another option is to use the last function:
dt[, last(value[time < test.day]), by = .(loc.x, loc.y)]
which gives:
loc.x loc.y V1
1: 1 1 a
2: 1 2 f
3: 3 1 c
You can first subset the rows where time < test.day (which should be quite efficient because it is not done by group) and then select the last value per group. To do that you can either use tail(value, 1L) or, as suggested by Floo0, value[.N], resulting in:
dt[time < test.day, tail(value, 1L), by = .(loc.x, loc.y)]
# loc.x loc.y V1
#1: 1 1 a
#2: 1 2 f
#3: 3 1 c
or
dt[time < test.day, value[.N], by = .(loc.x, loc.y)]
Note that this works because the data is sorted due to setkey(dt, loc.x, loc.y, time).
Here's another option using a rolling join after creating a lookup table
indx <- data.table(unique(dt[ ,.(loc.x, loc.y)]), time = test.day)
dt[indx, roll = TRUE, on = names(indx)]
# loc.x loc.y time value
# 1: 1 1 2015-10-01 a
# 2: 1 2 2015-10-01 f
# 3: 3 1 2015-10-01 c
Or a very similar option suggested by #eddi
dt[dt[, .(time = test.day), by = .(loc.x, loc.y)], roll = T, on = c('loc.x', 'loc.y', 'time')]
Or a one liner which will be less efficient as it will call [.data.table by group
dt[,
.SD[data.table(test.day), value, roll = TRUE, on = c(time = "test.day")],
by = .(loc.x, loc.y)
]
# loc.x loc.y V1
# 1: 1 1 a
# 2: 1 2 f
# 3: 3 1 c

Cartesian product with filter data.table

I'm trying to replace Cartesian product produced by SQL by data.table call.
I have large history with assets and values, and I need a subset of all combinations.
Let's say that I have table a with T = [date, contract, value]. In SQL it looks like
SELECT a.date, a.contract, a.value, b.contract. b.value
FROM T a, T b
WHERE a.date = b.date AND a.contract <> b.contract AND a.value + b.value < 4
In R I have now the following
library(data.table)
n <- 1500
dt <- data.table(date = rep(seq(Sys.Date() - n+1, Sys.Date(), by = "1 day"), 3),
contract = c(rep("a", n), rep("b", n), rep("c", n)),
value = c(rep(1, n), rep(2, n), rep(3, n)))
setkey(dt, date)
dt[dt, allow.cartesian = TRUE][(contract != i.contract) & (value + i.value < 4)]
I believe that my solution creates all combinations first (in this case 13,500 rows) and then filter (to 3000). SQL however (and I might be wrong) joining subset, and what is more important don't load all combinations into RAM. Any ideas how to use data.table more efficient?
Use by = .EACHI feature. In data.table joins and subsets are very closely linked; i.e., a join is just another subset - using data.table - instead of the usual integer / logical / row names. They are designed this way with these cases in mind.
Subset based joins allow to incorporate j-expressions and grouping operations together while joining.
require(data.table)
dt[dt, .SD[contract != i.contract & value + i.value < 4L], by = .EACHI, allow = TRUE]
This is the idiomatic way (in case you'd like to use i.* cols just for condition, but not return them as well), however, .SD has not yet been optimised, and evaluating the j-expression on .SD for each group is costly.
system.time(dt[dt, .SD[contract != i.contract & value + i.value < 4L], by = .EACHI, allow = TRUE])
# user system elapsed
# 2.874 0.020 2.983
Some cases using .SD have already been optimised. Until these cases are taken care of, you can workaround it this way:
dt[dt, {
idx = contract != i.contract & value + i.value < 4L
list(contract = contract[idx],
value = value[idx],
i.contract = i.contract[any(idx)],
i.value = i.value[any(idx)]
)
}, by = .EACHI, allow = TRUE]
And this takes 0.045 seconds, as opposed to 0.005 seconds from your method. But by = .EACHI evaluates the j-expression each time (and therefore memory efficient). That's the trade-off you'll have to accept.
Since version v1.9.8 (on CRAN 25 Nov 2016), non-equi joins are possible with data.table which can be utilized here.
In addition, OP's approach creates "symmetric duplicates" (a, b) and (b, a). Avoiding duplicates would halfen the size of the result set without loss of information (compare ?combn)
If this is the intention of the OP we can use non-equi joins to avoid those symmetric duplicates:
library(data.table)
dt[, rn := .I][dt, on = .(date, rn < rn), nomatch = 0L][value + i.value < 4]
which gives
date contract value rn i.contract i.value
1: 2013-09-24 a 1 1501 b 2
2: 2013-09-25 a 1 1502 b 2
3: 2013-09-26 a 1 1503 b 2
4: 2013-09-27 a 1 1504 b 2
5: 2013-09-28 a 1 1505 b 2
---
1496: 2017-10-28 a 1 2996 b 2
1497: 2017-10-29 a 1 2997 b 2
1498: 2017-10-30 a 1 2998 b 2
1499: 2017-10-31 a 1 2999 b 2
1500: 2017-11-01 a 1 3000 b 2
as opposed to the result using OP's code
date contract value i.contract i.value
1: 2013-09-24 b 2 a 1
2: 2013-09-24 a 1 b 2
3: 2013-09-25 b 2 a 1
4: 2013-09-25 a 1 b 2
5: 2013-09-26 b 2 a 1
---
2996: 2017-10-30 a 1 b 2
2997: 2017-10-31 b 2 a 1
2998: 2017-10-31 a 1 b 2
2999: 2017-11-01 b 2 a 1
3000: 2017-11-01 a 1 b 2
The next step is to further reduce the number of pairs created which are need to be filtered out afterwards:
dt[, val4 := 4 - value][dt, on = .(date, rn < rn, val4 > value), nomatch = 0L]
which returns the same result as above.
Note that filter condition value + i.value < 4 is replaced by another join condition val4 > value where val4 is an especially created helper column.
Benchmark
For a benchmark case of n <- 150000L resulting in 450 k rows in dt the timings are:
n <- 150000L
dt <- data.table(date = rep(seq(Sys.Date() - n+1, Sys.Date(), by = "1 day"), 3),
contract = c(rep("a", n), rep("b", n), rep("c", n)),
value = c(rep(1, n), rep(2, n), rep(3, n)))
dt0 <- copy(dt)
microbenchmark::microbenchmark(
OP = {
dt <- copy(dt0)
dt[dt, on = .(date), allow.cartesian = TRUE][
(contract != i.contract) & (value + i.value < 4)]
},
nej1 = {
dt <- copy(dt0)
dt[, rn := .I][dt, on = .(date, rn < rn), nomatch = 0L][value + i.value < 4]
},
nej2 = {
dt <- copy(dt0)
dt[, rn := .I][, val4 := 4 - value][dt, on = .(date, rn < rn, val4 > value), nomatch = 0L]
},
times = 20L
)
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 136.3091 143.1656 246.7349 298.8648 304.8166 311.1141 20 b
nej1 127.9487 133.1772 160.8096 136.0825 146.0947 298.3348 20 a
nej2 180.4189 183.9264 219.5171 185.9385 198.7846 351.3038 20 b
So, doing the check value + i.value < 4 after the join seems to be faster than including it in the non-equi join.

Resources