How to merge two data.table under two conditions - r

I would like to merge two tables, dt_program and dt_sale, to find the START and END using the common keys CH and ITEM_ID under the condition as follows:
ORDER_TIME has to be within START and END
or
ORDER_TIME can happen after END (the nearest ORDER_TIME to END)
Data are provided:
The time schedule table represents the program from each channel:
dt_program <- structure(list(CH = c("CH1", "CH1", "CH1", "CH1", "CH1", "CH2",
"CH2", "CH2", "CH3", "CH3", "CH3", "CH3"), ITEM_ID = c(110, 111,
110, 111, 110, 110, 111, 112, 114, 113, 110, 112), START = structure(c(1514791800,
1514799000, 1514806200, 1514813400, 1514820600, 1518602400, 1518609600,
1518616800.005, 1517560200, 1517565600, 1517570999.995, 1517576399.995
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), END = structure(c(1514795400,
1514802600, 1514809800.005, 1514817000.01, 1514824200.015, 1518604200,
1518611400, 1518618600, 1517563800, 1517569200, 1517574600, 1517580000
), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-12L), class = c("data.table", "data.frame"))
return:
CH ITEM_ID START END
1: CH1 110 2018-01-01 07:30:00 2018-01-01 08:30:00
2: CH1 111 2018-01-01 09:30:00 2018-01-01 10:30:00
3: CH1 110 2018-01-01 11:30:00 2018-01-01 12:30:00
4: CH1 111 2018-01-01 13:30:00 2018-01-01 14:30:00
5: CH1 110 2018-01-01 15:30:00 2018-01-01 16:30:00
6: CH2 110 2018-02-14 10:00:00 2018-02-14 10:30:00
7: CH2 111 2018-02-14 12:00:00 2018-02-14 12:30:00
8: CH2 112 2018-02-14 14:00:00 2018-02-14 14:30:00
9: CH3 114 2018-02-02 08:30:00 2018-02-02 09:30:00
10: CH3 113 2018-02-02 10:00:00 2018-02-02 11:00:00
11: CH3 110 2018-02-02 11:29:59 2018-02-02 12:30:00
12: CH3 112 2018-02-02 12:59:59 2018-02-02 14:00:00
Also, I have the sale transaction table which collect the data when customer make purchase products:
dt_sale <- structure(list(CUST_ID = c("A001", "A001", "A001", "A002", "A002",
"A003"), CH = c("CH1", "CH3", "CH2", "CH2", "CH3", "CH1"), ORDER_TIME = structure(c(1514793600,
1514813400, 1518619200, 1514816100, 1517565600, 1514803200), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), ITEM_ID = c(110, 110, 112, 112, 114,
111)), row.names = c(NA, -6L), class = c("data.table", "data.frame"
))
return:
CUST_ID CH ORDER_TIME ITEM_ID
1: A001 CH1 2018-01-01 08:00:00 110
2: A001 CH3 2018-01-01 13:30:00 110
3: A001 CH2 2018-02-14 14:40:00 112
4: A002 CH2 2018-01-01 14:15:00 112
5: A002 CH3 2018-02-02 10:00:00 114
6: A003 CH1 2018-01-01 10:40:00 111
The output that I expected:
CUST_ID CH ORDER_TIME ITEM_ID START END
1: A001 CH1 2018-01-01 08:00:00 110 2018-01-01 07:30:00 2018-01-01 08:30:00
2: A001 CH3 2018-01-01 13:30:00 110 <NA> <NA>
3: A001 CH2 2018-02-14 14:40:00 112 2018-02-14 14:00:00 2018-02-14 14:30:00
4: A002 CH2 2018-01-01 14:15:00 112 <NA> <NA>
5: A002 CH3 2018-02-02 10:00:00 114 2018-02-02 08:30:00 2018-02-02 09:30:00
6: A003 CH1 2018-01-01 10:40:00 111 2018-01-01 09:30:00 2018-01-01 10:30:00
Could you please give suggestions?

The output as shown in the question doesn't match the description in the beginning of the question. Rows 2 and 4 should not have values for START & END.
A possible solution using a double join:
dt_sale[dt_program
, on = .(CH, ITEM_ID, ORDER_TIME > START, ORDER_TIME < END)
, `:=` (START = i.START, END = i.END)
][dt_program
, on = .(CH, ITEM_ID, ORDER_TIME > END)
, `:=` (START = i.START, END = i.END)][]
which gives:
> dt_sale
CUST_ID CH ORDER_TIME ITEM_ID START END
1: A001 CH1 2018-01-01 08:00:00 110 2018-01-01 07:30:00 2018-01-01 08:30:00
2: A001 CH3 2018-01-01 13:30:00 110 <NA> <NA>
3: A001 CH2 2018-02-14 14:40:00 112 2018-02-14 14:00:00 2018-02-14 14:30:00
4: A002 CH2 2018-01-01 14:15:00 112 <NA> <NA>
5: A002 CH3 2018-02-02 10:00:00 114 2018-02-02 08:30:00 2018-02-02 09:30:00
6: A003 CH1 2018-01-01 10:40:00 111 2018-01-01 09:30:00 2018-01-01 10:30:00

Related

Duplicating and modifying rows based on datetime

I have got have a data.table that looks like this
library(dplyr)
library(data.table)
dt <- data.table(ID=c("A001","A002","A003","A004"),start_time=c('2019-06-18 05:18:00','2020-03-04 05:00:00',
'2019-05-10 19:00:00','2020-01-06 22:42:00'),end_time=c('2019-06-18 08:41:00','2020-03-04 05:04:00',
'2019-05-10 19:08:00','2020-01-07 03:10:00'))
ID
start_time end_time duration
1: A001 2019-06-18 05:18:00 2019-06-18 08:41:00 203 mins
2: A002 2020-03-04 05:59:00 2020-03-04 06:04:00 5 mins
3: A003 2019-05-10 19:00:00 2019-05-10 19:08:00 8 mins
4: A004 2020-01-06 22:42:00 2020-01-07 03:10:00 268 mins
Duration was simply calculated as
dt$start_time <- as.POSIXct(dt$start_time, tz='UTC')
dt$end_time <- as.POSIXct(dt$end_time, tz='UTC')
dt <- dt %>% mutate(duration = (end_time-start_time))
I need to duplicate rows where duration is larger than the end of the hour from start_time (records that cover > 1 hour). I need to change for them start time (beginning of the hour), end time - end of hour OR the original end time if if's the last row (last viewing hour),and duration accordingly, so that the final output would look like:
dt_expected <- data.table(ID=c("A001","A001","A001","A001","A002","A002","A003","A004","A004","A004","A004","A004","A004"),
start_time=c('2019-06-18 05:18:00','2019-06-18 06:00:00','2019-06-18 07:00:00','2019-06-18 08:00:00', '2020-03-04 05:59:00', '2020-03-04 06:00:00', '2019-05-10 19:00:00',
'2020-01-06 22:42:00', '2020-01-06 23:00:00','2020-01-07 00:00:00','2020-01-07 01:00:00','2020-01-07 02:00:00','2020-01-07 03:00:00'),
end_time=c('2019-06-18 05:59:00','2019-06-18 06:59:00','2019-06-18 07:59:00','2019-06-18 08:41:00','2020-03-04 05:59:00','2020-03-04 06:04:00', '2019-05-10 19:08:00', '2020-01-06 22:59:00','2020-01-06 23:59:00','2020-01-07 00:59:00','2020-01-07 01:59:00', '2020-01-07 02:59:00','2020-01-07 03:10:00'),
duration = c(12,60,60,41,1,4,8,18,60,60,60,60,10))
Note that records for ID A002 should also be duplicated as duration happened in 2 different hours.
ID start_time end_time duration
1: A001 2019-06-18 05:18:00 2019-06-18 05:59:00 12
2: A001 2019-06-18 06:00:00 2019-06-18 06:59:00 60
3: A001 2019-06-18 07:00:00 2019-06-18 07:59:00 60
4: A001 2019-06-18 08:00:00 2019-06-18 08:41:00 41
5: A002 2020-03-04 05:59:00 2020-03-04 05:59:00 1
6: A002 2020-03-04 06:00:00 2020-03-04 06:04:00 4
7: A003 2019-05-10 19:00:00 2019-05-10 19:08:00 8
8: A004 2020-01-06 22:42:00 2020-01-06 22:59:00 18
9: A004 2020-01-06 23:00:00 2020-01-06 23:59:00 60
10: A004 2020-01-07 00:00:00 2020-01-07 00:59:00 60
11: A004 2020-01-07 01:00:00 2020-01-07 01:59:00 60
12: A004 2020-01-07 02:00:00 2020-01-07 02:59:00 60
13: A004 2020-01-07 03:00:00 2020-01-07 03:10:00 10
I think this is pretty close to what you're looking for.
This creates new rows of start and end times, one row for each hour using map from purrr.
Then, for each ID, it will determine start_time and end_time using pmin.
First, for the end_time, it takes the minimum value between that row's end_time and an hour later than the start_time for that row. For example, the first row for A001 would have end_time of 6:00, which is the ceiling_date time for 5:18 to the nearest hour, and less than 6:18 from the sequence generated from map. For the last row for A001, the end_time is 8:41, which is less than the ceiling_date time of 9:00.
The start_time will take the minimum value between the last row's end_time and that row's start_time. For example, the second row of A001 will have 6:00, which is the row above's end_time which is less than 6:18 from the sequence generated from map.
Note that one row has 0 minutes for duration - the time fell right on the hour (19:00:00). These could be filtered out.
library(purrr)
library(dplyr)
library(tidyr)
library(lubridate)
dt %>%
rowwise() %>%
mutate(start_time = map(start_time, ~seq.POSIXt(., ceiling_date(end_time, "hour"), by = "hour"))) %>%
unnest(start_time) %>%
group_by(ID) %>%
mutate(end_time = pmin(ceiling_date(start_time, unit = "hour"), end_time),
start_time = pmin(floor_date(lag(end_time, default = first(end_time)), unit = "hour"), start_time),
duration = difftime(end_time, start_time, units = "mins"))
Output
ID start_time end_time duration
<chr> <dttm> <dttm> <drtn>
1 A001 2019-06-18 05:18:00 2019-06-18 06:00:00 42 mins
2 A001 2019-06-18 06:00:00 2019-06-18 07:00:00 60 mins
3 A001 2019-06-18 07:00:00 2019-06-18 08:00:00 60 mins
4 A001 2019-06-18 08:00:00 2019-06-18 08:41:00 41 mins
5 A002 2020-03-04 05:59:00 2020-03-04 06:00:00 1 mins
6 A002 2020-03-04 06:00:00 2020-03-04 06:04:00 4 mins
7 A003 2019-05-10 19:00:00 2019-05-10 19:00:00 0 mins
8 A003 2019-05-10 19:00:00 2019-05-10 19:08:00 8 mins
9 A004 2020-01-06 22:42:00 2020-01-06 23:00:00 18 mins
10 A004 2020-01-06 23:00:00 2020-01-07 00:00:00 60 mins
11 A004 2020-01-07 00:00:00 2020-01-07 01:00:00 60 mins
12 A004 2020-01-07 01:00:00 2020-01-07 02:00:00 60 mins
13 A004 2020-01-07 02:00:00 2020-01-07 03:00:00 60 mins
14 A004 2020-01-07 03:00:00 2020-01-07 03:10:00 10 mins

data.table R: instants in which two events occur within a specific time epoch

I need to identify instances when two events occur within a specific time epoch as follows. If a event A occurs first, the event B must occur within 24 hours. On the other hand, if B occurs first, then A need to be found within 72 hours. Also, when the criteria is met, I need the "onset" time, which is time at which the first of these events occurred.
Event A
structure(list(fake_id = c("1000686267", "1000686267", "1000686267",
"1000686267", "1000686267", "1000686267", "1000686267", "1070640921",
"1070640921", "1070640921", "1070640921", "1070640921", "1070640921",
"1184695414", "1184695414", "1184695414", "1184695414", "1184695414"
), date = structure(c(1515063600, 1514822400, 1514822400, 1514822400,
1514822400, 1515146400, 1514901600, 1515330000, 1514822400, 1514822400,
1514822400, 1514822400, 1517385600, 1516701600, 1515142800, 1515178800,
1515178800, 1516557600), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-18L), class = "data.frame", .Names = c("fake_id",
"date"))
Event B
structure(list(fake_id = c("1000686267", "1000686267", "1000686267",
"1000686267", "1000686267", "1000686267", "1000686267", "1000686267",
"1000686267", "1000686267", "1000686267", "1000686267", "1000686267",
"1000686267", "1000686267", "1000686267", "1000686267", "1070640921",
"1070640921", "1070640921", "1070640921", "1070640921", "1070640921",
"1184695414", "1184695414", "1184695414", "1184695414", "1184695414",
"1184695414", "1184695414"), date = structure(c(1516795200, 1516795200,
1516795200, 1516917600, 1517400000, 1517400000, 1515492000, 1515492000,
1516190400, 1516190400, 1517410800, 1517410800, 1516921200, 1515070800,
1515070800, 1515052800, 1516633200, 1517374800, 1515322800, 1515322800,
1516525200, 1515232800, 1516543200, 1516550400, 1515189600, 1516543200,
1516543200, 1515142800, 1515142800, 1515142800), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -30L), class = "data.frame", .Names = c("fake_id",
"date"))
Some code
library (data.table)
event_a <- data.table(event_a[, c("fake_id", "date"), with = FALSE])
event_b <- data.table(event_b[, c("fake_id", "date"), with = FALSE])
event_a[, `:=`("criteria_a", "criteria_a")]
event_b[, `:=`("criteria_b", "criteria_b")]
setkeyv(event_a, c("fake_id", "date"))
setkeyv(event_b, c("fake_id", "date"))
join_window <- 60 * 60 * c(24, 72)
event_subset_a <- event_a[event_b, roll = join_window[1]]
event_subset_b <- event_b[event_a, roll = join_window[2]]
event_df <- rbind(event_subset_a, event_subset_b)
event_df[, `:=`(c("criteria_a", "criteria_b"), NULL)]
setkeyv(event_df, c("fake_id", "date"))
event_df <- unique(event_df)
Current output
fake_id date
1 1184695414 2018-01-05 09:00:00
2 1184695414 2018-01-05 19:00:00
3 1184695414 2018-01-05 22:00:00
4 1184695414 2018-01-21 14:00:00
5 1184695414 2018-01-21 16:00:00
6 1184695414 2018-01-21 18:00:00
7 1184695414 2018-01-23 10:00:00
Desired output
fake_id date
1 1184695414 2018-01-05 09:00:00
2 1184695414 2018-01-21 14:00:00
3 1184695414 2018-01-23 10:00:00
At first I thought this problem needed to be addresed with a non-equi join, but then I realized that a standard join is sufficient.
The overall process would be like this:
Eliminate duplicated rows
Join both tables
Filter those in which condition A appeared first. Mark them as "type A" and establish the onset time.
Filter those in which condition B appeared first. Mark them as "type B", and establish the onset time.
Drop the un-marked rows.
.
library(data.table)
library(lubridate) # we'll use the dhours() function
setDT(eventA, key = "fake_id")
setDT(eventB, key = "fake_id")
Modify the name of columns so it's easier to understand what belongs where
setnames(eventA, "date", "dateA")
setnames(eventB, "date", "dateB")
Eliminate duplicated rows
eventA <- eventA[!duplicated(eventA), ]
eventB <- eventB[!duplicated(eventB), ]
Join both tables and with chaining do steps 2 - 4 of the overall plan
eventA[eventB,
allow.cartesian = TRUE][
dateA < dateB & dateB <= dateA + dhours(24),
`:=` (type = "A",
onset = dateA)][
dateB < dateA & dateA <= dateB + dhours(72),
`:=` (type = "B",
onset = dateB)][!is.na(type), ][]
fake_id dateA dateB type onset
1: 1000686267 2018-01-04 11:00:00 2018-01-04 08:00:00 B 2018-01-04 08:00:00
2: 1000686267 2018-01-05 10:00:00 2018-01-04 08:00:00 B 2018-01-04 08:00:00
3: 1000686267 2018-01-04 11:00:00 2018-01-04 13:00:00 A 2018-01-04 11:00:00
4: 1000686267 2018-01-05 10:00:00 2018-01-04 13:00:00 B 2018-01-04 13:00:00
5: 1070640921 2018-01-07 13:00:00 2018-01-06 10:00:00 B 2018-01-06 10:00:00
6: 1070640921 2018-01-07 13:00:00 2018-01-07 11:00:00 B 2018-01-07 11:00:00
7: 1070640921 2018-01-31 08:00:00 2018-01-31 05:00:00 B 2018-01-31 05:00:00
8: 1184695414 2018-01-05 19:00:00 2018-01-05 09:00:00 B 2018-01-05 09:00:00
9: 1184695414 2018-01-05 09:00:00 2018-01-05 22:00:00 A 2018-01-05 09:00:00
10: 1184695414 2018-01-05 19:00:00 2018-01-05 22:00:00 A 2018-01-05 19:00:00
11: 1184695414 2018-01-21 18:00:00 2018-01-21 14:00:00 B 2018-01-21 14:00:00
12: 1184695414 2018-01-23 10:00:00 2018-01-21 14:00:00 B 2018-01-21 14:00:00
13: 1184695414 2018-01-21 18:00:00 2018-01-21 16:00:00 B 2018-01-21 16:00:00
14: 1184695414 2018-01-23 10:00:00 2018-01-21 16:00:00 B 2018-01-21 16:00:00
The output is very different from your expected output, but looking at your data and to the rules you esablished (if A earlier than B and B within 24 h of A, then A. If B earlier than A and A within 72 h of B, then B) there are 11 additional matches to the ones you found (in other words: either your expected output is wrong, or your established rules are wrong).
This is similar to #PavoDive but focuses on creating the non-equi join criteria before the actual join:
library (data.table)
setDT(event_a)
setDT(event_b)
# for the join - eventB needs to be within -72 to 24 hours
event_a[, `:=`(min_date = date - 72*60*60,
max_date = date + 24*60*60)]
# join unique data.tables
unique(event_b)[unique(event_a),
#non-equi join conditions
on = .(fake_id = fake_id,
date > min_date,
date < max_date),
nomatch = 0L,
allow.cartesian = T,
#select columns - you would only include fake_id and onset for desired output
j = .(fake_id,
a_date = i.date,
b_date = x.date,
onset = pmin(i.date, x.date),
first_type = ifelse(i.date == x.date,
NA_character_,
ifelse(i.date < x.date,
'A',
'B'))
)
]
fake_id a_date b_date onset first_type
1: 1000686267 2018-01-04 11:00:00 2018-01-04 13:00:00 2018-01-04 11:00:00 A
2: 1000686267 2018-01-04 11:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00 B
3: 1000686267 2018-01-05 10:00:00 2018-01-04 13:00:00 2018-01-04 13:00:00 B
4: 1000686267 2018-01-05 10:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00 B
5: 1070640921 2018-01-07 13:00:00 2018-01-07 11:00:00 2018-01-07 11:00:00 B
6: 1070640921 2018-01-07 13:00:00 2018-01-06 10:00:00 2018-01-06 10:00:00 B
7: 1070640921 2018-01-31 08:00:00 2018-01-31 05:00:00 2018-01-31 05:00:00 B
8: 1184695414 2018-01-23 10:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00 B
9: 1184695414 2018-01-23 10:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00 B
10: 1184695414 2018-01-05 09:00:00 2018-01-05 22:00:00 2018-01-05 09:00:00 A
11: 1184695414 2018-01-05 09:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00 <NA>
12: 1184695414 2018-01-05 19:00:00 2018-01-05 22:00:00 2018-01-05 19:00:00 A
13: 1184695414 2018-01-05 19:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00 B
14: 1184695414 2018-01-21 18:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00 B
15: 1184695414 2018-01-21 18:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00 B
The difference in output is that in row 11, the start times are the same. My join criteria doesn't capture this because data.table currently doesn't support not equal.

Fill NA by last observed value from other column, modified by adding some constant

I have start of some process, end of it and process duration.
process_start process_end hourly_process_duration
2019-01-01 00:00:00 2019-01-01 12:00:00 12
2019-01-01 12:00:00 2019-01-01 13:00:00 1
NA NA 11
NA NA 15
2019-01-02 15:00:00 2019-01-02 18:00:00 3
I always have hourly_process_duration. Processes are continuous - when one process ends the next one begins.
I need to replace NA correctly. Like in the example:
process_start process_end hourly_process_duration
2019-01-01 00:00:00 2019-01-01 12:00:00 12
2019-01-01 12:00:00 2019-01-01 13:00:00 1
2019-01-01 13:00:00 2019-01-02 00:00:00 11
2019-01-02 00:00:00 2019-01-02 15:00:00 15
2019-01-02 15:00:00 2019-01-02 18:00:00 3
Here is one option to fill the missing date time
library(dplyr)
library(lubridate)
df1 %>%
mutate(process_start = coalesce(process_start, lag(process_end)),
process_end = coalesce(process_end, lead(process_start))) %>%
mutate_at(vars(process_start, process_end), ymd_hms) %>%
mutate_at(vars(process_start, process_end),
list(~ replace(., is.na(.), floor_date(.[which(is.na(.))+1], "day"))))
# process_start process_end hourly_process_duration
#1 2019-01-01 00:00:00 2019-01-01 12:00:00 12
#2 2019-01-01 12:00:00 2019-01-01 13:00:00 1
#3 2019-01-01 13:00:00 2019-01-02 00:00:00 11
#4 2019-01-02 00:00:00 2019-01-02 15:00:00 15
#5 2019-01-02 15:00:00 2019-01-02 18:00:00 3
data
df1 <- structure(list(process_start = c("2019-01-01 00:00:00",
"2019-01-01 12:00:00",
NA, NA, "2019-01-02 15:00:00"), process_end = c("2019-01-01 12:00:00",
"2019-01-01 13:00:00", NA, NA, "2019-01-02 18:00:00"),
hourly_process_duration = c(12L,
1L, 11L, 15L, 3L)), class = "data.frame", row.names = c(NA, -5L
))

R: calculate number of occurrences which have started but not ended - count if within a datetime range

I've got a dataset with the following shape
ID Start Time End Time
1 01/01/2017 00:15:00 01/01/2017 07:15:00
2 01/01/2017 04:45:00 01/01/2017 06:15:00
3 01/01/2017 10:20:00 01/01/2017 20:15:00
4 01/01/2017 02:15:00 01/01/2017 00:15:00
5 02/01/2017 15:15:00 03/01/2017 00:30:00
6 03/01/2017 07:00:00 04/01/2017 09:15:00
I would like to count every 15 min for an entire year how many items have started but not finished, so count the number of times with a start time greater or equal than the time I'm looking at and an end time less or equal than the time I'm looking at.
I'm looking for an approach using tidyverse/dplyr if possible.
Any help or guidance would be very much appreciated.
If I understand correctly, the OP wants to count the number of simultaneously active events.
One possibility to tackle this question is the coverage() function from Bioconductor's IRange package. Another one is to aggregate in a non-equi join which is available with the data.table package.
Non-equi join
# create sequence of datetimes (limited to 4 days for demonstration)
seq15 <- seq(lubridate::as_datetime("2017-01-01"),
lubridate::as_datetime("2017-01-05"), by = "15 mins")
# aggregate within a non-equi join
library(data.table)
result <- periods[.(time = seq15), on = .(Start.Time <= time, End.Time > time),
.(time, count = sum(!is.na(ID))), by = .EACHI][, .(time, count)]
result
time count
1: 2017-01-01 00:00:00 0
2: 2017-01-01 00:15:00 1
3: 2017-01-01 00:30:00 1
4: 2017-01-01 00:45:00 1
5: 2017-01-01 01:00:00 1
---
381: 2017-01-04 23:00:00 0
382: 2017-01-04 23:15:00 0
383: 2017-01-04 23:30:00 0
384: 2017-01-04 23:45:00 0
385: 2017-01-05 00:00:00 0
The result can be visualized graphically:
library(ggplot2)
ggplot(result) + aes(time, count) + geom_step()
Data
periods <- readr::read_table(
"ID Start.Time End.Time
1 01/01/2017 00:15:00 01/01/2017 07:15:00
2 01/01/2017 04:45:00 01/01/2017 06:15:00
3 01/01/2017 10:20:00 01/01/2017 20:15:00
4 01/01/2017 02:15:00 01/01/2017 00:15:00
5 02/01/2017 15:15:00 03/01/2017 00:30:00
6 03/01/2017 07:00:00 04/01/2017 09:15:00"
)
# convert date strings to class Date
library(data.table)
cols <- names(periods)[names(periods) %like% "Time$"]
setDT(periods)[, (cols) := lapply(.SD, lubridate::dmy_hms), .SDcols = cols]
periods
ID Start.Time End.Time
1: 1 2017-01-01 00:15:00 2017-01-01 07:15:00
2: 2 2017-01-01 04:45:00 2017-01-01 06:15:00
3: 3 2017-01-01 10:20:00 2017-01-01 20:15:00
4: 4 2017-01-01 02:15:00 2017-01-01 00:15:00
5: 5 2017-01-02 15:15:00 2017-01-03 00:30:00
6: 6 2017-01-03 07:00:00 2017-01-04 09:15:00

Conditional (inequality) join in data.table

I'm just trying to figure out how to do a conditional join on two data.tables.
I've written a sqldf conditional join to give me the circuits whose start or finish times are within the other's start/finish times.
sqldf("select dt2.start, dt2.finish, dt2.counts, dt1.id, dt1.circuit
from dt2
left join dt1 on (
(dt2.start >= dt1.start and dt2.start < dt1.finish) or
(dt2.finish >= dt1.start and dt2.finish < dt1.finish)
)")
This gives me the correct result, but it's too slow for my large-ish data set.
What's the data.table way to do this without a vector scan?
Here's my data:
dt1 <- data.table(structure(list(circuit = structure(c(2L, 1L, 2L, 1L, 2L, 3L,
1L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"), start = structure(c(1393621200,
1393627920, 1393628400, 1393631520, 1393650300, 1393646400, 1393656000,
1393668000, 1393666200), class = c("POSIXct", "POSIXt"), tzone = ""),
end = structure(c(1393626600, 1393631519, 1393639200, 1393632000,
1393660500, 1393673400, 1393667999, 1393671600, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), id = structure(1:9, .Label = c("1001",
"1002", "1003", "1004", "1005", "1006", "1007", "1008", "1009"
), class = "factor")), .Names = c("circuit", "start", "end",
"id"), class = "data.frame", row.names = c(NA, -9L)))
dt2 <- data.table(structure(list(start = structure(c(1393621200, 1393624800, 1393626600,
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299,
1393655999, 1393660500, 1393666199, 1393671600, 1393673400), class = c("POSIXct",
"POSIXt"), tzone = ""), end = structure(c(1393624799, 1393626600,
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299,
1393655999, 1393660500, 1393666199, 1393671600, 1393673400, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), seconds = c(3599L,
1800L, 1319L, 480L, 3601L, 7200L, 7199L, 3900L, 5700L, 4501L,
5699L, 5401L, 1800L, 3600L), counts = c(1L, 1L, 0L, 1L, 2L, 1L,
0L, 1L, 2L, 3L, 2L, 3L, 2L, 1L)), .Names = c("start", "end",
"seconds", "counts"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame"))
Using non-equi joins:
ans = dt1[dt2, on=.(start <= end, end > start),
.(i.start, i.end, counts, id, circuit, cndn = i.start < x.start & i.end >= x.end),
allow.cartesian=TRUE
][!cndn %in% TRUE]
The condition start <= end, end >= start (note the >= on both cases) would check if two intervals overlap by any means. The open interval on one side is accomplished by end > start part (> instead of >=). But still it also picks up the intervals of type:
dt1: start=================end
dt2: start--------------------------------end ## start < start, end > end
and
dt1: start=================end
dt2: start----------end ## end == end
The cndn column is to check and remove these cases. Hopefully, those cases aren't a lot so that we don't materialise unwanted rows unnecessarily.
PS: the solution in this case is not as straightforward as I'd like to still, and that's because the solution requires an OR operation. It is possible to do two conditional joins, and then bind them together though.
Perhaps at some point, we'll have to think about the feasibility of extending joins to these kinds of operations in a more straightforward manner.
No idea if this performs faster, but here's a shot at a data table method. I reshape dt1 and use findInterval to identify where the times in dt2 line up with times in dt1.
dt1 <- data.table(structure(list(circuit = structure(c(2L, 1L, 2L, 1L, 2L, 3L,
1L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"), start = structure(c(1393621200,
1393627920, 1393628400, 1393631520, 1393650300, 1393646400, 1393656000,
1393668000, 1393666200), class = c("POSIXct", "POSIXt"), tzone = ""),
end = structure(c(1393626600, 1393631519, 1393639200, 1393632000,
1393660500, 1393673400, 1393667999, 1393671600, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), id = structure(1:9, .Label = c("1001",
"1002", "1003", "1004", "1005", "1006", "1007", "1008", "1009"
), class = "factor")), .Names = c("circuit", "start", "end",
"id"), class = "data.frame", row.names = c(NA, -9L)))
dt2 <- data.table(structure(list(start = structure(c(1393621200, 1393624800, 1393626600,
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299,
1393655999, 1393660500, 1393666199, 1393671600, 1393673400), class = c("POSIXct",
"POSIXt"), tzone = ""), end = structure(c(1393624799, 1393626600,
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299,
1393655999, 1393660500, 1393666199, 1393671600, 1393673400, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), seconds = c(3599L,
1800L, 1319L, 480L, 3601L, 7200L, 7199L, 3900L, 5700L, 4501L,
5699L, 5401L, 1800L, 3600L), counts = c(1L, 1L, 0L, 1L, 2L, 1L,
0L, 1L, 2L, 3L, 2L, 3L, 2L, 1L)), .Names = c("start", "end",
"seconds", "counts"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame"))
# > dt1
# circuit start end id
# 1: b 2014-02-28 16:00:00 2014-02-28 17:30:00 1001
# 2: a 2014-02-28 17:52:00 2014-02-28 18:51:59 1002
# 3: b 2014-02-28 18:00:00 2014-02-28 21:00:00 1003
# 4: a 2014-02-28 18:52:00 2014-02-28 19:00:00 1004
# 5: b 2014-03-01 00:05:00 2014-03-01 02:55:00 1005
# 6: c 2014-02-28 23:00:00 2014-03-01 06:30:00 1006
# 7: a 2014-03-01 01:40:00 2014-03-01 04:59:59 1007
# 8: a 2014-03-01 05:00:00 2014-03-01 06:00:00 1008
# 9: b 2014-03-01 04:30:00 2014-03-01 07:30:00 1009
# > dt2
# start end seconds counts
# 1: 2014-02-28 16:00:00 2014-02-28 16:59:59 3599 1
# 2: 2014-02-28 17:00:00 2014-02-28 17:30:00 1800 1
# 3: 2014-02-28 17:30:00 2014-02-28 17:51:59 1319 0
# 4: 2014-02-28 17:51:59 2014-02-28 17:59:59 480 1
# 5: 2014-02-28 17:59:59 2014-02-28 19:00:00 3601 2
# 6: 2014-02-28 19:00:00 2014-02-28 21:00:00 7200 1
# 7: 2014-02-28 21:00:00 2014-02-28 22:59:59 7199 0
# 8: 2014-02-28 22:59:59 2014-03-01 00:04:59 3900 1
# 9: 2014-03-01 00:04:59 2014-03-01 01:39:59 5700 2
# 10: 2014-03-01 01:39:59 2014-03-01 02:55:00 4501 3
# 11: 2014-03-01 02:55:00 2014-03-01 04:29:59 5699 2
# 12: 2014-03-01 04:29:59 2014-03-01 06:00:00 5401 3
# 13: 2014-03-01 06:00:00 2014-03-01 06:30:00 1800 2
# 14: 2014-03-01 06:30:00 2014-03-01 07:30:00 3600 1
## reshapes dt1 from wide to long
## puts start and end times into one column and sorts by time
## this is so that you can use findInterval later
dt3 <- dt1[,list(time = c(start,end)), by = "circuit,id"][order(time)]
dt3[,ntvl := seq_len(nrow(dt3))]
# circuit id time ntvl
# 1: b 1001 2014-02-28 16:00:00 1
# 2: b 1001 2014-02-28 17:30:00 2
# 3: a 1002 2014-02-28 17:52:00 3
# 4: b 1003 2014-02-28 18:00:00 4
# 5: a 1002 2014-02-28 18:51:59 5
# 6: a 1004 2014-02-28 18:52:00 6
# 7: a 1004 2014-02-28 19:00:00 7
# 8: b 1003 2014-02-28 21:00:00 8
# 9: c 1006 2014-02-28 23:00:00 9
# 10: b 1005 2014-03-01 00:05:00 10
# 11: a 1007 2014-03-01 01:40:00 11
# 12: b 1005 2014-03-01 02:55:00 12
# 13: b 1009 2014-03-01 04:30:00 13
# 14: a 1007 2014-03-01 04:59:59 14
# 15: a 1008 2014-03-01 05:00:00 15
# 16: a 1008 2014-03-01 06:00:00 16
# 17: c 1006 2014-03-01 06:30:00 17
# 18: b 1009 2014-03-01 07:30:00 18
## map interval to id
dt4 <- dt3[,list(ntvl = seq(from = min(ntvl), to = max(ntvl)-1), by = 1),by = "circuit,id"]
setkey(dt4, ntvl)
# circuit id ntvl
# 1: b 1001 1
# 2: a 1002 3
# 3: a 1002 4
# 4: b 1003 4
# 5: b 1003 5
# 6: b 1003 6
# 7: a 1004 6
# 8: b 1003 7
# 9: c 1006 9
# 10: c 1006 10
# 11: b 1005 10
# 12: c 1006 11
# 13: b 1005 11
# 14: a 1007 11
# 15: c 1006 12
# 16: a 1007 12
# 17: c 1006 13
# 18: a 1007 13
# 19: b 1009 13
# 20: c 1006 14
# 21: b 1009 14
# 22: c 1006 15
# 23: b 1009 15
# 24: a 1008 15
# 25: c 1006 16
# 26: b 1009 16
# 27: b 1009 17
# circuit id ntvl
## finds intervals in dt2
dt2[,`:=`(ntvl_start = findInterval(start, dt3[["time"]], rightmost.closed = FALSE),
ntvl_end = findInterval(end, dt3[["time"]], rightmost.closed = FALSE))]
# start end seconds counts ntvl_start ntvl_end
# 1: 2014-02-28 16:00:00 2014-02-28 16:59:59 3599 1 1 1
# 2: 2014-02-28 17:00:00 2014-02-28 17:30:00 1800 1 1 2
# 3: 2014-02-28 17:30:00 2014-02-28 17:51:59 1319 0 2 2
# 4: 2014-02-28 17:51:59 2014-02-28 17:59:59 480 1 2 3
# 5: 2014-02-28 17:59:59 2014-02-28 19:00:00 3601 2 3 7
# 6: 2014-02-28 19:00:00 2014-02-28 21:00:00 7200 1 7 8
# 7: 2014-02-28 21:00:00 2014-02-28 22:59:59 7199 0 8 8
# 8: 2014-02-28 22:59:59 2014-03-01 00:04:59 3900 1 8 9
# 9: 2014-03-01 00:04:59 2014-03-01 01:39:59 5700 2 9 10
# 10: 2014-03-01 01:39:59 2014-03-01 02:55:00 4501 3 10 12
# 11: 2014-03-01 02:55:00 2014-03-01 04:29:59 5699 2 12 12
# 12: 2014-03-01 04:29:59 2014-03-01 06:00:00 5401 3 12 16
# 13: 2014-03-01 06:00:00 2014-03-01 06:30:00 1800 2 16 17
# 14: 2014-03-01 06:30:00 2014-03-01 07:30:00 3600 1 17 18
## joins, by start time, then by end time
## the commented out lines may be a better alternative
## if there are many NA values
setkey(dt2, ntvl_start)
dt_ans_start <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = NA]
# dt_ans_start <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = 0]
# dt_ans_start_na <- dt2[!dt4]
setkey(dt2, ntvl_end)
dt_ans_end <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = NA]
# dt_ans_end <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = 0]
# dt_ans_end_na <- dt2[!dt4]
## bring them all together and remove duplicates
dt_ans <- unique(rbind(dt_ans_start, dt_ans_end), by = c("start", "id"))
dt_ans <- dt_ans[!(is.na(id) & counts > 0)]
dt_ans[,ntvl := NULL]
setkey(dt_ans,start)
# start end counts id circuit
# 1: 2014-02-28 16:00:00 2014-02-28 16:59:59 1 1001 b
# 2: 2014-02-28 17:00:00 2014-02-28 17:30:00 1 1001 b
# 3: 2014-02-28 17:30:00 2014-02-28 17:51:59 0 NA NA
# 4: 2014-02-28 17:51:59 2014-02-28 17:59:59 1 1002 a
# 5: 2014-02-28 17:59:59 2014-02-28 19:00:00 2 1002 a
# 6: 2014-02-28 17:59:59 2014-02-28 19:00:00 2 1003 b
# 7: 2014-02-28 19:00:00 2014-02-28 21:00:00 1 1003 b
# 8: 2014-02-28 21:00:00 2014-02-28 22:59:59 0 NA NA
# 9: 2014-02-28 22:59:59 2014-03-01 00:04:59 1 1006 c
# 10: 2014-03-01 00:04:59 2014-03-01 01:39:59 2 1006 c
# 11: 2014-03-01 00:04:59 2014-03-01 01:39:59 2 1005 b
# 12: 2014-03-01 01:39:59 2014-03-01 02:55:00 3 1006 c
# 13: 2014-03-01 01:39:59 2014-03-01 02:55:00 3 1005 b
# 14: 2014-03-01 01:39:59 2014-03-01 02:55:00 3 1007 a
# 15: 2014-03-01 02:55:00 2014-03-01 04:29:59 2 1006 c
# 16: 2014-03-01 02:55:00 2014-03-01 04:29:59 2 1007 a
# 17: 2014-03-01 04:29:59 2014-03-01 06:00:00 3 1006 c
# 18: 2014-03-01 04:29:59 2014-03-01 06:00:00 3 1007 a
# 19: 2014-03-01 04:29:59 2014-03-01 06:00:00 3 1009 b
# 20: 2014-03-01 06:00:00 2014-03-01 06:30:00 2 1006 c
# 21: 2014-03-01 06:00:00 2014-03-01 06:30:00 2 1009 b
# 22: 2014-03-01 06:30:00 2014-03-01 07:30:00 1 1009 b
# start end counts id circuit

Resources