How to get equally spaced intervals when counting factors? - r

I have some difficulties to create an time interval with 30 min breaks beginning either with the full hour 00 or full hour 00 and 30 min:
For instance:
library(reshape2)
library(dplyr)
# Given some data which resemble the original data
foo <- data.frame(start.time = c("2012-02-01 13:47:00",
"2012-02-01 14:02:00",
"2012-02-01 14:20:00",
"2012-02-01 14:40:00",
"2012-02-01 15:08:00",
"2012-02-01 16:01:00",
"2012-02-01 16:02:00",
"2012-02-01 16:20:00",
"2012-02-01 17:09:00",
"2012-02-01 18:08:00",
"2012-02-01 18:20:00",
"2012-02-01 19:08:00"
),
employee = c("mike","john","john","steven","mike","mike","mike","steven","mike","steven","mike","mike"))
start.time employee
#1 2012-02-01 13:47:00 mike
#2 2012-02-01 14:02:00 john
#3 2012-02-01 14:20:00 john
#4 2012-02-01 14:40:00 steven
#5 2012-02-01 15:08:00 mike
#6 2012-02-01 16:01:00 mike
#7 2012-02-01 16:02:00 mike
#8 2012-02-01 16:20:00 steven
#9 2012-02-01 17:09:00 mike
#10 2012-02-01 18:08:00 steven
#11 2012-02-01 18:20:00 mike
#12 2012-02-01 19:08:00 mike
# change factor to POSIXct
foo$start.time <- as.POSIXct(foo$start.time)
# long to wide
my_emp<- dcast(foo, start.time ~ employee, fun.aggregate = length)
# 30 min breaks
my_emp_ag<- my_emp %>% group_by(start.time = as.POSIXct(cut(start.time, breaks="30 min"))) %>%
summarize(john = sum(john ),mike = sum(mike ),steven = sum(steven))
# Missing intervalls
miss_interval <- data.frame(start.time=seq(from = min(as.POSIXct(my_emp$start.time)), to= max(as.POSIXct(my_emp$start.time)), by = "30 mins"))
# join old woth new
substitited <- left_join(miss_interval,my_emp_ag,by=c('start.time'))
# change NA to zero
substitited[is.na(substitited)] <- 0
start.time john mike steven
1 2012-02-01 13:47:00 1 1 0
2 2012-02-01 14:17:00 1 0 1
3 2012-02-01 14:47:00 0 1 0
4 2012-02-01 15:17:00 0 0 0
5 2012-02-01 15:47:00 0 2 0
6 2012-02-01 16:17:00 0 0 1
7 2012-02-01 16:47:00 0 1 0
8 2012-02-01 17:17:00 0 0 0
9 2012-02-01 17:47:00 0 0 1
10 2012-02-01 18:17:00 0 1 0
11 2012-02-01 18:47:00 0 1 0
which is almost as desired 2012-02-01 13:30:00 2012-02-01 14:00:00 and so on.

library(data.table)
library(lubridate)
setDT(foo)[, `:=` (
round.time = {
todate = ymd_hms(start.time)
rounddate = floor_date(todate, "30 minutes")
}
)]
start.time employee round.time
1: 2012-02-01 13:47:00 mike 2012-02-01 13:30:00
2: 2012-02-01 14:02:00 john 2012-02-01 14:00:00
3: 2012-02-01 14:20:00 john 2012-02-01 14:00:00
4: 2012-02-01 14:40:00 steven 2012-02-01 14:30:00
5: 2012-02-01 15:08:00 mike 2012-02-01 15:00:00
6: 2012-02-01 16:01:00 mike 2012-02-01 16:00:00
7: 2012-02-01 16:02:00 mike 2012-02-01 16:00:00
8: 2012-02-01 16:20:00 steven 2012-02-01 16:00:00
9: 2012-02-01 17:09:00 mike 2012-02-01 17:00:00
10: 2012-02-01 18:08:00 steven 2012-02-01 18:00:00
11: 2012-02-01 18:20:00 mike 2012-02-01 18:00:00
12: 2012-02-01 19:08:00 mike 2012-02-01 19:00:00

Related

Duplicating and modifying rows based on datetime

I have got have a data.table that looks like this
library(dplyr)
library(data.table)
dt <- data.table(ID=c("A001","A002","A003","A004"),start_time=c('2019-06-18 05:18:00','2020-03-04 05:00:00',
'2019-05-10 19:00:00','2020-01-06 22:42:00'),end_time=c('2019-06-18 08:41:00','2020-03-04 05:04:00',
'2019-05-10 19:08:00','2020-01-07 03:10:00'))
ID
start_time end_time duration
1: A001 2019-06-18 05:18:00 2019-06-18 08:41:00 203 mins
2: A002 2020-03-04 05:59:00 2020-03-04 06:04:00 5 mins
3: A003 2019-05-10 19:00:00 2019-05-10 19:08:00 8 mins
4: A004 2020-01-06 22:42:00 2020-01-07 03:10:00 268 mins
Duration was simply calculated as
dt$start_time <- as.POSIXct(dt$start_time, tz='UTC')
dt$end_time <- as.POSIXct(dt$end_time, tz='UTC')
dt <- dt %>% mutate(duration = (end_time-start_time))
I need to duplicate rows where duration is larger than the end of the hour from start_time (records that cover > 1 hour). I need to change for them start time (beginning of the hour), end time - end of hour OR the original end time if if's the last row (last viewing hour),and duration accordingly, so that the final output would look like:
dt_expected <- data.table(ID=c("A001","A001","A001","A001","A002","A002","A003","A004","A004","A004","A004","A004","A004"),
start_time=c('2019-06-18 05:18:00','2019-06-18 06:00:00','2019-06-18 07:00:00','2019-06-18 08:00:00', '2020-03-04 05:59:00', '2020-03-04 06:00:00', '2019-05-10 19:00:00',
'2020-01-06 22:42:00', '2020-01-06 23:00:00','2020-01-07 00:00:00','2020-01-07 01:00:00','2020-01-07 02:00:00','2020-01-07 03:00:00'),
end_time=c('2019-06-18 05:59:00','2019-06-18 06:59:00','2019-06-18 07:59:00','2019-06-18 08:41:00','2020-03-04 05:59:00','2020-03-04 06:04:00', '2019-05-10 19:08:00', '2020-01-06 22:59:00','2020-01-06 23:59:00','2020-01-07 00:59:00','2020-01-07 01:59:00', '2020-01-07 02:59:00','2020-01-07 03:10:00'),
duration = c(12,60,60,41,1,4,8,18,60,60,60,60,10))
Note that records for ID A002 should also be duplicated as duration happened in 2 different hours.
ID start_time end_time duration
1: A001 2019-06-18 05:18:00 2019-06-18 05:59:00 12
2: A001 2019-06-18 06:00:00 2019-06-18 06:59:00 60
3: A001 2019-06-18 07:00:00 2019-06-18 07:59:00 60
4: A001 2019-06-18 08:00:00 2019-06-18 08:41:00 41
5: A002 2020-03-04 05:59:00 2020-03-04 05:59:00 1
6: A002 2020-03-04 06:00:00 2020-03-04 06:04:00 4
7: A003 2019-05-10 19:00:00 2019-05-10 19:08:00 8
8: A004 2020-01-06 22:42:00 2020-01-06 22:59:00 18
9: A004 2020-01-06 23:00:00 2020-01-06 23:59:00 60
10: A004 2020-01-07 00:00:00 2020-01-07 00:59:00 60
11: A004 2020-01-07 01:00:00 2020-01-07 01:59:00 60
12: A004 2020-01-07 02:00:00 2020-01-07 02:59:00 60
13: A004 2020-01-07 03:00:00 2020-01-07 03:10:00 10
I think this is pretty close to what you're looking for.
This creates new rows of start and end times, one row for each hour using map from purrr.
Then, for each ID, it will determine start_time and end_time using pmin.
First, for the end_time, it takes the minimum value between that row's end_time and an hour later than the start_time for that row. For example, the first row for A001 would have end_time of 6:00, which is the ceiling_date time for 5:18 to the nearest hour, and less than 6:18 from the sequence generated from map. For the last row for A001, the end_time is 8:41, which is less than the ceiling_date time of 9:00.
The start_time will take the minimum value between the last row's end_time and that row's start_time. For example, the second row of A001 will have 6:00, which is the row above's end_time which is less than 6:18 from the sequence generated from map.
Note that one row has 0 minutes for duration - the time fell right on the hour (19:00:00). These could be filtered out.
library(purrr)
library(dplyr)
library(tidyr)
library(lubridate)
dt %>%
rowwise() %>%
mutate(start_time = map(start_time, ~seq.POSIXt(., ceiling_date(end_time, "hour"), by = "hour"))) %>%
unnest(start_time) %>%
group_by(ID) %>%
mutate(end_time = pmin(ceiling_date(start_time, unit = "hour"), end_time),
start_time = pmin(floor_date(lag(end_time, default = first(end_time)), unit = "hour"), start_time),
duration = difftime(end_time, start_time, units = "mins"))
Output
ID start_time end_time duration
<chr> <dttm> <dttm> <drtn>
1 A001 2019-06-18 05:18:00 2019-06-18 06:00:00 42 mins
2 A001 2019-06-18 06:00:00 2019-06-18 07:00:00 60 mins
3 A001 2019-06-18 07:00:00 2019-06-18 08:00:00 60 mins
4 A001 2019-06-18 08:00:00 2019-06-18 08:41:00 41 mins
5 A002 2020-03-04 05:59:00 2020-03-04 06:00:00 1 mins
6 A002 2020-03-04 06:00:00 2020-03-04 06:04:00 4 mins
7 A003 2019-05-10 19:00:00 2019-05-10 19:00:00 0 mins
8 A003 2019-05-10 19:00:00 2019-05-10 19:08:00 8 mins
9 A004 2020-01-06 22:42:00 2020-01-06 23:00:00 18 mins
10 A004 2020-01-06 23:00:00 2020-01-07 00:00:00 60 mins
11 A004 2020-01-07 00:00:00 2020-01-07 01:00:00 60 mins
12 A004 2020-01-07 01:00:00 2020-01-07 02:00:00 60 mins
13 A004 2020-01-07 02:00:00 2020-01-07 03:00:00 60 mins
14 A004 2020-01-07 03:00:00 2020-01-07 03:10:00 10 mins

igraph, POSIX, and data.table

In an earlier question, I learned that graphs are useful to collapse these data
require(data.table)
set.seed(333)
t <- data.table(old=1002:2001, dif=sample(1:10,1000, replace=TRUE))
t$new <- t$old + t$dif; t$foo <- rnorm(1000); t$dif <- NULL
> head(t)
old new foo
1: 1002 1007 -0.7889534
2: 1003 1004 0.3901869
3: 1004 1014 0.7907947
4: 1005 1011 2.0964612
5: 1006 1007 1.1834171
6: 1007 1015 1.1397910
to obtain only those rows such that new[i] = old[i-1]. The result could then be joined into a table with users who each have their own starting points
i <- data.table(id=1:3, start=sample(1000:1990,3))
> i
id start
1: 1 1002
2: 2 1744
3: 3 1656
Specifically, when only the first n=3 steps are calculated, the solution was
> library(igraph)
> i[, t[old %in% subcomponent(g, start, "out")[1:n]], by=.(id)]
id old new foo
1: 1 1002 1007 -0.7889534
2: 1 1007 1015 1.1397910
3: 1 1015 1022 -1.2193666
4: 2 1744 1750 -0.1368320
5: 2 1750 1758 0.3331686
6: 2 1758 1763 1.3040357
7: 3 1656 1659 -0.1556208
8: 3 1659 1663 0.1663042
9: 3 1663 1669 0.3781835
When implementing this when the setup is the same but new, old, and start are POSIXct class,
set.seed(333)
u <- data.table(old=seq(from=as.POSIXct("2013-01-01"),
to=as.POSIXct("2013-01-02"), by="15 mins"),
dif=as.difftime(sample(seq(15,120,15),97,replace=TRUE),units="mins"))
u$new <- u$old + u$dif; u$foo <- rnorm(97); u$dif <- NULL
j <- data.table(id=1:3, start=sample(seq(from=as.POSIXct("2013-01-01"),
to=as.POSIXct("2013-01-01 22:00:00"), by="15 mins"),3))
> head(u)
old new foo
1: 2013-01-01 00:00:00 2013-01-01 01:00:00 -1.5434407
2: 2013-01-01 00:15:00 2013-01-01 00:30:00 -0.2753971
3: 2013-01-01 00:30:00 2013-01-01 02:30:00 -1.5986916
4: 2013-01-01 00:45:00 2013-01-01 02:00:00 -0.6288528
5: 2013-01-01 01:00:00 2013-01-01 01:15:00 -0.8967041
6: 2013-01-01 01:15:00 2013-01-01 02:45:00 -1.2145590
> j
id start
1: 1 2013-01-01 22:00:00
2: 2 2013-01-01 21:00:00
3: 3 2013-01-01 13:30:00
the command
> j[, u[old %in% subcomponent(h, V(h)$name %in% as.character(start), "out")[1:n]], by=.(id)]
Empty data.table (0 rows and 4 cols): id,old,new,foo
returns an empty vector, which appears to be due to the inner part u[...]. I do not quite see where the problem is in this case and wonder whether anyone spots a mistake.

Summarize data within multiple groups of a time series

I have a series of observations of birds at different locations and times. The data frame looks like this:
birdID site ts
1 A 2013-04-15 09:29
1 A 2013-04-19 01:22
1 A 2013-04-20 23:13
1 A 2013-04-22 00:03
1 B 2013-04-22 14:02
1 B 2013-04-22 17:02
1 C 2013-04-22 14:04
1 C 2013-04-22 15:18
1 C 2013-04-23 00:54
1 A 2013-04-23 01:20
1 A 2013-04-24 23:07
1 A 2013-04-30 23:47
1 B 2013-04-30 03:51
1 B 2013-04-30 04:26
2 C 2013-04-30 04:29
2 C 2013-04-30 18:49
2 A 2013-05-01 01:03
2 A 2013-05-01 23:15
2 A 2013-05-02 00:09
2 C 2013-05-03 07:57
2 C 2013-05-04 07:21
2 C 2013-05-05 02:54
2 A 2013-05-05 03:27
2 A 2013-05-14 00:16
2 D 2013-05-14 10:00
2 D 2013-05-14 15:00
I would like to summarize the data in a way that shows the first and last detection of each bird at each site, and the duration at each site, while preserving information about multiple visits to sites (i.e. if a bird went from site A > B > C > A > B, I would like show each visit to site A and B independently, not lump both visits together).
I am hoping to produce output like this, where the start (min_ts), end (max_ts), and duration (days) of each visit are preserved:
birdID site min_ts max_ts days
1 A 2013-04-15 09:29 2013-04-22 00:03 6.6
1 B 2013-04-22 14:02 2013-04-22 17:02 0.1
1 C 2013-04-22 14:04 2013-04-23 00:54 0.5
1 A 2013-04-23 01:20 2013-04-30 23:47 7.9
1 B 2013-04-30 03:51 2013-04-30 04:26 0.02
2 C 2013-04-30 4:29 2013-04-30 18:49 0.6
2 A 2013-05-01 01:03 2013-05-02 00:09 0.96
2 C 2013-05-03 07:57 2013-05-05 02:54 1.8
2 A 2013-05-05 03:27 2013-05-14 00:16 8.8
2 D 2013-05-14 10:00 2013-05-14 15:00 0.2
I have tried this code, which yields the correct variables but lumps all the information about a single site together, not preserving multiple visits:
df <- df %>%
group_by(birdID, site) %>%
summarise(min_ts = min(ts),
max_ts = max(ts),
days = difftime(max_ts, min_ts, units = "days")) %>%
arrange(birdID, min_ts)
birdID site min_ts max_ts days
1 A 2013-04-15 09:29 2013-04-30 23:47 15.6
1 B 2013-04-22 14:02 2013-04-30 4:26 7.6
1 C 2013-04-22 14:04 2013-04-23 0:54 0.5
2 C 2013-04-30 04:29 2013-05-05 2:54 4.9
2 A 2013-05-01 01:03 2013-05-14 0:16 12.9
2 D 2013-05-14 10:00 2013-05-14 15:00 0.2
I realize grouping by site is a problem, but if I remove that as a grouping variable the data are summarised without site info. I have tried this. It doesn't run, but I feel it's close to the solution:
df <- df %>%
group_by(birdID) %>%
summarize(min_ts = if_else((birdID == lag(birdID) & site != lag(site)), min(ts), NA_real_),
max_ts = if_else((birdID == lag(birdID) & site != lag(site)), max(ts), NA_real_),
min_d = min(yday(ts)),
max_d = max(yday(ts)),
days = max_d - min_d))
One possibility could be:
df %>%
group_by(birdID, site, rleid = with(rle(site), rep(seq_along(lengths), lengths))) %>%
summarise(min_ts = min(ts),
max_ts = max(ts),
days = difftime(max_ts, min_ts, units = "days")) %>%
ungroup() %>%
select(-rleid) %>%
arrange(birdID, min_ts)
birdID site min_ts max_ts days
<int> <chr> <dttm> <dttm> <drtn>
1 1 A 2013-04-15 09:29:00 2013-04-22 00:03:00 6.60694444 days
2 1 B 2013-04-22 14:02:00 2013-04-22 17:02:00 0.12500000 days
3 1 C 2013-04-22 14:04:00 2013-04-23 00:54:00 0.45138889 days
4 1 A 2013-04-23 01:20:00 2013-04-30 23:47:00 7.93541667 days
5 1 B 2013-04-30 03:51:00 2013-04-30 04:26:00 0.02430556 days
6 2 C 2013-04-30 04:29:00 2013-04-30 18:49:00 0.59722222 days
7 2 A 2013-05-01 01:03:00 2013-05-02 00:09:00 0.96250000 days
8 2 C 2013-05-03 07:57:00 2013-05-05 02:54:00 1.78958333 days
9 2 A 2013-05-05 03:27:00 2013-05-14 00:16:00 8.86736111 days
10 2 D 2013-05-14 10:00:00 2013-05-14 15:00:00 0.20833333 days
Here it creates a rleid()-like grouping variable and then calculates the difference.
Or the same using rleid() from data.table explicitly:
df %>%
group_by(birdID, site, rleid = rleid(site)) %>%
summarise(min_ts = min(ts),
max_ts = max(ts),
days = difftime(max_ts, min_ts, units = "days")) %>%
ungroup() %>%
select(-rleid) %>%
arrange(birdID, min_ts)
Another alternative is to use lag and cumsum to create a grouping variable.
library(dplyr)
df %>%
group_by(birdID, group = cumsum(site != lag(site, default = first(site)))) %>%
summarise(min_ts = min(ts),
max_ts = max(ts),
days = difftime(max_ts, min_ts, units = "days")) %>%
ungroup() %>%
select(-group)
# A tibble: 10 x 4
# birdID min_ts max_ts days
# <int> <dttm> <dttm> <drtn>
# 1 1 2013-04-15 09:29:00 2013-04-22 00:03:00 6.60694444 days
# 2 1 2013-04-22 14:02:00 2013-04-22 17:02:00 0.12500000 days
# 3 1 2013-04-22 14:04:00 2013-04-23 00:54:00 0.45138889 days
# 4 1 2013-04-23 01:20:00 2013-04-30 23:47:00 7.93541667 days
# 5 1 2013-04-30 03:51:00 2013-04-30 04:26:00 0.02430556 days
# 6 2 2013-04-30 04:29:00 2013-04-30 18:49:00 0.59722222 days
# 7 2 2013-05-01 01:03:00 2013-05-02 00:09:00 0.96250000 days
# 8 2 2013-05-03 07:57:00 2013-05-05 02:54:00 1.78958333 days
# 9 2 2013-05-05 03:27:00 2013-05-14 00:16:00 8.86736111 days
#10 2 2013-05-14 10:00:00 2013-05-14 15:00:00 0.20833333 days

R: calculate number of occurrences which have started but not ended - count if within a datetime range

I've got a dataset with the following shape
ID Start Time End Time
1 01/01/2017 00:15:00 01/01/2017 07:15:00
2 01/01/2017 04:45:00 01/01/2017 06:15:00
3 01/01/2017 10:20:00 01/01/2017 20:15:00
4 01/01/2017 02:15:00 01/01/2017 00:15:00
5 02/01/2017 15:15:00 03/01/2017 00:30:00
6 03/01/2017 07:00:00 04/01/2017 09:15:00
I would like to count every 15 min for an entire year how many items have started but not finished, so count the number of times with a start time greater or equal than the time I'm looking at and an end time less or equal than the time I'm looking at.
I'm looking for an approach using tidyverse/dplyr if possible.
Any help or guidance would be very much appreciated.
If I understand correctly, the OP wants to count the number of simultaneously active events.
One possibility to tackle this question is the coverage() function from Bioconductor's IRange package. Another one is to aggregate in a non-equi join which is available with the data.table package.
Non-equi join
# create sequence of datetimes (limited to 4 days for demonstration)
seq15 <- seq(lubridate::as_datetime("2017-01-01"),
lubridate::as_datetime("2017-01-05"), by = "15 mins")
# aggregate within a non-equi join
library(data.table)
result <- periods[.(time = seq15), on = .(Start.Time <= time, End.Time > time),
.(time, count = sum(!is.na(ID))), by = .EACHI][, .(time, count)]
result
time count
1: 2017-01-01 00:00:00 0
2: 2017-01-01 00:15:00 1
3: 2017-01-01 00:30:00 1
4: 2017-01-01 00:45:00 1
5: 2017-01-01 01:00:00 1
---
381: 2017-01-04 23:00:00 0
382: 2017-01-04 23:15:00 0
383: 2017-01-04 23:30:00 0
384: 2017-01-04 23:45:00 0
385: 2017-01-05 00:00:00 0
The result can be visualized graphically:
library(ggplot2)
ggplot(result) + aes(time, count) + geom_step()
Data
periods <- readr::read_table(
"ID Start.Time End.Time
1 01/01/2017 00:15:00 01/01/2017 07:15:00
2 01/01/2017 04:45:00 01/01/2017 06:15:00
3 01/01/2017 10:20:00 01/01/2017 20:15:00
4 01/01/2017 02:15:00 01/01/2017 00:15:00
5 02/01/2017 15:15:00 03/01/2017 00:30:00
6 03/01/2017 07:00:00 04/01/2017 09:15:00"
)
# convert date strings to class Date
library(data.table)
cols <- names(periods)[names(periods) %like% "Time$"]
setDT(periods)[, (cols) := lapply(.SD, lubridate::dmy_hms), .SDcols = cols]
periods
ID Start.Time End.Time
1: 1 2017-01-01 00:15:00 2017-01-01 07:15:00
2: 2 2017-01-01 04:45:00 2017-01-01 06:15:00
3: 3 2017-01-01 10:20:00 2017-01-01 20:15:00
4: 4 2017-01-01 02:15:00 2017-01-01 00:15:00
5: 5 2017-01-02 15:15:00 2017-01-03 00:30:00
6: 6 2017-01-03 07:00:00 2017-01-04 09:15:00

Flag first instance of an event occurring contingent on other variable's value

New to R and to solving such a problem as the one below, so not sure about how certain functionality is achieved in particular instances.
I have a dataframe as such:
df <- data.frame(DATETIME = seq(from = as.POSIXct('2014-01-01 00:00', tz = "GMT"), to = as.POSIXct('2014-01-01 06:00', tz = "GMT"), by='15 mins'),
Price = c(23,22,23,24,27,31,33,34,31,26,24,23,19,18,19,19,23,25,26,26,27,30,26,25,24),
TroughPriceFlag = c(0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0))
df <- data.table(df)
df
DATETIME Price TroughPriceFlag
1: 2014-01-01 00:00:00 23 0
2: 2014-01-01 00:15:00 22 1
3: 2014-01-01 00:30:00 23 0
4: 2014-01-01 00:45:00 24 0
5: 2014-01-01 01:00:00 27 0
6: 2014-01-01 01:15:00 31 0
7: 2014-01-01 01:30:00 33 0
8: 2014-01-01 01:45:00 34 0
9: 2014-01-01 02:00:00 31 0
10: 2014-01-01 02:15:00 26 0
11: 2014-01-01 02:30:00 24 0
12: 2014-01-01 02:45:00 23 0
13: 2014-01-01 03:00:00 19 0
14: 2014-01-01 03:15:00 18 1
15: 2014-01-01 03:30:00 19 0
16: 2014-01-01 03:45:00 19 0
17: 2014-01-01 04:00:00 23 0
18: 2014-01-01 04:15:00 25 0
19: 2014-01-01 04:30:00 26 0
20: 2014-01-01 04:45:00 26 0
21: 2014-01-01 05:00:00 27 0
22: 2014-01-01 05:15:00 30 0
23: 2014-01-01 05:30:00 26 0
24: 2014-01-01 05:45:00 25 0
25: 2014-01-01 06:00:00 24 0
What I wish to do is two things:
(1) From where we observe a TroughPrice, flag the first instance where the price has risen by 10 or more dollars. That is, find the first instance where deltaPrice >= 10 since the trough price.
As an example: from the trough price of 22 (row 2), in the next interval price is increased to 23 which is a change of 1 dollar, so no flag. From the trough price of 22 (again row 2, since always with reference to the trough price in question), two intervals later the price is 24 dollars, so the price has increased by 2 dollars since the trough, so again no flag. However, from the trough price of 22, 5 intervals later the price has increased to 33 dollars, which is an increase of 11 dollars and is the first time the price has increased above 10 dollars. Thus the flag is 1.
(2) Determine the number of 15 minute periods which have passed between the trough price and the first instance the price has risen by 10 or more dollars.
The resulting dataframe should look like this:
DATETIME Price TroughPriceFlag FirstOver10CentsFlag CountPeriods
1 2014-01-01 00:00:00 23 0 0 NA
2 2014-01-01 00:15:00 22 1 0 5
3 2014-01-01 00:30:00 23 0 0 NA
4 2014-01-01 00:45:00 24 0 0 NA
5 2014-01-01 01:00:00 27 0 0 NA
6 2014-01-01 01:15:00 31 0 0 NA
7 2014-01-01 01:30:00 33 0 1 NA
8 2014-01-01 01:45:00 34 0 0 NA
9 2014-01-01 02:00:00 31 0 0 NA
10 2014-01-01 02:15:00 26 0 0 NA
11 2014-01-01 02:30:00 24 0 0 NA
12 2014-01-01 02:45:00 23 0 0 NA
13 2014-01-01 03:00:00 19 0 0 NA
14 2014-01-01 03:15:00 18 1 0 8
15 2014-01-01 03:30:00 19 0 0 NA
16 2014-01-01 03:45:00 19 0 0 NA
17 2014-01-01 04:00:00 23 0 0 NA
18 2014-01-01 04:15:00 25 0 0 NA
19 2014-01-01 04:30:00 26 0 0 NA
20 2014-01-01 04:45:00 26 0 0 NA
21 2014-01-01 05:00:00 27 0 0 NA
22 2014-01-01 05:15:00 30 0 1 NA
23 2014-01-01 05:30:00 26 0 0 NA
24 2014-01-01 05:45:00 25 0 0 NA
25 2014-01-01 06:00:00 24 0 0 NA
I'm not really sure where to start, since the time gaps can be quite large and I've only used indexing in the context of a few steps forward/backward. Please help!
Thanks in advance
You can chain operation with data.table package, the idea would be to group by cumsum of the ThroughPriceFlag:
library(data.table)
df[, col1:=pmatch(Price-Price[1]>10,T, nomatch=0), cumsum(TroughPriceFlag)][
, count:=which(col1==1)-1,cumsum(TroughPriceFlag)][
TroughPriceFlag==0, count:=NA]
#> df
# DATETIME Price TroughPriceFlag col1 count
# 1: 2014-01-01 00:00:00 23 0 0 NA
# 2: 2014-01-01 00:15:00 22 1 0 5
# 3: 2014-01-01 00:30:00 23 0 0 NA
# 4: 2014-01-01 00:45:00 24 0 0 NA
# 5: 2014-01-01 01:00:00 27 0 0 NA
# 6: 2014-01-01 01:15:00 31 0 0 NA
# 7: 2014-01-01 01:30:00 33 0 1 NA
# 8: 2014-01-01 01:45:00 34 0 0 NA
# 9: 2014-01-01 02:00:00 31 0 0 NA
#10: 2014-01-01 02:15:00 26 0 0 NA
#11: 2014-01-01 02:30:00 24 0 0 NA
#12: 2014-01-01 02:45:00 23 0 0 NA
#13: 2014-01-01 03:00:00 19 0 0 NA
#14: 2014-01-01 03:15:00 18 1 0 8
#15: 2014-01-01 03:30:00 19 0 0 NA
#16: 2014-01-01 03:45:00 19 0 0 NA
#17: 2014-01-01 04:00:00 23 0 0 NA
#18: 2014-01-01 04:15:00 25 0 0 NA
#19: 2014-01-01 04:30:00 26 0 0 NA
#20: 2014-01-01 04:45:00 26 0 0 NA
#21: 2014-01-01 05:00:00 27 0 0 NA
#22: 2014-01-01 05:15:00 30 0 1 NA
#23: 2014-01-01 05:30:00 26 0 0 NA
#24: 2014-01-01 05:45:00 25 0 0 NA
#25: 2014-01-01 06:00:00 24 0 0 NA

Resources