calculate duration of time interval while removing certain time spans - r

Assume we have an interval spanning several days (interval "A" in Figure below).
library(lubridate)
int <- interval("2018-01-01 22:00:00", "2018-01-04 10:00:00")
In hours, I get
as.period(int, unit = "hours")
"60H 0M 0S"
Now, I want to subtract all non-working-hours, here 16:00-08:00 (greyed out) in that interval, i.e. only keep the blue parts (08:00-16:00) and, again, calculate the remaining hours (see "B" in Figure below), which would be 8 + 8 + 2 = 18 hours.
One approach would be to create a list of intervals I want to keep which span the entire interval and then calculate intersections. (The code below could, of course, be setup programmatically using floor/ceiling/seq functions etc.)
int_keep <- list(
interval("2018-01-01 08:00:00", "2018-01-01 16:00:00"),
interval("2018-01-02 08:00:00", "2018-01-02 16:00:00"),
interval("2018-01-03 08:00:00", "2018-01-03 16:00:00"),
interval("2018-01-04 08:00:00", "2018-01-04 16:00:00"),
interval("2018-01-05 08:00:00", "2018-01-05 16:00:00")
)
l <- lapply(int_keep, function(x) intersect(x, int))
mns <- sapply(l, as.numeric) # returns seconds
sum(mns, na.rm = T) / 60 / 60 # sum of intersections in hours
[1] 18
While this works, it appears utterly clumsy to me. What would be a less tedious way to do this?

df <- data.frame(DateTime=seq.POSIXt(as.POSIXct("2018-01-01 22:00:00"), as.POSIXct("2018-01-04 10:00:00"), by = "1 hour"))
head(df)
#DateTime
#1 2018-01-01 22:00:00
#2 2018-01-01 23:00:00
#3 2018-01-02 00:00:00
#4 2018-01-02 01:00:00
#5 2018-01-02 02:00:00
#6 2018-01-02 03:00:00
#you want the hours worked between A and B
A <-format(strptime("8:00:00", "%H:%M:%S"),"%H:%M:%S")
B <-format(strptime("16:00:00", "%H:%M:%S"),"%H:%M:%S")
#a simple ifelse statement to assign a value of 1 to column "value" if the time is between 8 and 16 or a 0 if it's not:
df$value<-ifelse((format(df[1],"%H:%M:%S")>A & format(df[1],"%H:%M:%S")<=B),1,0)
tail(df)
#DateTime DateTime
#56 2018-01-04 05:00:00 0
#57 2018-01-04 06:00:00 0
#58 2018-01-04 07:00:00 0
#59 2018-01-04 08:00:00 0
#60 2018-01-04 09:00:00 1
#61 2018-01-04 10:00:00 1
#now taking the column sum of the value column will give you the total hours worked:
TotalHoursWorked<-colSums(df$value)
TotalHoursWorked
#DateTime
# 18

Related

Calculate Rolling 12 Hours by Group in R

I am working on a project where I have to only include patients who had lab tests ordered at least 12 hours apart, and to keep the timestamp of each included lab test. The issue is that many patients get several labs done within the 12 hour window, but the client has asked to not include those tests. I have made it this far:
#Create dummy dataset
df = data.frame(
"Encounter" = c(rep("12345", times=16), rep("67890", times = 5)),
"Timestamp" = c("01/06/2022 04:00:00", "01/07/2022 08:00:00",
"01/08/2022 00:00:00", "01/08/2022 04:00:00",
"01/08/2022 08:00:00", "01/08/2022 20:00:00",
"01/09/2022 04:00:00", "01/09/2022 08:00:00",
"01/09/2022 20:00:00", "01/09/2022 23:26:00",
"01/10/2022 00:00:00", "01/10/2022 08:00:00",
"01/10/2022 20:00:00", "01/11/2022 00:00:00",
"01/11/2022 20:00:00", "01/12/2022 04:00:00",
"11/10/2021 11:00:00", "11/10/2021 12:00:00",
"11/10/2021 13:00:00", "11/10/2021 14:00:00",
"11/11/2021 00:00:00"))
#Convert timestamp to POSIXlt format
df$Timestamp <- strptime(as.character(df$Timestamp), format="%m/%d/%Y %H:%M")
#Calculate time (in hours) between each previous timestamp by Encounter
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(difftime(Timestamp, lag(Timestamp), units="hours"))
I can't seem to figure out what to do next. It seems like I need to calculate a rolling 12-hours that then resets to 0 once a row hits 12 hours, but I'm not sure how to go about it. Below is my ideal result:
df$Keep.Row <- c(1,1,1,0,0,1,0,1,1,0,0,1,1,0,1,0,1,0,0,0,1)
There is absolutely nothing elegant about this, but I believe it gives you what you’re looking for. I use a temporary variable to store the “rolling” sum before it’s reset once the hours between is 12 or greater.
library(tidyverse)
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_diff = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
replace_na(list(time_diff = 0)) %>%
mutate(temp = ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(time_diff) + time_diff),
temp = ifelse(is.na(temp), 0, temp),
hours_between = ifelse(time_diff >= 12, time_diff,
ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(temp) + time_diff)),
keep = ifelse(hours_between >= 12 | is.na(hours_between), 1, 0)) %>%
select(-temp)
Created on 2022-01-27 by the reprex package (v2.0.1)
Here is an alternative option using accumulate. Here, you can use you differences, and once they exceed the threshold of 12 hours, reset by just using the diff value (starting over) instead of using the cumulative sum. To include the first time for each Encounter, you can either make that diff 12 hours, or add a separate mutate and check where Timestamp == first(Timestamp) and in those cases set keep to 1.
library(tidyverse)
thresh <- 12
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(diff = difftime(Timestamp, lag(Timestamp, default = first(Timestamp) - (thresh * 60 * 60)), units = "hours"),
keep = +(accumulate(diff, ~if_else(.x >= thresh, .y, .x + .y)) >= thresh))
Output
Encounter Timestamp diff keep
<chr> <dttm> <drtn> <int>
1 12345 2022-01-06 04:00:00 12.0000000 hours 1
2 12345 2022-01-07 08:00:00 28.0000000 hours 1
3 12345 2022-01-08 00:00:00 16.0000000 hours 1
4 12345 2022-01-08 04:00:00 4.0000000 hours 0
5 12345 2022-01-08 08:00:00 4.0000000 hours 0
6 12345 2022-01-08 20:00:00 12.0000000 hours 1
7 12345 2022-01-09 04:00:00 8.0000000 hours 0
8 12345 2022-01-09 08:00:00 4.0000000 hours 1
9 12345 2022-01-09 20:00:00 12.0000000 hours 1
10 12345 2022-01-09 23:26:00 3.4333333 hours 0
11 12345 2022-01-10 00:00:00 0.5666667 hours 0
12 12345 2022-01-10 08:00:00 8.0000000 hours 1
13 12345 2022-01-10 20:00:00 12.0000000 hours 1
14 12345 2022-01-11 00:00:00 4.0000000 hours 0
15 12345 2022-01-11 20:00:00 20.0000000 hours 1
16 12345 2022-01-12 04:00:00 8.0000000 hours 0
17 67890 2021-11-10 11:00:00 12.0000000 hours 1
18 67890 2021-11-10 12:00:00 1.0000000 hours 0
19 67890 2021-11-10 13:00:00 1.0000000 hours 0
20 67890 2021-11-10 14:00:00 1.0000000 hours 0
21 67890 2021-11-11 00:00:00 10.0000000 hours 1
Probably missing something, but wouldn't this work:
library(dplyr)
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_dif = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
filter(time_dif > 12)

Sum a part of a time series between set end points

I have a time series (xts) of rain gage data and I would like to be able to sum all the rain amounts between a beginning and end time point from a list. And then make a new data frame that is StormNumber and TotalRain over that time
> head(RainGage)
Rain_mm
2019-07-01 00:00:00 0
2019-07-01 00:15:00 0
2019-07-01 00:30:00 0
2019-07-01 00:45:00 0
2019-07-01 01:00:00 0
2019-07-01 01:15:00 0
head(StormTimes)
StormNumber RainStartTime RainEndTime
1 1 2019-07-21 20:00:00 2019-07-22 04:45:00
2 2 2019-07-22 11:30:00 2019-07-22 23:45:00
3 3 2019-07-11 09:15:00 2019-07-11 19:00:00
4 4 2019-05-29 17:00:00 2019-05-29 20:45:00
5 5 2019-06-27 14:30:00 2019-06-27 17:15:00
6 6 2019-07-11 06:15:00 2019-07-11 09:00:00
I have this code that I got from the SO community when I was trying to do something similar in the past (but extract data rather than sum it). However, I have no idea how it works so I am struggling to adapt it to this situation.
do.call(rbind, Map(function(x, y) RainGage[paste(x, y, sep="/")],
StormTimes$RainStartTime, StormTimes$RainEndTime)
In this case I would suggest just to write your own function and then use apply to achieve what you want, for example:
dates <- c('2019-07-01 00:00:00', '2019-07-01 00:15:00',
'2019-07-01 00:30:00', '2019-07-01 00:45:00',
'2019-07-01 01:00:00', '2019-07-01 01:15:00')
dates <- as.POSIXct(strptime(dates, '%Y-%m-%d %H:%M:%S'))
mm <- c(0, 10, 10, 20, 0, 0)
rain <- data.frame(dates, mm)
number <- c(1,2)
start <- c('2019-07-01 00:00:00','2019-07-01 00:18:00')
start <- as.POSIXct(strptime(start, '%Y-%m-%d %H:%M:%S'))
end <- c('2019-07-01 00:17:00','2019-07-01 01:20:00')
end <- as.POSIXct(strptime(end, '%Y-%m-%d %H:%M:%S'))
storms <- data.frame(number, start, end)
# Sum of rain
f = function(x, output) {
# Get storm number
number = x[1]
# Get starting moment
start = x[2]
# Get ending moment
end = x[3]
# Calculate sum
output <- sum(rain[rain$dates >= start & rain$dates < end, 'mm'])
}
# Apply function to each row of the dataframe
storms$rain <- apply(storms, 1, f)
print(storms)
This yields:
number start end rain
1 1 2019-07-01 00:00:00 2019-07-01 00:17:00 10
2 2 2019-07-01 00:18:00 2019-07-01 01:20:00 30
So a column rain in storms now holds the sum of rain$mm, which is what you're after.
Hope that helps you out!

How to increase the number of observations in xts object by taking previous value?

What I have:
dta <- xts(
c(1,2,3,4),
order.by = timeDate(c(
"2000-01-01 00:01:01",
"2000-01-01 00:01:05",
"2000-01-01 00:01:06",
"2000-01-01 00:01:07"
)
)
)
[,1]
2000-01-01 00:01:00 1
2000-01-01 00:05:00 2
2000-01-01 00:06:00 3
2000-01-01 00:07:00 4
What I need:
[,1]
2000-01-01 00:01:00 1
2000-01-01 00:02:00 1
2000-01-01 00:03:00 1
2000-01-01 00:04:00 1
2000-01-01 00:05:00 2
2000-01-01 00:06:00 3
2000-01-01 00:07:00 4
Similar questions on stackoverflow only deal with decreasing the periodicity of an xts object and did not help me with this problem.
I am cleaning minute price data, where prices where only recorded if there actually where trades. Obviously, if there where no trades this means that the price stays the same as in the previous minute.
Merge it with a zero width sequence (grid) having the required times:
rng <- range(time(dta))
g <- xts(, seq(rng[1], rng[2], by = 1))
na.locf(merge(dta, g))

R convert hourly to daily data up to 0:00 instead of 23:00

How do you set 0:00 as end of day instead of 23:00 in an hourly data? I have this struggle while using period.apply or to.period as both return days ending at 23:00. Here is an example :
x1 = xts(seq(as.POSIXct("2018-02-01 00:00:00"), as.POSIXct("2018-02-05 23:00:00"), by="hour"), x = rnorm(120))
The following functions show periods ends at 23:00
to.period(x1, OHLC = FALSE, drop.date = FALSE, period = "days")
x1[endpoints(x1, 'days')]
So when I am aggregating the hourly data to daily, does someone have an idea how to set the end of day at 0:00?
As already pointed out by another answer here, to.period on days computes on the data with timestamps between 00:00:00 and 23:59:59.9999999 on the day in question. so 23:00:00 is seen as the last timestamp in your data, and 00:00:00 corresponds to a value in the next day "bin".
What you can do is shift all the timestamps back 1 hour, use to.period get the daily data points from the hour points, and then using align.time to get the timestamps aligned correctly.
(More generally, to.period is useful for generating OHLCV type data, and so if you're say generating say hourly bars from ticks, it makes sense to look at all the ticks between 23:00:00 and 23:59:59.99999 in the bar creation. then 00:00:00 to 00:59:59.9999.... would form the next hourly bar and so on.)
Here is an example:
> tail(x1["2018-02-01"])
# [,1]
# 2018-02-01 18:00:00 -1.2760349
# 2018-02-01 19:00:00 -0.1496041
# 2018-02-01 20:00:00 -0.5989614
# 2018-02-01 21:00:00 -0.9691905
# 2018-02-01 22:00:00 -0.2519618
# 2018-02-01 23:00:00 -1.6081656
> head(x1["2018-02-02"])
# [,1]
# 2018-02-02 00:00:00 -0.3373271
# 2018-02-02 01:00:00 0.8312698
# 2018-02-02 02:00:00 0.9321747
# 2018-02-02 03:00:00 0.6719425
# 2018-02-02 04:00:00 -0.5597391
# 2018-02-02 05:00:00 -0.9810128
> head(x1["2018-02-03"])
# [,1]
# 2018-02-03 00:00:00 2.3746424
# 2018-02-03 01:00:00 0.8536594
# 2018-02-03 02:00:00 -0.2467268
# 2018-02-03 03:00:00 -0.1316978
# 2018-02-03 04:00:00 0.3079848
# 2018-02-03 05:00:00 0.2445634
x2 <- x1
.index(x2) <- .index(x1) - 3600
> tail(x2["2018-02-01"])
# [,1]
# 2018-02-01 18:00:00 -0.1496041
# 2018-02-01 19:00:00 -0.5989614
# 2018-02-01 20:00:00 -0.9691905
# 2018-02-01 21:00:00 -0.2519618
# 2018-02-01 22:00:00 -1.6081656
# 2018-02-01 23:00:00 -0.3373271
x.d2 <- to.period(x2, OHLC = FALSE, drop.date = FALSE, period = "days")
> x.d2
# [,1]
# 2018-01-31 23:00:00 0.12516594
# 2018-02-01 23:00:00 -0.33732710
# 2018-02-02 23:00:00 2.37464235
# 2018-02-03 23:00:00 0.51797747
# 2018-02-04 23:00:00 0.08955208
# 2018-02-05 22:00:00 0.33067734
x.d2 <- align.time(x.d2, n = 86400)
> x.d2
# [,1]
# 2018-02-01 0.12516594
# 2018-02-02 -0.33732710
# 2018-02-03 2.37464235
# 2018-02-04 0.51797747
# 2018-02-05 0.08955208
# 2018-02-06 0.33067734
Want to convince yourself? Try something like this:
x3 <- rbind(x1, xts(x = matrix(c(1,2), nrow = 2), order.by = as.POSIXct(c("2018-02-01 23:59:59.999", "2018-02-02 00:00:00"))))
x3["2018-02-01 23/2018-02-02 01"]
# [,1]
# 2018-02-01 23:00:00.000 -1.6081656
# 2018-02-01 23:59:59.999 1.0000000
# 2018-02-02 00:00:00.000 -0.3373271
# 2018-02-02 00:00:00.000 2.0000000
# 2018-02-02 01:00:00.000 0.8312698
x3.d <- to.period(x3, OHLC = FALSE, drop.date = FALSE, period = "days")
> x3.d <- align.time(x3.d, 86400)
> x3.d
[,1]
2018-02-02 1.00000000
2018-02-03 -0.09832625
2018-02-04 -0.65075506
2018-02-05 -0.09423664
2018-02-06 0.33067734
See that the value of 2 on 00:00:00 did not form the last observation in the day for 2018-02-02 (00:00:00), which went from 2018-02-01 00:00:00 to 2018-02-01 23:59:59.9999.
Of course, if you want the daily timestamp to be the start of the day, not the end of the day, which would be 2018-02-01 as start of bar for the first row, in x3.d above, you could shift back the day by one. You could do this relatively safely for most timezones, when your data doesn't involve weekend dates:
index(x3.d) = index(x3.d) - 86400
I say relatively safetly, because there are corner cases when there are time shifts in a time zone. e.g. Be careful with day light savings. Simply subtracting -86400 can be a problem when going from Sunday to Saturday in time zones where day light saving occurs:
#e.g. bad: day light savings occurs on this weekend for US EST
z <- xts(x = 9, order.by = as.POSIXct("2018-03-12", tz = "America/New_York"))
> index(z) - 86400
[1] "2018-03-10 23:00:00 EST"
i.e. the timestamp is off by one hour, when you really want the midnight timestamp (00:00:00).
You could get around this problem using something much safer like this:
library(lubridate)
# right
> index(z) - days(1)
[1] "2018-03-11 EST"
I don't think this is possible because 00:00 is the start of the day. From the manual:
These endpoints are aligned in POSIXct time to the zero second of the day at the beginning, and the 59.9999th second of the 59th minute of the 23rd hour of the final day
I think the solution here is to use minutes instead of hours. Using your example:
x1 = xts(seq(as.POSIXct("2018-02-01 00:00:00"), as.POSIXct("2018-02-05 23:59:99"), by="min"), x = rnorm(7200))
to.period(x1, OHLC = FALSE, drop.date = FALSE, period = "day")
x1[endpoints(x1, 'day')]

Creating time index that creates same intervals every day

I want to make a time index that starts at 9:15:00 and progress in 50-minute intervals before ending at 03:30:00. E.g. 9:15:00,10:05:00, 10:55:00 and so on. This is my code that perfectly creates these time indices for 1st day. However, it becomes messed up next day and begins at 09:25:00 instead of 09:15:00 and gets all intervals wrong. The start time keeps changing every day.
Intervals <- seq(as.POSIXct("2016-04-01 09:15:00", format="%Y-%m-%d %H:%M:%S"), as.POSIXct("2016-04-29 15:30:00", format="%Y-%m-%d %H:%M:%S"),
by="50 min")
As I am trying to calculate various intervals by change only argument by="50 min" for example to by="55 min" and need flexibility in fixing the end time, so I put it as before 15:30:00 . Please me to fix It?
You might be better off generating one sequence and reusing this for each day. As so:
start <- "2016-04-01"
stop <- "2016-04-29"
daylength <- difftime(as.POSIXct(stop), as.POSIXct(start), units="days")
Intervals <- seq(
as.POSIXct(paste(start, "09:15")),
as.POSIXct(paste(start, "15:30")),
by="50 min"
)
out <- Intervals + as.difftime(rep(0:daylength, each=length(Intervals)), units="days")
range(out)
#[1] "2016-04-01 09:15:00 AEST" "2016-04-29 15:05:00 AEST"
It may be worth exploring making use of the cut function. For example for the set of days:
myDays <-
seq(
from = as.Date("2016-04-01"),
to = as.Date("2016-04-29"),
by = "day"
)
the one could arrive at 50 minutes intervals for each day via:
myIntervals <-
data.frame(table(cut(x = as.POSIXct(myDays), breaks = "50 min")))
Preview
>> head(myIntervals, 10)
Var1 Freq
1 2016-04-01 01:00:00 1
2 2016-04-01 01:50:00 0
3 2016-04-01 02:40:00 0
4 2016-04-01 03:30:00 0
5 2016-04-01 04:20:00 0
6 2016-04-01 05:10:00 0
7 2016-04-01 06:00:00 0
8 2016-04-01 06:50:00 0
9 2016-04-01 07:40:00 0
10 2016-04-01 08:30:00 0

Resources