Calculate availability with dates - r

I have a tibble and want to compute monthly availability (A), defined as
A = uptime / (uptime + downtime),
where (monthly) downtime is end - start, by month and uptime is total time (1 month) - downtime. What is the way to compute monthly availability for the year 2019?
This is the sample data
structure(list(start = structure(c(1550048400, 1558008000, 1558703040,
1561032000, 1560945660, 1563451200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), end = structure(c(1550143989, 1558008000, 1558956840,
1561032000, 1560945660, 1563451200), tzone = "GMT", class = c("POSIXct",
"POSIXt"))), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))

First, you have inconsistent "tzone" attributes, one is "UTC" and the other is "GMT". It's minor (and slightly noisy), so I'll preempt the noise (though no change in the results):
attr(dat$end, "tzone") <- "UTC"
A helper function:
fun <- function(mon1, mon2, x = dat) {
# if either start/end is between mon1/mon2, include it ...
tmp <- x[with(x, (start >= mon1 & start < mon2) | (end >= mon1 & end < mon2)),] |>
# ... but if start-to-end straddles a month begin/end, then truncate it
transform(
start = pmax(start, mon1),
end = pmin(end, mon2)
)
data.frame(start = mon1, end = mon2) |>
transform(downtime = c(sum(with(tmp, as.numeric(end - start, units = "hours"))), 0)[1]) |>
transform(uptime = as.numeric(mon2 - mon1, units = "hours") - downtime) |>
transform(A = uptime / ( uptime + downtime))
}
And the work in base R:
months <- seq(as.POSIXct("2019-01-01 00:00:00", tz="UTC"), by="1 month", length.out=12)
months
# [1] "2019-01-01 UTC" "2019-02-01 UTC" "2019-03-01 UTC" "2019-04-01 UTC" "2019-05-01 UTC" "2019-06-01 UTC" "2019-07-01 UTC"
# [8] "2019-08-01 UTC" "2019-09-01 UTC" "2019-10-01 UTC" "2019-11-01 UTC" "2019-12-01 UTC"
do.call(rbind, Map(fun, months[-12], months[-1]))
# start end downtime uptime A
# 1 2019-01-01 2019-02-01 0.0000 744.0000 1.0000000
# 2 2019-02-01 2019-03-01 26.5525 645.4475 0.9604874
# 3 2019-03-01 2019-04-01 0.0000 744.0000 1.0000000
# 4 2019-04-01 2019-05-01 0.0000 720.0000 1.0000000
# 5 2019-05-01 2019-06-01 70.5000 673.5000 0.9052419
# 6 2019-06-01 2019-07-01 0.0000 720.0000 1.0000000
# 7 2019-07-01 2019-08-01 0.0000 744.0000 1.0000000
# 8 2019-08-01 2019-09-01 0.0000 744.0000 1.0000000
# 9 2019-09-01 2019-10-01 0.0000 720.0000 1.0000000
# 10 2019-10-01 2019-11-01 0.0000 744.0000 1.0000000
# 11 2019-11-01 2019-12-01 0.0000 720.0000 1.0000000

If you are trying to calculate the value of 'A' for each month, then the process would be:
sum up all the down time in each month
subtract that from the total time in the month to get the uptime
divide the uptime by the total time in the month
This is possible using the lubridate package:
library(lubridate)
library(dplyr)
data <- data %>%
mutate(downtime = end-start,
month = format(end, "%Y-%m %b"),
month_time = ceiling_date(end,
unit = "months") - floor_date(end,
unit = "months")) %>%
group_by(month) %>%
summarise(downtime = sum(downtime),
month_time = month_time[1]) %>%
mutate(uptime = month_time - downtime,
A = as.numeric(uptime) / as.numeric(uptime + downtime))

Related

Find the overlap between two timestamps in R to assign shifts

Problem
Currently, I have a large flight crew schedule dataset, with a start and end time, where my goal is to identify whether an employee was working a night shift. A night shift is defined as any portion of the shift between 01:00:00 and 05:59:59. I have looked at functions such as %overlaps%, but these seem not to work for only timestamps. Some sample data (in UTC-tz):
library(lubridate)
df <- data.frame(start = ymd_hms(c("2018-09-19 23:30:00", "2018-09-19 17:00:00", "2018-09-22 04:30:00")),
end = ymd_hms(c('2018-09-20 07:05:00', "2018-09-19 21:00:00", "2018-09-22 12:00:00")))
Solution
Ideally, I would like to get the following output, with a Boolean variable indicating whether the employee worked a night shift:
start end night.shift
2018-09-19 23:30:00 | 2018-09-20 07:05:00 | TRUE
2018-09-19 17:00:00 | 2018-09-19 21:00:00 | FALSE
2018-09-22 04:30:00 | 2018-09-22 12:00:00 | TRUE
Thanks in advance!
Using seq.POSIXt
transform(df, night.shift=mapply(\(x, y) any(
as.POSIXct(outer(as.Date(c(x, y)), c('01:00:00', '05:59:59'), paste), tz='GMT') %in%
seq.POSIXt(x, y, by='sec')),
start, end))
# start end night.shift
# 1 2018-09-19 23:30:00 2018-09-20 07:05:00 TRUE
# 2 2018-09-19 17:00:00 2018-09-19 21:00:00 FALSE
# 3 2018-09-22 04:30:00 2018-09-22 12:00:00 TRUE
or, almost twice as fast, %inrange% from the data.table package.
library(data.table)
transform(df, night.shift=mapply(\(x, y) any(
as.POSIXct(outer(as.Date(c(x, y)), c('01:00:00', '05:59:59'), paste), tz='GMT') %inrange%
c(x, y)),
start, end))
# start end night.shift
# 1 2018-09-19 23:30:00 2018-09-20 07:05:00 TRUE
# 2 2018-09-19 17:00:00 2018-09-19 21:00:00 FALSE
# 3 2018-09-22 04:30:00 2018-09-22 12:00:00 TRUE
You can use interval() or %--% to create an Interval object and int_overlaps() to test if two intervals overlap.
library(dplyr)
library(lubridate)
df %>%
mutate(
night.shift = int_overlaps(
(date(start) + hms("01:00:00")) %--% (date(start) + hms("05:59:59")),
start %--% end
) | int_overlaps(
(date(end) + hms("01:00:00")) %--% (date(end) + hms("05:59:59")),
start %--% end
)
)
Another way is using %within% to check whether a date-time object falls within an interval.
df %>%
rowwise() %>%
mutate(
night.shift = any(outer(date(c(start, end)), hms(c("01:00:00", "05:59:59")), `+`) %within% (start %--% end))
) %>%
ungroup()
Output
# # A tibble: 4 × 3
# start end night.shift
# <dttm> <dttm> <lgl>
# 1 2018-09-19 23:30:00 2018-09-20 07:05:00 TRUE
# 2 2018-09-19 17:00:00 2018-09-19 21:00:00 FALSE
# 3 2018-09-22 04:30:00 2018-09-22 12:00:00 TRUE
# 4 2018-09-22 04:30:00 2018-09-23 00:30:00 TRUE
Reference
Utilities for creation and manipulation of Interval objects
This is super janky and not optimized, but it works (and was fun to figure out). You'll want to vectorize it if possible.
library(lubridate)
df <- data.frame(start = ymd_hms(c("2018-09-19 23:30:00", "2018-09-19 17:00:00", "2018-09-22 04:30:00")),
end = ymd_hms(c('2018-09-20 07:05:00', "2018-09-19 21:00:00", "2018-09-22 12:00:00")))
night <- interval( hms::as_hms(3600), hms::as_hms(21599), tz = "UTC")
print(night)
for(i in 1:3) {
s = df$start[i]
f = df$end[i]
start_seconds = hms::as_hms(60*60*hour(s) + 60*minute(s) + second(s))
end_seconds = hms::as_hms(60*60*hour(f) + 60*minute(f) + second(f))
interval <- interval(start_seconds, end_seconds, tz = "UTC")
t <- int_overlaps(night, interval)
print(t)
}

Converting dates to hours in R

I have a start and end date for individuals and i need to estimate if the time passed from the start to the end is within 2 days
or 3 plus days.These dates are assign to record ids, how can i filter ones that ended within 2 days (from the start date)
and the ones that ended after 3 days or later.
Record_id <- c("2245","6728","5122","9287")
Start <- c("2021-01-13 CST" ,"2021-01-21 CST" ,"2021-01-17 CST","2021-01-13 CST")
End <- c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST","2021-01-25 15:00:00 CST")
I tried using
elapsed.time <- DF$start %--% DF$End
time.duration <- as.duration(elapsed.time)
but I am getting error because End date contains hour.Thank you.
Here's a dplyr pipe that will include both constraints (2 and 3 days):
df %>%
mutate(across(Start:End, as.POSIXct)) %>%
mutate(d = difftime(End, Start, units = "days")) %>%
filter(!between(difftime(End, Start, units = "days"), 2, 3))
# # A tibble: 4 x 4
# Record_id Start End d
# <chr> <dttm> <dttm> <drtn>
# 1 2245 2021-01-13 00:00:00 2021-01-21 18:00:00 8.750000 days
# 2 6728 2021-01-21 00:00:00 2021-01-22 16:00:00 1.666667 days
# 3 5122 2021-01-17 00:00:00 2021-01-22 13:00:00 5.541667 days
# 4 9287 2021-01-13 00:00:00 2021-01-25 15:00:00 12.625000 days
I included mutate(d= so that we can see what the actual differences are. If you were looking to remove those, then use filter(between(..)) (no !).
In the case of the data you provided, all observations are less than 2 or more than 3 days. I'll expand this range so that we can see it in effect:
df %>%
mutate(across(Start:End, as.POSIXct)) %>%
mutate(d = difftime(End, Start, units = "days")) %>%
filter(!between(difftime(End, Start, units = "days"), 1, 6))
# # A tibble: 2 x 4
# Record_id Start End d
# <chr> <dttm> <dttm> <drtn>
# 1 2245 2021-01-13 00:00:00 2021-01-21 18:00:00 8.750 days
# 2 9287 2021-01-13 00:00:00 2021-01-25 15:00:00 12.625 days
Data
df <- structure(list(Record_id = c("2245", "6728", "5122", "9287"), Start = c("2021-01-13 CST", "2021-01-21 CST", "2021-01-17 CST", "2021-01-13 CST"), End = c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST", "2021-01-25 15:00:00 CST")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I just converted the character to a date time with lubridate and then subtracted the dates. What you'll get back are days. I then filter for dates that are within 2 days.
Record_id<- c("2245","6728","5122","9287")
Start<-c("2021-01-13 CST" ,"2021-01-21 CST" ,"2021-01-17 CST","2021-01-13 CST")
End<-c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST","2021-01-25 15:00:00 CST")
df <- dplyr::tibble(x = Record_id, y = Start, z = End)
df %>%
dplyr::mutate_at(vars(y:z), ~ lubridate::as_datetime(.)) %>%
dplyr::mutate(diff = as.numeric(z - y)) %>%
dplyr::filter(diff <= 2 )

How to round datetime to nearest time of day, preferably vectorized?

Say I have a POSIXct vector like
timestamps = seq(as.POSIXct("2021-01-23"), as.POSIXct("2021-01-24"), length.out = 6)
I would like to round these times up to the nearest hour of the day in a vector:
hours_of_day = c(6, 14, 20)
i.e., the following result:
timestamps result
1 2021-01-23 00:00:00 2021-01-23 02:00:00
2 2021-01-23 04:48:00 2021-01-23 14:00:00
3 2021-01-23 09:36:00 2021-01-23 14:00:00
4 2021-01-23 14:24:00 2021-01-23 20:00:00
5 2021-01-23 19:12:00 2021-01-23 20:00:00
6 2021-01-24 00:00:00 2021-01-24 02:00:00
Is there a vectorized solution to this (or otherwise fast)? I have a few million timestamps and need to apply it for several hours_of_day.
One way to simplify this problem is to (1) find the next hours_of_day for each lubridate::hour(timestamps) and then (2) result = lubridate::floor_date(timestamps) + next_hour_of_day * 3600. But how to do step 1 vectorized?
Convert to as.POSIXlt, which allows you to extract hours and minutes, and calculate decimal hours. In an lapply/sapply combination first look up where these are less than the hours of the day vector, and choose the maximum hour using which.max. Now create new date-time using ISOdate and add one day ifelse date-time is smaller than original time.
timestamps <- as.POSIXlt(timestamps)
h <- hours_of_day[sapply(lapply(with(timestamps, hour + min/60 + sec/3600),
`<=`, hours_of_day), which.max)]
r <- with(timestamps, ISOdate(1900 + year, mon + 1, mday, h,
tz=attr(timestamps, "tzone")[[1]]))
r[r < timestamps] <- r[r < timestamps] + 86400
Result
r
# [1] "2021-01-23 06:00:00 CET" "2021-01-23 06:00:00 CET"
# [3] "2021-01-23 14:00:00 CET" "2021-01-23 20:00:00 CET"
# [5] "2021-01-23 20:00:00 CET" "2021-01-24 06:00:00 CET"
# [7] "2021-01-25 06:00:00 CET" "2021-01-27 20:00:00 CET"
data.frame(timestamps, r)
# timestamps r
# 1 2021-01-23 00:00:00 2021-01-23 06:00:00
# 2 2021-01-23 04:48:00 2021-01-23 06:00:00
# 3 2021-01-23 09:36:00 2021-01-23 14:00:00
# 4 2021-01-23 14:24:00 2021-01-23 20:00:00
# 5 2021-01-23 19:12:00 2021-01-23 20:00:00
# 6 2021-01-24 00:00:00 2021-01-24 06:00:00
# 7 2021-01-24 23:59:00 2021-01-25 06:00:00
# 8 2021-01-27 20:00:00 2021-01-27 20:00:00
Note: I've added "2021-01-24 23:59:00 CET" to timestamps to demonstrate the date change.
Benchmark
Tested on a length 1.4e6 vector.
# Unit: seconds
# expr min lq mean median uq max neval cld
# POSIX() 32.96197 33.06495 33.32104 33.16793 33.50057 33.83321 3 a
# lubridate() 47.36412 47.57762 47.75280 47.79113 47.94715 48.10316 3 b
Data:
timestamps <- structure(c(1611356400, 1611373680, 1611390960, 1611408240, 1611425520,
1611442800, 1611529140, 1611774000), class = c("POSIXct", "POSIXt"
))
hours_of_day <- c(6, 14, 20)
I would extract the hour component, use cut to bin it, and assign the binned hours back to the original:
hours_of_day = c(2, 14, 20)
library(lubridate)
library(magrittr) ## just for the pipe
new_hours = timestamps %>%
hour %>%
cut(breaks = c(0, hours_of_day), labels = hours_of_day, include.lowest = TRUE) %>%
as.character() %>%
as.integer()
result = floor_date(timestamps, "hour")
hour(result) = new_hours
result
# [1] "2021-01-23 02:00:00 EST" "2021-01-23 14:00:00 EST" "2021-01-23 14:00:00 EST"
# [4] "2021-01-23 14:00:00 EST" "2021-01-23 20:00:00 EST" "2021-01-24 02:00:00 EST"
Building on the approach by #jay.sf, I made a function for floor as well while adding support for NA values.
floor_date_to = function(timestamps, hours_of_day) {
# Handle NA with a temporary filler so code below doesn't break
na_timestamps = is.na(timestamps)
timestamps[na_timestamps] = as.POSIXct("9999-12-31")
# Proceed as usual
timestamps = as.POSIXlt(timestamps)
hours_of_day = rev(hours_of_day) # floor-specific: because which.max returns the first index by default
nearest_hour = hours_of_day[sapply(lapply(with(timestamps, hour + min/60 + sec/3600), `<`, hours_of_day), function(x) which.max(-x))] # floor-specific: negative which.max()
rounded = with(timestamps, ISOdate(1900 + year, mon + 1, mday, nearest_hour, tz = attr(timestamps, "tzone")[1]))
rounded[rounded > timestamps] = rounded[rounded > timestamps] - 86400 # floor: use minus
return(rounded)
timestamps[na_timestamps] = NA # Overwrite with NA again
}

Summarize values for overlapping time periods

I'm trying to summarize values for overlapping time periods.
I can use only tidyr, ggplot2 and dplyr libraries. Base R is preferred though.
My data looks like this, but usually it has around 100 records:
df <- structure(list(Start = structure(c(1546531200, 1546531200, 546531200, 1546638252.6316, 1546549800, 1546534800, 1546545600, 1546531200, 1546633120, 1547065942.1053), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Stop = structure(c(1546770243.1579, 1546607400, 1547110800, 1546670652.6316, 1547122863.1579, 1546638252.6316, 1546878293.5579, 1546416000, 1546849694.4, 1547186400), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Value = c(12610, 520, 1500, 90, 331380, 27300, 6072, 4200, 61488, 64372)), .Names = c("Start", "Stop", "Value"), row.names = c(41L, 55L, 25L, 29L, 38L, 28L, 1L, 20L, 14L, 31L), class = c("tbl_df", "tbl", "data.frame"))
head(df) and str(df) gives:
Start Stop Value
2019-01-03 16:00:00 2019-01-06 10:24:03 12610
2019-01-03 16:00:00 2019-01-04 13:10:00 520
2019-01-03 16:00:00 2019-01-10 09:00:00 1500
2019-01-04 21:44:12 2019-01-05 06:44:12 90
2019-01-03 21:10:00 2019-01-10 12:21:03 331380
2019-01-03 17:00:00 2019-01-04 21:44:12 27300
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 10 obs. of 3 variables:
$ Start: POSIXct, format: "2019-01-03 16:00:00" "2019-01-03 16:00:00" ...
$ Stop : POSIXct, format: "2019-01-06 10:24:03" "2019-01-04 13:10:00" ...
$ Value: num 12610 520 1500 90 331380 ...
So there are overlapping time periods with "Start" and "Stop" dates with assigned value. In any given record when there is a value between df$Start and df$Stop and outside of this scope the value is 0.
I want to create another dataframe based on which I could show how this values summarize and change over time. The Desired output would look like this (the "sum" column is made up):
> head(df2)
timestamp sum
"2019-01-02 09:00:00 CET" 14352
"2019-01-03 17:00:00 CET" 6253
"2019-01-03 18:00:00 CET" 23465
"2019-01-03 21:00:00 CET" 3241
"2019-01-03 22:10:00 CET" 23235
"2019-01-04 14:10:00 CET" 123321
To get unique timestamps:
timestamps <- sort(unique(c(df$`Start`, df$`Stop`)))
With df2 dataframe I could easily draw a graph with ggplot, but how to get this sums?
I think I should iterate over df data frame either some custom function or any built-it summarize function which would work like this:
fnct <- function(date, min, max, value) {
if (date >= min && date <=max) {
a <- value
}
else {
a <- 0
}
return(a)
}
...for every given date from timestamps iterate through df and give me a sum of values for the timestamp.
It looks really simple and I'm missing something very basic.
Here's a tidyverse solution similar to my response to this recent question. I gather to bring the timestamps (Starts and Stops) into one column, with another column specifying which. The Starts add the value and the Stops subtract it, and then we just take the cumulative sum to get values at all the instants when the sum changes.
For 100 records, there won't be any perceivable speed improvement from using data.table; in my experience it starts to make more of a difference around 1M records, especially when grouping is involved.
library(dplyr); library(tidyr)
df2 <- df %>%
gather(type, time, Start:Stop) %>%
mutate(chg = if_else(type == "Start", Value, -Value)) %>%
arrange(time) %>%
mutate(sum = cumsum(chg)) # EDIT: corrected per OP comment
> head(df2)
## A tibble: 6 x 5
# Value type time chg sum
# <dbl> <chr> <dttm> <dbl> <dbl>
#1 1500 Start 1987-04-27 14:13:20 1500 1500
#2 4200 Stop 2019-01-02 08:00:00 -4200 -2700
#3 12610 Start 2019-01-03 16:00:00 12610 9910
#4 520 Start 2019-01-03 16:00:00 520 10430
#5 4200 Start 2019-01-03 16:00:00 4200 14630
#6 27300 Start 2019-01-03 17:00:00 27300 41930
In the past I have tried to solve similar problems using the tidyverse/baseR... But nothing comes even remotely close to the speeds that data.table provides for these kind of operations, so I encourage you to give it a try...
For questions like this, my favourite finction is foverlaps() from the data.table-package. With this function you can (fast!) perform an overlap-join. If you want more flexibility in your joining than foverlaps() provides, a non-equi-join (again using data.table) is probably the best (and fastest!) option. But foverlaps() will do here (I guess).
I used the sample data you provided, but filtered out rows where Stop <= Start (probably a tyop in your sample data). When df$Start is not before df$Stop, foverlaps give a warning and won't execute.
library( data.table )
#create data.table with periods you wish to simmarise on
#NB: UTC is used as timezone, since this is also the case in the sample data provided!!
dt.dates <- data.table( id = paste0( "Day", 1:31 ),
Start = seq( as.POSIXct( "2019-01-01 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ),
as.POSIXct( "2019-01-31 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ),
by = "1 days"),
Stop = seq( as.POSIXct( "2019-01-02 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) - 1,
as.POSIXct( "2019-02-01 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) - 1,
by = "1 days") )
If you do not want to summarise on a daily basis, but by hour, minute, second, of year. Just change the values (and stepsize) in dt.dates data.table so that they match your periods.
#set df as data.table
dt <- as.data.table( df )
#filter out any row where Stop is smaller than Start
dt <- dt[ Start < Stop, ]
#perform overlap join
#first set keys
setkey(dt, Start, Stop)
#then perform join
result <- foverlaps( dt.dates, dt, type = "within" )
#summarise
result[, .( Value = sum( Value , na.rm = TRUE ) ), by = .(Day = i.Start) ]
output
# Day Value
# 1: 2019-01-01 1500
# 2: 2019-01-02 1500
# 3: 2019-01-03 1500
# 4: 2019-01-04 351562
# 5: 2019-01-05 413050
# 6: 2019-01-06 400440
# 7: 2019-01-07 332880
# 8: 2019-01-08 332880
# 9: 2019-01-09 332880
# 10: 2019-01-10 64372
# 11: 2019-01-11 0
# 12: 2019-01-12 0
# 13: 2019-01-13 0
# 14: 2019-01-14 0
# 15: 2019-01-15 0
# 16: 2019-01-16 0
# 17: 2019-01-17 0
# 18: 2019-01-18 0
# 19: 2019-01-19 0
# 20: 2019-01-20 0
# 21: 2019-01-21 0
# 22: 2019-01-22 0
# 23: 2019-01-23 0
# 24: 2019-01-24 0
# 25: 2019-01-25 0
# 26: 2019-01-26 0
# 27: 2019-01-27 0
# 28: 2019-01-28 0
# 29: 2019-01-29 0
# 30: 2019-01-30 0
# 31: 2019-01-31 0
# Day Value
plot
#summarise for plot
result.plot <- result[, .( Value = sum( Value , na.rm = TRUE ) ), by = .(Day = i.Start) ]
library( ggplot2 )
ggplot( data = result.plot, aes( x = Day, y = Value ) ) + geom_col()

Combining time series data with different resolution in R

I have read in and formatted my data set like shown under.
library(xts)
#Read data from file
x <- read.csv("data.dat", header=F)
x[is.na(x)] <- c(0) #If empty fill in zero
#Construct data frames
rawdata.h <- data.frame(x[,2],x[,3],x[,4],x[,5],x[,6],x[,7],x[,8]) #Hourly data
rawdata.15min <- data.frame(x[,10]) #15 min data
#Convert time index to proper format
index.h <- as.POSIXct(strptime(x[,1], "%d.%m.%Y %H:%M"))
index.15min <- as.POSIXct(strptime(x[,9], "%d.%m.%Y %H:%M"))
#Set column names
names(rawdata.h) <- c("spot","RKup", "RKdown","RKcon","anm", "pp.stat","prod.h")
names(rawdata.15min) <- c("prod.15min")
#Convert data frames to time series objects
data.htemp <- xts(rawdata.h,order.by=index.h)
data.15mintemp <- xts(rawdata.15min,order.by=index.15min)
#Select desired subset period
data.h <- data.htemp["2013"]
data.15min <- data.15mintemp["2013"]
I want to be able to combine hourly data from data.h$prod.h with data, with 15 min resolution, from data.15min$prod.15min corresponding to the same hour.
An example would be to take the average of the hourly value at time 2013-12-01 00:00-01:00 with the last 15 minute value in that same hour, i.e. the 15 minute value from time 2013-12-01 00:45-01:00. I'm looking for a flexible way to do this with an arbitrary hour.
Any suggestions?
Edit: Just to clarify further: I want to do something like this:
N <- NROW(data.h$prod.h)
for (i in 1:N){
prod.average[i] <- mean(data.h$prod.h[i] + #INSERT CODE THAT FINDS LAST 15 MIN IN HOUR i )
}
I found a solution to my problem by converting the 15 minute data into hourly data using the very useful .index* function from the xts package like shown under.
prod.new <- data.15min$prod.15min[.indexmin(data.15min$prod.15min) %in% c(45:59)]
This creates a new time series with only the values occuring in the 45-59 minute interval each hour.
For those curious my data looked like this:
Original hourly series:
> data.h$prod.h[1:4]
2013-01-01 00:00:00 19.744
2013-01-01 01:00:00 27.866
2013-01-01 02:00:00 26.227
2013-01-01 03:00:00 16.013
Original 15 minute series:
> data.15min$prod.15min[1:4]
2013-09-30 00:00:00 16.4251
2013-09-30 00:15:00 18.4495
2013-09-30 00:30:00 7.2125
2013-09-30 00:45:00 12.1913
2013-09-30 01:00:00 12.4606
2013-09-30 01:15:00 12.7299
2013-09-30 01:30:00 12.9992
2013-09-30 01:45:00 26.7522
New series with only the last 15 minutes in each hour:
> prod.new[1:4]
2013-09-30 00:45:00 12.1913
2013-09-30 01:45:00 26.7522
2013-09-30 02:45:00 5.0332
2013-09-30 03:45:00 2.6974
Short answer
df %>%
group_by(t = cut(time, "30 min")) %>%
summarise(v = mean(value))
Long answer
Since, you want to compress the 15 minutes time series to a smaller resolution (30 minutes), you should use dplyr package or any other package that computes the "group by" concept.
For instance:
s = seq(as.POSIXct("2017-01-01"), as.POSIXct("2017-01-02"), "15 min")
df = data.frame(time = s, value=1:97)
df is a time series with 97 rows and two columns.
head(df)
time value
1 2017-01-01 00:00:00 1
2 2017-01-01 00:15:00 2
3 2017-01-01 00:30:00 3
4 2017-01-01 00:45:00 4
5 2017-01-01 01:00:00 5
6 2017-01-01 01:15:00 6
The cut.POSIXt, group_by and summarise functions do the work:
df %>%
group_by(t = cut(time, "30 min")) %>%
summarise(v = mean(value))
t v
1 2017-01-01 00:00:00 1.5
2 2017-01-01 00:30:00 3.5
3 2017-01-01 01:00:00 5.5
4 2017-01-01 01:30:00 7.5
5 2017-01-01 02:00:00 9.5
6 2017-01-01 02:30:00 11.5
A more robust way is to convert 15 minutes values into hourly values by taking average. Then do whatever operation you want to.
### 15 Minutes Data
min15 <- structure(list(V1 = structure(1:8, .Label = c("2013-01-01 00:00:00",
"2013-01-01 00:15:00", "2013-01-01 00:30:00", "2013-01-01 00:45:00",
"2013-01-01 01:00:00", "2013-01-01 01:15:00", "2013-01-01 01:30:00",
"2013-01-01 01:45:00"), class = "factor"), V2 = c(16.4251, 18.4495,
7.2125, 12.1913, 12.4606, 12.7299, 12.9992, 26.7522)), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -8L))
min15
### Hourly Data
hourly <- structure(list(V1 = structure(1:4, .Label = c("2013-01-01 00:00:00",
"2013-01-01 01:00:00", "2013-01-01 02:00:00", "2013-01-01 03:00:00"
), class = "factor"), V2 = c(19.744, 27.866, 26.227, 16.013)), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -4L))
hourly
### Convert 15min data into hourly data by taking average of 4 values
min15$V1 <- as.POSIXct(min15$V1,origin="1970-01-01 0:0:0")
min15 <- aggregate(. ~ cut(min15$V1,"60 min"),min15[setdiff(names(min15), "V1")],mean)
min15
names(min15) <- c("time","min15")
names(hourly) <- c("time","hourly")
### merge the corresponding values
combined <- merge(hourly,min15)
### average of hourly and 15min values
rowMeans(combined[,2:3])

Resources