Identify Weekdays and Time from a xts object R - r

I've a xtc object x2 like this:
str(x2)
An ‘xts’ object on 2016-01-31 23:15:00/2016-02-26 22:55:00 containing:
Data: num [1:5700, 1] 1.08 1.08 1.08 1.08 1.08 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr "close"
Indexed by objects of class: [POSIXct,POSIXt] TZ: America/New_York
xts Attributes:
NULL
head(x2, 10)
close
2016-01-31 23:15:00 1.083390
2016-01-31 23:20:00 1.083350
2016-01-31 23:25:00 1.083125
2016-01-31 23:30:00 1.083360
2016-01-31 23:35:00 1.083240
2016-01-31 23:40:00 1.083190
2016-01-31 23:45:00 1.083165
2016-01-31 23:50:00 1.083020
2016-01-31 23:55:00 1.082965
2016-02-01 00:00:00 1.082200
And now I would like to identify for example all Mondays from 8:00 to 10:00. Is there a smart way to get this? Thank you guys.

You can use .indexwday() and .indexhour(). A little example follows:
library(xts)
seqTime <- seq(as.POSIXct("2016-01-01"), by = 300, length.out = 1000)
myXts <- xts(rnorm(1000), seqTime)
myXts[.indexwday(myXts) == 1 & (.indexhour(myXts) %in% c(8, 9))]
with output:
[,1]
2016-01-04 08:00:00 0.74224022
2016-01-04 08:05:00 -0.50372235
2016-01-04 08:10:00 0.94655985
2016-01-04 08:15:00 -0.80261212
2016-01-04 08:20:00 0.90475246
2016-01-04 08:25:00 -0.72225021
2016-01-04 08:30:00 -0.32635167
2016-01-04 08:35:00 0.94919253
2016-01-04 08:40:00 0.33799147
2016-01-04 08:45:00 1.19636284
2016-01-04 08:50:00 0.13022675
2016-01-04 08:55:00 -0.61397227
2016-01-04 09:00:00 -2.14580209
2016-01-04 09:05:00 -0.02778257
2016-01-04 09:10:00 -0.73649967
2016-01-04 09:15:00 0.31217192
2016-01-04 09:20:00 -0.30923692
2016-01-04 09:25:00 0.64499992
2016-01-04 09:30:00 -1.84125238
2016-01-04 09:35:00 2.43008526
2016-01-04 09:40:00 -1.85907819
2016-01-04 09:45:00 0.31648160
2016-01-04 09:50:00 -0.02847419
2016-01-04 09:55:00 -0.09911078

Related

Fill Missing Interval Values in r

I have a data with 4 variables, for which 2 of them are date variables. I would like to check whether the intervals for rows with TYPE == “OT” or TYPE == “NON-OT” fall within the interval of the preceding row with TYPE == “ICU”.
Data:
df <- structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), TYPE = c("NON-OT", "NON-OT", "OT", "ICU", "OT",
"NON-OT", "OT", "NON-OT", "ICU", "OT", "OT", "ICU", "OT", "OT",
"NON-OT", "OT", "NON-OT"), DATE1 = structure(c(1427214540, 1427216280,
1427279700, 1427370420, 1427543700, 1427564520, 1427800800, 1427849280,
1427850240, 1427927400, 1428155400, 1428166380, 1428514500, 1428927000,
1429167600, 1429264500, 1429388160), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), DATE2 = structure(c(1427216280, 1427370420,
1427279700, 1427564520, 1427543700, 1427849280, 1427800800, 1427850240,
1428166380, 1427927400, 1428155400, 1429388160, 1428514500, 1428927000,
1429167600, 1429264500, 1430362020), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), .Names = c("id", "TYPE", "DATE1", "DATE2"
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-17L))
# id TYPE DATE1 DATE2
# 1 1 NON-OT 2015-03-24 16:29:00 2015-03-24 16:58:00
# 2 1 NON-OT 2015-03-24 16:58:00 2015-03-26 11:47:00
# 3 1 OT 2015-03-25 10:35:00 2015-03-25 10:35:00
# 4 1 ICU 2015-03-26 11:47:00 2015-03-28 17:42:00
# 5 1 OT 2015-03-28 11:55:00 2015-03-28 11:55:00
# 6 1 NON-OT 2015-03-28 17:42:00 2015-04-01 00:48:00
# 7 1 OT 2015-03-31 11:20:00 2015-03-31 11:20:00
# 8 1 NON-OT 2015-04-01 00:48:00 2015-04-01 01:04:00
# 9 1 ICU 2015-04-01 01:04:00 2015-04-04 16:53:00
# 10 1 OT 2015-04-01 22:30:00 2015-04-01 22:30:00
# 11 1 OT 2015-04-04 13:50:00 2015-04-04 13:50:00
# 12 1 ICU 2015-04-04 16:53:00 2015-04-18 20:16:00
# 13 1 OT 2015-04-08 17:35:00 2015-04-08 17:35:00
# 14 1 OT 2015-04-13 12:10:00 2015-04-13 12:10:00
# 15 1 NON-OT 2015-04-16 07:00:00 2015-04-16 07:00:00
# 16 1 OT 2015-04-17 09:55:00 2015-04-17 09:55:00
# 17 1 NON-OT 2015-04-18 20:16:00 2015-04-30 02:47:00
This is what I have done:
Obtain a new variable, INT that gives the interval between DATE1 and DATE2 for every row.
Obtain another variable, INT_ICU that gives the interval for rows with TYPE == “ICU” only and fill down (This is where the problem comes as the fill function in tidyr could not fill in the missing interval values.)
Obtain a logical variable, WITHIN_ICU, which gives TRUE if the interval is within the interval of ICU and FALSE otherwise.
Code:
library(tidyverse)
df %>%
mutate(INT = interval(DATE1, DATE2),
INT_ICU = if_else(TYPE == "ICU", interval(DATE1, DATE2), NA_real_)) %>%
fill(INT_ICU) %>%
mutate(WITHIN_ICU = INT %within% INT_ICU)
Output:
As you can see, there are a lot of missing values in INT_ICU variables even when I have applied fill function.
# id TYPE DATE1 DATE2 INT INT_ICU WITHIN_ICU
# <dbl> <chr> <dttm> <dttm> <S4: Interval> <S4: Interval> <lgl>
# 1 1 NON-OT 2015-03-24 16:29:00 2015-03-24 16:58:00 2015-03-24 16:29:00 UTC--2015-03-24 16:58:00 UTC NA--NA NA
# 2 1 NON-OT 2015-03-24 16:58:00 2015-03-26 11:47:00 2015-03-24 16:58:00 UTC--2015-03-26 11:47:00 UTC NA--NA NA
# 3 1 OT 2015-03-25 10:35:00 2015-03-25 10:35:00 2015-03-25 10:35:00 UTC--2015-03-25 10:35:00 UTC NA--NA NA
# 4 1 ICU 2015-03-26 11:47:00 2015-03-28 17:42:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC TRUE
# 5 1 OT 2015-03-28 11:55:00 2015-03-28 11:55:00 2015-03-28 11:55:00 UTC--2015-03-28 11:55:00 UTC NA--NA NA
# 6 1 NON-OT 2015-03-28 17:42:00 2015-04-01 00:48:00 2015-03-28 17:42:00 UTC--2015-04-01 00:48:00 UTC NA--NA NA
# 7 1 OT 2015-03-31 11:20:00 2015-03-31 11:20:00 2015-03-31 11:20:00 UTC--2015-03-31 11:20:00 UTC NA--NA NA
# 8 1 NON-OT 2015-04-01 00:48:00 2015-04-01 01:04:00 2015-04-01 00:48:00 UTC--2015-04-01 01:04:00 UTC NA--NA NA
# 9 1 ICU 2015-04-01 01:04:00 2015-04-04 16:53:00 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC TRUE
# 10 1 OT 2015-04-01 22:30:00 2015-04-01 22:30:00 2015-04-01 22:30:00 UTC--2015-04-01 22:30:00 UTC NA--NA NA
# 11 1 OT 2015-04-04 13:50:00 2015-04-04 13:50:00 2015-04-04 13:50:00 UTC--2015-04-04 13:50:00 UTC NA--NA NA
# 12 1 ICU 2015-04-04 16:53:00 2015-04-18 20:16:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
# 13 1 OT 2015-04-08 17:35:00 2015-04-08 17:35:00 2015-04-08 17:35:00 UTC--2015-04-08 17:35:00 UTC NA--NA NA
# 14 1 OT 2015-04-13 12:10:00 2015-04-13 12:10:00 2015-04-13 12:10:00 UTC--2015-04-13 12:10:00 UTC NA--NA NA
# 15 1 NON-OT 2015-04-16 07:00:00 2015-04-16 07:00:00 2015-04-16 07:00:00 UTC--2015-04-16 07:00:00 UTC NA--NA NA
# 16 1 OT 2015-04-17 09:55:00 2015-04-17 09:55:00 2015-04-17 09:55:00 UTC--2015-04-17 09:55:00 UTC NA--NA NA
# 17 1 NON-OT 2015-04-18 20:16:00 2015-04-30 02:47:00 2015-04-18 20:16:00 UTC--2015-04-30 02:47:00 UTC NA--NA NA
Desired Output:
# id TYPE DATE1 DATE2 WITHIN_ICU
# <dbl> <chr> <dttm> <dttm> <lgl>
# 1 1 NON-OT 2015-03-24 16:29:00 2015-03-24 16:58:00 NA
# 2 1 NON-OT 2015-03-24 16:58:00 2015-03-26 11:47:00 NA
# 3 1 OT 2015-03-25 10:35:00 2015-03-25 10:35:00 NA
# 4 1 ICU 2015-03-26 11:47:00 2015-03-28 17:42:00 TRUE
# 5 1 OT 2015-03-28 11:55:00 2015-03-28 11:55:00 TRUE
# 6 1 NON-OT 2015-03-28 17:42:00 2015-04-01 00:48:00 FALSE
# 7 1 OT 2015-03-31 11:20:00 2015-03-31 11:20:00 FALSE
# 8 1 NON-OT 2015-04-01 00:48:00 2015-04-01 01:04:00 FALSE
# 9 1 ICU 2015-04-01 01:04:00 2015-04-04 16:53:00 TRUE
# 10 1 OT 2015-04-01 22:30:00 2015-04-01 22:30:00 TRUE
# 11 1 OT 2015-04-04 13:50:00 2015-04-04 13:50:00 TRUE
# 12 1 ICU 2015-04-04 16:53:00 2015-04-18 20:16:00 TRUE
# 13 1 OT 2015-04-08 17:35:00 2015-04-08 17:35:00 TRUE
# 14 1 OT 2015-04-13 12:10:00 2015-04-13 12:10:00 TRUE
# 15 1 NON-OT 2015-04-16 07:00:00 2015-04-16 07:00:00 TRUE
# 16 1 OT 2015-04-17 09:55:00 2015-04-17 09:55:00 TRUE
# 17 1 NON-OT 2015-04-18 20:16:00 2015-04-30 02:47:00 FALSE
This should work
# use own function to fill rather than using dplyr's fill
f2 <- function(x) {
for(i in seq_along(x)[-1]) if(is.na(x#start[i])) x[i] <- x[i-1]#check if Start in S4 interval object is NA.
x
}
df %>%
mutate(INT = interval(DATE1, DATE2),
INT_ICU = if_else(TYPE == "ICU", interval(DATE1, DATE2), NA_real_)) %>%
mutate(INT_ICU = f2(t$INT_ICU)) %>% #instead of fill
mutate(WITHIN_ICU = INT %within% INT_ICU)
The output:
# A tibble: 17 x 6
id TYPE DATE1 DATE2 INT_ICU WITHIN_ICU
<dbl> <chr> <dttm> <dttm> <S4: Interval> <lgl>
1 1. NON-OT 2015-03-24 16:29:00 2015-03-24 16:58:00 NA--NA NA
2 1. NON-OT 2015-03-24 16:58:00 2015-03-26 11:47:00 NA--NA NA
3 1. OT 2015-03-25 10:35:00 2015-03-25 10:35:00 NA--NA NA
4 1. ICU 2015-03-26 11:47:00 2015-03-28 17:42:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC TRUE
5 1. OT 2015-03-28 11:55:00 2015-03-28 11:55:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC TRUE
6 1. NON-OT 2015-03-28 17:42:00 2015-04-01 00:48:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC FALSE
7 1. OT 2015-03-31 11:20:00 2015-03-31 11:20:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC FALSE
8 1. NON-OT 2015-04-01 00:48:00 2015-04-01 01:04:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC FALSE
9 1. ICU 2015-04-01 01:04:00 2015-04-04 16:53:00 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC TRUE
10 1. OT 2015-04-01 22:30:00 2015-04-01 22:30:00 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC TRUE
11 1. OT 2015-04-04 13:50:00 2015-04-04 13:50:00 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC TRUE
12 1. ICU 2015-04-04 16:53:00 2015-04-18 20:16:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
13 1. OT 2015-04-08 17:35:00 2015-04-08 17:35:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
14 1. OT 2015-04-13 12:10:00 2015-04-13 12:10:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
15 1. NON-OT 2015-04-16 07:00:00 2015-04-16 07:00:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
16 1. OT 2015-04-17 09:55:00 2015-04-17 09:55:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
17 1. NON-OT 2015-04-18 20:16:00 2015-04-30 02:47:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC FALSE

Populating missing Date and Time in time-series data in R, with zoo package

I have a quarter- hour (15 min interval) frequency data.
sasan<-read.csv("sasanhz.csv", header = TRUE)
head(sasan)
Timestamp Avg.Hz
1 12/27/2017 12:15:00 AM 50.05
2 12/27/2017 12:30:00 AM 49.99
3 12/27/2017 12:45:00 AM 49.98
4 12/27/2017 01:00:00 AM 50.01
5 12/27/2017 01:15:00 AM 49.97
6 12/27/2017 01:30:00 AM 49.98
str(sasan)
'data.frame': 5501 obs. of 2 variables:
$ Timestamp: Factor w/ 5501 levels "01/01/2018 00:00:00 AM",..: 5112 5114 5116 5023 5025
5027 5029 5031 5033 5035 ...
$ Avg.Hz : num 50 50 50 50 50 ...
#change to posixct
sasan$Timestamp<-as.POSIXct(sasan$Timestamp, format="%m/%d/%Y %I:%M:%S %p")
Here in this time-series I have some missing data-time in the coloum "Timestamp" I want to impute the missing date-time.
I have tried with zoo.
z<-zoo(sasan)
> head(z[1489:1497])
Timestamp Avg.Hz
1489 2018-01-11 12:15:00 50.02
1490 2018-01-11 12:30:00 49.99
1491 2018-01-11 12:45:00 49.94
1492 <NA> 49.98
1493 <NA> 50.02
1494 <NA> 49.95
While imputing NA value of dates and time with "na.locf" function in zoo package I am getting following error.
sasan_mis<-seq(start(z), end(z), by = times("00:15:00"))
> na.locf(z, xout = sasan_mis)
Error in approx(x[!na], y[!na], xout, ...) : zero non-NA points
In addition: Warning message:
In xy.coords(x, y, setLab = FALSE) : NAs introduced by coercion
How to overcome this error? How can I impute this missing date-time? Appreciate your suggestion.
dput(head(z))
structure(c("2017-12-27 00:15:00", "2017-12-27 00:30:00", "2017-12-27 00:45:00",
"2017-12-27 01:00:00", "2017-12-27 01:15:00", "2017-12-27 01:30:00",
"50.05", "49.99", "49.98", "50.01", "49.97", "49.98"), .Dim = c(6L,
2L), .Dimnames = list(NULL, c("Timestamp", "Avg.Hz")), index = 1:6, class = "zoo")
The library package I have used are
library(ggplot2)
library(forecast)
library(tseries)
library(xts)
library(zoo)
library(dplyr)
Assuming that OP have got missing values of Timestamp variables in data and looking for a way to populate it.
na.approx from zoo package comes very handy in such cases.
# na.approx from zoo to populate missing values of Timestamp
sasan$Timestamp <- as.POSIXct(na.approx(sasan$Timestamp), origin = "1970-1-1")
sasan
# 1 2017-12-27 00:15:00 50.05
# 2 2017-12-27 00:30:00 49.99
# 3 2017-12-27 00:45:00 49.98
# 4 2017-12-27 01:00:00 50.01
# 5 2017-12-27 01:15:00 49.97
# 6 2017-12-27 01:30:00 49.98
# 7 2017-12-27 01:45:00 49.98
# 8 2017-12-27 02:00:00 50.02
# 9 2017-12-27 02:15:00 49.95
# 10 2017-12-27 02:30:00 49.98
Data
# OP's data has been slightly modified to include NAs
sasan <- read.table(text =
"Timestamp Avg.Hz
1 '12/27/2017 12:15:00 AM' 50.05
2 '12/27/2017 12:30:00 AM' 49.99
3 '12/27/2017 12:45:00 AM' 49.98
4 '12/27/2017 01:00:00 AM' 50.01
5 '12/27/2017 01:15:00 AM' 49.97
6 '12/27/2017 01:30:00 AM' 49.98
7 <NA> 49.98
8 <NA> 50.02
9 <NA> 49.95
10 '12/27/2017 02:30:00 AM' 49.98",
header = TRUE, stringsAsFactors = FALSE)
# convert to POSIXct
sasan$Timestamp<-as.POSIXct(sasan$Timestamp, format="%m/%d/%Y %I:%M:%S %p")

subset data with timestamp irrespective of date in R

The Dataset
head(data)
Date OPEN
2015-11-30 10:00:00 951.15
2015-11-30 10:30:00 949.90
2015-11-30 11:00:00 943.45
2015-11-30 11:30:00 944.30
2015-11-30 12:00:00 942.00
2015-11-30 12:30:00 940.60
2015-01-01 10:00:00 951.15
2015-01-01 10:30:00 949.90
2015-01-02 10:30:00 943.45
2015-01-02 11:30:00 944.30
2015-01-03 10:00:00 943.45
2015-01-03 10:30:00 943.45
2015-01-03 11:30:00 944.30
2015-01-06 10:00:00 942.00
2015-01-06 10:30:00 940.60
2015-01-06 11:00:00 940.60
2015-01-06 11:30:00 942.00
str(data)
'data.frame': 32023 obs. of 2 variables:
$ Date : POSIXct, format: "2015-11-30 10:00:00" "2015-11-30 10:30:00" "2015-11-30 11:00:00" ...
$ OPEN : num 951 950 943 944 942 ...
Hi,
Dataframe is mentioned above. I want to extract OPEN prices with timestamps 10:00 and 10:30 for all the dates available. I only need to keep timestamps 10:00 to 10:30 in filter condition irrespective of dates. Please suggest in R.
Thanks.
We can format the 'Date' to extract the HH:MM part, use %in% to get a logical vector and subset based on that.
subset(data, format(Date, "%H:%M") %in% c("10:00", "10:30"), select="OPEN")
# OPEN
#1 951.15
#2 949.90
#7 951.15
#8 949.90
#9 943.45
#11 943.45
#12 943.45
#14 942.00
#15 940.60
If it is between those intervals
library(chron)
subset(data, between(times(format(Date, "%H:%M:%S")) ,
times("10:00:00"), times("10:30:00")))
you can use lubridate package to make a friendly subset:
library(lubridate)
res <- subset(data, minute(Date) <=30 & hour(Date) == 10)

R: Compare data.table and pass variable while respecting key

I have two data.tables:
original <- data.frame(id = c(rep("RE01",5),rep("RE02",5)),date.time = head(seq.POSIXt(as.POSIXct("2015-11-01 01:00:00"),as.POSIXct("2015-11-05 01:00:00"),60*60*10),10))
compare <- data.frame(id = c("RE01","RE02"),seq = c(1,2),start = as.POSIXct(c("2015-11-01 20:00:00","2015-11-04 08:00:00")),end = as.POSIXct(c("2015-11-02 08:00:00","2015-11-04 20:00:00")))
setDT(original)
setDT(compare)
I would like to check the date in each row of original and see if it lies between the start and finish dates of compare whilst respecting the id. If it does lie between the two elements, a variable should be passed to original (compare$diff.seq). The output should look like this:
original
id date.time diff.seq
1 RE01 2015-11-01 01:00:00 NA
2 RE01 2015-11-01 11:00:00 NA
3 RE01 2015-11-01 21:00:00 1
4 RE01 2015-11-02 07:00:00 1
5 RE01 2015-11-02 17:00:00 NA
6 RE02 2015-11-03 03:00:00 NA
7 RE02 2015-11-03 13:00:00 NA
8 RE02 2015-11-03 23:00:00 NA
9 RE02 2015-11-04 09:00:00 2
10 RE02 2015-11-04 19:00:00 2
I've been reading the manual and SO for hours and trying "on", "by" and so on.. without any success. Can anybody point me in the right direction?
As said in the comments, this is very straight forward using data.table::foverlaps
You basically have to create an additional column in the original data set in order to set join boundaries, then key the two data sets by the columns you want to join on and then simply run forverlas and select the desired columns
original[, end := date.time]
setkey(original, id, date.time, end)
setkey(compare, id, start, end)
foverlaps(original, compare)[, .(id, date.time, seq)]
# id date.time seq
# 1: RE01 2015-11-01 01:00:00 NA
# 2: RE01 2015-11-01 11:00:00 NA
# 3: RE01 2015-11-01 21:00:00 1
# 4: RE01 2015-11-02 07:00:00 1
# 5: RE01 2015-11-02 17:00:00 NA
# 6: RE02 2015-11-03 03:00:00 NA
# 7: RE02 2015-11-03 13:00:00 NA
# 8: RE02 2015-11-03 23:00:00 NA
# 9: RE02 2015-11-04 09:00:00 2
# 10: RE02 2015-11-04 19:00:00 2
Alternatively, you can run foverlaps the other way around and then just update the original data set by reference while selecting the correct rows to update
indx <- foverlaps(compare, original, which = TRUE)
original[indx$yid, diff.seq := indx$xid]
original
# id date.time end diff.seq
# 1: RE01 2015-11-01 01:00:00 2015-11-01 01:00:00 NA
# 2: RE01 2015-11-01 11:00:00 2015-11-01 11:00:00 NA
# 3: RE01 2015-11-01 21:00:00 2015-11-01 21:00:00 1
# 4: RE01 2015-11-02 07:00:00 2015-11-02 07:00:00 1
# 5: RE01 2015-11-02 17:00:00 2015-11-02 17:00:00 NA
# 6: RE02 2015-11-03 03:00:00 2015-11-03 03:00:00 NA
# 7: RE02 2015-11-03 13:00:00 2015-11-03 13:00:00 NA
# 8: RE02 2015-11-03 23:00:00 2015-11-03 23:00:00 NA
# 9: RE02 2015-11-04 09:00:00 2015-11-04 09:00:00 2
# 10: RE02 2015-11-04 19:00:00 2015-11-04 19:00:00 2

Compute column average based on date and time in R

I have a matrix, which looks a bit like this:
Date Time Data
15000 04/09/2014 05:45:00 0.908
15001 04/09/2014 06:00:00 0.888
15002 04/09/2014 06:15:00 0.976
15003 04/09/2014 06:30:00 1.632
15004 04/09/2014 06:45:00 1.648
15005 04/09/2014 07:00:00 1.164
15006 04/09/2014 07:15:00 0.568
15007 04/09/2014 07:30:00 1.020
15008 04/09/2014 07:45:00 1.052
15009 04/09/2014 08:00:00 0.920
15010 04/09/2014 08:15:00 0.656
15011 04/09/2014 08:30:00 1.172
15012 04/09/2014 08:45:00 1.000
15013 04/09/2014 09:00:00 1.420
15014 04/09/2014 09:15:00 0.936
15015 04/09/2014 09:30:00 0.996
15016 04/09/2014 09:45:00 1.100
15017 04/09/2014 10:00:00 0.492
It contains a years worth of data, with each day having a 96 rows (15 minute intervals from 00:00 to 23:45). My question is that I'd like to average the data column, for each day, based on the time range I specify. For example, if I wanted to average over times 06:00 - 08:00 for each day, in the code above I should get an answer of 1.0964 for the date 04/09/2014.
I have no idea how to do this using the date and time columns as filters, and wondered if someone could help?
To make things even more complicated, I would also like to compute 45 minute rolling averages for each day, within a different time period, say 04:00 - 09:00. Again, as this is for each day, it would be good to get the result in a matrix for which each row is a certain date, then the columns would represent the rolling averages from say, 04:00 - 04:45, 04:15 - 05:00...
Any ideas?!
check the following code and let me know if anything is unclear
data = read.table(header = T, stringsAsFactors = F, text = "Index Date Time Data
15000 04/09/2014 05:45:00 0.908
15001 04/09/2014 06:00:00 0.888
15002 04/09/2014 06:15:00 0.976
15003 04/09/2014 06:30:00 1.632
15004 04/09/2014 06:45:00 1.648
15005 04/09/2014 07:00:00 1.164
15006 04/09/2014 07:15:00 0.568
15007 04/09/2014 07:30:00 1.020
15008 04/09/2014 07:45:00 1.052
15009 04/09/2014 08:00:00 0.920
15010 04/09/2014 08:15:00 0.656
15011 04/09/2014 08:30:00 1.172
15012 04/09/2014 08:45:00 1.000
15013 04/09/2014 09:00:00 1.420
15014 04/09/2014 09:15:00 0.936
15015 04/09/2014 09:30:00 0.996
15016 04/09/2014 09:45:00 1.100
15017 04/09/2014 10:00:00 0.492")
library("magrittr")
data$parsed.timestamp = paste(data$Date, data$Time) %>% strptime(., format = "%d/%m/%Y %H:%M:%S")
# Hourly Average
desiredGroupingUnit = cut(data$parsed.timestamp, breaks = "hour") #You can use substr for that also
aggregate(data$Data, by = list(desiredGroupingUnit), FUN = mean )
# Group.1 x
# 1 2014-09-04 05:00:00 0.908
# 2 2014-09-04 06:00:00 1.286
# 3 2014-09-04 07:00:00 0.951
# 4 2014-09-04 08:00:00 0.937
# 5 2014-09-04 09:00:00 1.113
# 6 2014-09-04 10:00:00 0.492
# Moving average
getAvgBetweenTwoTimeStamps = function(data, startTime, endTime) {
avergeThoseIndcies = which(data$parsed.timestamp >= startTime & data$parsed.timestamp <= endTime)
return(mean(data$Data[avergeThoseIndcies]))
}
movingAvgWindow = 45*60 #minutes
movingAvgTimestamps = data.frame(from = data$parsed.timestamp, to = data$parsed.timestamp + movingAvgWindow)
movingAvgTimestamps$movingAvg =
apply(movingAvgTimestamps, MARGIN = 1,
FUN = function(x) getAvgBetweenTwoTimeStamps(data = data, startTime = x["from"], endTime = x["to"]))
print(movingAvgTimestamps)
# from to movingAvg
# 1 2014-09-04 05:45:00 2014-09-04 06:30:00 1.1010000
# 2 2014-09-04 06:00:00 2014-09-04 06:45:00 1.2860000
# 3 2014-09-04 06:15:00 2014-09-04 07:00:00 1.3550000
# 4 2014-09-04 06:30:00 2014-09-04 07:15:00 1.2530000
# 5 2014-09-04 06:45:00 2014-09-04 07:30:00 1.1000000
# 6 2014-09-04 07:00:00 2014-09-04 07:45:00 0.9510000
# 7 2014-09-04 07:15:00 2014-09-04 08:00:00 0.8900000
# 8 2014-09-04 07:30:00 2014-09-04 08:15:00 0.9120000
# 9 2014-09-04 07:45:00 2014-09-04 08:30:00 0.9500000
# 10 2014-09-04 08:00:00 2014-09-04 08:45:00 0.9370000
# 11 2014-09-04 08:15:00 2014-09-04 09:00:00 1.0620000
# 12 2014-09-04 08:30:00 2014-09-04 09:15:00 1.1320000
# 13 2014-09-04 08:45:00 2014-09-04 09:30:00 1.0880000
# 14 2014-09-04 09:00:00 2014-09-04 09:45:00 1.1130000
# 15 2014-09-04 09:15:00 2014-09-04 10:00:00 0.8810000
# 16 2014-09-04 09:30:00 2014-09-04 10:15:00 0.8626667
# 17 2014-09-04 09:45:00 2014-09-04 10:30:00 0.7960000
# 18 2014-09-04 10:00:00 2014-09-04 10:45:00 0.4920000

Resources