Merge on date and hour range - r

I have a dataframe of datetimes, like so:
library(lubridate)
date_seq <- seq.POSIXt(ymd_hm('2016-04-01 0:00'), ymd_hm('2016-04-30 23:30'), by = '30 mins')
datetimes <- data.frame(datetime = date_seq)
I've also got a dataframe containing opening times that specify a range of days over which the opening times apply and an hour range over which the store is open for the days in the date range, like so:
opening_times <- data.frame(from_date = c('2016-03-01', '2016-04-15'),
till_date = c('2016-04-15', '2016-05-20'),
from_time = c('11:00', '10:30'),
till_time = c('22:00', '23:00'))
What I would like is to mark in datetimes those rows which are inside the opening hours. That is, I want a column that is TRUE whenever the datetime in the row is within both from_date and till_date and within from_time and till_time.

If the dataset isn't too big, I'd recommend creating a new dataset from opening_times -
opening_times$from_date = as.Date(opening_times$from_date, '%Y-%m-%d')
opening_times$till_date = as.Date(opening_times$till_date, '%Y-%m-%d')
opening_times2 = do.call(
rbind,
lapply(
seq(nrow(opening_times)),
function (rownumber) {
data.frame(
seq.Date(
from = opening_times[rownumber,'from_date'],
to = opening_times[rownumber,'till_date'],
by = 1
),
from_time = opening_times[rownumber,'from_time'],
till_time = opening_times[rownumber,'till_time']
)
}
)
)
and then merging it with datetimes by date and checking for whether time falls between the two values.

lubridate has a %within% function for checking whether a time is within a lubridate::interval which can make this easy once you create a vector of intervals:
# make a sequence of days in each set from opening_times
open_intervals <- apply(opening_times, 1, function(x){
dates <- seq.Date(ymd(x[1]), ymd(x[2]), by = 'day')
})
# turn each date into a lubridate::interval object with the appropriate times
open_intervals <- mapply(function(dates, from, to){
interval(ymd_hm(paste(dates, from)), ymd_hm(paste(dates, to)))
}, open_intervals, opening_times$from_time, opening_times$till_time)
# combine list items into one vector of intervals
open_intervals <- do.call(c, open_intervals)
# use lubridate::%within% to check if each datetime is in any open interval
datetimes$open <- sapply(datetimes$datetime, function(x){
any(x %within% open_intervals)
})
datetimes[20:26,]
# datetime open
# 20 2016-04-01 09:30:00 FALSE
# 21 2016-04-01 10:00:00 FALSE
# 22 2016-04-01 10:30:00 FALSE
# 23 2016-04-01 11:00:00 TRUE
# 24 2016-04-01 11:30:00 TRUE
# 25 2016-04-01 12:00:00 TRUE
# 26 2016-04-01 12:30:00 TRUE
Edit
If you have exactly two sets of hours, you can condense the whole thing into a (somewhat huge) ifelse:
datetimes$open <- ifelse(as.Date(datetimes$datetime) %within%
interval(opening_times$from_date[1],
opening_times$till_date[1]),
hm(format(datetimes$datetime, '%H:%M')) >= hm(opening_times$from_time)[1] &
hm(format(datetimes$datetime, '%H:%M')) <= hm(opening_times$till_time)[1],
hm(format(datetimes$datetime, '%H:%M')) >= hm(opening_times$from_time)[2] &
hm(format(datetimes$datetime, '%H:%M')) <= hm(opening_times$till_time)[2])
or
datetimes$open <- ifelse(as.Date(datetimes$datetime) %within%
interval(opening_times$from_date[1],
opening_times$till_date[1]),
datetimes$datetime %within%
interval(ymd_hm(paste(as.Date(datetimes$datetime), opening_times$from_time[1])),
ymd_hm(paste(as.Date(datetimes$datetime), opening_times$till_time[1]))),
datetimes$datetime %within%
interval(ymd_hm(paste(as.Date(datetimes$datetime), opening_times$from_time[2])),
ymd_hm(paste(as.Date(datetimes$datetime), opening_times$till_time[2]))))

Related

Compare date intervals within the same data frame

I have search around and find similar questions but can make it work for my data.
I have a data frame with start and end dates, as well as several other factors. Ideally, the start date of a row should be posterior to the end date of any previous row, but the data has duplicated starts or ends, and sometimes the interval of the dates overlap.
I tried to make a reproducible example:
df = data.frame(start=c("2018/04/15 9:00:00","2018/04/15 9:00:00","2018/04/16 10:20:00","2018/04/16 15:30:00",
"2018/04/17 12:40:00","2018/04/17 18:50:00"),
end=c("2018/04/16 8:00:00","2018/04/16 7:10:00","2018/04/17 18:20:00","2018/04/16 16:30:00",
"2018/04/17 16:40:00","2018/04/17 19:50:00"),
value=c(10,15,11,13,14,12))
I was able to remove the duplicated (end or start dates), but I can't remove the overlapping intervals. I want to create a loop that "cleans" the intervals contained within any larger interval. So the results looks like this:
result = df[c(1,3,6),]
I thought I could make a loop that would "clean" both duplicates and overlapping intervals, but I can't make it work.
Any suggestions?
The data.table package is suited for this kind of problem using the overlapping join function foverlaps (inspired by findOverlaps function from the Bioconductor package IRanges) and then an anti-join (data.table syntax is B[!A, on]) to remove those inner intervals.
library(data.table)
cols <- c("start", "end")
setDT(df)
df[, (cols) := lapply(.SD, function(x) as.POSIXct(x, format="%Y/%m/%d %H:%M:%S")), .SDcols=cols]
setkeyv(df, cols)
anti <- foverlaps(df, df, type="within")[start!=i.start | end!=i.end | value!=i.value]
df[!anti, on=.(start=i.start, end=i.end, value=i.value)]
# start end value
# 1: 2018-04-15 09:00:00 2018-04-16 08:00:00 10
# 2: 2018-04-16 10:20:00 2018-04-17 18:20:00 11
# 3: 2018-04-17 18:50:00 2018-04-17 19:50:00 12
Alternative approach is to use %within% of the lubridate() package:
library(lubridate)
# transform characters to dates
start_time <- as_datetime(df[ , "start"], tz = "UTC")
end_time <- as_datetime(df[ , "end"], tz = "UTC")
# construct intervals
start_end_intrvls <- interval(start_time, end_time)
# find indices of the non-within intervals
not_within <- !(sapply(FUN = function(i) any(start_end_intrvls[i] %within% start_end_intrvls[-i]),
X = seq(along.with = df[ , "start"])))
df[not_within, ]
# start end value
# 1 2018/04/15 9:00:00 2018/04/16 8:00:00 10
# 3 2018/04/16 10:20:00 2018/04/17 18:20:00 11
# 6 2018/04/17 18:50:00 2018/04/17 19:50:00 12
Update
The as_datetime() function causes an error when being applied to a tibble:
as_datetime(tibble("2018/04/15 9:00:00"), tz = "UTC")
Error in as.POSIXct.default(x) :
do not know how to convert 'x' to class “POSIXct”
The solution above may be modified to resolve this issue with substitution of the as_datetime() with the as.POSIXlt():
df_tibble <- tibble(start=c("2018/04/15 9:00:00","2018/04/15 9:00:00","2018/04/16 10:20:00",
"2018/04/16 15:30:00", "2018/04/17 12:40:00","2018/04/17 18:50:00"),
end=c("2018/04/16 8:00:00","2018/04/16 7:10:00","2018/04/17 18:20:00","2018/04/16 16:30:00",
"2018/04/17 16:40:00","2018/04/17 19:50:00"), value=c(10,15,11,13,14,12))
start_time_lst <- lapply(FUN = function(i) as.POSIXlt(as.character(df_tibble[i , "start"]),
tz = "UTC"),
X = seq(along.with = unlist(df_tibble[ , "start"])))
end_time_lst <- lapply(FUN = function(i) as.POSIXlt(as.character(df_tibble[ i, "end"]),
tz = "UTC"),
X = seq(along.with = unlist(df_tibble[ , "end"])))
start_end_intrvls <- lapply(function(i) interval(start_time_lst[[i]] , end_time_lst[[i]]),
X = seq(along.with = unlist(df_tibble[ , "start"])))
not_within <- sapply(function(i) !(any(unlist(Map(`%within%`,
start_end_intrvls[[i]], start_end_intrvls[-i])))),
X = seq(along.with = unlist(df_tibble[ , "start"])))

How to clean a time column in r

I have a time column in R as:
22:34:47
06:23:15
7:35:15
5:45
How to make all the time values in a column into hh:mm:ss format. I have used
as_date(a$time, tz=NULL) but I am not able to get the format which I wanted.
Here is an option with parse_date_time which can take multiple formats
library(lubridate)
format(parse_date_time(time, c("HMS", "HM"), tz = "GMT"), "%H:%M:%S")
#[1] "22:34:47" "06:23:15" "07:35:15" "05:45:00"
data
time <- c("22:34:47", "06:23:15", "7:35:15", "5:45")
Nothing a bit of formatting can't take care of:
x <- c("22:34:47","06:23:15","7:35:15","5:45")
format(
pmax(
as.POSIXct(x, format="%T", tz="UTC"),
as.POSIXct(x, format="%R", tz="UTC"), na.rm=TRUE
),
"%T"
)
#[1] "22:34:47" "06:23:15" "07:35:15" "05:45:00"
The pmax means any additional seconds will be taken in preference to just hh:mm.
You could get functional if you wanted to get a similar result with less typing, and more opportunity for turning it into a repeatable function.
do.call(pmax, c(lapply(c("%T","%R"), as.POSIXct, x=x, tz="UTC"), na.rm=TRUE))
Using a tidyverse approach with dplyr and hms verbs.
library(dplyr)
library(hms)
a <- tibble(time = c("22:34:47", "06:23:15", "7:35:15", "5:45"))
a %>%
mutate(
time = case_when(
is.na(parse_hms(time)) ~ parse_hm(time),
TRUE ~ parse_hms(time)
)
)
# # A tibble: 4 x 1
# time
# <time>
# 1 22:34
# 2 06:23
# 3 07:35
# 4 05:45
Note that the use of case_when could be replaced with an ifelse. The reason for this conditional is that parse_hms will return NA for values without seconds.
You may also want the output to be a POSIX compliant value, you may adapt the previous solution to do so.
a %>%
mutate(
time = case_when(
is.na(parse_hms(time)) ~ as.POSIXct(parse_hm(time)),
TRUE ~ as.POSIXct(parse_hms(time))
)
)
# # A tibble: 4 x 1
# time
# <dttm>
# 1 1970-01-01 22:34:47
# 2 1970-01-01 06:23:15
# 3 1970-01-01 07:35:15
# 4 1970-01-01 05:45:00
Note this will set the date to origin, which is 1970-01-01 by default.

Convert continuous time-series data into daily-hourly representation using R

I have time-series data in xts representation as
library(xts)
xtime <-timeBasedSeq('2015-01-01/2015-01-30 23')
df <- xts(rnorm(length(xtime),30,4),xtime)
Now I want to calculate co-orelation between different days, and hence I want to represent df in matrix form as:
To achieve this I used
p_mat= split(df,f="days",drop=FALSE,k=1)
Using this I get a list of days, but I am not able to arrange this list in matrix form. Also I used
p_mat<- df[.indexday(df) %in% c(1:30) & .indexhour(df) %in% c(1:24)]
With this I do not get any output.
Also I tried to use rollapply(), but was not able to arrange it properly.
May I get help to form the matrix using xts/zoo objects.
Maybe you could use something like this:
#convert to a data.frame with an hour column and a day column
df2 <- data.frame(value = df,
hour = format(index(df), '%H:%M:%S'),
day = format(index(df), '%Y:%m:%d'),
stringsAsFactors=FALSE)
#then use xtabs which ouputs a matrix in the format you need
tab <- xtabs(value ~ day + hour, df2)
Output:
hour
day 00:00:00 01:00:00 02:00:00 03:00:00 04:00:00 05:00:00 06:00:00 07:00:00 08:00:00 09:00:00 10:00:00 11:00:00 12:00:00
2015:01:01 28.15342 35.72913 27.39721 29.17048 28.42877 28.72003 28.88355 31.97675 29.29068 27.97617 35.37216 29.14168 29.28177
2015:01:02 23.85420 28.79610 27.88688 27.39162 29.77241 22.34256 34.70633 23.34011 28.14588 25.53632 26.99672 38.34867 30.06958
2015:01:03 37.47716 31.70040 29.04541 34.23393 33.54569 27.52303 38.82441 28.97989 24.30202 29.42240 30.83015 39.23191 30.42321
2015:01:04 24.13100 32.08409 29.36498 35.85835 26.93567 28.27915 26.29556 29.29158 31.60805 27.07301 33.32149 25.16767 25.80806
2015:01:05 32.16531 29.94640 32.04043 29.34250 31.68278 28.39901 24.51917 33.95135 36.07898 28.76504 24.98684 32.56897 29.82116
2015:01:06 18.44432 27.43807 32.28203 29.76111 29.60729 32.24328 25.25417 34.38711 29.97862 32.82924 34.13643 30.89392 26.48517
2015:01:07 34.58491 20.38762 32.29096 31.49890 28.29893 33.80405 28.44305 28.86268 33.42964 36.87851 31.08022 28.31126 25.24355
2015:01:08 33.67921 31.59252 28.36989 35.29703 27.19507 27.67754 25.99571 27.32729 33.78074 31.73481 34.02064 28.43953 31.50548
2015:01:09 28.46547 36.61658 36.04885 30.33186 32.26888 25.90181 31.29203 34.17445 30.39631 28.18345 27.37687 29.85631 34.27665
2015:01:10 30.68196 26.54386 32.71692 28.69160 23.72367 28.53020 35.45774 28.66287 32.93100 33.78634 30.01759 28.59071 27.88122
2015:01:11 32.70907 31.51985 29.22881 36.31157 32.38494 25.30569 29.37743 22.32436 29.21896 19.63069 35.25601 27.45783 28.28008
2015:01:12 29.96676 30.51542 29.41650 29.34436 37.05421 33.05035 34.44572 26.30717 30.65737 34.61930 29.77391 21.48256 31.37938
2015:01:13 33.46089 34.29776 37.58262 27.58801 28.43653 28.33511 28.49737 28.53348 28.81729 35.76728 27.20985 28.44733 32.61015
2015:01:14 22.96213 32.27889 36.44939 23.45088 26.88173 27.43529 27.27547 21.86686 32.00385 23.87281 29.90001 32.37194 29.20722
2015:01:15 28.30359 30.94721 20.62911 33.84679 27.58230 26.98849 23.77755 24.18443 30.22533 32.03748 21.60847 25.98255 32.14309
2015:01:16 23.52449 29.56138 31.76356 35.40398 24.72556 31.45754 30.93400 34.77582 29.88836 28.57080 25.41274 27.93032 28.55150
2015:01:17 25.56436 31.23027 25.57242 31.39061 26.50694 30.30921 28.81253 25.26703 30.04517 33.96640 36.37587 24.50915 29.00156
...and so on
Here's one way to do it using a helper function that will account for days that do not have 24 observations.
library(xts)
xtime <- timeBasedSeq('2015-01-01/2015-01-30 23')
set.seed(21)
df <- xts(rnorm(length(xtime),30,4), xtime)
tHourly <- function(x) {
# initialize result matrix for all 24 hours
dnames <- list(format(index(x[1]), "%Y-%m-%d"),
paste0("H", 0:23))
res <- matrix(NA, 1, 24, dimnames = dnames)
# transpose day's rows and set colnames
tx <- t(x)
colnames(tx) <- paste0("H", .indexhour(x))
# update result object and return
res[,colnames(tx)] <- tx
res
}
# split on days, apply tHourly to each day, rbind results
p_mat <- split(df, f="days", drop=FALSE, k=1)
p_list <- lapply(p_mat, tHourly)
p_hmat <- do.call(rbind, p_list)

Find range of values in each unique day

I have the following example:
Date1 <- seq(from = as.POSIXct("2010-05-01 02:00"),
to = as.POSIXct("2010-10-10 22:00"), by = 3600)
Dat <- data.frame(DateTime = Date1,
t = rnorm(length(Date1)))
I would like to find the range of values in a given day (i.e. maximum - minimum).
First, I've defined additional columns which define the unique days in terms of the date and in terms of the day of year (doy).
Dat$date <- format(Dat$DateTime, format = "%Y-%m-%d") # find the unique days
Dat$doy <- as.numeric(format(Dat$DateTime, format="%j")) # find the unique days
To then find the range I tried
by(Dat$t, Dat$doy, function(x) range(x))
but this returns the range as two values not a single value, So, my question is, how do I find the calculated range for each day and return them in a data.frame which has
new_data <- data.frame(date = unique(Dat$date),
range = ...)
Can anyone suggest a method for doing this?
I tend to use tapply for this kind of thing. ave is also useful sometimes. Here:
> dr = tapply(Dat$t,Dat$doy,function(x){diff(range(x))})
Always check tricksy stuff:
> dr[1]
121
3.084317
> diff(range(Dat$t[Dat$doy==121]))
[1] 3.084317
Use the names attribute to get the day-of-year and the values to make a data frame:
> new_data = data.frame(date=names(dr),range=dr)
> head(new_data)
date range
121 121 3.084317
122 122 4.204053
Did you want to convert the number day-of-year back to a date object?
# Use the data.table package
require(data.table)
# Set seed so data is reproducible
set.seed(42)
# Create data.table
Date1 <- seq(from = as.POSIXct("2010-05-01 02:00"), to = as.POSIXct("2010-10-10 22:00"), by = 3600)
DT <- data.table(date = as.IDate(Date1), t = rnorm(length(Date1)))
# Set key on data.table so that it is sorted by date
setkey(DT, "date")
# Make a new data.table with the required information (can be used as a data.frame)
new_data <- DT[, diff(range(t)), by = date]
# date V1
# 1: 2010-05-01 4.943101
# 2: 2010-05-02 4.309401
# 3: 2010-05-03 4.568818
# 4: 2010-05-04 2.707036
# 5: 2010-05-05 4.362990
# ---
# 159: 2010-10-06 2.659115
# 160: 2010-10-07 5.820803
# 161: 2010-10-08 4.516654
# 162: 2010-10-09 4.010017
# 163: 2010-10-10 3.311408

How to select time range during weekdays and associated data on the next column

Here is an example of a subset data in .csv files. There are three columns with no header. The first column represents the date/time and the second column is load [kw] and the third column is 1= weekday, 0 = weekends/ holiday.
9/9/2010 3:00 153.94 1
9/9/2010 3:15 148.46 1
I would like to program in R, so that it selects the first and second column within time ranges from 10:00 to 20:00 for all weekdays (when the third column is 1) within a month of September and do not know what's the best and most efficient way to code.
code dt <- read.csv("file", header = F, sep=",")
#Select a column with weekday designation = 1, weekend or holiday = 0
y <- data.frame(dt[,3])
#Select a column with timestamps and loads
x <- data.frame(dt[,1:2])
t <- data.frame(dt[,1])
#convert timestamps into readable format
s <- strptime("9/1/2010 0:00", format="%m/%d/%Y %H:%M")
e <- strptime("9/30/2010 23:45", format="%m/%d/%Y %H:%M")
range <- seq(s,e, by = "min")
df <- data.frame(range)
OP ask for "best and efficient way to code" this without showing "inefficient code", so #Justin is right.
It's seems that the OP is new to R (and it's officially the summer of love) so I give it a try and I have a solution (not sure about efficiency..)
index <- c("9/9/2010 19:00", "9/9/2010 21:15", "10/9/2010 11:00", "3/10/2010 10:30")
index <- as.POSIXct(index, format = "%d/%m/%Y %H:%M")
set.seed(1)
Data <- data.frame(Date = index, load = rnorm(4, mean = 120, sd = 10), weeks = c(0, 1, 1, 1))
## Data
## Date load weeks
## 1 2010-09-09 19:00:00 113.74 0
## 2 2010-09-09 21:15:00 121.84 1
## 3 2010-09-10 11:00:00 111.64 1
## 4 2010-10-03 10:30:00 135.95 1
cond <- expression(format(Date, "%H:%M") < "20:00" &
format(Date, "%H:%M") > "10:00" &
weeks == 1 &
format(Date, "%m") == "09")
subset(Data, eval(cond))
## Date load weeks
## 3 2010-09-10 11:00:00 111.64 1

Resources