R: Cut datetimes by time of day - r

I have a data_frame with POSIXct date-times. I would now like to create a variable that cuts these date-times into timebands: 1 -- [00:00:00, 08:00:00), 2 -- [08:00:00, 17:00:00), 3 -- [17:00:00, 18:30:00), 4 -- [18:30:00, 00:00:00).
Here is some sample data:
df_times = data_frame(
datetime = seq.POSIXt(
from = as.POSIXct(strftime("2016-01-01 00:00:00", format = "%Y-%m-%d :%H:%M:%S")),
by = "min",
length.out = 100000
),
value = rnorm(100000)
)
Here is the expected output:
> df_times
# A tibble: 100,000 × 3
datetime value band
<dttm> <dbl> <dbl>
1 2016-01-01 00:00:00 0.5855288 1
2 2016-01-01 00:01:00 0.7094660 1
3 2016-01-01 00:02:00 -0.1093033 1
4 2016-01-01 00:03:00 -0.4534972 1
5 2016-01-01 00:04:00 0.6058875 1
6 2016-01-01 00:05:00 -1.8179560 1
7 2016-01-01 00:06:00 0.6300986 1
8 2016-01-01 00:07:00 -0.2761841 1
9 2016-01-01 00:08:00 -0.2841597 1
10 2016-01-01 00:09:00 -0.9193220 1
# ... with 99,990 more rows
I have tried cut.POSIXt but that insists on keeping track of dates. An ideal solution will use dplyr::recode or forcats::.

Here is the solution I think directly translates the intent of the question into code:
set.seed(12345)
# create a dataset
df_times = data_frame(
datetime = seq.POSIXt(
from = as.POSIXct("2016-01-01 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
by = "min",
length.out = 100000
),
value = rnorm(100000)
) %>%
mutate(
time = times(format(datetime, "%H:%M:%S")),
cut(
time,
breaks = times(c(
"00:00:00",
"08:00:00",
"17:00:00",
"18:30:00",
"23:59:59"
)),
labels = c(
"1",
"2",
"3",
"4"
),
include.lowest = TRUE,
right = FALSE
)
)

You could create an hour column and then cut that:
df_times$hour = as.numeric(df_times$datetime) %% (24*60*60) / 3600
df_times$band = cut(df_times$hour, breaks=c(0,8,17,18.5,24), include.lowest=TRUE,
right=FALSE)

Related

Calculate part of duration that occur in each hour of day

I have a dataframe with start and end times:
id start_time end_time
1 1 2018-09-02 11:13:00 2018-09-02 11:54:00
2 2 2018-09-02 14:34:00 2018-09-02 14:37:00
3 3 2018-09-02 03:00:00 2018-09-02 03:30:00
4 4 2018-09-02 03:49:00 2018-09-02 03:53:00
5 5 2018-09-02 07:05:00 2018-09-02 08:05:00
6 6 2018-09-02 06:44:00 2018-09-02 06:57:00
7 7 2018-09-02 06:04:00 2018-09-02 08:34:00
8 8 2018-09-02 07:51:00 2018-09-02 08:15:00
9 9 2018-09-02 08:16:00 2018-09-02 08:55:00
From such periods, how can I calculate the total number of minutes that occurred in each hour, each day? E.g. if a period started at 9:45 and ended at 10:15, I want to assign 15 minutes to the 9:00 hour and 15 minutes to the 10:00 hour.
Or checking the hour 06 in the data above, that hour is included in two different rows (periods):
6 6 2018-09-02 06:44:00 2018-09-02 06:57:00
7 7 2018-09-02 06:04:00 2018-09-02 08:34:00
In the first row, 13 minutes should be assigned to 06, and in the second row 56 minutes. Thus, a total of 69 minutes for the hour 06 that date.
Expected output from sample data:
hourOfDay Day totalMinutes
<chr> <chr> <drtn>
1 03 2018-09-02 34 mins
2 06 2018-09-02 69 mins
3 07 2018-09-02 124 mins
4 08 2018-09-02 93 mins
5 11 2018-09-02 41 mins
6 14 2018-09-02 3 mins
My attempt: I couldn't make it with lubridate, then I found this old question here. I tried to use POSIXct, but the output is correct for some hours and incorrect for another hours. What am I missing here?
df %>%
mutate(minutes = difftime(end_time,start_time),
hourOfDay = format(as.POSIXct(start_time), "%H"),
Day = format(as.POSIXct(start_time),"%Y-%m-%d")) %>%
group_by(hourOfDay, Day) %>%
summarize(totalMinutes = sum(minutes))
Wrong output:
hourOfDay Day totalMinutes
<chr> <chr> <drtn>
1 03 2018-09-02 34 mins
2 06 2018-09-02 163 mins
3 07 2018-09-02 84 mins
4 08 2018-09-02 39 mins
5 11 2018-09-02 41 mins
6 14 2018-09-02 3 mins
Sample data :
df <- data.frame(
id = c(1,2,3,4,5,6,7,8,9),
start_time = c("2018-09-02 11:13:00", "2018-09-02 14:34:00",
"2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"),
end_time = c("2018-09-02 11:54:00", "2018-09-02 14:37:00",
"2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00"))
Here is an alternate solution, similar to Ronak's but without creating a minute-by-minute data-frame.
library(dplyr)
library(lubridate)
df %>%
mutate(hour = (purrr::map2(hour(start_time), hour(end_time), seq, by = 1))) %>%
tidyr::unnest(hour) %>% mutate(minu=case_when(hour(start_time)!=hour & hour(end_time)==hour ~ 1*minute(end_time),
hour(start_time)==hour & hour(end_time)!=hour ~ 60-minute(start_time),
hour(start_time)==hour & hour(end_time)==hour ~ 1*minute(end_time)-1*minute(start_time),
TRUE ~ 60)) %>% group_by(hour) %>% summarise(sum(minu))
# A tibble: 6 x 2
hour `sum(minu)`
<dbl> <dbl>
1 3 34
2 6 69
3 7 124
4 8 93
5 11 41
6 14 3
Not the best solution since it expands the data but I think it works :
library(dplyr)
library(lubridate)
df %>%
mutate_at(-1, ymd_hms) %>%
mutate(time = purrr::map2(start_time, end_time, seq, by = 'min')) %>%
tidyr::unnest(time) %>%
mutate(hour = hour(time), date = as.Date(time)) %>%
count(date, hour)
# A tibble: 6 x 3
# date hour n
# <date> <int> <int>
#1 2018-09-02 3 36
#2 2018-09-02 6 70
#3 2018-09-02 7 124
#4 2018-09-02 8 97
#5 2018-09-02 11 42
#6 2018-09-02 14 4
We create a sequence from start_time to end_time with 1 minute intervals, extract hours and count occurrence of for each date and hour.
A data.table / lubridate alternative.
library(data.table)
library(lubridate)
setDT(df)
df[ , ceil_start := ceiling_date(start_time, "hour")]
d = df[ , {
if(ceil_start > end_time){
.SD[ , .(start_time, dur = as.double(end_time - start_time, units = "mins"))]
} else {
time <- c(start_time,
seq(from = ceil_start, to = floor_date(end_time, "hour"), by = "hour"),
end_time)
.(start = head(time, -1), dur = `units<-`(diff(time), "mins"))
}
},
by = id]
setorder(d, start_time)
d[ , .(n_min = sum(dur)), by = .(date = as.Date(start_time), hour(start_time))]
# date hour n_min
# 1: 2018-09-02 3 34
# 2: 2018-09-02 6 69
# 3: 2018-09-02 7 124
# 4: 2018-09-02 8 93
# 5: 2018-09-02 11 41
# 6: 2018-09-02 14 3
Explanation
Convert data.frame to data.table (setDT). Round up start times to nearest hour (ceiling_date(start, "hour")).
if the up-rounded time is larger than end time (if(ceil_start > end_time)), select start time and calculate duration for that hour (as.double(end_time - start_time, units = "mins")).
else, create a sequence from the up-rounded start time, to the down-rounded end time, with an hourly increment (seq(from = ceil_start, to = floor_date(end, "hour"), by = "hour")). Concatenate with start and end times. Return all times except the last (head(time, -1)) and calculate difference between time each step in minutes (`units<-`(diff(time), "mins")).
Order data by start time (setorder(d, start_time)). Sum duration by date and hour d[ , .(n_min = sum(dur)), by = .(date = as.Date(start_time), hour(start_time))].
Here is an option using data.table::foverlaps:
#create a data.table of hourly intervals
hours <- seq(df[, trunc(min(start_time)-60*60, "hours")],
df[, trunc(max(end_time)+60*60, "hours")],
by="1 hour")
hourly <- data.table(start_time=hours[-length(hours)], end_time=hours[-1L],
key=cols)
#set keys and find overlaps
#and then calculate overlapping minutes
setkeyv(df, cols)
foverlaps(hourly, df, nomatch=0L)[,
sum(as.numeric(pmin(end_time, i.end_time) - pmax(start_time, i.start_time))) / 60,
.(i.start_time, i.end_time)]
output:
i.start_time i.end_time V1
1: 2018-09-02 02:00:00 2018-09-02 03:00:00 0
2: 2018-09-02 03:00:00 2018-09-02 04:00:00 34
3: 2018-09-02 06:00:00 2018-09-02 07:00:00 69
4: 2018-09-02 07:00:00 2018-09-02 08:00:00 124
5: 2018-09-02 08:00:00 2018-09-02 09:00:00 93
6: 2018-09-02 11:00:00 2018-09-02 12:00:00 41
7: 2018-09-02 14:00:00 2018-09-02 15:00:00 3
data:
df <- data.frame(
id = c(1,2,3,4,5,6,7,8,9),
start_time = c("2018-09-02 11:13:00", "2018-09-02 14:34:00",
"2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"),
end_time = c("2018-09-02 11:54:00", "2018-09-02 14:37:00",
"2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00"))
library(data.table)
cols <- c("start_time", "end_time")
fmt <- "%Y-%m-%d %T"
setDT(df)[, (cols) := lapply(.SD, as.POSIXct, format=fmt), .SDcols=cols]
Here comes a base R solution, which "reshapes" such lines into a long format whose time interval is not in the same hour.
It uses a helper function doTime that generates time sequences.
This updated version calculates with numeric dates (seconds) and internally uses vapply rather than sapply for sake of performance.
decompDayHours <- function(data) {
## convert dates into POSIXct if they're not
if (!all(sapply(data[c("start_time", "end_time")], class) == "POSIXct")) {
data[c("start_time", "end_time")] <-
lapply(data[c("start_time", "end_time")], as.POSIXct)
}
doTime2 <- function(x, date) {
## helper function generating time sequences
xd <- as.double(x) - date
hf <- floor(xd/3600)
hs <- `:`(hf[1], hf[2])[-1]*3600
`attr<-`(mapply(`+`, date, hs), "hours", hf)
}
## Reshape time intervals not in same hour
M <- do.call(rbind, sapply(1:nrow(data), function(i) {
h <- vapply(2:3, function(s) as.double(substr(data[i, s], 12, 13)), 0)
date <- as.double(as.POSIXct(format(data[i, 2], "%F")))
if (h[1] != h[2]) {
hr <- c(as.double(data[i, 2]), dt2 <- doTime2(data[i, 2:3], date))
fh <- attr(dt2, "hours")
fhs <- fh[1]:fh[2]
r1 <- t(vapply(seq_along(hr[-1]) - 1, function(j)
c(id=data[i, 1], start_time=hr[1 + j],
end_time=unname(hr[2 + j]), date=date, hour=fhs[j + 1]), c(0, 0, 0, 0, 0)))
rbind(r1,
c(id=data[i, 1], start_time=r1[nrow(r1), 3],
end_time=as.double(data[i, 3]), date=date, hour=fhs[length(fhs)]))
} else {
c(vapply(data[i, ], as.double, 0), date=date, hour=el(h))
}
}))
## calculating difftime
DF <- cbind.data.frame(M, diff=(M[,3] - M[,2])/60)
## aggregating
res <- aggregate(diff ~ date + hour, DF, sum)
res <- transform(res, date=as.POSIXct(res$date, origin="1970-01-01"))
res[order(res$date, res$hour), ]
}
Result
decompDayHours(df1)
# date hour diff
# 1 2018-09-02 3 34
# 2 2018-09-02 6 69
# 3 2018-09-02 7 124
# 4 2018-09-02 8 93
# 5 2018-09-02 11 41
# 6 2018-09-02 14 3
decompDayHours(df2)
# date hour diff
# 1 2018-09-02 3 30
# 9 2018-09-02 11 41
# 10 2018-09-02 14 3
# 2 2018-09-03 3 4
# 3 2018-09-03 6 13
# 5 2018-09-03 7 55
# 7 2018-09-03 8 5
# 4 2018-09-04 6 56
# 6 2018-09-04 7 69
# 8 2018-09-04 8 88
Benchmarks
I was curious and did a vanilla-benchmark of all solutions so far. Date columns are converted to POSIXct. Not all solutions did scale up to the extended data sets, though.
## df1
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 20.022136 20.445664 20.789341 20.566980 20.791374 25.04604 100 e
# dplyr.bas 103.827770 104.705059 106.631214 105.461541 108.365255 127.12306 100 f
# dplyr.otw 8.972915 9.293750 9.623298 9.464182 9.721488 14.28079 100 ab
# data.tbl.hen 9.258668 9.708603 9.960635 9.872784 10.002138 14.14301 100 b
# data.tbl.chi 10.053165 10.348614 10.673600 10.553489 10.714481 15.43605 100 c
# decomp 8.998939 9.259435 9.372276 9.319774 9.392999 13.13701 100 a
# decomp.old 15.567698 15.795918 16.129622 15.896570 16.029114 20.35637 100 d
## df2
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 19.982590 20.411347 20.949345 20.598873 20.895342 27.24736 100 d
# dplyr.bas 103.513187 104.958665 109.305938 105.942346 109.538759 253.80958 100 e
# dplyr.otw NA NA NA NA NA NA NA NA
# data.tbl.hen 9.392105 9.708858 10.077967 9.922025 10.121671 15.02859 100 ab
# data.tbl.chi 11.308439 11.701862 12.089154 11.909543 12.167486 16.46731 100 b
# decomp 9.111200 9.317223 9.496347 9.398229 9.574146 13.46945 100 a
# decomp.old 15.561829 15.838653 16.163180 16.031282 16.221232 20.41045 100 c
## df3
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 382.32849 385.27367 389.42564 388.21884 392.97421 397.72959 3 b
# dplyr.bas 10558.87492 10591.51307 10644.58889 10624.15122 10687.44588 10750.74054 3 e
# dplyr.otw NA NA NA NA NA NA NA NA
# data.tbl.hen NA NA NA NA NA NA NA NA
# data.tbl.chi 12.85534 12.91453 17.23170 12.97372 19.41988 25.86605 3 a
# decomp 785.81346 795.86114 811.73947 805.90882 824.70247 843.49612 3 c
# decomp.old 1564.06747 1592.72370 1614.21763 1621.37992 1639.29271 1657.20550 3 d
Data:
## OP data
df1 <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), start_time = c("2018-09-02 11:13:00",
"2018-09-02 14:34:00", "2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"), end_time = c("2018-09-02 11:54:00",
"2018-09-02 14:37:00", "2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00")), class = "data.frame", row.names = c(NA,
-9L))
## OP data, modified for alternating dates
df2 <- structure(list(id = 1:9, start_time = c("2018-09-02 11:13:00",
"2018-09-02 14:34:00", "2018-09-02 03:00:00", "2018-09-03 03:49:00",
"2018-09-03 07:05:00", "2018-09-03 06:44:00", "2018-09-04 06:04:00",
"2018-09-04 07:51:00", "2018-09-04 08:16:00"), end_time = c("2018-09-02 11:54:00",
"2018-09-02 14:37:00", "2018-09-02 03:30:00", "2018-09-03 03:53:00",
"2018-09-03 08:05:00", "2018-09-03 06:57:00", "2018-09-04 08:34:00",
"2018-09-04 08:15:00", "2018-09-04 08:55:00")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
## df2 sampled to 1k rows
set.seed(42)
df3 <- df2[sample(1:nrow(df2), 1e3, replace=T), ]
Old version:
# decompDayHours.old <- function(df) {
# df[c("start_time", "end_time")] <-
# lapply(df[c("start_time", "end_time")], as.POSIXct)
# doTime <- function(x) {
# ## helper function generating time sequences
# x <- as.POSIXct(sapply(x, strftime, format="%F %H:00"))
# seq.POSIXt(x[1], x[2], "hours")[-1]
# }
# ## Reshape time intervals not in same hour
# df.long <- do.call(rbind, lapply(1:nrow(df), function(i) {
# if (substr(df[i, 2], 12, 13) != substr(df[i, 3], 12, 13)) {
# tt <- c(df[i, 2], doTime(df[i, 2:3]))
# r <- lapply(seq_along(tt[-1]) - 1, function(j)
# data.frame(id=df[i,1], start_time=tt[1 + j], end_time=tt[2 + j]))
# rr <- do.call(rbind, r)
# rbind(rr, data.frame(id=df[i, 1], start_time=rr[nrow(rr), 3], end_time=df[i, 3]))
# } else {
# df[i, ]
# }
# }))
# ## calculating difftime
# df.long$diff <- apply(df.long[-1], 1, function(x) abs(difftime(x[1], x[2], units="mins")))
# ## aggregating
# with(df.long, aggregate(list(totalMinutes=diff),
# by=list(Day=as.Date(start_time),
# hourOfDay=substr(start_time, 12, 13)),
# FUN=sum))[c(2, 1, 3)]
# }
An alternative solution that does not expand the data, but requires a helper function:
library(dplyr)
library(lubridate)
count_minutes <- function(start_time, end_time) {
time_interval <- interval(start_time, end_time)
start_hour <- floor_date(start_time, unit = "hour")
end_hour <- ceiling_date(end_time, unit = "hour")
diff_hours <- as.double(difftime(end_hour, start_hour, "hours"))
hours <- start_hour + hours(0:diff_hours)
hour_intervals <- int_diff(hours)
minutes_per_hour <- as.double(intersect(time_interval, hour_intervals), units = "minutes")
hours <- hours[1:(length(hours)-1)]
tibble(Day = date(hours),
hourOfDay = hour(hours),
totalMinutes = minutes_per_hour)
}
df %>%
mutate(start_time = as_datetime(start_time),
end_time = as_datetime(end_time)) %>%
as_tibble() %>%
mutate(minutes_per_hour = purrr::map2(start_time, end_time, count_minutes)) %>%
unnest(minutes_per_hour) %>%
group_by(Day, hourOfDay) %>%
summarise(totalMinutes = sum(totalMinutes)) %>%
ungroup()
# A tibble: 6 x 3
# Day hourOfDay totalMinutes
# <date> <int> <dbl>
# 1 2018-09-02 3 34
# 2 2018-09-02 6 69
# 3 2018-09-02 7 124
# 4 2018-09-02 8 93
# 5 2018-09-02 11 41
# 6 2018-09-02 14 3
The helper function counts for every hour within one pair of start_time, end_time how many minutes it contains, and returns this as a tibble. This can then be applied for every such pair in your data, and unnested and summarized to calculate the totals.

Convert start time and total duration to elapsed time per hour

I have data on start time ('startTime', a date-time variable, POSIXct) and duration in minutes ('duration_minutes'):
df <- data.frame(id = c(1, 2, 3),
startTime = as.POSIXct(c("2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11")),
duration_minutes = c(315, 120, 45))
I want to convert the start time and duration to elapsed time per hour, for each hour, from the hour of the start time to the last hour at the end of the duration:
df_result <- data.frame(id = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 3),
startTime = c("2018-01-01 12:15:31","2018-01-01 13:00:00",
"2018-01-01 14:00:00","2018-01-01 15:00:00",
"2018-01-01 16:00:00","2018-01-01 17:00:00",
"2018-01-02 23:43:00","2018-01-03 00:00:00",
"2018-01-03 01:00:00",
"2018-01-03 11:00:11"),
duration_minutes = c(44.48, 60, 60, 60, 60, 30.5, 17, 60, 43, 45))
Please, advice with the possible solution.
Another possibility:
library(data.table)
library(lubridate)
setDT(df)
df[ , ceil_start := ceiling_date(start, "hour", change_on_boundary = TRUE)]
df[ , {
if(difftime(ceil_start, start, units = "min") > dur) {
.SD[ , .(start, dur)]
} else {
end <- start + dur * 60
time <- c(start,
seq(from = ceil_start,
to = floor_date(end, "hour"),
by = "hour"),
end)
.(start = head(time, -1), dur = `units<-`(diff(time), "mins"))
}
},
by = id]
# id start dur
# 1: 1 2018-01-01 12:15:31 44.48333 mins
# 2: 1 2018-01-01 13:00:00 60.00000 mins
# 3: 1 2018-01-01 14:00:00 60.00000 mins
# 4: 1 2018-01-01 15:00:00 60.00000 mins
# 5: 1 2018-01-01 16:00:00 60.00000 mins
# 6: 1 2018-01-01 17:00:00 30.51667 mins
# 7: 2 2018-01-02 23:43:00 17.00000 mins
# 8: 2 2018-01-03 00:00:00 60.00000 mins
# 9: 2 2018-01-03 01:00:00 43.00000 mins
# 10: 3 2018-01-03 11:00:11 45.00000 mins
# 11: 4 2018-01-03 11:35:00 25.00000 mins
# 12: 4 2018-01-03 12:00:00 10.00000 mins
# 13: 5 2018-01-03 00:00:00 60.00000 mins
# 14: 5 2018-01-03 01:00:00 0.00000 mins
Explanation
Convert data.frame to data.table (setDT). Round up start times to nearest hour (ceiling_date(start, "hour", ...). Use change_on_boundary = TRUE for easier handling of times without minutes and seconds (not in the data, but tested).
To handle cases when the end time (start + duration) is in the same hour as the start time (e.g. id = 3), check if difference between rounded time and start time is larger than duration (if(difftime(ceil_start, start, units = "min") > dur))). If so, just select the start and duration columns (.SD[ , .(start, dur)).
For other cases (else), calculate end time: end <- start + dur * 60. Create a sequence from the up-rounded start time ('ceil_start'), to the down-rounded end time, with an hourly increment (seq(from = ceil_start, to = floor_date(end, "hour"), by = "hour")). Concatenate with 'start' and 'end' times. Return all times except the last (head(time, -1) and calculate difference between time steps in minutes (`units<-`(diff(time), "mins")).
For times with H:M:S = 00:00:00 and duration is a multiple of 60 min, like id = 5, the current solution gives a row with a duration of 0 minutes for the last hour. While waiting for a more elegant solution, a quick and dirty way is just to delete such rows with duration = 0.
Data
Please note that I have added a case not included in original data, id = 4 (see also my comment above) and id = 5.
df <- data.frame(id = 1:5,
start = as.POSIXct(c("2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11",
"2018-01-03 11:35:00",
"2018-01-03 00:00:00")),
dur = c(315, 120, 45, 35, 60))
Try this:
library(data.table)
library(lubridate)
library(magrittr)
df <-
setDT(df)[, start_ceiling := ceiling_date(startTime, "hour", change_on_boundary = TRUE)] %>%
.[, `:=` (
reps = ifelse(
startTime + (duration_minutes * 60) <= start_ceiling, 1, pmax(2, floor(duration_minutes / 60) + 1)
),
initial_diff = as.numeric(difftime(start_ceiling[1], startTime[1], units = "mins"))
), by = id] %>%
.[, df[df[, rep(.I, reps)]]] %>%
.[, startTime := pmax(startTime, floor_date(startTime, "hour") + hours(0:(.N - 1))), by = id] %>%
.[reps > 1, duration_minutes := c(initial_diff[.N],
rep(60, reps[.N] - 2),
(duration_minutes[.N] - initial_diff[.N]) %% 60), by = id] %>%
.[!(duration_minutes == 0 & reps > 1), ] %>%
.[, c("reps", "start_ceiling", "initial_diff") := NULL]
I've tested this with all the scenarios we've gathered so far, and this is the output:
id startTime duration_minutes
1: 1 2018-01-01 12:15:31 44.48333
2: 1 2018-01-01 13:00:00 60.00000
3: 1 2018-01-01 14:00:00 60.00000
4: 1 2018-01-01 15:00:00 60.00000
5: 1 2018-01-01 16:00:00 60.00000
6: 1 2018-01-01 17:00:00 30.51667
7: 2 2018-01-02 23:43:00 17.00000
8: 2 2018-01-03 00:00:00 60.00000
9: 2 2018-01-03 01:00:00 43.00000
10: 3 2018-01-03 11:00:11 45.00000
11: 4 2018-01-04 10:00:00 60.00000
12: 4 2018-01-04 11:00:00 5.00000
13: 5 2018-01-05 00:00:00 60.00000
14: 6 2018-01-06 11:35:00 25.00000
15: 6 2018-01-06 12:00:00 10.00000
16: 7 2018-01-07 00:00:00 60.00000
17: 7 2018-01-07 01:00:00 60.00000
Data used:
df <- data.frame(
id = c(1, 2, 3, 4, 5, 6, 7),
startTime = as.POSIXct(
c(
"2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11",
"2018-01-04 10:00:00",
"2018-01-05 00:00:00",
"2018-01-06 11:35:00",
"2018-01-07 00:00:00"
)
),
duration_minutes = c(315, 120, 45, 65, 60, 35, 120)
)
df
id startTime duration_minutes
1 1 2018-01-01 12:15:31 315
2 2 2018-01-02 23:43:00 120
3 3 2018-01-03 11:00:11 45
4 4 2018-01-04 10:00:00 65
5 5 2018-01-05 00:00:00 60
6 6 2018-01-06 11:35:00 35
7 7 2018-01-07 00:00:00 120

Summarize values for overlapping time periods

I'm trying to summarize values for overlapping time periods.
I can use only tidyr, ggplot2 and dplyr libraries. Base R is preferred though.
My data looks like this, but usually it has around 100 records:
df <- structure(list(Start = structure(c(1546531200, 1546531200, 546531200, 1546638252.6316, 1546549800, 1546534800, 1546545600, 1546531200, 1546633120, 1547065942.1053), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Stop = structure(c(1546770243.1579, 1546607400, 1547110800, 1546670652.6316, 1547122863.1579, 1546638252.6316, 1546878293.5579, 1546416000, 1546849694.4, 1547186400), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Value = c(12610, 520, 1500, 90, 331380, 27300, 6072, 4200, 61488, 64372)), .Names = c("Start", "Stop", "Value"), row.names = c(41L, 55L, 25L, 29L, 38L, 28L, 1L, 20L, 14L, 31L), class = c("tbl_df", "tbl", "data.frame"))
head(df) and str(df) gives:
Start Stop Value
2019-01-03 16:00:00 2019-01-06 10:24:03 12610
2019-01-03 16:00:00 2019-01-04 13:10:00 520
2019-01-03 16:00:00 2019-01-10 09:00:00 1500
2019-01-04 21:44:12 2019-01-05 06:44:12 90
2019-01-03 21:10:00 2019-01-10 12:21:03 331380
2019-01-03 17:00:00 2019-01-04 21:44:12 27300
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 10 obs. of 3 variables:
$ Start: POSIXct, format: "2019-01-03 16:00:00" "2019-01-03 16:00:00" ...
$ Stop : POSIXct, format: "2019-01-06 10:24:03" "2019-01-04 13:10:00" ...
$ Value: num 12610 520 1500 90 331380 ...
So there are overlapping time periods with "Start" and "Stop" dates with assigned value. In any given record when there is a value between df$Start and df$Stop and outside of this scope the value is 0.
I want to create another dataframe based on which I could show how this values summarize and change over time. The Desired output would look like this (the "sum" column is made up):
> head(df2)
timestamp sum
"2019-01-02 09:00:00 CET" 14352
"2019-01-03 17:00:00 CET" 6253
"2019-01-03 18:00:00 CET" 23465
"2019-01-03 21:00:00 CET" 3241
"2019-01-03 22:10:00 CET" 23235
"2019-01-04 14:10:00 CET" 123321
To get unique timestamps:
timestamps <- sort(unique(c(df$`Start`, df$`Stop`)))
With df2 dataframe I could easily draw a graph with ggplot, but how to get this sums?
I think I should iterate over df data frame either some custom function or any built-it summarize function which would work like this:
fnct <- function(date, min, max, value) {
if (date >= min && date <=max) {
a <- value
}
else {
a <- 0
}
return(a)
}
...for every given date from timestamps iterate through df and give me a sum of values for the timestamp.
It looks really simple and I'm missing something very basic.
Here's a tidyverse solution similar to my response to this recent question. I gather to bring the timestamps (Starts and Stops) into one column, with another column specifying which. The Starts add the value and the Stops subtract it, and then we just take the cumulative sum to get values at all the instants when the sum changes.
For 100 records, there won't be any perceivable speed improvement from using data.table; in my experience it starts to make more of a difference around 1M records, especially when grouping is involved.
library(dplyr); library(tidyr)
df2 <- df %>%
gather(type, time, Start:Stop) %>%
mutate(chg = if_else(type == "Start", Value, -Value)) %>%
arrange(time) %>%
mutate(sum = cumsum(chg)) # EDIT: corrected per OP comment
> head(df2)
## A tibble: 6 x 5
# Value type time chg sum
# <dbl> <chr> <dttm> <dbl> <dbl>
#1 1500 Start 1987-04-27 14:13:20 1500 1500
#2 4200 Stop 2019-01-02 08:00:00 -4200 -2700
#3 12610 Start 2019-01-03 16:00:00 12610 9910
#4 520 Start 2019-01-03 16:00:00 520 10430
#5 4200 Start 2019-01-03 16:00:00 4200 14630
#6 27300 Start 2019-01-03 17:00:00 27300 41930
In the past I have tried to solve similar problems using the tidyverse/baseR... But nothing comes even remotely close to the speeds that data.table provides for these kind of operations, so I encourage you to give it a try...
For questions like this, my favourite finction is foverlaps() from the data.table-package. With this function you can (fast!) perform an overlap-join. If you want more flexibility in your joining than foverlaps() provides, a non-equi-join (again using data.table) is probably the best (and fastest!) option. But foverlaps() will do here (I guess).
I used the sample data you provided, but filtered out rows where Stop <= Start (probably a tyop in your sample data). When df$Start is not before df$Stop, foverlaps give a warning and won't execute.
library( data.table )
#create data.table with periods you wish to simmarise on
#NB: UTC is used as timezone, since this is also the case in the sample data provided!!
dt.dates <- data.table( id = paste0( "Day", 1:31 ),
Start = seq( as.POSIXct( "2019-01-01 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ),
as.POSIXct( "2019-01-31 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ),
by = "1 days"),
Stop = seq( as.POSIXct( "2019-01-02 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) - 1,
as.POSIXct( "2019-02-01 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) - 1,
by = "1 days") )
If you do not want to summarise on a daily basis, but by hour, minute, second, of year. Just change the values (and stepsize) in dt.dates data.table so that they match your periods.
#set df as data.table
dt <- as.data.table( df )
#filter out any row where Stop is smaller than Start
dt <- dt[ Start < Stop, ]
#perform overlap join
#first set keys
setkey(dt, Start, Stop)
#then perform join
result <- foverlaps( dt.dates, dt, type = "within" )
#summarise
result[, .( Value = sum( Value , na.rm = TRUE ) ), by = .(Day = i.Start) ]
output
# Day Value
# 1: 2019-01-01 1500
# 2: 2019-01-02 1500
# 3: 2019-01-03 1500
# 4: 2019-01-04 351562
# 5: 2019-01-05 413050
# 6: 2019-01-06 400440
# 7: 2019-01-07 332880
# 8: 2019-01-08 332880
# 9: 2019-01-09 332880
# 10: 2019-01-10 64372
# 11: 2019-01-11 0
# 12: 2019-01-12 0
# 13: 2019-01-13 0
# 14: 2019-01-14 0
# 15: 2019-01-15 0
# 16: 2019-01-16 0
# 17: 2019-01-17 0
# 18: 2019-01-18 0
# 19: 2019-01-19 0
# 20: 2019-01-20 0
# 21: 2019-01-21 0
# 22: 2019-01-22 0
# 23: 2019-01-23 0
# 24: 2019-01-24 0
# 25: 2019-01-25 0
# 26: 2019-01-26 0
# 27: 2019-01-27 0
# 28: 2019-01-28 0
# 29: 2019-01-29 0
# 30: 2019-01-30 0
# 31: 2019-01-31 0
# Day Value
plot
#summarise for plot
result.plot <- result[, .( Value = sum( Value , na.rm = TRUE ) ), by = .(Day = i.Start) ]
library( ggplot2 )
ggplot( data = result.plot, aes( x = Day, y = Value ) ) + geom_col()

Creating intervals from time series data

I have a data frame of users and access times. Access times can be duplicated.
I am trying to create a list of users grouped and named by a given time interval, e.g. year.
timestamp user
1 2013-03-06 01:00:00 1
2 2014-07-06 21:00:00 1
3 2014-07-31 23:00:00 2
4 2014-08-09 17:00:00 2
5 2014-08-14 20:00:00 2
6 2014-08-14 22:00:00 3
7 2014-08-16 15:00:00 3
8 2014-08-19 02:00:00 1
9 2014-12-28 18:00:00 1
10 2015-01-17 17:00:00 1
11 2015-01-22 22:00:00 2
12 2015-01-22 22:00:00 3
13 2015-03-23 15:00:00 4
14 2015-04-05 18:00:00 1
15 2015-04-06 01:00:00 2
My code example already creates a list of users grouped by year.
My problem is that I need to modify the table in this approach, which becomes a problem with my tables of a million entries.
test <- structure(list(timestamp = c("2013-03-06 01:00:00", "2014-07-06 21:00:00",
"2014-07-31 23:00:00", "2014-08-09 17:00:00", "2014-08-14 20:00:00",
"2014-08-14 22:00:00", "2014-08-16 15:00:00", "2014-08-19 02:00:00",
"2014-12-28 18:00:00", "2015-01-17 17:00:00", "2015-01-22 22:00:00",
"2015-01-22 22:00:00", "2015-03-23 15:00:00", "2015-04-05 18:00:00",
"2015-04-06 01:00:00"), user = c(1L, 1L, 2L, 2L, 2L, 3L, 3L,
1L, 1L, 1L, 2L, 3L, 4L, 1L, 2L)), .Names = c("timestamp", "user"
), class = "data.frame", row.names = c(NA, -15L))
require(lubridate)
#Creating "POSIXct" object from string timestamp
timestamp <- lapply(test$timestamp,
function(x)parse_date_time(x, "y-m-d H:M:S"))
test$timestamp <- do.call(c,timestamp)
print(class(test$timestamp))
#Adding column for year
test <- cbind(test,sapply(timestamp, function(x)year(x)))
colnames(test)[3]<- "year"
#Creating list of year time intervals for users
intervals <- names(table(test$year))
users <- lapply(intervals, function(x)test[test$year %in% x,"user"])
names(users) <- intervals
without timestamps
treat the timestamp as a character. Only works if for every timestap, the first 4 digits represent the year.
library(dplyr)
test %>%
group_by( user, substr(timestamp,1,4 ) ) %>%
summarise( )
# user `substr(timestamp, 1, 4)`
# <int> <chr>
# 1 1 2013
# 2 1 2014
# 3 1 2015
# 4 2 2014
# 5 2 2015
# 6 3 2014
# 7 3 2015
# 8 4 2015
dplyr + lubridate
will extract the year from the timestamp
library( dplyr )
library( lubridate )
test %>%
mutate( timestamp = as.POSIXct( timestamp, format = "%Y-%m-%d %H:%M:%S" ) ) %>%
group_by( user, lubridate::year( timestamp ) ) %>%
summarise( )
# # Groups: user [?]
# user `year(timestamp)`
# <int> <dbl>
# 1 1 2013
# 2 1 2014
# 3 1 2015
# 4 2 2014
# 5 2 2015
# 6 3 2014
# 7 3 2015
# 8 4 2015
table
a frequency table is also quickly made
table( test$user, substr( test$timestamp, 1, 4 ) )
# 2013 2014 2015
# 1 1 3 2
# 2 0 3 2
# 3 0 2 1
# 4 0 0 1
there are any more alternatives... pick one
edit
if speed is an issue, ty data.table
dcast(
setDT( test )[, timestamp := as.POSIXct( timestamp, format = "%Y-%m-%d %H:%M:%S" )][, .N, by = list( user, data.table::year(timestamp) )],
user ~ data.table,
value.var = "N")
# user 2013 2014 2015
# 1: 1 1 3 2
# 2: 2 NA 3 2
# 3: 3 NA 2 1
# 4: 4 NA NA 1
Another option using the lightning fast data.table package:
library(data.table)
setDT(test) # make `test` a data.frame 'by reference' (no copy is made at all)
test[, j=.(users=list(unique(user))),
by=.(year=substr(test$timestamp,1,4))]
year users
1: 2013 1
2: 2014 1,2,3
3: 2015 1,2,3,4
Again assuming your test$timestamp column is a character vector - otherwise substitute lubridate::year() as needed.
Update:
Simple change to show grouping instead by month (just as it was mentioned in a comment):
test[, j=.(users=list(unique(user))),
by=.(ym=substr(test$timestamp,1,7))]
ym users
1: 2013-03 1
2: 2014-07 1,2
3: 2014-08 2,3,1
4: 2014-12 1
5: 2015-01 1,2,3
6: 2015-03 4
7: 2015-04 1,2
Or group by day, to help demonstrate how to subset with chaining:
test[, j=.(users=list(unique(user))),
by=.(ymd=substr(test$timestamp,1,11))][ymd>='2014-08-01' & ymd<= '2014-08-21']
ymd users
1: 2014-08-09 2
2: 2014-08-14 2,3
3: 2014-08-16 3
4: 2014-08-19 1
Note for filtering/subsetting, if you are only interested in a subset of dates for a "one off" calculation (and not otherwise saving the whole aggregated set to be stored for other purposes) it will likely be more efficient to do the subset in i of DT[i, j, by] for the "one off" calculation.
You could also use base (stats) function aggregate() as follows:
aggregate( x = test$user,
by = list(year=substr(test$timestamp,1,4)),
FUN = unique )
Result:
year x
1 2013 1
2 2014 1, 2, 3
3 2015 1, 2, 3, 4
Above working on assumption that your timestamp column is initially just a character vector exactly as included in your structured example data. In which case you may directly substr out the year with substr(test$timestamp,1,4) avoiding the need to first convert to dates.
However, if you have the timestamp column already as a date, simply substitute the lubridate::year() function you demonstrated in your attempted solution.

ddply summarize data hourly

I would like to summarize frequency of a dataset hourly and two-hourly. The time column's format is hh:mm:ss.
The below code is working to summarize data monthly but I have not found any similar code for hourly or two-hourly.
Thanks in advance.
data2$StartDate <- as.Date(data2$StartDate, "%m/%d/%Y")
data4 <- ddply(data2, .(format(StartDate, "%m")), summarize, freq=length(StartDate))
The dataset is like this:
TripId StartDate StartTime
<int> <date> <S3: times>
1 15335543 2016-01-01 00:14:00
2 15335544 2016-01-01 00:14:00
3 15335607 2016-01-01 02:00:00
4 15335608 2016-01-01 02:01:00
5 15335613 2016-01-01 02:16:00
6 15335639 2016-01-01 02:50:00
If I understood the question correctly then
For hourly frequency:
library(dplyr)
df %>%
mutate(start_timestamp = as.POSIXct(paste(df$StartDate, df$StartTime), tz="UTC", format="%Y-%m-%d %H")) %>%
right_join(data.frame(seq_h = as.POSIXct(unlist(lapply(unique(df$StartDate),
function(x) seq(from=as.POSIXct(paste(x, "00:00:00"), tz="UTC"),
to=as.POSIXct(paste(x, "23:00:00"), tz="UTC"),
by="hour"))), origin="1970-01-01", tz="UTC")), by=c("start_timestamp" = "seq_h")) %>%
group_by(start_timestamp) %>%
summarise(freq=sum(!is.na(TripId)))
Output is:
start_timestamp freq
1 2016-01-01 00:00:00 2
2 2016-01-01 01:00:00 1
3 2016-01-01 02:00:00 1
4 2016-01-01 03:00:00 0
5 2016-01-01 04:00:00 0
...
For two-hourly frequency:
library(dplyr)
df %>%
mutate(start_timestamp = as.POSIXct(cut(as.POSIXct(paste(df$StartDate, df$StartTime), tz="UTC"), breaks="2 hours"), tz="UTC")) %>%
right_join(data.frame(seq_h = as.POSIXct(unlist(lapply(unique(df$StartDate),
function(x) seq(from=as.POSIXct(paste(x, "00:00:00"), tz="UTC"),
to=as.POSIXct(paste(x, "23:00:00"), tz="UTC"),
by="2 hours"))), origin="1970-01-01", tz="UTC")), by=c("start_timestamp" = "seq_h")) %>%
group_by(start_timestamp) %>%
summarise(freq=sum(!is.na(TripId)))
Output is:
start_timestamp freq
1 2016-01-01 00:00:00 3
2 2016-01-01 02:00:00 1
3 2016-01-01 04:00:00 0
4 2016-01-01 06:00:00 0
5 2016-01-01 08:00:00 0
...
Sample data:
df <- structure(list(TripId = c(15335543L, 15335544L, 15335607L, 15335608L,
15335613L, 15335639L), StartDate = c("2016-01-01", "2016-01-01",
"2016-01-01", "2016-01-01", "2016-01-02", "2016-01-02"), StartTime = c("00:14:00",
"00:14:00", "01:00:00", "02:01:00", "02:16:00", "02:50:00")), .Names = c("TripId",
"StartDate", "StartTime"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))

Resources