I have a dataframe with start and end times:
id start_time end_time
1 1 2018-09-02 11:13:00 2018-09-02 11:54:00
2 2 2018-09-02 14:34:00 2018-09-02 14:37:00
3 3 2018-09-02 03:00:00 2018-09-02 03:30:00
4 4 2018-09-02 03:49:00 2018-09-02 03:53:00
5 5 2018-09-02 07:05:00 2018-09-02 08:05:00
6 6 2018-09-02 06:44:00 2018-09-02 06:57:00
7 7 2018-09-02 06:04:00 2018-09-02 08:34:00
8 8 2018-09-02 07:51:00 2018-09-02 08:15:00
9 9 2018-09-02 08:16:00 2018-09-02 08:55:00
From such periods, how can I calculate the total number of minutes that occurred in each hour, each day? E.g. if a period started at 9:45 and ended at 10:15, I want to assign 15 minutes to the 9:00 hour and 15 minutes to the 10:00 hour.
Or checking the hour 06 in the data above, that hour is included in two different rows (periods):
6 6 2018-09-02 06:44:00 2018-09-02 06:57:00
7 7 2018-09-02 06:04:00 2018-09-02 08:34:00
In the first row, 13 minutes should be assigned to 06, and in the second row 56 minutes. Thus, a total of 69 minutes for the hour 06 that date.
Expected output from sample data:
hourOfDay Day totalMinutes
<chr> <chr> <drtn>
1 03 2018-09-02 34 mins
2 06 2018-09-02 69 mins
3 07 2018-09-02 124 mins
4 08 2018-09-02 93 mins
5 11 2018-09-02 41 mins
6 14 2018-09-02 3 mins
My attempt: I couldn't make it with lubridate, then I found this old question here. I tried to use POSIXct, but the output is correct for some hours and incorrect for another hours. What am I missing here?
df %>%
mutate(minutes = difftime(end_time,start_time),
hourOfDay = format(as.POSIXct(start_time), "%H"),
Day = format(as.POSIXct(start_time),"%Y-%m-%d")) %>%
group_by(hourOfDay, Day) %>%
summarize(totalMinutes = sum(minutes))
Wrong output:
hourOfDay Day totalMinutes
<chr> <chr> <drtn>
1 03 2018-09-02 34 mins
2 06 2018-09-02 163 mins
3 07 2018-09-02 84 mins
4 08 2018-09-02 39 mins
5 11 2018-09-02 41 mins
6 14 2018-09-02 3 mins
Sample data :
df <- data.frame(
id = c(1,2,3,4,5,6,7,8,9),
start_time = c("2018-09-02 11:13:00", "2018-09-02 14:34:00",
"2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"),
end_time = c("2018-09-02 11:54:00", "2018-09-02 14:37:00",
"2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00"))
Here is an alternate solution, similar to Ronak's but without creating a minute-by-minute data-frame.
library(dplyr)
library(lubridate)
df %>%
mutate(hour = (purrr::map2(hour(start_time), hour(end_time), seq, by = 1))) %>%
tidyr::unnest(hour) %>% mutate(minu=case_when(hour(start_time)!=hour & hour(end_time)==hour ~ 1*minute(end_time),
hour(start_time)==hour & hour(end_time)!=hour ~ 60-minute(start_time),
hour(start_time)==hour & hour(end_time)==hour ~ 1*minute(end_time)-1*minute(start_time),
TRUE ~ 60)) %>% group_by(hour) %>% summarise(sum(minu))
# A tibble: 6 x 2
hour `sum(minu)`
<dbl> <dbl>
1 3 34
2 6 69
3 7 124
4 8 93
5 11 41
6 14 3
Not the best solution since it expands the data but I think it works :
library(dplyr)
library(lubridate)
df %>%
mutate_at(-1, ymd_hms) %>%
mutate(time = purrr::map2(start_time, end_time, seq, by = 'min')) %>%
tidyr::unnest(time) %>%
mutate(hour = hour(time), date = as.Date(time)) %>%
count(date, hour)
# A tibble: 6 x 3
# date hour n
# <date> <int> <int>
#1 2018-09-02 3 36
#2 2018-09-02 6 70
#3 2018-09-02 7 124
#4 2018-09-02 8 97
#5 2018-09-02 11 42
#6 2018-09-02 14 4
We create a sequence from start_time to end_time with 1 minute intervals, extract hours and count occurrence of for each date and hour.
A data.table / lubridate alternative.
library(data.table)
library(lubridate)
setDT(df)
df[ , ceil_start := ceiling_date(start_time, "hour")]
d = df[ , {
if(ceil_start > end_time){
.SD[ , .(start_time, dur = as.double(end_time - start_time, units = "mins"))]
} else {
time <- c(start_time,
seq(from = ceil_start, to = floor_date(end_time, "hour"), by = "hour"),
end_time)
.(start = head(time, -1), dur = `units<-`(diff(time), "mins"))
}
},
by = id]
setorder(d, start_time)
d[ , .(n_min = sum(dur)), by = .(date = as.Date(start_time), hour(start_time))]
# date hour n_min
# 1: 2018-09-02 3 34
# 2: 2018-09-02 6 69
# 3: 2018-09-02 7 124
# 4: 2018-09-02 8 93
# 5: 2018-09-02 11 41
# 6: 2018-09-02 14 3
Explanation
Convert data.frame to data.table (setDT). Round up start times to nearest hour (ceiling_date(start, "hour")).
if the up-rounded time is larger than end time (if(ceil_start > end_time)), select start time and calculate duration for that hour (as.double(end_time - start_time, units = "mins")).
else, create a sequence from the up-rounded start time, to the down-rounded end time, with an hourly increment (seq(from = ceil_start, to = floor_date(end, "hour"), by = "hour")). Concatenate with start and end times. Return all times except the last (head(time, -1)) and calculate difference between time each step in minutes (`units<-`(diff(time), "mins")).
Order data by start time (setorder(d, start_time)). Sum duration by date and hour d[ , .(n_min = sum(dur)), by = .(date = as.Date(start_time), hour(start_time))].
Here is an option using data.table::foverlaps:
#create a data.table of hourly intervals
hours <- seq(df[, trunc(min(start_time)-60*60, "hours")],
df[, trunc(max(end_time)+60*60, "hours")],
by="1 hour")
hourly <- data.table(start_time=hours[-length(hours)], end_time=hours[-1L],
key=cols)
#set keys and find overlaps
#and then calculate overlapping minutes
setkeyv(df, cols)
foverlaps(hourly, df, nomatch=0L)[,
sum(as.numeric(pmin(end_time, i.end_time) - pmax(start_time, i.start_time))) / 60,
.(i.start_time, i.end_time)]
output:
i.start_time i.end_time V1
1: 2018-09-02 02:00:00 2018-09-02 03:00:00 0
2: 2018-09-02 03:00:00 2018-09-02 04:00:00 34
3: 2018-09-02 06:00:00 2018-09-02 07:00:00 69
4: 2018-09-02 07:00:00 2018-09-02 08:00:00 124
5: 2018-09-02 08:00:00 2018-09-02 09:00:00 93
6: 2018-09-02 11:00:00 2018-09-02 12:00:00 41
7: 2018-09-02 14:00:00 2018-09-02 15:00:00 3
data:
df <- data.frame(
id = c(1,2,3,4,5,6,7,8,9),
start_time = c("2018-09-02 11:13:00", "2018-09-02 14:34:00",
"2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"),
end_time = c("2018-09-02 11:54:00", "2018-09-02 14:37:00",
"2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00"))
library(data.table)
cols <- c("start_time", "end_time")
fmt <- "%Y-%m-%d %T"
setDT(df)[, (cols) := lapply(.SD, as.POSIXct, format=fmt), .SDcols=cols]
Here comes a base R solution, which "reshapes" such lines into a long format whose time interval is not in the same hour.
It uses a helper function doTime that generates time sequences.
This updated version calculates with numeric dates (seconds) and internally uses vapply rather than sapply for sake of performance.
decompDayHours <- function(data) {
## convert dates into POSIXct if they're not
if (!all(sapply(data[c("start_time", "end_time")], class) == "POSIXct")) {
data[c("start_time", "end_time")] <-
lapply(data[c("start_time", "end_time")], as.POSIXct)
}
doTime2 <- function(x, date) {
## helper function generating time sequences
xd <- as.double(x) - date
hf <- floor(xd/3600)
hs <- `:`(hf[1], hf[2])[-1]*3600
`attr<-`(mapply(`+`, date, hs), "hours", hf)
}
## Reshape time intervals not in same hour
M <- do.call(rbind, sapply(1:nrow(data), function(i) {
h <- vapply(2:3, function(s) as.double(substr(data[i, s], 12, 13)), 0)
date <- as.double(as.POSIXct(format(data[i, 2], "%F")))
if (h[1] != h[2]) {
hr <- c(as.double(data[i, 2]), dt2 <- doTime2(data[i, 2:3], date))
fh <- attr(dt2, "hours")
fhs <- fh[1]:fh[2]
r1 <- t(vapply(seq_along(hr[-1]) - 1, function(j)
c(id=data[i, 1], start_time=hr[1 + j],
end_time=unname(hr[2 + j]), date=date, hour=fhs[j + 1]), c(0, 0, 0, 0, 0)))
rbind(r1,
c(id=data[i, 1], start_time=r1[nrow(r1), 3],
end_time=as.double(data[i, 3]), date=date, hour=fhs[length(fhs)]))
} else {
c(vapply(data[i, ], as.double, 0), date=date, hour=el(h))
}
}))
## calculating difftime
DF <- cbind.data.frame(M, diff=(M[,3] - M[,2])/60)
## aggregating
res <- aggregate(diff ~ date + hour, DF, sum)
res <- transform(res, date=as.POSIXct(res$date, origin="1970-01-01"))
res[order(res$date, res$hour), ]
}
Result
decompDayHours(df1)
# date hour diff
# 1 2018-09-02 3 34
# 2 2018-09-02 6 69
# 3 2018-09-02 7 124
# 4 2018-09-02 8 93
# 5 2018-09-02 11 41
# 6 2018-09-02 14 3
decompDayHours(df2)
# date hour diff
# 1 2018-09-02 3 30
# 9 2018-09-02 11 41
# 10 2018-09-02 14 3
# 2 2018-09-03 3 4
# 3 2018-09-03 6 13
# 5 2018-09-03 7 55
# 7 2018-09-03 8 5
# 4 2018-09-04 6 56
# 6 2018-09-04 7 69
# 8 2018-09-04 8 88
Benchmarks
I was curious and did a vanilla-benchmark of all solutions so far. Date columns are converted to POSIXct. Not all solutions did scale up to the extended data sets, though.
## df1
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 20.022136 20.445664 20.789341 20.566980 20.791374 25.04604 100 e
# dplyr.bas 103.827770 104.705059 106.631214 105.461541 108.365255 127.12306 100 f
# dplyr.otw 8.972915 9.293750 9.623298 9.464182 9.721488 14.28079 100 ab
# data.tbl.hen 9.258668 9.708603 9.960635 9.872784 10.002138 14.14301 100 b
# data.tbl.chi 10.053165 10.348614 10.673600 10.553489 10.714481 15.43605 100 c
# decomp 8.998939 9.259435 9.372276 9.319774 9.392999 13.13701 100 a
# decomp.old 15.567698 15.795918 16.129622 15.896570 16.029114 20.35637 100 d
## df2
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 19.982590 20.411347 20.949345 20.598873 20.895342 27.24736 100 d
# dplyr.bas 103.513187 104.958665 109.305938 105.942346 109.538759 253.80958 100 e
# dplyr.otw NA NA NA NA NA NA NA NA
# data.tbl.hen 9.392105 9.708858 10.077967 9.922025 10.121671 15.02859 100 ab
# data.tbl.chi 11.308439 11.701862 12.089154 11.909543 12.167486 16.46731 100 b
# decomp 9.111200 9.317223 9.496347 9.398229 9.574146 13.46945 100 a
# decomp.old 15.561829 15.838653 16.163180 16.031282 16.221232 20.41045 100 c
## df3
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 382.32849 385.27367 389.42564 388.21884 392.97421 397.72959 3 b
# dplyr.bas 10558.87492 10591.51307 10644.58889 10624.15122 10687.44588 10750.74054 3 e
# dplyr.otw NA NA NA NA NA NA NA NA
# data.tbl.hen NA NA NA NA NA NA NA NA
# data.tbl.chi 12.85534 12.91453 17.23170 12.97372 19.41988 25.86605 3 a
# decomp 785.81346 795.86114 811.73947 805.90882 824.70247 843.49612 3 c
# decomp.old 1564.06747 1592.72370 1614.21763 1621.37992 1639.29271 1657.20550 3 d
Data:
## OP data
df1 <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), start_time = c("2018-09-02 11:13:00",
"2018-09-02 14:34:00", "2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"), end_time = c("2018-09-02 11:54:00",
"2018-09-02 14:37:00", "2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00")), class = "data.frame", row.names = c(NA,
-9L))
## OP data, modified for alternating dates
df2 <- structure(list(id = 1:9, start_time = c("2018-09-02 11:13:00",
"2018-09-02 14:34:00", "2018-09-02 03:00:00", "2018-09-03 03:49:00",
"2018-09-03 07:05:00", "2018-09-03 06:44:00", "2018-09-04 06:04:00",
"2018-09-04 07:51:00", "2018-09-04 08:16:00"), end_time = c("2018-09-02 11:54:00",
"2018-09-02 14:37:00", "2018-09-02 03:30:00", "2018-09-03 03:53:00",
"2018-09-03 08:05:00", "2018-09-03 06:57:00", "2018-09-04 08:34:00",
"2018-09-04 08:15:00", "2018-09-04 08:55:00")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
## df2 sampled to 1k rows
set.seed(42)
df3 <- df2[sample(1:nrow(df2), 1e3, replace=T), ]
Old version:
# decompDayHours.old <- function(df) {
# df[c("start_time", "end_time")] <-
# lapply(df[c("start_time", "end_time")], as.POSIXct)
# doTime <- function(x) {
# ## helper function generating time sequences
# x <- as.POSIXct(sapply(x, strftime, format="%F %H:00"))
# seq.POSIXt(x[1], x[2], "hours")[-1]
# }
# ## Reshape time intervals not in same hour
# df.long <- do.call(rbind, lapply(1:nrow(df), function(i) {
# if (substr(df[i, 2], 12, 13) != substr(df[i, 3], 12, 13)) {
# tt <- c(df[i, 2], doTime(df[i, 2:3]))
# r <- lapply(seq_along(tt[-1]) - 1, function(j)
# data.frame(id=df[i,1], start_time=tt[1 + j], end_time=tt[2 + j]))
# rr <- do.call(rbind, r)
# rbind(rr, data.frame(id=df[i, 1], start_time=rr[nrow(rr), 3], end_time=df[i, 3]))
# } else {
# df[i, ]
# }
# }))
# ## calculating difftime
# df.long$diff <- apply(df.long[-1], 1, function(x) abs(difftime(x[1], x[2], units="mins")))
# ## aggregating
# with(df.long, aggregate(list(totalMinutes=diff),
# by=list(Day=as.Date(start_time),
# hourOfDay=substr(start_time, 12, 13)),
# FUN=sum))[c(2, 1, 3)]
# }
An alternative solution that does not expand the data, but requires a helper function:
library(dplyr)
library(lubridate)
count_minutes <- function(start_time, end_time) {
time_interval <- interval(start_time, end_time)
start_hour <- floor_date(start_time, unit = "hour")
end_hour <- ceiling_date(end_time, unit = "hour")
diff_hours <- as.double(difftime(end_hour, start_hour, "hours"))
hours <- start_hour + hours(0:diff_hours)
hour_intervals <- int_diff(hours)
minutes_per_hour <- as.double(intersect(time_interval, hour_intervals), units = "minutes")
hours <- hours[1:(length(hours)-1)]
tibble(Day = date(hours),
hourOfDay = hour(hours),
totalMinutes = minutes_per_hour)
}
df %>%
mutate(start_time = as_datetime(start_time),
end_time = as_datetime(end_time)) %>%
as_tibble() %>%
mutate(minutes_per_hour = purrr::map2(start_time, end_time, count_minutes)) %>%
unnest(minutes_per_hour) %>%
group_by(Day, hourOfDay) %>%
summarise(totalMinutes = sum(totalMinutes)) %>%
ungroup()
# A tibble: 6 x 3
# Day hourOfDay totalMinutes
# <date> <int> <dbl>
# 1 2018-09-02 3 34
# 2 2018-09-02 6 69
# 3 2018-09-02 7 124
# 4 2018-09-02 8 93
# 5 2018-09-02 11 41
# 6 2018-09-02 14 3
The helper function counts for every hour within one pair of start_time, end_time how many minutes it contains, and returns this as a tibble. This can then be applied for every such pair in your data, and unnested and summarized to calculate the totals.
There are 2 issues:
I have time data in factor format and I want to change it into date format for later manipulation.
The goal is to sum values of precipitation of the same time unit, eg. precipitation per hour.
I tried to convert the time using as.POSIXct() or as.date() in lubridate but always get NA values after defining the format. This is the code I used:
tt=as.POSIXct(FixUseNew$StartTimestamp, )
df$time <- as.Date(df$time, "%d-%m-%Y")
If I leave out the format and do the following :
tt=as.POSIXct(df$time)
tt
hour(tt)
The date data looks like this now: "0010-07-14 00:38:00 LMT"
I wanted to use aggregate function to sum the precipitation in the same hour interval or day but couldn't do it as I am stuck with the date format.
Just a brain dump. I was going to change the factor date in to character then to date format as following. Please advise if that is a good idea.
df$time <-paste(substr(df$time,6,7),
substr(df$time,9,10),
substr(df$time,1,4),sep="/")
Here is a subset of the data, hope this helps to illustrate the question better:
Id <- c(1,2,3,4)
Time <- c("10/7/2014 12:30:00 am", "10/7/2014 01:00:05 am","10/7/2014 01:30:10 am", "10/7/2014 02:00:15 am")
Precipitation <- c(0.06, 0.02,0,0.25)
cbind(Id, Time, Precipitation)
Thank you so much.
Here is the outcome:
It seems like the order is distorted:
6 1/1/15 0:35 602
7 1/1/15 0:36 582
8 1/1/15 0:37 958
9 1/1/15 0:38 872
10 1/10/14 0:31 500
11 1/10/14 0:32 571
12 1/10/14 0:33 487
13 1/10/14 0:34 220
14 1/10/14 0:35 550
15 1/10/14 0:36 582
16 1/10/14 0:37 524
17 1/10/14 0:38 487
⋮
106 10/10/14 15:16 494
107 10/10/14 7:53 37
108 10/10/14 7:56 24
109 10/10/14 8:01 3
110 10/11/14 0:30 686
111 10/11/14 0:31 592
112 10/11/14 0:32 368
113 10/11/14 0:33 702
114 10/11/14 0:34 540
115 10/11/14 0:35 564
Using dplyr and lubridate packages we can extract the hour from each Time and sum.
library(dplyr)
library(lubridate)
df %>%
mutate(hour = hour(dmy_hms(Time))) %>%
group_by(hour) %>%
summarise(Precipitation = sum(Precipitation, na.rm = TRUE))
For aggregation by date, we can do
df %>%
mutate(day = as.Date(dmy_hms(Time))) %>%
group_by(day) %>%
summarise(Precipitation = sum(Precipitation, na.rm = TRUE))
Using base R, we could do
df$Hour <- format(as.POSIXct(df$Time, format = "%d/%m/%Y %I:%M:%S %p"), "%H")
df$Day <- as.Date(as.POSIXct(df$Time, format = "%d/%m/%Y %I:%M:%S %p"))
#Aggregation by hour
aggregate(Precipitation~Hour, df, sum, na.rm = TRUE)
#Aggregation by date
aggregate(Precipitation~Day, df, sum, na.rm = TRUE)
EDIT
Based on updated data and information, we can do
df <- readxl::read_xlsx("/path/to/file/df (1).xlsx")
hour_df <- df %>%
group_by(hour = hour(Time)) %>%
summarise(Precipitation = sum(Precipitation, na.rm = TRUE))
day_df <- df %>%
mutate(day = as.Date(Time)) %>%
group_by(day) %>%
summarise(Precipitation = sum(Precipitation, na.rm = TRUE))
So hour_df has got hourly sum of values without taking into consideration the date and day_df has got sum of Precipitation for each day.
data
Id <- c(1,2,3,4)
Time <- c("10/7/2014 12:30:00 am", "10/7/2014 01:00:05 am",
"10/7/2014 01:30:10 am", "10/7/2014 02:00:15 am")
Precipitation <- c(0.06, 0.02,0,0.25)
df <- data.frame(Id, Time, Precipitation)