Grouping every n minutes with dplyr - r

I have a dataset containing 10 events occuring at a certain time on a given day, with corresponding value for each event:
d1 <- data.frame(date = as.POSIXct(c("21/05/2010 19:59:37", "21/05/2010 08:40:30",
"21/05/2010 09:21:00", "21/05/2010 22:29:50", "21/05/2010 11:27:34",
"21/05/2010 18:25:14", "21/05/2010 15:16:01", "21/05/2010 09:41:53",
"21/05/2010 15:01:29", "21/05/2010 09:02:06"), format ="%d/%m/%Y %H:%M:%S"),
value = c(11313,42423,64645,643426,1313313,1313,3535,6476,11313,9875))
I want to aggregate the results every 3 minutes, in a standard dataframe format (from "21/05/2010 00:00:00" to "21/05/2010 23:57:00", so that the dataframe has 480 bins of 3 minutes each)
First, I create a dataframe containing bins of 3 minutes each:
d2 <- data.frame(date = seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=(1440/3)))
Then, I merge the two dataframes together and remove NAs:
library(dplyr)
m <- merge(d1, d2, all=TRUE) %>% mutate(value = ifelse(is.na(value),0,value))
Finally, I use period.apply() from the xts package to sum the values for each bin:
library(xts)
a <- period.apply(m$value, endpoints(m$date, "minutes", 3), sum)
Is there a more efficient way to do this ? It does not feel optimal.
Update #1
I adjusted my code after Joshua's answer:
library(xts)
startpoints <- function (x, on = "months", k = 1) {
head(endpoints(x, on, k) + 1, -1)
}
m <- seq(as.POSIXct("2010-05-21 00:00:00"), by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m))
y <- period.apply(x, c(0,startpoints(x, "minutes", 3)), sum, na.rm=TRUE)
I wasn't aware that na.rm=TRUE could be used with period.apply(), which now allows me to skip mutate(value = ifelse(is.na(value),0,value)). It's a step forward and I'm actually pleased with the xts approach here but I would like to know if there is a pure dplyr solution I could use in such a situation.
Update #2
After trying Khashaa's answer, I had an error because my timezone was not specified. So I had:
> tail(d4)
interval sumvalue
476 2010-05-21 23:45:00 NA
477 2010-05-21 23:48:00 NA
478 2010-05-21 23:51:00 NA
479 2010-05-21 23:54:00 NA
480 2010-05-21 23:57:00 11313
481 2010-05-22 02:27:00 643426
> d4[450,]
interval sumvalue
450 2010-05-21 22:27:00 NA
Now, after Sys.setenv(TZ="UTC"), it all works fine.

lubridate-dplyr-esque solution.
library(lubridate)
library(dplyr)
d2 <- data.frame(interval = seq(ymd_hms('2010-05-21 00:00:00'), by = '3 min',length.out=(1440/3)))
d3 <- d1 %>%
mutate(interval = floor_date(date, unit="hour")+minutes(floor(minute(date)/3)*3)) %>%
group_by(interval) %>%
mutate(sumvalue=sum(value)) %>%
select(interval,sumvalue)
d4 <- merge(d2,d3, all=TRUE) # better if left_join is used
tail(d4)
# interval sumvalue
#475 2010-05-21 23:42:00 NA
#476 2010-05-21 23:45:00 NA
#477 2010-05-21 23:48:00 NA
#478 2010-05-21 23:51:00 NA
#479 2010-05-21 23:54:00 NA
#480 2010-05-21 23:57:00 NA
d4[450,]
# interval sumvalue
#450 2010-05-21 22:27:00 643426
If you are comfortable working with Date (I am not), you can dispense with lubridate, and replace the final merge with left_join.

If you need to group data into n minute bins, the floor_date function can allow multiple units to be specified within the unit argument of the function. For example:
library(lubridate)
x <- ymd_hms("2009-08-03 12:25:59.23")
floor_date(x, unit = "3minutes")
"2009-08-03 12:24:00 UTC"
Using your example:
library(lubridate)
library(tidyverse)
# make complete time sequence
d2 <- data.frame(timePeriod = seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=(1440/3)))
d1 %>%
mutate(timePeriod = floor_date(date, "3minutes")) %>%
group_by(timePeriod) %>%
summarise(sum = sum(value)) %>%
right_join(d2)

I'm not sure about a dplyr solution, but here's an xts solution:
startpoints <- function (x, on = "months", k = 1) {
head(endpoints(x, on, k) + 1, -1)
}
m3 <- seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m3))
y <- period.apply(x, c(0,startpoints(x, "minutes", 3)), sum, na.rm=TRUE)
Update: Here's another xts solution that is a bit more careful about correctly aligning the aggregated values. Not to suggest the prior solution was wrong, but this solution is easier to follow and repeat in other analysis.
m3 <- seq(as.POSIXct("2010-05-20 23:59:59.999"),
by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m3))
y <- period.apply(x, endpoints(x, "minutes", 3), sum, na.rm=TRUE)
y <- align.time(y, 60*3)

Recently, the padr package has been developed which can also solve this in a clean way.
library(lubridate)
library(dplyr)
library(padr)
d1 <- data.frame(date = as.POSIXct(c("21/05/2010 19:59:37", "21/05/2010 08:40:30",
"21/05/2010 09:21:00", "21/05/2010 22:29:50", "21/05/2010 11:27:34",
"21/05/2010 18:25:14", "21/05/2010 15:16:01", "21/05/2010 09:41:53",
"21/05/2010 15:01:29", "21/05/2010 09:02:06"), format ="%d/%m/%Y %H:%M:%S"),
value = c(11313,42423,64645,643426,1313313,1313,3535,6476,11313,9875))
res <- d1 %>%
as_tibble() %>%
arrange(date) %>%
# Thicken the results to fall in 3 minute buckets
thicken(
interval = '3 min',
start_val = as.POSIXct('2010-05-21 00:00:00'),
colname = "date_pad") %>%
# Pad the results to fill in the rest of the 3 minute buckets
pad(
interval = '3 min',
by = 'date_pad',
start_val = as.POSIXct('2010-05-21 00:00:00'),
end_val = as.POSIXct('2010-05-21 23:57:00')) %>%
select(date_pad, value)
res
#> # A tibble: 480 x 2
#> date_pad value
#> <dttm> <dbl>
#> 1 2010-05-21 00:00:00 NA
#> 2 2010-05-21 00:03:00 NA
#> 3 2010-05-21 00:06:00 NA
#> 4 2010-05-21 00:09:00 NA
#> 5 2010-05-21 00:12:00 NA
#> 6 2010-05-21 00:15:00 NA
#> 7 2010-05-21 00:18:00 NA
#> 8 2010-05-21 00:21:00 NA
#> 9 2010-05-21 00:24:00 NA
#> 10 2010-05-21 00:27:00 NA
#> # ... with 470 more rows
res[450,]
#> # A tibble: 1 x 2
#> date_pad value
#> <dttm> <dbl>
#> 1 2010-05-21 22:27:00 643426

Related

Calculate the difference between to date columns of a dataframe

How can I get the difference between Date1 and Date2 columns of my dataframe?
Date1 Tfd Date2 Sex
13/08/1936 3 09/01/2013 M
25/04/1948 2 14/05/2014 M
26/01/1939 1 03/07/2015 F
13/02/1935 8 03/08/2012 F
I have tryed:
age<-apply(df[, c("Date1", "Date2")], function(x, y) difftime(strptime(y, format = "%d.%m.%Y"), strptime(x, format = "%d.%m.%Y"),units="years"))
but I get this error:
Error in strptime(y, format = "%d.%m.%Y") :
argument "y" is missing, with no default
Do you know how can I solve this?
You don't need apply here :
as.numeric(as.Date(df$Date2, "%d/%m/%Y") - as.Date(df$Date1, "%d/%m/%Y"))
#[1] 27908 24125 27917 28296
difftime does not have units as 'years'. The maximum units it has is of weeks. You can divide the week value with 52.25 to get year of use lubridate's time_length function.
Or using dplyr with difftime
library(dplyr)
library(lubridate)
df %>%
mutate_at(vars(starts_with('date')), lubridate::dmy) %>%
mutate(diff = time_length(difftime(Date2, Date1), 'years'))
# Date1 Tfd Date2 Sex diff
#1 1936-08-13 3 2013-01-09 M 76.4
#2 1948-04-25 2 2014-05-14 M 66.1
#3 1939-01-26 1 2015-07-03 F 76.4
#4 1935-02-13 8 2012-08-03 F 77.5

Calculate part of duration that occur in each hour of day

I have a dataframe with start and end times:
id start_time end_time
1 1 2018-09-02 11:13:00 2018-09-02 11:54:00
2 2 2018-09-02 14:34:00 2018-09-02 14:37:00
3 3 2018-09-02 03:00:00 2018-09-02 03:30:00
4 4 2018-09-02 03:49:00 2018-09-02 03:53:00
5 5 2018-09-02 07:05:00 2018-09-02 08:05:00
6 6 2018-09-02 06:44:00 2018-09-02 06:57:00
7 7 2018-09-02 06:04:00 2018-09-02 08:34:00
8 8 2018-09-02 07:51:00 2018-09-02 08:15:00
9 9 2018-09-02 08:16:00 2018-09-02 08:55:00
From such periods, how can I calculate the total number of minutes that occurred in each hour, each day? E.g. if a period started at 9:45 and ended at 10:15, I want to assign 15 minutes to the 9:00 hour and 15 minutes to the 10:00 hour.
Or checking the hour 06 in the data above, that hour is included in two different rows (periods):
6 6 2018-09-02 06:44:00 2018-09-02 06:57:00
7 7 2018-09-02 06:04:00 2018-09-02 08:34:00
In the first row, 13 minutes should be assigned to 06, and in the second row 56 minutes. Thus, a total of 69 minutes for the hour 06 that date.
Expected output from sample data:
hourOfDay Day totalMinutes
<chr> <chr> <drtn>
1 03 2018-09-02 34 mins
2 06 2018-09-02 69 mins
3 07 2018-09-02 124 mins
4 08 2018-09-02 93 mins
5 11 2018-09-02 41 mins
6 14 2018-09-02 3 mins
My attempt: I couldn't make it with lubridate, then I found this old question here. I tried to use POSIXct, but the output is correct for some hours and incorrect for another hours. What am I missing here?
df %>%
mutate(minutes = difftime(end_time,start_time),
hourOfDay = format(as.POSIXct(start_time), "%H"),
Day = format(as.POSIXct(start_time),"%Y-%m-%d")) %>%
group_by(hourOfDay, Day) %>%
summarize(totalMinutes = sum(minutes))
Wrong output:
hourOfDay Day totalMinutes
<chr> <chr> <drtn>
1 03 2018-09-02 34 mins
2 06 2018-09-02 163 mins
3 07 2018-09-02 84 mins
4 08 2018-09-02 39 mins
5 11 2018-09-02 41 mins
6 14 2018-09-02 3 mins
Sample data :
df <- data.frame(
id = c(1,2,3,4,5,6,7,8,9),
start_time = c("2018-09-02 11:13:00", "2018-09-02 14:34:00",
"2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"),
end_time = c("2018-09-02 11:54:00", "2018-09-02 14:37:00",
"2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00"))
Here is an alternate solution, similar to Ronak's but without creating a minute-by-minute data-frame.
library(dplyr)
library(lubridate)
df %>%
mutate(hour = (purrr::map2(hour(start_time), hour(end_time), seq, by = 1))) %>%
tidyr::unnest(hour) %>% mutate(minu=case_when(hour(start_time)!=hour & hour(end_time)==hour ~ 1*minute(end_time),
hour(start_time)==hour & hour(end_time)!=hour ~ 60-minute(start_time),
hour(start_time)==hour & hour(end_time)==hour ~ 1*minute(end_time)-1*minute(start_time),
TRUE ~ 60)) %>% group_by(hour) %>% summarise(sum(minu))
# A tibble: 6 x 2
hour `sum(minu)`
<dbl> <dbl>
1 3 34
2 6 69
3 7 124
4 8 93
5 11 41
6 14 3
Not the best solution since it expands the data but I think it works :
library(dplyr)
library(lubridate)
df %>%
mutate_at(-1, ymd_hms) %>%
mutate(time = purrr::map2(start_time, end_time, seq, by = 'min')) %>%
tidyr::unnest(time) %>%
mutate(hour = hour(time), date = as.Date(time)) %>%
count(date, hour)
# A tibble: 6 x 3
# date hour n
# <date> <int> <int>
#1 2018-09-02 3 36
#2 2018-09-02 6 70
#3 2018-09-02 7 124
#4 2018-09-02 8 97
#5 2018-09-02 11 42
#6 2018-09-02 14 4
We create a sequence from start_time to end_time with 1 minute intervals, extract hours and count occurrence of for each date and hour.
A data.table / lubridate alternative.
library(data.table)
library(lubridate)
setDT(df)
df[ , ceil_start := ceiling_date(start_time, "hour")]
d = df[ , {
if(ceil_start > end_time){
.SD[ , .(start_time, dur = as.double(end_time - start_time, units = "mins"))]
} else {
time <- c(start_time,
seq(from = ceil_start, to = floor_date(end_time, "hour"), by = "hour"),
end_time)
.(start = head(time, -1), dur = `units<-`(diff(time), "mins"))
}
},
by = id]
setorder(d, start_time)
d[ , .(n_min = sum(dur)), by = .(date = as.Date(start_time), hour(start_time))]
# date hour n_min
# 1: 2018-09-02 3 34
# 2: 2018-09-02 6 69
# 3: 2018-09-02 7 124
# 4: 2018-09-02 8 93
# 5: 2018-09-02 11 41
# 6: 2018-09-02 14 3
Explanation
Convert data.frame to data.table (setDT). Round up start times to nearest hour (ceiling_date(start, "hour")).
if the up-rounded time is larger than end time (if(ceil_start > end_time)), select start time and calculate duration for that hour (as.double(end_time - start_time, units = "mins")).
else, create a sequence from the up-rounded start time, to the down-rounded end time, with an hourly increment (seq(from = ceil_start, to = floor_date(end, "hour"), by = "hour")). Concatenate with start and end times. Return all times except the last (head(time, -1)) and calculate difference between time each step in minutes (`units<-`(diff(time), "mins")).
Order data by start time (setorder(d, start_time)). Sum duration by date and hour d[ , .(n_min = sum(dur)), by = .(date = as.Date(start_time), hour(start_time))].
Here is an option using data.table::foverlaps:
#create a data.table of hourly intervals
hours <- seq(df[, trunc(min(start_time)-60*60, "hours")],
df[, trunc(max(end_time)+60*60, "hours")],
by="1 hour")
hourly <- data.table(start_time=hours[-length(hours)], end_time=hours[-1L],
key=cols)
#set keys and find overlaps
#and then calculate overlapping minutes
setkeyv(df, cols)
foverlaps(hourly, df, nomatch=0L)[,
sum(as.numeric(pmin(end_time, i.end_time) - pmax(start_time, i.start_time))) / 60,
.(i.start_time, i.end_time)]
output:
i.start_time i.end_time V1
1: 2018-09-02 02:00:00 2018-09-02 03:00:00 0
2: 2018-09-02 03:00:00 2018-09-02 04:00:00 34
3: 2018-09-02 06:00:00 2018-09-02 07:00:00 69
4: 2018-09-02 07:00:00 2018-09-02 08:00:00 124
5: 2018-09-02 08:00:00 2018-09-02 09:00:00 93
6: 2018-09-02 11:00:00 2018-09-02 12:00:00 41
7: 2018-09-02 14:00:00 2018-09-02 15:00:00 3
data:
df <- data.frame(
id = c(1,2,3,4,5,6,7,8,9),
start_time = c("2018-09-02 11:13:00", "2018-09-02 14:34:00",
"2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"),
end_time = c("2018-09-02 11:54:00", "2018-09-02 14:37:00",
"2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00"))
library(data.table)
cols <- c("start_time", "end_time")
fmt <- "%Y-%m-%d %T"
setDT(df)[, (cols) := lapply(.SD, as.POSIXct, format=fmt), .SDcols=cols]
Here comes a base R solution, which "reshapes" such lines into a long format whose time interval is not in the same hour.
It uses a helper function doTime that generates time sequences.
This updated version calculates with numeric dates (seconds) and internally uses vapply rather than sapply for sake of performance.
decompDayHours <- function(data) {
## convert dates into POSIXct if they're not
if (!all(sapply(data[c("start_time", "end_time")], class) == "POSIXct")) {
data[c("start_time", "end_time")] <-
lapply(data[c("start_time", "end_time")], as.POSIXct)
}
doTime2 <- function(x, date) {
## helper function generating time sequences
xd <- as.double(x) - date
hf <- floor(xd/3600)
hs <- `:`(hf[1], hf[2])[-1]*3600
`attr<-`(mapply(`+`, date, hs), "hours", hf)
}
## Reshape time intervals not in same hour
M <- do.call(rbind, sapply(1:nrow(data), function(i) {
h <- vapply(2:3, function(s) as.double(substr(data[i, s], 12, 13)), 0)
date <- as.double(as.POSIXct(format(data[i, 2], "%F")))
if (h[1] != h[2]) {
hr <- c(as.double(data[i, 2]), dt2 <- doTime2(data[i, 2:3], date))
fh <- attr(dt2, "hours")
fhs <- fh[1]:fh[2]
r1 <- t(vapply(seq_along(hr[-1]) - 1, function(j)
c(id=data[i, 1], start_time=hr[1 + j],
end_time=unname(hr[2 + j]), date=date, hour=fhs[j + 1]), c(0, 0, 0, 0, 0)))
rbind(r1,
c(id=data[i, 1], start_time=r1[nrow(r1), 3],
end_time=as.double(data[i, 3]), date=date, hour=fhs[length(fhs)]))
} else {
c(vapply(data[i, ], as.double, 0), date=date, hour=el(h))
}
}))
## calculating difftime
DF <- cbind.data.frame(M, diff=(M[,3] - M[,2])/60)
## aggregating
res <- aggregate(diff ~ date + hour, DF, sum)
res <- transform(res, date=as.POSIXct(res$date, origin="1970-01-01"))
res[order(res$date, res$hour), ]
}
Result
decompDayHours(df1)
# date hour diff
# 1 2018-09-02 3 34
# 2 2018-09-02 6 69
# 3 2018-09-02 7 124
# 4 2018-09-02 8 93
# 5 2018-09-02 11 41
# 6 2018-09-02 14 3
decompDayHours(df2)
# date hour diff
# 1 2018-09-02 3 30
# 9 2018-09-02 11 41
# 10 2018-09-02 14 3
# 2 2018-09-03 3 4
# 3 2018-09-03 6 13
# 5 2018-09-03 7 55
# 7 2018-09-03 8 5
# 4 2018-09-04 6 56
# 6 2018-09-04 7 69
# 8 2018-09-04 8 88
Benchmarks
I was curious and did a vanilla-benchmark of all solutions so far. Date columns are converted to POSIXct. Not all solutions did scale up to the extended data sets, though.
## df1
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 20.022136 20.445664 20.789341 20.566980 20.791374 25.04604 100 e
# dplyr.bas 103.827770 104.705059 106.631214 105.461541 108.365255 127.12306 100 f
# dplyr.otw 8.972915 9.293750 9.623298 9.464182 9.721488 14.28079 100 ab
# data.tbl.hen 9.258668 9.708603 9.960635 9.872784 10.002138 14.14301 100 b
# data.tbl.chi 10.053165 10.348614 10.673600 10.553489 10.714481 15.43605 100 c
# decomp 8.998939 9.259435 9.372276 9.319774 9.392999 13.13701 100 a
# decomp.old 15.567698 15.795918 16.129622 15.896570 16.029114 20.35637 100 d
## df2
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 19.982590 20.411347 20.949345 20.598873 20.895342 27.24736 100 d
# dplyr.bas 103.513187 104.958665 109.305938 105.942346 109.538759 253.80958 100 e
# dplyr.otw NA NA NA NA NA NA NA NA
# data.tbl.hen 9.392105 9.708858 10.077967 9.922025 10.121671 15.02859 100 ab
# data.tbl.chi 11.308439 11.701862 12.089154 11.909543 12.167486 16.46731 100 b
# decomp 9.111200 9.317223 9.496347 9.398229 9.574146 13.46945 100 a
# decomp.old 15.561829 15.838653 16.163180 16.031282 16.221232 20.41045 100 c
## df3
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr.ron 382.32849 385.27367 389.42564 388.21884 392.97421 397.72959 3 b
# dplyr.bas 10558.87492 10591.51307 10644.58889 10624.15122 10687.44588 10750.74054 3 e
# dplyr.otw NA NA NA NA NA NA NA NA
# data.tbl.hen NA NA NA NA NA NA NA NA
# data.tbl.chi 12.85534 12.91453 17.23170 12.97372 19.41988 25.86605 3 a
# decomp 785.81346 795.86114 811.73947 805.90882 824.70247 843.49612 3 c
# decomp.old 1564.06747 1592.72370 1614.21763 1621.37992 1639.29271 1657.20550 3 d
Data:
## OP data
df1 <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), start_time = c("2018-09-02 11:13:00",
"2018-09-02 14:34:00", "2018-09-02 03:00:00", "2018-09-02 03:49:00",
"2018-09-02 07:05:00", "2018-09-02 06:44:00", "2018-09-02 06:04:00",
"2018-09-02 07:51:00", "2018-09-02 08:16:00"), end_time = c("2018-09-02 11:54:00",
"2018-09-02 14:37:00", "2018-09-02 03:30:00", "2018-09-02 03:53:00",
"2018-09-02 08:05:00", "2018-09-02 06:57:00", "2018-09-02 08:34:00",
"2018-09-02 08:15:00", "2018-09-02 08:55:00")), class = "data.frame", row.names = c(NA,
-9L))
## OP data, modified for alternating dates
df2 <- structure(list(id = 1:9, start_time = c("2018-09-02 11:13:00",
"2018-09-02 14:34:00", "2018-09-02 03:00:00", "2018-09-03 03:49:00",
"2018-09-03 07:05:00", "2018-09-03 06:44:00", "2018-09-04 06:04:00",
"2018-09-04 07:51:00", "2018-09-04 08:16:00"), end_time = c("2018-09-02 11:54:00",
"2018-09-02 14:37:00", "2018-09-02 03:30:00", "2018-09-03 03:53:00",
"2018-09-03 08:05:00", "2018-09-03 06:57:00", "2018-09-04 08:34:00",
"2018-09-04 08:15:00", "2018-09-04 08:55:00")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
## df2 sampled to 1k rows
set.seed(42)
df3 <- df2[sample(1:nrow(df2), 1e3, replace=T), ]
Old version:
# decompDayHours.old <- function(df) {
# df[c("start_time", "end_time")] <-
# lapply(df[c("start_time", "end_time")], as.POSIXct)
# doTime <- function(x) {
# ## helper function generating time sequences
# x <- as.POSIXct(sapply(x, strftime, format="%F %H:00"))
# seq.POSIXt(x[1], x[2], "hours")[-1]
# }
# ## Reshape time intervals not in same hour
# df.long <- do.call(rbind, lapply(1:nrow(df), function(i) {
# if (substr(df[i, 2], 12, 13) != substr(df[i, 3], 12, 13)) {
# tt <- c(df[i, 2], doTime(df[i, 2:3]))
# r <- lapply(seq_along(tt[-1]) - 1, function(j)
# data.frame(id=df[i,1], start_time=tt[1 + j], end_time=tt[2 + j]))
# rr <- do.call(rbind, r)
# rbind(rr, data.frame(id=df[i, 1], start_time=rr[nrow(rr), 3], end_time=df[i, 3]))
# } else {
# df[i, ]
# }
# }))
# ## calculating difftime
# df.long$diff <- apply(df.long[-1], 1, function(x) abs(difftime(x[1], x[2], units="mins")))
# ## aggregating
# with(df.long, aggregate(list(totalMinutes=diff),
# by=list(Day=as.Date(start_time),
# hourOfDay=substr(start_time, 12, 13)),
# FUN=sum))[c(2, 1, 3)]
# }
An alternative solution that does not expand the data, but requires a helper function:
library(dplyr)
library(lubridate)
count_minutes <- function(start_time, end_time) {
time_interval <- interval(start_time, end_time)
start_hour <- floor_date(start_time, unit = "hour")
end_hour <- ceiling_date(end_time, unit = "hour")
diff_hours <- as.double(difftime(end_hour, start_hour, "hours"))
hours <- start_hour + hours(0:diff_hours)
hour_intervals <- int_diff(hours)
minutes_per_hour <- as.double(intersect(time_interval, hour_intervals), units = "minutes")
hours <- hours[1:(length(hours)-1)]
tibble(Day = date(hours),
hourOfDay = hour(hours),
totalMinutes = minutes_per_hour)
}
df %>%
mutate(start_time = as_datetime(start_time),
end_time = as_datetime(end_time)) %>%
as_tibble() %>%
mutate(minutes_per_hour = purrr::map2(start_time, end_time, count_minutes)) %>%
unnest(minutes_per_hour) %>%
group_by(Day, hourOfDay) %>%
summarise(totalMinutes = sum(totalMinutes)) %>%
ungroup()
# A tibble: 6 x 3
# Day hourOfDay totalMinutes
# <date> <int> <dbl>
# 1 2018-09-02 3 34
# 2 2018-09-02 6 69
# 3 2018-09-02 7 124
# 4 2018-09-02 8 93
# 5 2018-09-02 11 41
# 6 2018-09-02 14 3
The helper function counts for every hour within one pair of start_time, end_time how many minutes it contains, and returns this as a tibble. This can then be applied for every such pair in your data, and unnested and summarized to calculate the totals.

Efficient Group Variable to Note When Values Fall Between Two Times

I have a dataset that contains start and end time stamps, as well as a performance percentage. I'd like to calculate group statistics over hourly blocks, e.g. "the average performance for the midnight hour was x%."
My question is if there is a more efficient way to do this than a series of ifelse() statements.
# some sample data
pre.starting <- data.frame(starting = format(seq.POSIXt(from =
as.POSIXct(Sys.Date()), to = as.POSIXct(Sys.Date()+1), by = "5 min"),
"%H:%M", tz="GMT"))
pre.ending <- data.frame(ending = pre.starting[seq(1, nrow(pre.starting),
2), ])
ending2 <- pre.ending[-c(1), ]
starting2 <- data.frame(pre.starting = pre.starting[!(pre.starting$starting
%in% pre.ending$ending),])
dataset <- data.frame(starting = starting2
, ending = ending2
, perct = rnorm(nrow(starting2), 0.5, 0.2))
For example, I could create hour blocks with code along the lines of the following:
dataset2 <- dataset %>%
mutate(hour = ifelse(starting >= 00:00 & ending < 01:00, 12
, ifelse(starting >= 01:00 & ending < 02:00, 1
, ifelse(starting >= 02:00 & ending < 03:00, 13)))
) %>%
group_by(hour) %>%
summarise(mean.perct = mean(perct, na.rm=T))
Is there a way to make this code more efficient, or improve beyond ifelse()?
We can use cut ending hour based on hourly interval after converting timestamps into POSIXct and then take mean for each hour.
library(dplyr)
dataset %>%
mutate_at(vars(pre.starting, ending), as.POSIXct, format = "%H:%M") %>%
group_by(ending_hour = cut(ending, breaks = "1 hour")) %>%
summarise(mean.perct = mean(perct, na.rm = TRUE))
# ending_hour mean.perct
# <fct> <dbl>
# 1 2019-09-30 00:00:00 0.540
# 2 2019-09-30 01:00:00 0.450
# 3 2019-09-30 02:00:00 0.612
# 4 2019-09-30 03:00:00 0.470
# 5 2019-09-30 04:00:00 0.564
# 6 2019-09-30 05:00:00 0.437
# 7 2019-09-30 06:00:00 0.413
# 8 2019-09-30 07:00:00 0.397
# 9 2019-09-30 08:00:00 0.492
#10 2019-09-30 09:00:00 0.613
# … with 14 more rows

How to aggregate/ sum values by time in r

There are 2 issues:
I have time data in factor format and I want to change it into date format for later manipulation.
The goal is to sum values of precipitation of the same time unit, eg. precipitation per hour.
I tried to convert the time using as.POSIXct() or as.date() in lubridate but always get NA values after defining the format. This is the code I used:
tt=as.POSIXct(FixUseNew$StartTimestamp, )
df$time <- as.Date(df$time, "%d-%m-%Y")
If I leave out the format and do the following :
tt=as.POSIXct(df$time)
tt
hour(tt)
The date data looks like this now: "0010-07-14 00:38:00 LMT"
I wanted to use aggregate function to sum the precipitation in the same hour interval or day but couldn't do it as I am stuck with the date format.
Just a brain dump. I was going to change the factor date in to character then to date format as following. Please advise if that is a good idea.
df$time <-paste(substr(df$time,6,7),
substr(df$time,9,10),
substr(df$time,1,4),sep="/")
Here is a subset of the data, hope this helps to illustrate the question better:
Id <- c(1,2,3,4)
Time <- c("10/7/2014 12:30:00 am", "10/7/2014 01:00:05 am","10/7/2014 01:30:10 am", "10/7/2014 02:00:15 am")
Precipitation <- c(0.06, 0.02,0,0.25)
cbind(Id, Time, Precipitation)
Thank you so much.
Here is the outcome:
It seems like the order is distorted:
6 1/1/15 0:35 602
7 1/1/15 0:36 582
8 1/1/15 0:37 958
9 1/1/15 0:38 872
10 1/10/14 0:31 500
11 1/10/14 0:32 571
12 1/10/14 0:33 487
13 1/10/14 0:34 220
14 1/10/14 0:35 550
15 1/10/14 0:36 582
16 1/10/14 0:37 524
17 1/10/14 0:38 487
⋮
106 10/10/14 15:16 494
107 10/10/14 7:53 37
108 10/10/14 7:56 24
109 10/10/14 8:01 3
110 10/11/14 0:30 686
111 10/11/14 0:31 592
112 10/11/14 0:32 368
113 10/11/14 0:33 702
114 10/11/14 0:34 540
115 10/11/14 0:35 564
Using dplyr and lubridate packages we can extract the hour from each Time and sum.
library(dplyr)
library(lubridate)
df %>%
mutate(hour = hour(dmy_hms(Time))) %>%
group_by(hour) %>%
summarise(Precipitation = sum(Precipitation, na.rm = TRUE))
For aggregation by date, we can do
df %>%
mutate(day = as.Date(dmy_hms(Time))) %>%
group_by(day) %>%
summarise(Precipitation = sum(Precipitation, na.rm = TRUE))
Using base R, we could do
df$Hour <- format(as.POSIXct(df$Time, format = "%d/%m/%Y %I:%M:%S %p"), "%H")
df$Day <- as.Date(as.POSIXct(df$Time, format = "%d/%m/%Y %I:%M:%S %p"))
#Aggregation by hour
aggregate(Precipitation~Hour, df, sum, na.rm = TRUE)
#Aggregation by date
aggregate(Precipitation~Day, df, sum, na.rm = TRUE)
EDIT
Based on updated data and information, we can do
df <- readxl::read_xlsx("/path/to/file/df (1).xlsx")
hour_df <- df %>%
group_by(hour = hour(Time)) %>%
summarise(Precipitation = sum(Precipitation, na.rm = TRUE))
day_df <- df %>%
mutate(day = as.Date(Time)) %>%
group_by(day) %>%
summarise(Precipitation = sum(Precipitation, na.rm = TRUE))
So hour_df has got hourly sum of values without taking into consideration the date and day_df has got sum of Precipitation for each day.
data
Id <- c(1,2,3,4)
Time <- c("10/7/2014 12:30:00 am", "10/7/2014 01:00:05 am",
"10/7/2014 01:30:10 am", "10/7/2014 02:00:15 am")
Precipitation <- c(0.06, 0.02,0,0.25)
df <- data.frame(Id, Time, Precipitation)

Build datetime column in R

I have 2 columns
one is date :
2011-04-13
2013-07-29
2010-11-23
the other is time :
3
22
15
I want to make a new column contains date time
it will be like this
2011-04-13 3:00:00
2013-07-29 22:00:00
2010-11-23 15:00:00
I managed to combine them as string
but when i convert them to datetime i get only date the time disappears
any idea how to get date and time in one column?
my script
data <- read.csv("d:\\__r\\hour.csv")
data$date <- as.POSIXct(paste(data$dteday , paste(data$hr, ":00:00", sep=""), sep=" "))
as example you can use ymd_hm function from lubridate:
a <- c("2014-09-08", "2014-09-08", "2014-09-08")
b <- c(3, 4, 5)
library(lubridate)
library(tidyverse)
tibble(a, b) %>%
mutate(time = paste0(a, " ", b, "-0"),
time = ymd_hm(time))
output would be:
# A tibble: 3 x 3
a b time
<chr> <dbl> <dttm>
1 2014-09-08 3 2014-09-08 03:00:00
2 2014-09-08 4 2014-09-08 04:00:00
3 2014-09-08 5 2014-09-08 05:00:00
found this fixed the problem
data$date <- as.POSIXct(strptime(paste(data$dteday , paste(data$hr, ":00:00", sep=""), sep=" "), "%Y-%m-%d %H:%M:%S"))

Resources