I have data on start time ('startTime', a date-time variable, POSIXct) and duration in minutes ('duration_minutes'):
df <- data.frame(id = c(1, 2, 3),
startTime = as.POSIXct(c("2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11")),
duration_minutes = c(315, 120, 45))
I want to convert the start time and duration to elapsed time per hour, for each hour, from the hour of the start time to the last hour at the end of the duration:
df_result <- data.frame(id = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 3),
startTime = c("2018-01-01 12:15:31","2018-01-01 13:00:00",
"2018-01-01 14:00:00","2018-01-01 15:00:00",
"2018-01-01 16:00:00","2018-01-01 17:00:00",
"2018-01-02 23:43:00","2018-01-03 00:00:00",
"2018-01-03 01:00:00",
"2018-01-03 11:00:11"),
duration_minutes = c(44.48, 60, 60, 60, 60, 30.5, 17, 60, 43, 45))
Please, advice with the possible solution.
Another possibility:
library(data.table)
library(lubridate)
setDT(df)
df[ , ceil_start := ceiling_date(start, "hour", change_on_boundary = TRUE)]
df[ , {
if(difftime(ceil_start, start, units = "min") > dur) {
.SD[ , .(start, dur)]
} else {
end <- start + dur * 60
time <- c(start,
seq(from = ceil_start,
to = floor_date(end, "hour"),
by = "hour"),
end)
.(start = head(time, -1), dur = `units<-`(diff(time), "mins"))
}
},
by = id]
# id start dur
# 1: 1 2018-01-01 12:15:31 44.48333 mins
# 2: 1 2018-01-01 13:00:00 60.00000 mins
# 3: 1 2018-01-01 14:00:00 60.00000 mins
# 4: 1 2018-01-01 15:00:00 60.00000 mins
# 5: 1 2018-01-01 16:00:00 60.00000 mins
# 6: 1 2018-01-01 17:00:00 30.51667 mins
# 7: 2 2018-01-02 23:43:00 17.00000 mins
# 8: 2 2018-01-03 00:00:00 60.00000 mins
# 9: 2 2018-01-03 01:00:00 43.00000 mins
# 10: 3 2018-01-03 11:00:11 45.00000 mins
# 11: 4 2018-01-03 11:35:00 25.00000 mins
# 12: 4 2018-01-03 12:00:00 10.00000 mins
# 13: 5 2018-01-03 00:00:00 60.00000 mins
# 14: 5 2018-01-03 01:00:00 0.00000 mins
Explanation
Convert data.frame to data.table (setDT). Round up start times to nearest hour (ceiling_date(start, "hour", ...). Use change_on_boundary = TRUE for easier handling of times without minutes and seconds (not in the data, but tested).
To handle cases when the end time (start + duration) is in the same hour as the start time (e.g. id = 3), check if difference between rounded time and start time is larger than duration (if(difftime(ceil_start, start, units = "min") > dur))). If so, just select the start and duration columns (.SD[ , .(start, dur)).
For other cases (else), calculate end time: end <- start + dur * 60. Create a sequence from the up-rounded start time ('ceil_start'), to the down-rounded end time, with an hourly increment (seq(from = ceil_start, to = floor_date(end, "hour"), by = "hour")). Concatenate with 'start' and 'end' times. Return all times except the last (head(time, -1) and calculate difference between time steps in minutes (`units<-`(diff(time), "mins")).
For times with H:M:S = 00:00:00 and duration is a multiple of 60 min, like id = 5, the current solution gives a row with a duration of 0 minutes for the last hour. While waiting for a more elegant solution, a quick and dirty way is just to delete such rows with duration = 0.
Data
Please note that I have added a case not included in original data, id = 4 (see also my comment above) and id = 5.
df <- data.frame(id = 1:5,
start = as.POSIXct(c("2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11",
"2018-01-03 11:35:00",
"2018-01-03 00:00:00")),
dur = c(315, 120, 45, 35, 60))
Try this:
library(data.table)
library(lubridate)
library(magrittr)
df <-
setDT(df)[, start_ceiling := ceiling_date(startTime, "hour", change_on_boundary = TRUE)] %>%
.[, `:=` (
reps = ifelse(
startTime + (duration_minutes * 60) <= start_ceiling, 1, pmax(2, floor(duration_minutes / 60) + 1)
),
initial_diff = as.numeric(difftime(start_ceiling[1], startTime[1], units = "mins"))
), by = id] %>%
.[, df[df[, rep(.I, reps)]]] %>%
.[, startTime := pmax(startTime, floor_date(startTime, "hour") + hours(0:(.N - 1))), by = id] %>%
.[reps > 1, duration_minutes := c(initial_diff[.N],
rep(60, reps[.N] - 2),
(duration_minutes[.N] - initial_diff[.N]) %% 60), by = id] %>%
.[!(duration_minutes == 0 & reps > 1), ] %>%
.[, c("reps", "start_ceiling", "initial_diff") := NULL]
I've tested this with all the scenarios we've gathered so far, and this is the output:
id startTime duration_minutes
1: 1 2018-01-01 12:15:31 44.48333
2: 1 2018-01-01 13:00:00 60.00000
3: 1 2018-01-01 14:00:00 60.00000
4: 1 2018-01-01 15:00:00 60.00000
5: 1 2018-01-01 16:00:00 60.00000
6: 1 2018-01-01 17:00:00 30.51667
7: 2 2018-01-02 23:43:00 17.00000
8: 2 2018-01-03 00:00:00 60.00000
9: 2 2018-01-03 01:00:00 43.00000
10: 3 2018-01-03 11:00:11 45.00000
11: 4 2018-01-04 10:00:00 60.00000
12: 4 2018-01-04 11:00:00 5.00000
13: 5 2018-01-05 00:00:00 60.00000
14: 6 2018-01-06 11:35:00 25.00000
15: 6 2018-01-06 12:00:00 10.00000
16: 7 2018-01-07 00:00:00 60.00000
17: 7 2018-01-07 01:00:00 60.00000
Data used:
df <- data.frame(
id = c(1, 2, 3, 4, 5, 6, 7),
startTime = as.POSIXct(
c(
"2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11",
"2018-01-04 10:00:00",
"2018-01-05 00:00:00",
"2018-01-06 11:35:00",
"2018-01-07 00:00:00"
)
),
duration_minutes = c(315, 120, 45, 65, 60, 35, 120)
)
df
id startTime duration_minutes
1 1 2018-01-01 12:15:31 315
2 2 2018-01-02 23:43:00 120
3 3 2018-01-03 11:00:11 45
4 4 2018-01-04 10:00:00 65
5 5 2018-01-05 00:00:00 60
6 6 2018-01-06 11:35:00 35
7 7 2018-01-07 00:00:00 120
Related
I am trying to see how I can create a variable which summarizes observations across multiple dates.
library(data.table)
library(lubridate)
library(magrittr)
sample <- data.table(start = c("2018-12-22 23:00:00",
"2018-12-23 06:00:00",
"2018-12-22 06:00:00",
"2018-12-23 06:00:00"),
end = c("2018-12-23 06:00:00",
"2018-12-23 13:00:00",
"2018-12-23 12:00:00",
"2018-12-24 01:00:00"),
store = c("A", "A", "B", "B"))
sample[, start:= ymd_hms(start)]
sample[, end := ymd_hms(end)]
sample
> sample
start end store
1: 2018-12-22 23:00:00 2018-12-23 06:00:00 A
2: 2018-12-23 06:00:00 2018-12-23 13:00:00 A
3: 2018-12-22 06:00:00 2018-12-23 12:00:00 B
4: 2018-12-23 06:00:00 2018-12-24 01:00:00 B
Here, sample is a time card of "shifts" used across each store. We see that store A has two observations, each with a start and end time. If there was no "bleeding" across dates (e.g. first observation begins on 2018-12-22 and ends on 2018-12-23), I would simply subtract the start and end times, and sum across the stores to get the total amount of minutes used across each stores. Something like:
worked_mins <- sample %>%
.[, date := ymd(substr(start,1,10))] %>%
.[, minutes := end - start] %>%
.[, .(worked_mins = sum(minutes)), by = .(store,date)]
However, I am trying to see how to best sum the number of minutes when shifts overlap across multiple days (potentially even >=2 days).
From the above, the desired output would be:
worked_mins = data.table(store = c("A","A", "B", "B", "B"),
date = c("2018-12-22", "2018-12-23",
"2018-12-22", "2018-12-23",
"2018-12-24"),
worked_mins = c(1, 13, 18, 30, 1))
> worked_mins
store date worked_mins
1: A 2018-12-22 1
2: A 2018-12-23 13
3: B 2018-12-22 18
4: B 2018-12-23 30
5: B 2018-12-24 1
Thanks!
An updated solution that counts actual time, not just counting hours. This should take into account fractional hours.
library(lubridate) # ceiling_date, floor_date
func <- function(st, en, units = "hours") {
midns <- ceiling_date(seq(st, en, by = "day"), unit = "day")
times <- unique(sort(c(midns[ st < midns & midns < en], st, en)))
if (length(times) < 2) {
data.table(date = as.Date(floor_date(st)), d = structure(0, class = "difftime", units = units))
} else {
data.table(date = as.Date(floor_date(times[-length(times)], unit = "days")), d = `units<-`(diff(times), units))
}
}
sample[, rbindlist(Map(func, start, end)), by = .(store)
][, .(d = sum(d)), by = .(store, date)]
# store date d
# <char> <Date> <difftime>
# 1: A 2018-12-22 1 hours
# 2: A 2018-12-23 13 hours
# 3: B 2018-12-22 18 hours
# 4: B 2018-12-23 30 hours
# 5: B 2018-12-24 1 hours
(The 1 hours is still a numeric column, it just has a label of its units attached; this can be removed easily by wrapping the diff in as.numeric.)
func works by including midnights between st and en; creating a times ordered vector of these unique timestamps allows us to diff across them, then floor_date them so that we know the date that each diff started.
You can see what func is doing with this quick demo, one that makes the first line a 0-second difference (for testing and validation):
copy(sample)[1, end:=start][, rbindlist(Map(func, start, end)), by = .(store)]
# store date d
# <char> <Date> <difftime>
# 1: A 2018-12-22 0 hours
# 2: A 2018-12-23 7 hours
# 3: B 2018-12-22 18 hours
# 4: B 2018-12-23 12 hours
# 5: B 2018-12-23 18 hours
# 6: B 2018-12-24 1 hours
Does this achieve what you need?
sample %>%
rowwise() %>%
mutate(
worked_hours = map2(start, end, ~seq(.x, .y, "hours") %>% head(-1))
) %>%
unnest(cols = c(worked_hours)) %>%
select(store, worked_hours) %>%
mutate(date = floor_date(worked_hours, "days")) %>%
group_by(store, date) %>%
count(name = "worked_mins")
# A tibble: 5 x 3
# Groups: store, date [5]
store date worked_mins
<chr> <dttm> <int>
1 A 2018-12-22 00:00:00 1
2 A 2018-12-23 00:00:00 13
3 B 2018-12-22 00:00:00 18
4 B 2018-12-23 00:00:00 30
5 B 2018-12-24 00:00:00 1
I have a sample dataset of the trajectory of one bike. My objective is to figure out, on average, the amount of time that lapses in between visits to station B.
So far, I have been able to simply order the dataset with:
test[order(test$starttime, decreasing = FALSE),]
and find the row index of where start_station and end_station equal B.
which(test$start_station == 'B')
which(test$end_station == 'B')
The next part is where I run into trouble. In order to calculate the time that lapses in between when the bike is at Station B, we must take the difftime() between where start_station = "B" (bike leaves) and the next occurring record where end_station= "B", even if the record happens to be in the same row (see row 6).
Using the dataset below, we know that the bike spent 510 minutes between 7:30:00 and 16:00:00 outside of Station B, 30 minutes between 18:00:00 and 18:30:00 outside of Station B, and 210 minutes between 19:00:00 and 22:30:00 outside of Station B, which averages to 250 minutes.
How would one reproduce this output in R using difftime()?
> test
bikeid start_station starttime end_station endtime
1 1 A 2017-09-25 01:00:00 B 2017-09-25 01:30:00
2 1 B 2017-09-25 07:30:00 C 2017-09-25 08:00:00
3 1 C 2017-09-25 10:00:00 A 2017-09-25 10:30:00
4 1 A 2017-09-25 13:00:00 C 2017-09-25 13:30:00
5 1 C 2017-09-25 15:30:00 B 2017-09-25 16:00:00
6 1 B 2017-09-25 18:00:00 B 2017-09-25 18:30:00
7 1 B 2017-09-25 19:00:00 A 2017-09-25 19:30:00
8 1 А 2017-09-25 20:00:00 C 2017-09-25 20:30:00
9 1 C 2017-09-25 22:00:00 B 2017-09-25 22:30:00
10 1 B 2017-09-25 23:00:00 C 2017-09-25 23:30:00
Here is the sample data:
> dput(test)
structure(list(bikeid = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), start_station = c("A",
"B", "C", "A", "C", "B", "B", "А", "C", "B"), starttime = structure(c(1506315600,
1506339000, 1506348000, 1506358800, 1506367800, 1506376800, 1506380400,
1506384000, 1506391200, 1506394800), class = c("POSIXct", "POSIXt"
), tzone = ""), end_station = c("B", "C", "A", "C", "B", "B",
"A", "C", "B", "C"), endtime = structure(c(1506317400, 1506340800,
1506349800, 1506360600, 1506369600, 1506378600, 1506382200, 1506385800,
1506393000, 1506396600), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("bikeid",
"start_station", "starttime", "end_station", "endtime"), row.names = c(NA,
-10L), class = "data.frame")
This will calculate the difference as asked in the order it occurs, but does not append it to the data.frame
lapply(df1$starttime[df1$start_station == "B"], function(x, et) difftime(et[x < et][1], x, units = "mins"), et = df1$endtime[df1$end_station == "B"])
[[1]]
Time difference of 510 mins
[[2]]
Time difference of 30 mins
[[3]]
Time difference of 210 mins
[[4]]
Time difference of NA mins
To calculate the average time:
v1 <- sapply(df1$starttime[df1$start_station == "B"], function(x, et) difftime(et[x < et][1], x, units = "mins"), et = df1$endtime[df1$end_station == "B"])
mean(v1, na.rm = TRUE)
[1] 250
Another possibility:
library(data.table)
d <- setDT(test)[ , {
start = starttime[start_station == "B"]
end = endtime[end_station == "B"]
.(start = start, end = end, duration = difftime(end, start, units = "min"))
}
, by = .(trip = cumsum(start_station == "B"))]
d
# trip start end duration
# 1: 0 <NA> 2017-09-25 01:30:00 NA mins
# 2: 1 2017-09-25 07:30:00 2017-09-25 16:00:00 510 mins
# 3: 2 2017-09-25 18:00:00 2017-09-25 18:30:00 30 mins
# 4: 3 2017-09-25 19:00:00 2017-09-25 22:30:00 210 mins
# 5: 4 2017-09-25 23:00:00 <NA> NA mins
d[ , mean(duration, na.rm = TRUE)]
# Time difference of 250 mins
# or
d[ , mean(as.integer(duration), na.rm = TRUE)]
# [1] 250
The data is grouped by a counter which increases by 1 each time a bike starts from "B" (by = cumsum(start_station == "B")).
I have a dataset that looks like this.
id1 = c(1,1,1,1,1,1,1,1,2,2)
id2 = c(3,3,3,3,3,3,3,3,3,3)
lat = c(-62.81559,-62.82330, -62.78693,-62.70136, -62.76476,-62.48157,-62.49064,-62.45838,42.06258,42.06310)
lon = c(-61.15518, -61.14885,-61.17801,-61.00363, -59.14270, -59.22009, -59.32967, -59.04125 ,154.70579, 154.70625)
start_date= as.POSIXct(c('2016-03-24 15:30:00', '2016-03-24 15:30:00','2016-03-24 23:40:00','2016-03-25 12:50:00','2016-03-29 18:20:00','2016-06-01 02:40:00','2016-06-01 08:00:00','2016-06-01 16:30:00','2016-07-29 20:20:00','2016-07-29 20:20:00'), tz = 'UTC')
end_date = as.POSIXct(c('2016-03-24 23:40:00', '2016-03-24 18:50:00','2016-03-25 03:00:00','2016-03-25 19:20:00','2016-04-01 03:30:00','2016-06-02 01:40:00','2016-06-01 14:50:00','2016-06-02 01:40:00','2016-07-30 07:00:00','2016-07-30 07:00:00'),tz = 'UTC')
speed = c(2.9299398, 2.9437502, 0.0220565, 0.0798409, 1.2824859, 1.8685429, 3.7927680, 1.8549291, 0.8140249,0.8287073)
df = data.frame(id1, id2, lat, lon, start_date, end_date, speed)
id1 id2 lat lon start_date end_date speed
1 1 3 -62.81559 -61.15518 2016-03-24 15:30:00 2016-03-24 23:40:00 2.9299398
2 1 3 -62.82330 -61.14885 2016-03-24 15:30:00 2016-03-24 18:50:00 2.9437502
3 1 3 -62.78693 -61.17801 2016-03-24 23:40:00 2016-03-25 03:00:00 0.0220565
4 1 3 -62.70136 -61.00363 2016-03-25 12:50:00 2016-03-25 19:20:00 0.0798409
5 1 3 -62.76476 -59.14270 2016-03-29 18:20:00 2016-04-01 03:30:00 1.2824859
6 1 3 -62.48157 -59.22009 2016-06-01 02:40:00 2016-06-02 01:40:00 1.8685429
7 1 3 -62.49064 -59.32967 2016-06-01 08:00:00 2016-06-01 14:50:00 3.7927680
8 1 3 -62.45838 -59.04125 2016-06-01 16:30:00 2016-06-02 01:40:00 1.8549291
9 2 3 42.06258 154.70579 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8140249
10 2 3 42.06310 154.70625 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8287073
The actual dataset is larger. What I would like to do is consolidate this dataset based on date ranges and grouped by id1 and id2, such that if the date/time range on one row is within 12 hours of the next date/time range 'ABS(end_date[1] - start_date[2]) < 12hrs' the rows should be consolidated with the new start_date being the earliest date and the end_date being the latest. All other values (lat, lon, speed) will be averaged. This is some sense a 'deduping' effort as rows that are within 12 hours actually represent the same 'event'. For the above example the final result would be
id1 id2 lat lon start_date end_date speed
1 1 3 -62.7818 -61.12142 2016-03-24 15:30:00 2016-03-25 19:20:00 1.493897
2 1 3 -62.76476 -59.14270 2016-03-29 18:20:00 2016-04-01 03:30:00 1.2824859
3 1 3 -62.47686 -59.197 2016-06-01 02:40:00 2016-06-02 01:40:00 2.505413
4 2 3 42.06284 154.706 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8213661
With the first four rows consolidated (into row1), the 5 row left alone (row2), the 6-8 rows consolidated (row3), and the 9-10 rows consolidated (row4).
I have been trying to do this with dplyr group_by and summarize, but I can't seem to get the get the date ranges to come out correctly.
Hopefully someone can determine a simple means of solving the problem. Extra points if you know how to do it in SQL ;-) so I can dedupe before even pulling this into R.
Here is a first very naive implementation. Warning: it is slow, not pretty and still missing the start and end dates in the output! Note that it expects the rows to be ordered by date and time. If that's not the case in the data set, you can do it in R or SQL first. Sorry that I can't think of a dplyr or SQL solution. I'd also like to see those two, if anyone has got an idea.
dedupe <- function(df) {
counter = 1
temp_vector = unlist(df[1, ])
summarized_df = df[0, c(1, 2, 3, 4, 7)]
colnames(summarized_df) = colnames(df)[c(1, 2, 3, 4, 7)]
summarized_df$counter = NULL
for (i in 2:nrow(df)) {
if (((abs(difftime(df[i, "start_date"], df[i - 1, "end_date"], units = "h")) <
12) ||
abs(difftime(df[i, "start_date"], df[i - 1, "start_date"], units = "h")) <
12) &&
df[i, "id1"] == df[i - 1, "id1"] &&
df[i, "id2"] == df[i - 1, "id2"]) {
#group events because id is the same and time range overlap
#sum up columns and select maximum end_date
temp_vector[c(3, 4, 7)] = temp_vector[c(3, 4, 7)] + unlist(df[i, c(3, 4, 7)])
temp_vector["end_date"] = max(temp_vector["end_date"], df[i, "end_date"])
counter = counter + 1
if (i == nrow(df)) {
#in the last iteration we need to create a new group
summarized_df[nrow(summarized_df) + 1, c(1, 2)] = df[i, c(1, 2)]
summarized_df[nrow(summarized_df), 3:5] = temp_vector[c(3, 4, 7)] / counter
summarized_df[nrow(summarized_df), "counter"] = counter
}
} else {
#new event so we calculate group statistics for temp_vector and reset its value as well as counter
summarized_df[nrow(summarized_df) + 1, c(1, 2)] = df[i, c(1, 2)]
summarized_df[nrow(summarized_df), 3:5] = temp_vector[c(3, 4, 7)] / counter
summarized_df[nrow(summarized_df), "counter"] = counter
counter = 1
temp_vector[c(3, 4, 7)] = unlist(df[i, c(3, 4, 7)])
}
}
return(summarized_df)
}
Function call
> dedupe(df)
id1 id2 lat lon speed counter
5 1 3 -62.78179 -61.12142 1.4938968 4
6 1 3 -62.76476 -59.14270 1.2824859 1
9 2 3 -62.47686 -59.19700 2.5054133 3
10 2 3 42.06284 154.70602 0.8213661 2
This can be easily achieved by using insurancerating::reduce():
df |>
insurancerating::reduce(begin = start_date, end = end_date, id1, id2,
agg_cols = c(lat, lon, speed), agg = "mean",
min.gapwidth = 12 * 3600)
#> id1 id2 index end_date start_date lat lon
#> 1 1 3 0 2016-03-25 19:20:00 2016-03-24 15:30:00 -62.78180 -61.12142
#> 2 1 3 1 2016-04-01 03:30:00 2016-03-29 18:20:00 -62.76476 -59.14270
#> 3 1 3 2 2016-06-02 01:40:00 2016-06-01 02:40:00 -62.47686 -59.19700
#> 4 2 3 0 2016-07-30 07:00:00 2016-07-29 20:20:00 42.06284 154.70602
#> speed
#> 1 1.4938969
#> 2 1.2824859
#> 3 2.5054133
#> 4 0.8213661
Created on 2022-06-13 by the reprex package (v2.0.1)
I have a data_frame with POSIXct date-times. I would now like to create a variable that cuts these date-times into timebands: 1 -- [00:00:00, 08:00:00), 2 -- [08:00:00, 17:00:00), 3 -- [17:00:00, 18:30:00), 4 -- [18:30:00, 00:00:00).
Here is some sample data:
df_times = data_frame(
datetime = seq.POSIXt(
from = as.POSIXct(strftime("2016-01-01 00:00:00", format = "%Y-%m-%d :%H:%M:%S")),
by = "min",
length.out = 100000
),
value = rnorm(100000)
)
Here is the expected output:
> df_times
# A tibble: 100,000 × 3
datetime value band
<dttm> <dbl> <dbl>
1 2016-01-01 00:00:00 0.5855288 1
2 2016-01-01 00:01:00 0.7094660 1
3 2016-01-01 00:02:00 -0.1093033 1
4 2016-01-01 00:03:00 -0.4534972 1
5 2016-01-01 00:04:00 0.6058875 1
6 2016-01-01 00:05:00 -1.8179560 1
7 2016-01-01 00:06:00 0.6300986 1
8 2016-01-01 00:07:00 -0.2761841 1
9 2016-01-01 00:08:00 -0.2841597 1
10 2016-01-01 00:09:00 -0.9193220 1
# ... with 99,990 more rows
I have tried cut.POSIXt but that insists on keeping track of dates. An ideal solution will use dplyr::recode or forcats::.
Here is the solution I think directly translates the intent of the question into code:
set.seed(12345)
# create a dataset
df_times = data_frame(
datetime = seq.POSIXt(
from = as.POSIXct("2016-01-01 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
by = "min",
length.out = 100000
),
value = rnorm(100000)
) %>%
mutate(
time = times(format(datetime, "%H:%M:%S")),
cut(
time,
breaks = times(c(
"00:00:00",
"08:00:00",
"17:00:00",
"18:30:00",
"23:59:59"
)),
labels = c(
"1",
"2",
"3",
"4"
),
include.lowest = TRUE,
right = FALSE
)
)
You could create an hour column and then cut that:
df_times$hour = as.numeric(df_times$datetime) %% (24*60*60) / 3600
df_times$band = cut(df_times$hour, breaks=c(0,8,17,18.5,24), include.lowest=TRUE,
right=FALSE)
I have data on 'Start' and 'End' time for different jobs, grouped by 'owner':
Data <- data.frame(
job = c(1, 2, 3, 4, 5),
owner = c("name1", "name2", "name1", "name1", "name2"),
Start = as.POSIXct(c("2015-01-01 15:00:00", "2015-01-01 15:01:00", "2015-01-01 15:13:00", "2015-01-01 15:20:00", "2015-01-01 15:39:02"), format="%Y-%m-%d %H:%M:%S"),
End = as.POSIXct(c("2015-01-01 15:11:11", "2015-01-01 15:17:21", "2015-01-01 15:17:00", "2015-01-01 15:31:21", "2015-01-01 15:40:11"), format="%Y-%m-%d %H:%M:%S")
)
For each owner, I want to calculate the idle time between the jobs for each owner, i.e. the difference between the 'End' time of one job and the 'Start' time of the next job.
How do I use difftime() to calculate this time difference between specific rows and times in different columns?
The result should look something like this:
job, owner, idletime
1, name1, NA
2, name2, NA
3, name1, 1.816667 # End of row 1 minus Start of row 3
4, name1, 3.0 # End of row 3 minus Start of row 4
...
Here's a possible solution using data.table
library(data.table) # v 1.9.5+
setDT(Data)[, idletime := difftime(Start, shift(End), units = "mins"), by = owner]
# job owner Start End idletime
# 1: 1 name1 2015-01-01 15:00:00 2015-01-01 15:11:11 NA mins
# 2: 2 name2 2015-01-01 15:01:00 2015-01-01 15:17:21 NA mins
# 3: 3 name1 2015-01-01 15:13:00 2015-01-01 15:17:00 1.816667 mins
# 4: 4 name1 2015-01-01 15:20:00 2015-01-01 15:31:21 3.000000 mins
# 5: 5 name2 2015-01-01 15:39:02 2015-01-01 15:40:11 21.683333 mins
Or using dplyr
library(dplyr)
Data %>%
group_by(owner) %>%
mutate(idletime = difftime(Start, lag(End), units = "mins"))
# Source: local data frame [5 x 5]
# Groups: owner
#
# job owner Start End idletime
# 1 1 name1 2015-01-01 15:00:00 2015-01-01 15:11:11 NA mins
# 2 2 name2 2015-01-01 15:01:00 2015-01-01 15:17:21 NA mins
# 3 3 name1 2015-01-01 15:13:00 2015-01-01 15:17:00 1.816667 mins
# 4 4 name1 2015-01-01 15:20:00 2015-01-01 15:31:21 3.000000 mins
# 5 5 name2 2015-01-01 15:39:02 2015-01-01 15:40:11 21.683333 mins
If we are using base R, ave would be one option. We get the lag of the 'End' grouped by 'owner' using ave, use that as the second argument in difftime to create the 'idtime'.
Data$idtime <- with(Data, difftime(Start, ave(End, owner,FUN=lag), units='mins'))
Data
# job owner Start End idtime
#1 1 name1 2015-01-01 15:00:00 2015-01-01 15:11:11 NA mins
#2 2 name2 2015-01-01 15:01:00 2015-01-01 15:17:21 NA mins
#3 3 name1 2015-01-01 15:13:00 2015-01-01 15:17:00 1.816667 mins
#4 4 name1 2015-01-01 15:20:00 2015-01-01 15:31:21 3.000000 mins
#5 5 name2 2015-01-01 15:39:02 2015-01-01 15:40:11 21.683333 mins
NOTE: I named the column name as 'idtime' to keep the code in a single line :-)
library(dplyr)
Data <- data.frame(
job = c(1, 2, 3, 4, 5),
owner = c("name1", "name2", "name1", "name1", "name2"),
Start = as.POSIXct(c("2015-01-01 15:00:00", "2015-01-01 15:01:00", "2015-01-01 15:13:00", "2015-01-01 15:20:00", "2015-01-01 15:39:02"), format="%Y-%m-%d %H:%M:%S"),
End = as.POSIXct(c("2015-01-01 15:11:11", "2015-01-01 15:17:21", "2015-01-01 15:17:00", "2015-01-01 15:31:21", "2015-01-01 15:40:11"), format="%Y-%m-%d %H:%M:%S")
)
Data %>%
group_by(owner) %>%
arrange(Start) %>%
mutate(lagEnd = lag(End),
idletime = difftime(Start,lagEnd, units="mins")) %>%
ungroup %>%
arrange(job) %>%
select(job,owner,idletime)
# job owner idletime
# 1 1 name1 NA mins
# 2 2 name2 NA mins
# 3 3 name1 1.816667 mins
# 4 4 name1 3.000000 mins
# 5 5 name2 21.683333 mins