lag function with variable n - r

I am having some problems using the lag function in dplyr. This is my dataset.
ID <- c(100, 100, 100, 200, 200, 300, 300)
daytime <- c("2010-12-21 06:00:00", "2010-12-21 09:00:00", "2010-12-21 13:00:00 ", "2010-12-23 23:00:00", "2010-12-24 02:00:00", "2010-12-25 19:00:00", "2010-12-31 08:00:00")
lagfirstvisit <- c(0, 0, 2, 0, 1, 0, 0)
table <- cbind(ID, daytime, lagfirstvisit)
table <- as.data.frame(table)
table$daytime <- as.POSIXct(table$daytime)
My aim is to generate a new column with the lag of variable daytime by the number as indicated in the lagfirstvisit column. i.e. If lagfirstvisit == 2, I would want the lag2 daytime value of the particular ID. If lagfirstvisit == 0, it would mean to keep the observation row's original daytime value.
My expected result is as follow:
ID <- c(100, 100, 100, 200, 200, 300, 300)
daytime <- c("2010-12-21 06:00:00", "2010-12-21 09:00:00", "2010-12-21 13:00:00 ", "2010-12-23 23:00:00", "2010-12-24 02:00:00", "2010-12-25 19:00:00", "2010-12-31 08:00:00")
lagfirstvisit <- c(0, 0, 2, 0, 1, 0, 0)
result <- c("2010-12-21 06:00:00", "2010-12-21 09:00:00", "2010-12-21 06:00:00", "2010-12-23 23:00:00", "2010-12-23 23:00:00", "2010-12-25 19:00:00", "2010-12-31 08:00:00")
table.results <- cbind(ID, daytime, lagfirstvisit, result)
Currently, the code I am using is:
table <- table %>%
group_by(ID) %>%
mutate(result = lag(as.POSIXct(daytime, format="%m/%d/%Y %H:%M:%S", tz= "UTC"), n = as.integer(lagfirstvisit)))
However, I get the error:
Error in mutate_impl(.data, dots) :
Evaluation error: n must be a non-negative integer scalar, not integer of length 3.
Does, anyone out there know how do I resolve this problem? Thank you very much!

table.results %>%
group_by(ID) %>%
mutate(
result2=mapply(`[`, list(day), row_number() - lagfirstvisit)
)
# A tibble: 7 x 5
# Groups: ID [3]
ID day lagfirstvisit result result2
<dbl> <dbl> <dbl> <dbl> <dbl>
1 100. 21. 0. 21. 21.
2 100. 22. 0. 22. 22.
3 100. 23. 2. 21. 21.
4 200. 12. 0. 12. 12.
5 200. 13. 1. 12. 12.
6 300. 19. 0. 19. 19.
7 300. 22. 0. 22. 22.

table%>%
mutate_all(~as.numeric(as.character(.x)))%>%#First ensure all columns are numeric
mutate(result=day[1:n()-lagfirstvisit])# you can also use row_number() instead of 1:n()
ID day lagfirstvisit result
1 100 21 0 21
2 100 22 0 22
3 100 23 2 21
4 200 12 0 12
5 200 13 1 12
6 300 19 0 19
7 300 22 0 22
Caution: Refrain from using the inbuilt function names as variable names. eg, you are not supposed to use the name table as this is a function in base r
EDIT:
With the new data, the procedure remains the same, as long as the lagfirstvisit is numeric:
table%>%
mutate(result=daytime[1:n()-as.numeric(as.character(lagfirstvisit))])
ID daytime lagfirstvisit result
1 100 2010-12-21 06:00:00 0 2010-12-21 06:00:00
2 100 2010-12-21 09:00:00 0 2010-12-21 09:00:00
3 100 2010-12-21 13:00:00 2 2010-12-21 06:00:00
4 200 2010-12-23 23:00:00 0 2010-12-23 23:00:00
5 200 2010-12-24 02:00:00 1 2010-12-23 23:00:00
6 300 2010-12-25 19:00:00 0 2010-12-25 19:00:00
7 300 2010-12-31 08:00:00 0 2010-12-31 08:00:00

I think this is a little cleaner than the current answers:
table %>%
group_by(ID, lagfirstvisit) %>%
mutate(result = dplyr::lag(daytime, n = lagfirstvisit[1])) %>%
ungroup()
Since it's grouped lagfirstvisit all the indexes are the same, so taking the first works ok.

Related

Calculate Rolling 12 Hours by Group in R

I am working on a project where I have to only include patients who had lab tests ordered at least 12 hours apart, and to keep the timestamp of each included lab test. The issue is that many patients get several labs done within the 12 hour window, but the client has asked to not include those tests. I have made it this far:
#Create dummy dataset
df = data.frame(
"Encounter" = c(rep("12345", times=16), rep("67890", times = 5)),
"Timestamp" = c("01/06/2022 04:00:00", "01/07/2022 08:00:00",
"01/08/2022 00:00:00", "01/08/2022 04:00:00",
"01/08/2022 08:00:00", "01/08/2022 20:00:00",
"01/09/2022 04:00:00", "01/09/2022 08:00:00",
"01/09/2022 20:00:00", "01/09/2022 23:26:00",
"01/10/2022 00:00:00", "01/10/2022 08:00:00",
"01/10/2022 20:00:00", "01/11/2022 00:00:00",
"01/11/2022 20:00:00", "01/12/2022 04:00:00",
"11/10/2021 11:00:00", "11/10/2021 12:00:00",
"11/10/2021 13:00:00", "11/10/2021 14:00:00",
"11/11/2021 00:00:00"))
#Convert timestamp to POSIXlt format
df$Timestamp <- strptime(as.character(df$Timestamp), format="%m/%d/%Y %H:%M")
#Calculate time (in hours) between each previous timestamp by Encounter
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(difftime(Timestamp, lag(Timestamp), units="hours"))
I can't seem to figure out what to do next. It seems like I need to calculate a rolling 12-hours that then resets to 0 once a row hits 12 hours, but I'm not sure how to go about it. Below is my ideal result:
df$Keep.Row <- c(1,1,1,0,0,1,0,1,1,0,0,1,1,0,1,0,1,0,0,0,1)
There is absolutely nothing elegant about this, but I believe it gives you what you’re looking for. I use a temporary variable to store the “rolling” sum before it’s reset once the hours between is 12 or greater.
library(tidyverse)
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_diff = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
replace_na(list(time_diff = 0)) %>%
mutate(temp = ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(time_diff) + time_diff),
temp = ifelse(is.na(temp), 0, temp),
hours_between = ifelse(time_diff >= 12, time_diff,
ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(temp) + time_diff)),
keep = ifelse(hours_between >= 12 | is.na(hours_between), 1, 0)) %>%
select(-temp)
Created on 2022-01-27 by the reprex package (v2.0.1)
Here is an alternative option using accumulate. Here, you can use you differences, and once they exceed the threshold of 12 hours, reset by just using the diff value (starting over) instead of using the cumulative sum. To include the first time for each Encounter, you can either make that diff 12 hours, or add a separate mutate and check where Timestamp == first(Timestamp) and in those cases set keep to 1.
library(tidyverse)
thresh <- 12
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(diff = difftime(Timestamp, lag(Timestamp, default = first(Timestamp) - (thresh * 60 * 60)), units = "hours"),
keep = +(accumulate(diff, ~if_else(.x >= thresh, .y, .x + .y)) >= thresh))
Output
Encounter Timestamp diff keep
<chr> <dttm> <drtn> <int>
1 12345 2022-01-06 04:00:00 12.0000000 hours 1
2 12345 2022-01-07 08:00:00 28.0000000 hours 1
3 12345 2022-01-08 00:00:00 16.0000000 hours 1
4 12345 2022-01-08 04:00:00 4.0000000 hours 0
5 12345 2022-01-08 08:00:00 4.0000000 hours 0
6 12345 2022-01-08 20:00:00 12.0000000 hours 1
7 12345 2022-01-09 04:00:00 8.0000000 hours 0
8 12345 2022-01-09 08:00:00 4.0000000 hours 1
9 12345 2022-01-09 20:00:00 12.0000000 hours 1
10 12345 2022-01-09 23:26:00 3.4333333 hours 0
11 12345 2022-01-10 00:00:00 0.5666667 hours 0
12 12345 2022-01-10 08:00:00 8.0000000 hours 1
13 12345 2022-01-10 20:00:00 12.0000000 hours 1
14 12345 2022-01-11 00:00:00 4.0000000 hours 0
15 12345 2022-01-11 20:00:00 20.0000000 hours 1
16 12345 2022-01-12 04:00:00 8.0000000 hours 0
17 67890 2021-11-10 11:00:00 12.0000000 hours 1
18 67890 2021-11-10 12:00:00 1.0000000 hours 0
19 67890 2021-11-10 13:00:00 1.0000000 hours 0
20 67890 2021-11-10 14:00:00 1.0000000 hours 0
21 67890 2021-11-11 00:00:00 10.0000000 hours 1
Probably missing something, but wouldn't this work:
library(dplyr)
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_dif = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
filter(time_dif > 12)

Convert start time and total duration to elapsed time per hour

I have data on start time ('startTime', a date-time variable, POSIXct) and duration in minutes ('duration_minutes'):
df <- data.frame(id = c(1, 2, 3),
startTime = as.POSIXct(c("2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11")),
duration_minutes = c(315, 120, 45))
I want to convert the start time and duration to elapsed time per hour, for each hour, from the hour of the start time to the last hour at the end of the duration:
df_result <- data.frame(id = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 3),
startTime = c("2018-01-01 12:15:31","2018-01-01 13:00:00",
"2018-01-01 14:00:00","2018-01-01 15:00:00",
"2018-01-01 16:00:00","2018-01-01 17:00:00",
"2018-01-02 23:43:00","2018-01-03 00:00:00",
"2018-01-03 01:00:00",
"2018-01-03 11:00:11"),
duration_minutes = c(44.48, 60, 60, 60, 60, 30.5, 17, 60, 43, 45))
Please, advice with the possible solution.
Another possibility:
library(data.table)
library(lubridate)
setDT(df)
df[ , ceil_start := ceiling_date(start, "hour", change_on_boundary = TRUE)]
df[ , {
if(difftime(ceil_start, start, units = "min") > dur) {
.SD[ , .(start, dur)]
} else {
end <- start + dur * 60
time <- c(start,
seq(from = ceil_start,
to = floor_date(end, "hour"),
by = "hour"),
end)
.(start = head(time, -1), dur = `units<-`(diff(time), "mins"))
}
},
by = id]
# id start dur
# 1: 1 2018-01-01 12:15:31 44.48333 mins
# 2: 1 2018-01-01 13:00:00 60.00000 mins
# 3: 1 2018-01-01 14:00:00 60.00000 mins
# 4: 1 2018-01-01 15:00:00 60.00000 mins
# 5: 1 2018-01-01 16:00:00 60.00000 mins
# 6: 1 2018-01-01 17:00:00 30.51667 mins
# 7: 2 2018-01-02 23:43:00 17.00000 mins
# 8: 2 2018-01-03 00:00:00 60.00000 mins
# 9: 2 2018-01-03 01:00:00 43.00000 mins
# 10: 3 2018-01-03 11:00:11 45.00000 mins
# 11: 4 2018-01-03 11:35:00 25.00000 mins
# 12: 4 2018-01-03 12:00:00 10.00000 mins
# 13: 5 2018-01-03 00:00:00 60.00000 mins
# 14: 5 2018-01-03 01:00:00 0.00000 mins
Explanation
Convert data.frame to data.table (setDT). Round up start times to nearest hour (ceiling_date(start, "hour", ...). Use change_on_boundary = TRUE for easier handling of times without minutes and seconds (not in the data, but tested).
To handle cases when the end time (start + duration) is in the same hour as the start time (e.g. id = 3), check if difference between rounded time and start time is larger than duration (if(difftime(ceil_start, start, units = "min") > dur))). If so, just select the start and duration columns (.SD[ , .(start, dur)).
For other cases (else), calculate end time: end <- start + dur * 60. Create a sequence from the up-rounded start time ('ceil_start'), to the down-rounded end time, with an hourly increment (seq(from = ceil_start, to = floor_date(end, "hour"), by = "hour")). Concatenate with 'start' and 'end' times. Return all times except the last (head(time, -1) and calculate difference between time steps in minutes (`units<-`(diff(time), "mins")).
For times with H:M:S = 00:00:00 and duration is a multiple of 60 min, like id = 5, the current solution gives a row with a duration of 0 minutes for the last hour. While waiting for a more elegant solution, a quick and dirty way is just to delete such rows with duration = 0.
Data
Please note that I have added a case not included in original data, id = 4 (see also my comment above) and id = 5.
df <- data.frame(id = 1:5,
start = as.POSIXct(c("2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11",
"2018-01-03 11:35:00",
"2018-01-03 00:00:00")),
dur = c(315, 120, 45, 35, 60))
Try this:
library(data.table)
library(lubridate)
library(magrittr)
df <-
setDT(df)[, start_ceiling := ceiling_date(startTime, "hour", change_on_boundary = TRUE)] %>%
.[, `:=` (
reps = ifelse(
startTime + (duration_minutes * 60) <= start_ceiling, 1, pmax(2, floor(duration_minutes / 60) + 1)
),
initial_diff = as.numeric(difftime(start_ceiling[1], startTime[1], units = "mins"))
), by = id] %>%
.[, df[df[, rep(.I, reps)]]] %>%
.[, startTime := pmax(startTime, floor_date(startTime, "hour") + hours(0:(.N - 1))), by = id] %>%
.[reps > 1, duration_minutes := c(initial_diff[.N],
rep(60, reps[.N] - 2),
(duration_minutes[.N] - initial_diff[.N]) %% 60), by = id] %>%
.[!(duration_minutes == 0 & reps > 1), ] %>%
.[, c("reps", "start_ceiling", "initial_diff") := NULL]
I've tested this with all the scenarios we've gathered so far, and this is the output:
id startTime duration_minutes
1: 1 2018-01-01 12:15:31 44.48333
2: 1 2018-01-01 13:00:00 60.00000
3: 1 2018-01-01 14:00:00 60.00000
4: 1 2018-01-01 15:00:00 60.00000
5: 1 2018-01-01 16:00:00 60.00000
6: 1 2018-01-01 17:00:00 30.51667
7: 2 2018-01-02 23:43:00 17.00000
8: 2 2018-01-03 00:00:00 60.00000
9: 2 2018-01-03 01:00:00 43.00000
10: 3 2018-01-03 11:00:11 45.00000
11: 4 2018-01-04 10:00:00 60.00000
12: 4 2018-01-04 11:00:00 5.00000
13: 5 2018-01-05 00:00:00 60.00000
14: 6 2018-01-06 11:35:00 25.00000
15: 6 2018-01-06 12:00:00 10.00000
16: 7 2018-01-07 00:00:00 60.00000
17: 7 2018-01-07 01:00:00 60.00000
Data used:
df <- data.frame(
id = c(1, 2, 3, 4, 5, 6, 7),
startTime = as.POSIXct(
c(
"2018-01-01 12:15:31",
"2018-01-02 23:43:00",
"2018-01-03 11:00:11",
"2018-01-04 10:00:00",
"2018-01-05 00:00:00",
"2018-01-06 11:35:00",
"2018-01-07 00:00:00"
)
),
duration_minutes = c(315, 120, 45, 65, 60, 35, 120)
)
df
id startTime duration_minutes
1 1 2018-01-01 12:15:31 315
2 2 2018-01-02 23:43:00 120
3 3 2018-01-03 11:00:11 45
4 4 2018-01-04 10:00:00 65
5 5 2018-01-05 00:00:00 60
6 6 2018-01-06 11:35:00 35
7 7 2018-01-07 00:00:00 120

calculate duration of time interval while removing certain time spans

Assume we have an interval spanning several days (interval "A" in Figure below).
library(lubridate)
int <- interval("2018-01-01 22:00:00", "2018-01-04 10:00:00")
In hours, I get
as.period(int, unit = "hours")
"60H 0M 0S"
Now, I want to subtract all non-working-hours, here 16:00-08:00 (greyed out) in that interval, i.e. only keep the blue parts (08:00-16:00) and, again, calculate the remaining hours (see "B" in Figure below), which would be 8 + 8 + 2 = 18 hours.
One approach would be to create a list of intervals I want to keep which span the entire interval and then calculate intersections. (The code below could, of course, be setup programmatically using floor/ceiling/seq functions etc.)
int_keep <- list(
interval("2018-01-01 08:00:00", "2018-01-01 16:00:00"),
interval("2018-01-02 08:00:00", "2018-01-02 16:00:00"),
interval("2018-01-03 08:00:00", "2018-01-03 16:00:00"),
interval("2018-01-04 08:00:00", "2018-01-04 16:00:00"),
interval("2018-01-05 08:00:00", "2018-01-05 16:00:00")
)
l <- lapply(int_keep, function(x) intersect(x, int))
mns <- sapply(l, as.numeric) # returns seconds
sum(mns, na.rm = T) / 60 / 60 # sum of intersections in hours
[1] 18
While this works, it appears utterly clumsy to me. What would be a less tedious way to do this?
df <- data.frame(DateTime=seq.POSIXt(as.POSIXct("2018-01-01 22:00:00"), as.POSIXct("2018-01-04 10:00:00"), by = "1 hour"))
head(df)
#DateTime
#1 2018-01-01 22:00:00
#2 2018-01-01 23:00:00
#3 2018-01-02 00:00:00
#4 2018-01-02 01:00:00
#5 2018-01-02 02:00:00
#6 2018-01-02 03:00:00
#you want the hours worked between A and B
A <-format(strptime("8:00:00", "%H:%M:%S"),"%H:%M:%S")
B <-format(strptime("16:00:00", "%H:%M:%S"),"%H:%M:%S")
#a simple ifelse statement to assign a value of 1 to column "value" if the time is between 8 and 16 or a 0 if it's not:
df$value<-ifelse((format(df[1],"%H:%M:%S")>A & format(df[1],"%H:%M:%S")<=B),1,0)
tail(df)
#DateTime DateTime
#56 2018-01-04 05:00:00 0
#57 2018-01-04 06:00:00 0
#58 2018-01-04 07:00:00 0
#59 2018-01-04 08:00:00 0
#60 2018-01-04 09:00:00 1
#61 2018-01-04 10:00:00 1
#now taking the column sum of the value column will give you the total hours worked:
TotalHoursWorked<-colSums(df$value)
TotalHoursWorked
#DateTime
# 18

R Difference in time between rows

I've triangulated information from other SO answers for the below code, but getting stuck with an error message. Searched SO for similar errors and resolutions but haven't been able to figure it out, so help is appreciated.
For every group ("id"), I want to get the difference between the start times for consecutive rows.
Reproducible data:
require(dplyr)
df <-data.frame(id=as.numeric(c("1","1","1","2","2","2")),
start= c("1/31/17 10:00","1/31/17 10:02","1/31/17 10:45",
"2/10/17 12:00", "2/10/17 12:20","2/11/17 09:40"))
time <- strptime(df$start, format = "%m/%d/%y %H:%M")
df %>%
group_by(id)%>%
mutate(diff = time - lag(time),
diff_mins = as.numeric(diff, units = 'mins'))
Gets me error:
Error in mutate_impl(.data, dots) :
Column diff must be length 3 (the group size) or one, not 6
In addition: Warning message:
In unclass(time1) - unclass(time2) :
longer object length is not a multiple of shorter object length
Do you mean something like this?
There is no need for lag here, a simple diff on the grouped times is sufficient.
df %>%
mutate(start = as.POSIXct(start, format = "%m/%d/%y %H:%M")) %>%
group_by(id) %>%
mutate(diff = c(0, diff(start)))
## A tibble: 6 x 3
## Groups: id [2]
# id start diff
# <dbl> <dttm> <dbl>
#1 1. 2017-01-31 10:00:00 0.
#2 1. 2017-01-31 10:02:00 2.
#3 1. 2017-01-31 10:45:00 43.
#4 2. 2017-02-10 12:00:00 0.
#5 2. 2017-02-10 12:20:00 20.
#6 2. 2017-02-11 09:40:00 1280.
You can use lag and difftime (per Hadley):
df %>%
mutate(time = as.POSIXct(start, format = "%m/%d/%y %H:%M")) %>%
group_by(id) %>%
mutate(diff = difftime(time, lag(time)))
# A tibble: 6 x 4
# Groups: id [2]
id start time diff
<dbl> <fct> <dttm> <time>
1 1. 1/31/17 10:00 2017-01-31 10:00:00 <NA>
2 1. 1/31/17 10:02 2017-01-31 10:02:00 2
3 1. 1/31/17 10:45 2017-01-31 10:45:00 43
4 2. 2/10/17 12:00 2017-02-10 12:00:00 <NA>
5 2. 2/10/17 12:20 2017-02-10 12:20:00 20
6 2. 2/11/17 09:40 2017-02-11 09:40:00 1280

Consolidating rows by max and min dates

I have a dataset that looks like this.
id1 = c(1,1,1,1,1,1,1,1,2,2)
id2 = c(3,3,3,3,3,3,3,3,3,3)
lat = c(-62.81559,-62.82330, -62.78693,-62.70136, -62.76476,-62.48157,-62.49064,-62.45838,42.06258,42.06310)
lon = c(-61.15518, -61.14885,-61.17801,-61.00363, -59.14270, -59.22009, -59.32967, -59.04125 ,154.70579, 154.70625)
start_date= as.POSIXct(c('2016-03-24 15:30:00', '2016-03-24 15:30:00','2016-03-24 23:40:00','2016-03-25 12:50:00','2016-03-29 18:20:00','2016-06-01 02:40:00','2016-06-01 08:00:00','2016-06-01 16:30:00','2016-07-29 20:20:00','2016-07-29 20:20:00'), tz = 'UTC')
end_date = as.POSIXct(c('2016-03-24 23:40:00', '2016-03-24 18:50:00','2016-03-25 03:00:00','2016-03-25 19:20:00','2016-04-01 03:30:00','2016-06-02 01:40:00','2016-06-01 14:50:00','2016-06-02 01:40:00','2016-07-30 07:00:00','2016-07-30 07:00:00'),tz = 'UTC')
speed = c(2.9299398, 2.9437502, 0.0220565, 0.0798409, 1.2824859, 1.8685429, 3.7927680, 1.8549291, 0.8140249,0.8287073)
df = data.frame(id1, id2, lat, lon, start_date, end_date, speed)
id1 id2 lat lon start_date end_date speed
1 1 3 -62.81559 -61.15518 2016-03-24 15:30:00 2016-03-24 23:40:00 2.9299398
2 1 3 -62.82330 -61.14885 2016-03-24 15:30:00 2016-03-24 18:50:00 2.9437502
3 1 3 -62.78693 -61.17801 2016-03-24 23:40:00 2016-03-25 03:00:00 0.0220565
4 1 3 -62.70136 -61.00363 2016-03-25 12:50:00 2016-03-25 19:20:00 0.0798409
5 1 3 -62.76476 -59.14270 2016-03-29 18:20:00 2016-04-01 03:30:00 1.2824859
6 1 3 -62.48157 -59.22009 2016-06-01 02:40:00 2016-06-02 01:40:00 1.8685429
7 1 3 -62.49064 -59.32967 2016-06-01 08:00:00 2016-06-01 14:50:00 3.7927680
8 1 3 -62.45838 -59.04125 2016-06-01 16:30:00 2016-06-02 01:40:00 1.8549291
9 2 3 42.06258 154.70579 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8140249
10 2 3 42.06310 154.70625 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8287073
The actual dataset is larger. What I would like to do is consolidate this dataset based on date ranges and grouped by id1 and id2, such that if the date/time range on one row is within 12 hours of the next date/time range 'ABS(end_date[1] - start_date[2]) < 12hrs' the rows should be consolidated with the new start_date being the earliest date and the end_date being the latest. All other values (lat, lon, speed) will be averaged. This is some sense a 'deduping' effort as rows that are within 12 hours actually represent the same 'event'. For the above example the final result would be
id1 id2 lat lon start_date end_date speed
1 1 3 -62.7818 -61.12142 2016-03-24 15:30:00 2016-03-25 19:20:00 1.493897
2 1 3 -62.76476 -59.14270 2016-03-29 18:20:00 2016-04-01 03:30:00 1.2824859
3 1 3 -62.47686 -59.197 2016-06-01 02:40:00 2016-06-02 01:40:00 2.505413
4 2 3 42.06284 154.706 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8213661
With the first four rows consolidated (into row1), the 5 row left alone (row2), the 6-8 rows consolidated (row3), and the 9-10 rows consolidated (row4).
I have been trying to do this with dplyr group_by and summarize, but I can't seem to get the get the date ranges to come out correctly.
Hopefully someone can determine a simple means of solving the problem. Extra points if you know how to do it in SQL ;-) so I can dedupe before even pulling this into R.
Here is a first very naive implementation. Warning: it is slow, not pretty and still missing the start and end dates in the output! Note that it expects the rows to be ordered by date and time. If that's not the case in the data set, you can do it in R or SQL first. Sorry that I can't think of a dplyr or SQL solution. I'd also like to see those two, if anyone has got an idea.
dedupe <- function(df) {
counter = 1
temp_vector = unlist(df[1, ])
summarized_df = df[0, c(1, 2, 3, 4, 7)]
colnames(summarized_df) = colnames(df)[c(1, 2, 3, 4, 7)]
summarized_df$counter = NULL
for (i in 2:nrow(df)) {
if (((abs(difftime(df[i, "start_date"], df[i - 1, "end_date"], units = "h")) <
12) ||
abs(difftime(df[i, "start_date"], df[i - 1, "start_date"], units = "h")) <
12) &&
df[i, "id1"] == df[i - 1, "id1"] &&
df[i, "id2"] == df[i - 1, "id2"]) {
#group events because id is the same and time range overlap
#sum up columns and select maximum end_date
temp_vector[c(3, 4, 7)] = temp_vector[c(3, 4, 7)] + unlist(df[i, c(3, 4, 7)])
temp_vector["end_date"] = max(temp_vector["end_date"], df[i, "end_date"])
counter = counter + 1
if (i == nrow(df)) {
#in the last iteration we need to create a new group
summarized_df[nrow(summarized_df) + 1, c(1, 2)] = df[i, c(1, 2)]
summarized_df[nrow(summarized_df), 3:5] = temp_vector[c(3, 4, 7)] / counter
summarized_df[nrow(summarized_df), "counter"] = counter
}
} else {
#new event so we calculate group statistics for temp_vector and reset its value as well as counter
summarized_df[nrow(summarized_df) + 1, c(1, 2)] = df[i, c(1, 2)]
summarized_df[nrow(summarized_df), 3:5] = temp_vector[c(3, 4, 7)] / counter
summarized_df[nrow(summarized_df), "counter"] = counter
counter = 1
temp_vector[c(3, 4, 7)] = unlist(df[i, c(3, 4, 7)])
}
}
return(summarized_df)
}
Function call
> dedupe(df)
id1 id2 lat lon speed counter
5 1 3 -62.78179 -61.12142 1.4938968 4
6 1 3 -62.76476 -59.14270 1.2824859 1
9 2 3 -62.47686 -59.19700 2.5054133 3
10 2 3 42.06284 154.70602 0.8213661 2
This can be easily achieved by using insurancerating::reduce():
df |>
insurancerating::reduce(begin = start_date, end = end_date, id1, id2,
agg_cols = c(lat, lon, speed), agg = "mean",
min.gapwidth = 12 * 3600)
#> id1 id2 index end_date start_date lat lon
#> 1 1 3 0 2016-03-25 19:20:00 2016-03-24 15:30:00 -62.78180 -61.12142
#> 2 1 3 1 2016-04-01 03:30:00 2016-03-29 18:20:00 -62.76476 -59.14270
#> 3 1 3 2 2016-06-02 01:40:00 2016-06-01 02:40:00 -62.47686 -59.19700
#> 4 2 3 0 2016-07-30 07:00:00 2016-07-29 20:20:00 42.06284 154.70602
#> speed
#> 1 1.4938969
#> 2 1.2824859
#> 3 2.5054133
#> 4 0.8213661
Created on 2022-06-13 by the reprex package (v2.0.1)

Resources