Lets say I have three variables id, date, trad (which has 3 values and can be anyone of them at any time point):
library(tidyverse)
dput(df)
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 2, 2), date = structure(c(16436, 16437, 16438, 16439,
16440, 16441, 16442, 16443, 16444, 16445, 16446, 16447, 16448,
16449, 16450, 16451, 16452, 16453, 16454), class = "Date"), trad = c("Free",
"Suspended", "Suspended", "Free", "Suspended", "Withdrawn", "Withdrawn",
"Free", "Withdrawn", "Free", "Free", "Withdrawn", "Suspended",
"Withdrawn", "Withdrawn", "Free", "Withdrawn", "Suspended", "Free"
)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-19L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), date = structure(list(format = "%d/%m/%Y"), class = c("collector_date",
"collector")), trad = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
df
# A tibble: 19 x 3
id date trad
<dbl> <date> <chr>
1 1 2015-01-01 Free
2 1 2015-01-02 Suspended
3 1 2015-01-03 Suspended
4 1 2015-01-04 Free
5 1 2015-01-05 Suspended
6 1 2015-01-06 Withdrawn
7 1 2015-01-07 Withdrawn
8 1 2015-01-08 Free
9 1 2015-01-09 Withdrawn
10 1 2015-01-10 Free
11 1 2015-01-11 Free
12 1 2015-01-12 Withdrawn
13 1 2015-01-13 Suspended
14 1 2015-01-14 Withdrawn
15 1 2015-01-15 Withdrawn
16 1 2015-01-16 Free
17 2 2015-01-17 Withdrawn
18 2 2015-01-18 Suspended
19 2 2015-01-19 Free
I would like to generate new columns with the start and end dates of when a period starts. A period starts when trad moves to status "Withdrawn" with the cavet that if there is a status "Suspended" before the "Withdrawn" row, the start date moves to this row. If there are multiple rows of "Suspended" before "Withdrawn", then start begins with the first "Suspended". Similarly, the end date is when trad goes to Free after being in "Withdrawn". This is required final dataset:
dfnew
# A tibble: 19 x 6
id date trad start end period
<dbl> <date> <chr> <date> <date> <dbl>
1 1 2015-01-01 Free NA NA NA
2 1 2015-01-02 Suspended NA NA NA
3 1 2015-01-03 Suspended NA NA NA
4 1 2015-01-04 Free NA NA NA
5 1 2015-01-05 Suspended 2015-01-05 NA 1
6 1 2015-01-06 Withdrawn NA NA 1
7 1 2015-01-07 Withdrawn NA NA 1
8 1 2015-01-08 Free NA 2015-01-08 1
9 1 2015-01-09 Withdrawn 2015-01-09 NA 2
10 1 2015-01-10 Free NA 2015-01-10 2
11 1 2015-01-11 Free NA NA NA
12 1 2015-01-12 Withdrawn 2015-01-12 NA 3
13 1 2015-01-13 Suspended NA NA 3
14 1 2015-01-14 Withdrawn NA NA 3
15 1 2015-01-15 Withdrawn NA NA 3
16 1 2015-01-16 Free NA 2015-01-16 NA
17 2 2015-01-17 Withdrawn 2015-01-17 NA 1
18 2 2015-01-18 Suspended NA NA 1
19 2 2015-01-19 Free NA 2015-01-19 1
There is no pattern in trad so you could have any sequence of "Withdrawn"/"Suspended" before "Free" so a solution something like this doesn't work (in theory it could, but I would need too many conditions to implement it):
dfnew <- df %>%
group_by(id)
mutate(start = ifelse(trad == "Withdrawn" & lag(trad == "Free"), date, NA))
These questions are helpful but don't answer the question:
How to extract the previous n rows where a certain column value cannot be a particular value?
R - Conditional lagging - How to lag a certain amount of cells until a condition is met?
Would anyone have a flexible solution?
Not very flexible, but at least a try.
I don't know what happens when we have sequence Suspended, Suspended, Withdrawn, Withdrawn.
For example change trad on 2015-01-04 to Suspended. When is the start date in this case?
I gave 2 solutions, first makes start date on 2015-01-02 and the second on 2015-01-05
dfnew1 <- df %>%
mutate(startGroups = cumsum(trad == "Free")) %>%
group_by(startGroups) %>% # make a group from every occurance of "Free" in trad
mutate(wds = cumsum(trad == "Withdrawn"),
start = ifelse(max(wds) > 0 & row_number() == 2, date, NA) # if there is any "Withdrawn" in the group set start date right after "Free"
) %>%
ungroup() %>%
mutate(endGroups = cumsum(!is.na(start))) %>%
group_by(endGroups) %>% # group on every open trade now
mutate(frees = cumsum(trad == "Free"),
end = ifelse(frees == 1 & endGroups > 0, date, NA) #end on first occurance of "Free" in trad column
) # %>% select(-startGroups, wds, endGroups, frees) # remove cols
dfnew2 <- df %>%
mutate(startGroups = cumsum(trad == "Free")) %>%
group_by(startGroups) %>% # make a group from every occurance of "Free" in trad
mutate(wds = cumsum(trad == "Withdrawn"),
start = ifelse(
(trad == "Suspended" & lead(trad) == "Withdrawn" & lead(wds) == 1 |
trad == "Withdrawn" & lag(trad) != "Suspended" & wds == 1),
date, NA) # first trad in group. Other option:
) %>%
ungroup() %>%
mutate(endGroups = cumsum(!is.na(start))) %>%
group_by(endGroups) %>%
mutate(frees = cumsum(trad == "Free"),
end = ifelse(frees == 1 & endGroups > 0, date, NA)
) #%>% select(-startGroups, wds, endGroups, frees)
Related
(daily return percentage) / 100 = (today's close - yesterday's close) / yesterday's close
I have a data frame like this,
date close
1 2018-09-21 3410.486
2 2018-09-22 3310.126
3 2018-09-23 3312.482
4 2018-09-24 3269.432
5 2018-09-25 3204.922
I'd like to calculate daily returns and make it like this,
date close change
1 2018-09-21 3410.486 3.03%
2 2018-09-22 3310.126 -0.07%
3 2018-09-23 3312.482 1.32%
4 2018-09-24 3269.432 2.01%
5 2018-09-25 3321.825 NA
library(tidyverse)
library(tidyquant)
df %>%
tq_mutate(select = close,
mutate_fun = periodReturn,
period = "daily",
col_rename = "return")
# A tibble: 5 x 3
date close return
<date> <dbl> <dbl>
1 2018-09-21 3410. 0
2 2018-09-22 3310. -0.0294
3 2018-09-23 3312. 0.000712
4 2018-09-24 3269. -0.0130
5 2018-09-25 3205. -0.0197
Just using dplyr.
df1 %>%
mutate(change = (close - lag(close)) / lag(close))
date close change
1 2018-09-21 3410.486 NA
2 2018-09-22 3310.126 -0.0294268911
3 2018-09-23 3312.482 0.0007117554
4 2018-09-24 3269.432 -0.0129962970
5 2018-09-25 3204.922 -0.0197312561
data:
df1 <- structure(list(date = structure(c(17795, 17796, 17797, 17798,
17799), class = "Date"), close = c(3410.486, 3310.126, 3312.482,
3269.432, 3204.922), change = structure(c(0, 0, 0, 0, 0), tsp = c(0,
4, 1))), row.names = c(NA, -5L), class = "data.frame")
Here is example data:
df = data.frame(id = (1:5),
type= c("a_type","a_type","b_type","b_type", "c_type"),
start_date= lubridate::dmy(c("01/01/2014", "30/04/2014
", "30/04/2015", "10/05/2015", "30/03/2016")),
fail_date = lubridate::dmy(c("30/04/2015", rep(NA,4))))
> df
id type start_date fail_date
1 1 a_type 2014-01-01 2015-04-30
2 1 a_type 2014-04-30 <NA>
3 1 b_type 2015-04-30 <NA>
4 1 b_type 2015-05-10 <NA>
5 1 c_type 2016-03-30 <NA>
I want to fill in fail_date of "a_type" where they are NA. This needs to be the start_date of the next type that is "c_type" or "a_type". So output should be:
> df1
id type start_date fail_date
1 1 a_type 2014-01-01 2015-04-30
2 2 a_type 2014-04-30 2016-03-30
3 3 b_type 2015-04-30 <NA>
4 4 b_type 2015-05-10 <NA>
5 5 c_type 2016-03-30 <NA>
I have started with this code, but don't know how to specify the start_date of the next entry that is type "c_type" or "a_type":
df1= df%>%
mutate(fail_date = case_when(is.na(fail_date) & type =="a_type" ~ ??? ,
TRUE ~ fail_date))
I do not want to just filter out the "b_type"
#######Edit##############
Following the answer given by #Allan Cameron I have tried this on my data but have come across an issue.
If the last type is "b-type" the code won't work, as the cumsum(rev(df$type) != 'b_type') starts with a 0. Here is an example with amended data:
df = data.frame(id = (1:6),
type= c("a_type","a_type","b_type","b_type", "c_type","b_type"),
start_date= lubridate::dmy(c("01/12/2013","01/01/2014", "30/04/2014
", "30/04/2015", "10/05/2015", "30/03/2016")),
fail_date =rep(NA,6))
df1= df%>%
arrange(start_date) %>%
mutate(next_start = lead(rev(rev(start_date[type != 'b_type'])[
cumsum(rev(type) != 'b_type')])),
fail_date = if_else(type == 'a_type' &
is.na(fail_date) ,
next_start,
lubridate::dmy(fail_date)))
Error: Problem with `mutate()` column `next_start`.
i `next_start = lead(...)`.
i `next_start` must be size 6 or 1, not 5.
Here are some of the individual elements which have helped me to understand the error, but I dont' know how to overcome this:
df1= df%>%
mutate( rev = rev(type),
qa = rev(type) != 'b_type',
qw= cumsum(rev(type)!='b_type')
)
df1
id type start_date fail_date rev qa qw
1 1 a_type 2013-12-01 NA b_type FALSE 0
2 2 a_type 2014-01-01 NA c_type TRUE 1
3 3 b_type 2014-04-30 NA b_type FALSE 1
4 4 b_type 2015-04-30 NA b_type FALSE 1
5 5 c_type 2015-05-10 NA a_type TRUE 2
6 6 b_type 2016-03-30 NA a_type TRUE 3
Because the first entry in qw is 0, rev(rev(start_date[type != 'b_type'])[ cumsum(rev(type) != 'b_type')]) does not produce a result.
This is tricky. You can do it all within the pipe, but the hard part is back-propagating the next available start date, which requires a combination of lead, rev, cumsum and diff to create a temporary column. Choosing whether to insert a value from the temporary column just comes down to specifying your logic inside an if_else
df %>%
arrange(start_date) %>%
mutate(next_start = lead(rev(rev(start_date[type != 'b_type'])[
cumsum(rev(type) != 'b_type')])),
fail_date = if_else(type == 'a_type' &
is.na(fail_date) &
lead(type, default = last(type)) != 'a_type',
next_start,
fail_date)) %>%
select(-next_start)
#> id type start_date fail_date
#> 1 1 a_type 2014-01-01 2015-04-30
#> 2 2 a_type 2014-04-30 2016-03-30
#> 3 3 b_type 2015-04-30 <NA>
#> 4 4 b_type 2015-05-10 <NA>
#> 5 5 c_type 2016-03-30 <NA>
If you have multiple types (rather than just 3) you might need to change the instances of type != 'b_type' to type %in% allowed_types, where allowed_types is a pre-defined vector, as shown below:
allowed_types <- c('a_type', 'c_type')
df %>%
arrange(start_date) %>%
mutate(next_start = lead(rev(rev(start_date[type %in% allowed_types])[
cumsum(rev(type %in% allowed_types))])),
fail_date = if_else(type == 'a_type' &
is.na(fail_date) &
lead(type, default = last(type)) != 'a_type',
next_start,
fail_date)) %>%
select(-next_start)
This generates the same result in your example data but is more generalizable
I have a dataframe like this in R:
Start date
End date
Date 1
Date 2
Date 3
Date 4
11/12/2018
29/11/2019
08/03/2021
NA
NA
NA
07/03/2018
24/04/2019
08/03/2021
12/09/2016
NA
NA
04/06/2018
23/04/2019
08/03/2021
02/10/2017
05/10/2018
NA
26/07/2018
29/08/2019
08/03/2021
03/08/2015
02/10/2017
23/01/2017
I want to create a new column in R that says: If Date 1, Date 2, Date 3 or Date 4 is between Start Date and End date, it should return 1, 0 otherwise, as the table below:
Start date
End date
Date 1
Date 2
Date 3
Date 4
Change
11/12/2018
29/11/2019
08/03/2021
NA
NA
NA
0
07/03/2018
24/04/2019
08/03/2021
12/09/2016
NA
NA
0
04/06/2018
23/04/2019
08/03/2021
02/10/2017
05/10/2018
NA
1
26/07/2018
29/08/2019
08/03/2021
03/08/2015
02/10/2017
23/01/2017
0
Does anyone have a suggestion on how to solve this? Thank you :)
It'll make it much easier for people to help you if you can post code / data which we can run directly. The easiest way to do this is to use a handy R function called dput, which generates instructions to exactly recreate any R object. So you might run dput(MY_DATA), or if your data is much larger than needed to demonstrate your question, dput(head(MY_DATA)) to get the first six rows, and paste the output of that into your question. </PSA>
Here's code to generate your example data:
my_data <- data.frame(
stringsAsFactors = FALSE,
Start.date = c("11/12/2018", "07/03/2018", "04/06/2018", "26/07/2018"),
End.date = c("29/11/2019", "24/04/2019", "23/04/2019", "29/08/2019"),
Date.1 = c("08/03/2021", "08/03/2021", "08/03/2021", "08/03/2021"),
Date.2 = c(NA, "12/09/2016", "02/10/2017", "03/08/2015"),
Date.3 = c(NA, NA, "05/10/2018", "02/10/2017"),
Date.4 = c(NA, NA, NA, "23/01/2017")
)
Here's a tidyverse approach to first convert your day/month/year dates into data in R's Date type using lubridate::dmy, then to compare each of Date.1 thru Date.4 against your start dates, and then finally to show if there are any 1's (within range).
library(dplyr); library(lubridate)
my_data %>%
mutate(across(.fns = ~dmy(.x))) %>%
mutate(across(.cols = starts_with("Date"),
.fns = ~coalesce(.x >= Start.date & .x <= End.date, FALSE)*1)) %>%
mutate(Change = pmax(Date.1, Date.2, Date.3, Date.4))
coalesce(..., FALSE) used here to treat NA like FALSE.
(...)*1 to convert TRUE/FALSE to 1/0.
pmax(...) to grab the largest of the 1/0's, i.e. "are there any 1's?"
Edit: alternative to leave Date columns intact:
my_data %>%
mutate(across(.fns = ~dmy(.x))) %>%
mutate(across(.cols = starts_with("Date"),
.names = "Check_{.col}",
.fns = ~coalesce(.x >= Start.date & .x <= End.date, FALSE)*1)) %>%
rowwise() %>%
mutate(Change = max(c_across(starts_with("Check")))) %>%
select(-starts_with("Check"))
Start.date End.date Date.1 Date.2 Date.3 Date.4 Change
<date> <date> <date> <date> <date> <date> <dbl>
1 2018-12-11 2019-11-29 2021-03-08 NA NA NA 0
2 2018-03-07 2019-04-24 2021-03-08 2016-09-12 NA NA 0
3 2018-06-04 2019-04-23 2021-03-08 2017-10-02 2018-10-05 NA 1
4 2018-07-26 2019-08-29 2021-03-08 2015-08-03 2017-10-02 2017-01-23 0
library(tidyverse)
library(lubridate)
df <- read.table(textConnection("start_date;end_date;date_1;date_2;date_3;date_4
11/12/2018;29/11/2019;08/03/2021;NA;NA;NA
07/03/2018;24/04/2019;08/03/2021;12/09/2016;NA;NA
04/06/2018;23/04/2019;08/03/2021;02/10/2017;05/10/2018;NA
26/07/2018;29/08/2019;08/03/2021;03/08/2015;02/10/2017;23/01/2017"),
sep=";",
header = TRUE)
df %>%
mutate(
across(everything(), lubridate::dmy),
change = ((date_1 > start_date & date_1 < end_date) |
(date_2 > start_date & date_2 < end_date) |
(date_3 > start_date & date_3 < end_date)
) %>%
coalesce(FALSE) %>%
as.integer()
)
#> start_date end_date date_1 date_2 date_3 date_4 change
#> 1 2018-12-11 2019-11-29 2021-03-08 <NA> <NA> <NA> 0
#> 2 2018-03-07 2019-04-24 2021-03-08 2016-09-12 <NA> <NA> 0
#> 3 2018-06-04 2019-04-23 2021-03-08 2017-10-02 2018-10-05 <NA> 1
#> 4 2018-07-26 2019-08-29 2021-03-08 2015-08-03 2017-10-02 2017-01-23 0
Below is an example of a table I am working with.
df = data.frame(Test_ID = c('a1','a1','a1','a1','a1','a1','a1','a2','a2','a2','a2','a2','a2'),
Event_ID = c('Failure_x', 'Failure_x', 'Failure_y', 'Failure_y', 'Failure_x',
'Failure_x', 'Failure_y', 'Failure_x', 'Failure_y', 'Failure_y',
'Failure_x','Failure_x', 'Failure_y'),
Fail_Date = c('2018-10-10 17:52:20', '2018-10-11 17:02:16', '2018-10-14 12:52:20',
'2018-11-11 16:18:34', '2018-11-12 17:03:06', '2018-11-25 10:50:10',
'2018-12-01 10:28:50', '2018-09-12 19:02:08', '2018-09-20 11:32:25',
'2018-10-13 14:43:30', '2018-10-15 14:22:28', '2018-10-30 21:55:45',
'2018-11-17 11:53:35'))
I want to subtract the failure dates (by Test_ID) only where Failure_y occurs after Failure_x. The Fail_Date for Event_ID Failure_y will be subtracted from the Fail_Date for Event_ID Failure_x. Within a group I can have multiple Failure_y's. The second Failure_y will be subtracted from the Failure_x occurring after the first instance of Failure_y.
I have tried to use dplyr to create a column TIME_BETWEEN_FAILURES.
library(lubridate)
df$Fail_Date = as.POSIXct(as.character(as.factor(df$Fail_Date)),format="%Y-%m-%d %H:%M:%S")
df = df %>% group_by(Test_ID) %>%
mutate(TIME_BETWEEN_FAILURES = ifelse(Event_ID == "Failure_y" & lag(Event_ID) == "Failure_x",
difftime(Fail_Date, first(Fail_Date),units = "hours"),''))`
I was able to create the Time_BETWEEN_FAILURES correctly only for the first instance using first() in dplyr. That's where I am currently stuck. Any help on this matter will be appreciated.
This is result from the code snippet above.
Output required for analysis.
This is ideal response needed for my analysis.
Thanks.
Cheers.
df %>%
group_by(gr = rev(cumsum(rev(Event_ID)=="Failure_y")), Test_ID) %>%
mutate(time_between_failures = ifelse(n() > 1 & Event_ID=="Failure_y", difftime(Fail_Date[n()], Fail_Date[1L], units = "hours"), NA))
# A tibble: 13 x 5
# Groups: gr, Test_ID [6]
Test_ID Event_ID Fail_Date gr time_between_failures
<fct> <fct> <dttm> <int> <dbl>
1 a1 Failure_x 2018-10-10 17:52:20 6 NA
2 a1 Failure_x 2018-10-11 17:02:16 6 NA
3 a1 Failure_y 2018-10-14 12:52:20 6 91
4 a1 Failure_y 2018-11-11 16:18:34 5 NA
5 a1 Failure_x 2018-11-12 17:03:06 4 NA
6 a1 Failure_x 2018-11-25 10:50:10 4 NA
7 a1 Failure_y 2018-12-01 10:28:50 4 449.
8 a2 Failure_x 2018-09-12 19:02:08 3 NA
9 a2 Failure_y 2018-09-20 11:32:25 3 185.
10 a2 Failure_y 2018-10-13 14:43:30 2 NA
11 a2 Failure_x 2018-10-15 14:22:28 1 NA
12 a2 Failure_x 2018-10-30 21:55:45 1 NA
13 a2 Failure_y 2018-11-17 11:53:35 1 790.
I have a cohort of data with multiple person visits and want to group visits with a common ID based on person # and the time of the visit. The condition is if an start is within 24 hours of a the previous exit, then I want those to have the same ID.
Sample of what data looks like:
dat <- data.frame(
Person_ID = c(1,1,1,2,3,3,3,4,4),
Admit_Date_Time = as.POSIXct(c("2017-02-07 15:26:00","2017-04-21 10:20:00",
"2017-04-22 12:12:00", "2017-10-16 01:31:00","2017-01-24 02:41:00","2017- 01-24 05:31:00", "2017-01-28 04:26:00", "2017-12-01 01:31:00","2017-12-01
01:31:00"), format = "%Y-%m-%d %H:%M"),
Discharge_Date_Time = as.POSIXct(c("2017-03-01 11:42:00","2017-04-22
05:56:00",
"2017-04-26 21:01:00",
"2017-10-18 20:11:00",
"2017-01-27 22:15:00",
"2017-01-26 15:35:00",
"2017-01-28 09:25:00",
"2017-12-05 18:33:00",
"2017-12-04 16:41:00"),format = "%Y-%m-%d %H:%M" ),
Visit_ID = c(1:9))
this is what I tried to start:
dat1 <-
dat %>%
arrange(Person_ID, Admit_Date_Time) %>%
group_by(Person_ID) %>%
mutate(Previous_Visit_Interval = difftime(lag(Discharge_Date_Time,
1),Admit_Date_Time, units = "hours")) %>%
mutate(start = c(1,Previous_Visit_Interval[-1] < hours(-24)), run =
cumsum(start))
dat1$ID = as.numeric(as.factor(paste0(dat1$Person_ID,dat1$run)))
Which is almost right, except it does not give the correct ID for visit 7 (person #3). Since there are three visits and the second visit is entirely within the first, and the third starts within 24 hours of the first but not the second.
There's probably a way to shorten this, but here's an approach using tidyr::gather and spread. By gathering into long format, we can track the cumulative admissions inside each visit. A new visit is recorded whenever there's a new Person_ID or that Person_ID completed a visit (cumulative admissions went to zero) at least 24 hours prior.
library(tidyr)
dat1 <- dat %>%
# Gather into long format with event type in one column, timestamp in another
gather(event, time, Admit_Date_Time:Discharge_Date_Time) %>%
# I want discharges to have an effect up to 24 hours later. Sort using that.
mutate(time_adj = if_else(event == "Discharge_Date_Time",
time + ddays(1),
time)) %>%
arrange(Person_ID, time_adj) %>%
# For each Person_ID, track cumulative admissions. 0 means a visit has completed.
# (b/c we sorted by time_adj, these reflect the 24hr period after discharges.)
group_by(Person_ID) %>%
mutate(admissions = if_else(event == "Admit_Date_Time", 1, -1)) %>%
mutate(admissions_count = cumsum(admissions)) %>%
ungroup() %>%
# Record a new Hosp_ID when either (a) a new Person, or (b) preceded by a
# completed visit (ie admissions_count was zero).
mutate(Hosp_ID_chg = 1 *
(Person_ID != lag(Person_ID, default = 1) | # (a)
lag(admissions_count, default = 1) == 0), # (b)
Hosp_ID = cumsum(Hosp_ID_chg)) %>%
# Spread back into original format
select(-time_adj, -admissions, -admissions_count, -Hosp_ID_chg) %>%
spread(event, time)
Results
> dat1
# A tibble: 9 x 5
Person_ID Visit_ID Hosp_ID Admit_Date_Time Discharge_Date_Time
<dbl> <int> <dbl> <dttm> <dttm>
1 1 1 1 2017-02-07 15:26:00 2017-03-01 11:42:00
2 1 2 2 2017-04-21 10:20:00 2017-04-22 05:56:00
3 1 3 2 2017-04-22 12:12:00 2017-04-26 21:01:00
4 2 4 3 2017-10-16 01:31:00 2017-10-18 20:11:00
5 3 5 4 2017-01-24 02:41:00 2017-01-27 22:15:00
6 3 6 4 2017-01-24 05:31:00 2017-01-26 15:35:00
7 3 7 4 2017-01-28 04:26:00 2017-01-28 09:25:00
8 4 8 5 2017-12-01 01:31:00 2017-12-05 18:33:00
9 4 9 5 2017-12-01 01:31:00 2017-12-04 16:41:00
Here's a data.table approach using an overlap-join
library( data.table )
library( lubridate )
setDT( dat )
setorder( dat, Person_ID, Admit_Date_Time )
#create a 1-day extension after each discharge
dt2 <- dat[, discharge_24h := Discharge_Date_Time %m+% days(1)][]
#now create id
setkey( dat, Admit_Date_Time, discharge_24h )
#create data-table with overlap-join, create groups based on overlapping ranges
dt2 <- setorder(
foverlaps( dat,
dat,
mult = "first",
type = "any",
nomatch = 0L
),
Visit_ID )[, list( Visit_ID = i.Visit_ID,
Hosp_ID = .GRP ),
by = .( Visit_ID )][, Visit_ID := NULL]
#reorder the result
setorder( dt2[ dat, on = "Visit_ID" ][, discharge_24h := NULL], Visit_ID )[]
# Visit_ID Hosp_ID Person_ID Admit_Date_Time Discharge_Date_Time
# 1: 1 1 1 2017-02-07 15:26:00 2017-03-01 11:42:00
# 2: 2 2 1 2017-04-21 10:20:00 2017-04-22 05:56:00
# 3: 3 2 1 2017-04-22 12:12:00 2017-04-26 21:01:00
# 4: 4 3 2 2017-10-16 01:31:00 2017-10-18 20:11:00
# 5: 5 4 3 2017-01-24 02:41:00 2017-01-27 22:15:00
# 6: 6 4 3 2017-01-24 05:31:00 2017-01-26 15:35:00
# 7: 7 4 3 2017-01-28 04:26:00 2017-01-28 09:25:00
# 8: 8 5 4 2017-12-01 01:31:00 2017-12-05 18:33:00
# 9: 9 5 4 2017-12-01 01:31:00 2017-12-04 16:41:00