Related
I have dataset where length of stay of booking going in two or three month i want to create a row for every such bookings where revenue will be divided for every month and remaining information about the booking will remain same. if a booking length is in same month then it will show that as it is.
structure(list(channel = c("109", "109", "Agent"), room_stay_status = c("ENQUIRY",
"ENQUIRY", "CHECKED_OUT"), start_date = structure(c(1637971200,
1640995200, 1640995200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), end_date = structure(c(1643155200, 1642636800, 1641168000
), tzone = "UTC", class = c("POSIXct", "POSIXt")), los = c(60,
19, 2), booker = c("Anuj", "Anuj", "Anuj"), area = c("Goa", "Goa",
"Goa"), property_sku = c("Amna-3b", "Amna-3b", "Amna-3b"), Revenue = c(90223.666,
5979, 7015.9), Booking_ref = c("aed97", "b497h9", "bde65")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
output should look like this
structure(list(channel = c("109", "109", "109", "109", "Agent"
), room_stay_status = c("ENQUIRY", "ENQUIRY", "ENQUIRY", "ENQUIRY",
"CHECKED_OUT"), start_date = structure(c(1637971200, 1638316800,
1640995200, 1640995200, 1640995200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), end_date = structure(c(1638230400, 1640908800, 1643155200,
1642636800, 1641168000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), los = c(4, 31, 25, 19, 2), booker = c("Anuj", "Anuj",
"Anuj", "Anuj", "Anuj"), area = c("Goa", "Goa", "Goa", "Goa",
"Goa"), property_sku = c("Amna-3b", "Amna-3b", "Amna-3b", "Amna-3b",
"Amna-3b"), Revenue = c(6014.91106666667, 46615.5607666667, 37593.1941666667,
5979, 7015.9), Booking_ref = c("aed97", "aed97", "aed97", "b497h9",
"bde65")), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
Many thanks in advance.
An quick attempt here (assuming your data is named df_in and df_out) which seems to do the trick:
library("dplyr")
library("tidyr")
library("lubridate")
# Function for creating a vector from start (st) to end (nd) with intermediate
# months inside
cut_months <- function(st, nd) {
repeat {
# Grow vector, keep adding next month
next_month <- ceiling_date(tail(st, 1) + seconds(1), "month")
if (next_month > nd) {
st <- append(st, nd)
break
} else {
st <- append(st, next_month)
}
}
return(st)
}
# Let's try it
print(cut_months(df_in$start_date[1], df_in$end_date[2]))
# [1] "2021-11-27 01:00:00 CET" "2021-12-01 01:00:00 CET" "2022-01-01 00:00:00 CET" "2022-01-20 01:00:00 CET"
# Function for expanding months:
expand_months <- function(df) {
expand_rows <-
df %>%
# Expand months and unnest list-column
mutate(key_dates = mapply(cut_months, start_date, end_date)) %>%
select(-start_date, -end_date) %>%
unnest(key_dates) %>%
# Compute needed values
group_by(Booking_ref) %>%
arrange(Booking_ref, key_dates) %>%
mutate(
start_date = key_dates,
end_date = lead(key_dates),
los = as.numeric(as.duration(start_date %--% end_date), "days"), # Ceiling this?
Revenue = Revenue * los / sum(los, na.rm = TRUE)
) %>%
arrange(Booking_ref, start_date) %>%
# Clean-up
filter(!is.na(end_date)) %>%
select(-key_dates)
expand_rows
}
# Print results and compare:
expand_months(df_in)
## A tibble: 5 x 10
## Groups: Booking_ref [3]
#channel room_stay_status los booker area property_~1 Revenue Booki~2 start_date end_date
#<chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <dttm> <dttm>
#1 109 ENQUIRY 4 Anuj Goa Amna-3b 6015. aed97 2021-11-27 01:00:00 2021-12-01 01:00:00
#2 109 ENQUIRY 31.0 Anuj Goa Amna-3b 46553. aed97 2021-12-01 01:00:00 2022-01-01 00:00:00
#3 109 ENQUIRY 25.0 Anuj Goa Amna-3b 37656. aed97 2022-01-01 00:00:00 2022-01-26 01:00:00
#4 109 ENQUIRY 19 Anuj Goa Amna-3b 5979 b497h9 2022-01-01 01:00:00 2022-01-20 01:00:00
#5 Agent CHECKED_OUT 2 Anuj Goa Amna-3b 7016. bde65 2022-01-01 01:00:00 2022-01-03 01:00:00
## ... with abbreviated variable names 1: property_sku, 2: Booking_ref
df_out
## A tibble: 5 x 10
#channel room_stay_status start_date end_date los booker area property_~1 Revenue Booki~2
#<chr> <chr> <dttm> <dttm> <dbl> <chr> <chr> <chr> <dbl> <chr>
#1 109 ENQUIRY 2021-11-27 00:00:00 2021-11-30 00:00:00 4 Anuj Goa Amna-3b 6015. aed97
#2 109 ENQUIRY 2021-12-01 00:00:00 2021-12-31 00:00:00 31 Anuj Goa Amna-3b 46616. aed97
#3 109 ENQUIRY 2022-01-01 00:00:00 2022-01-26 00:00:00 25 Anuj Goa Amna-3b 37593. aed97
#4 109 ENQUIRY 2022-01-01 00:00:00 2022-01-20 00:00:00 19 Anuj Goa Amna-3b 5979 b497h9
#5 Agent CHECKED_OUT 2022-01-01 00:00:00 2022-01-03 00:00:00 2 Anuj Goa Amna-3b 7016. bde65
## ... with abbreviated variable names 1: property_sku, 2: Booking_ref
I do not understand entirely how you distribute the Revenue. Consider that left as an exercise to fix :).
Hint: you need a ceiling() around the computation of the new los which computes decimal days.
Using solution from this post to split date:
df2 <- df %>%
group_by(id = row_number()) %>% # for each row
mutate(seq = list(seq(start_date, end_date, "day")), # create a sequence of dates with 1 day step
month = map(seq, month)) %>% # get the month for each one of those dates in sequence
unnest() %>% # unnest data
group_by(Booking_ref, id, month) %>% # for each group, row and month
summarise(start_date = min(seq), # get minimum date as start
end_date = max(seq)) %>% # get maximum date as end
ungroup() %>% # ungroup
select(-id, - month)%>%
group_by(Booking_ref)%>%
mutate(last_date=max(end_date)) # get last_date to determine los
df3 <- merge(df2,df%>%select(-start_date,-end_date),by=c('Booking_ref'),all.x=T)%>%
mutate(timespam=end_date-start_date)%>%
mutate(los2=as.numeric(case_when(last_date==end_date~timespam,
T~timespam+1)),
Revenue2=Revenue*los2/los)
out_df <- df3%>%
select(-Revenue,-los,-timespam,-last_date)%>%
rename(Revenue=Revenue2,
los=los2)
> out_df
Booking_ref start_date end_date channel room_stay_status booker area property_sku los Revenue
1 aed97 2022-01-01 2022-01-26 109 ENQUIRY Anuj Goa Amna-3b 25 37593.194
2 aed97 2021-11-27 2021-11-30 109 ENQUIRY Anuj Goa Amna-3b 4 6014.911
3 aed97 2021-12-01 2021-12-31 109 ENQUIRY Anuj Goa Amna-3b 31 46615.561
4 b497h9 2022-01-01 2022-01-20 109 ENQUIRY Anuj Goa Amna-3b 19 5979.000
5 bde65 2022-01-01 2022-01-03 Agent CHECKED_OUT Anuj Goa Amna-3b 2 7015.900
I am trying to get the time difference between elements of an array a sample of the data is below and the image at the bottom describes the problem I am trying to solve. I have a dataframe column events where each value is an array of date and time entries that correspond to events and other columns which partition time into a before, evaluation and after period. I would like to calculate the statistics on the time between events.
** Update **
Using the excellent answer by danlooo below which gives me almost exactly what I need if I
add the four boundary events corresponding to before_eval_begin, eval_month, after_eval_end to the event array
duration is calculated for consecutive events
the before and after case_when statement is tweaked
the following code appears to work:
duration <-
data %>% mutate(across(before_event_eval:after_eval_end,as.character)) %>%
as_tibble() %>%
mutate(
events = events %>% str_remove_all("[\\[\\]\\\"]")
) %>%
mutate( events = ifelse(events == "",events,paste0(events,",",
before_event_eval,",",as.character(as.Date(eval_month)-days(1)),
",",as.character(ceiling_date(as.Date('2021-02-01'),"month")),
",",after_eval_end))) %>%
separate_rows(events, sep = ",") %>%
rename(event = events) %>%
filter(event != "") %>%
mutate(across(before_event_eval:after_eval_end,parse_datetime)) %>%
mutate(
event = event %>% parse_datetime(),
position = case_when(
event >= before_event_eval &
event < eval_month ~ "before",
event <= after_eval_end &
event > eval_month ~ "after"
)
) %>%
arrange(id,event) %>% group_by(id) %>%
mutate(duration = as.numeric(event - lag(event))) %>%
group_by(id,position) %>%
summarise(time_until_first = first(duration[!is.na(duration)]),
timebetween_last = last(duration[!is.na(duration)]),
min_duration = min(duration,na.rm=TRUE),
avg_duration = mean(duration,na.rm=TRUE),
max_duration = max(duration,na.rm=TRUE))
I think a general strategy would be as follows but I am not sure how to proceed after step 1 and perform computations on the cleaned array:
remove brackets and parenthesis from string
create ordered vector of events
Determine if event falls before or after eval month:
Before: event is >= before_eval_begin and < eval_month
After: event is > eval_month and <= after_eval_end
Determine time between events for each period (before, after) including times
relative to before_eval_begin, eval_month, after_eval_end
Return the below statistics:
If events is missing then all the values below should be set to 185
• Time to first event in pre period
• Time between last event in pre period and end of pre period
• Average time between events for pre period
• Minimum of time between events in pre period
• Maximum of time between events in pre period
• Time to first event in post period
• Time between last event in post period and end of post period
• Minimum of time between events in post period
• Maximum of time between events in post period
*Edit: removed duplicate events and added id column
Data
structure(list(id = c(1, 2, 3, 4), before_event_eval = structure(c(1596240000,
1596240000, 1604188800, 1604188800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), eval_month = structure(c(1612137600, 1612137600,
1619827200, 1619827200), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
after_eval_end = structure(c(1627776000, 1627776000, 1635724800,
1635724800), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
events = c("[\"2021-01-28 13:25:32\",\"2021-01-28 18:25:32\"]",
"[\"2021-04-30 18:25:32\",\"2021-01-15 11:25:32\",\"2021-01-30 18:25:32\",\"2021-03-30 18:25:32\",\"2021-01-27 11:25:32\",\"2021-01-30 18:26:32\"]",
"[]", "[\"2021-04-28 13:25:32\",\"2021-05-28 10:25:32\"]"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-4L))
Picture of Problem
Something like this?
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
data <- structure(list(
before_event_eval = structure(c(
1596240000, 1596240000,
1604188800, 1604188800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
eval_month = structure(c(
1612137600, 1612137600, 1619827200,
1619827200
), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
after_eval_end = structure(c(
1627776000, 1627776000, 1635724800,
1635724800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
events = c(
"[\"2021-01-28 13:25:32\",\"2021-01-28 18:25:32\"]",
"[\"2021-04-30 18:25:32\",\"2021-01-15 11:25:32\",\"2021-01-30 18:25:32\",\"2021-03-30 18:25:32\",\"2021-01-27 11:25:32\",\"2021-01-30 18:25:32\",\"2021-01-30 18:25:32\"]",
"[]", "[\"2021-04-28 13:25:32\",\"2021-05-28 10:25:32\"]"
)
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(
NA,
-4L
))
events <-
data %>%
as_tibble() %>%
mutate(
id = row_number(),
events = events %>% str_remove_all("[\\[\\]\\\"]")
) %>%
separate_rows(events, sep = ",") %>%
rename(event = events) %>%
filter(event != "") %>%
mutate(
event = event %>% parse_datetime(),
position = case_when(
event >= before_event_eval &
year(event) == year(eval_month) &
month(event) < month(eval_month) ~ "before",
event <= after_eval_end &
year(event) == year(eval_month) &
month(event) > month(eval_month) ~ "after"
)
) %>%
arrange(event)
events
#> # A tibble: 11 × 6
#> before_event_eval eval_month after_eval_end
#> <dttm> <dttm> <dttm>
#> 1 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 2 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 3 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 4 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 5 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 6 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 7 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 8 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 9 2020-11-01 00:00:00 2021-05-01 00:00:00 2021-11-01 00:00:00
#> 10 2020-08-01 00:00:00 2021-02-01 00:00:00 2021-08-01 00:00:00
#> 11 2020-11-01 00:00:00 2021-05-01 00:00:00 2021-11-01 00:00:00
#> # … with 3 more variables: event <dttm>, id <int>, position <chr>
durations <-
events$event %>%
as.character() %>%
unique() %>%
combn(2) %>%
t() %>%
as_tibble() %>%
transmute(
from = parse_datetime(V1),
to = parse_datetime(V2),
duration = to - from
) %>%
left_join(events, by = c("from" = "event"))
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
#> Using compatibility `.name_repair`.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
durations
#> # A tibble: 44 × 8
#> from to duration before_event_eval
#> <dttm> <dttm> <drtn> <dttm>
#> 1 2021-01-15 11:25:32 2021-01-27 11:25:32 288 hours 2020-08-01 00:00:00
#> 2 2021-01-15 11:25:32 2021-01-28 13:25:32 314 hours 2020-08-01 00:00:00
#> 3 2021-01-15 11:25:32 2021-01-28 18:25:32 319 hours 2020-08-01 00:00:00
#> 4 2021-01-15 11:25:32 2021-01-30 18:25:32 367 hours 2020-08-01 00:00:00
#> 5 2021-01-15 11:25:32 2021-03-30 18:25:32 1783 hours 2020-08-01 00:00:00
#> 6 2021-01-15 11:25:32 2021-04-28 13:25:32 2474 hours 2020-08-01 00:00:00
#> 7 2021-01-15 11:25:32 2021-04-30 18:25:32 2527 hours 2020-08-01 00:00:00
#> 8 2021-01-15 11:25:32 2021-05-28 10:25:32 3191 hours 2020-08-01 00:00:00
#> 9 2021-01-27 11:25:32 2021-01-28 13:25:32 26 hours 2020-08-01 00:00:00
#> 10 2021-01-27 11:25:32 2021-01-28 18:25:32 31 hours 2020-08-01 00:00:00
#> # … with 34 more rows, and 4 more variables: eval_month <dttm>,
#> # after_eval_end <dttm>, id <int>, position <chr>
durations %>%
group_by(position) %>%
summarise(
min_duration = min(duration),
avg_duration = mean(duration),
max_duration = max(duration)
)
#> # A tibble: 2 × 4
#> position min_duration avg_duration max_duration
#> <chr> <drtn> <drtn> <drtn>
#> 1 after 664 hours 876.750 hours 1408 hours
#> 2 before 5 hours 1600.925 hours 3191 hours
Created on 2022-04-26 by the reprex package (v2.0.0)
To only look at consecutive events, one can do
durations <-
events %>%
arrange(position, event) %>%
mutate(
from = event,
to = lead(event)
)
I'm doing something quite simple. Given a dataframe of start dates and end dates for specific periods I want to expand/create a full sequence for each period binned by week (with the factor for each row), then output this in a single large dataframe.
For instance:
library(tidyverse)
library(lubridate)
# Dataset
start_dates = ymd_hms(c("2019-05-08 00:00:00",
"2020-01-17 00:00:00",
"2020-03-03 00:00:00",
"2020-05-28 00:00:00",
"2020-12-10 00:00:00",
"2021-05-07 00:00:00",
"2022-01-04 00:00:00"), tz = "UTC")
end_dates = ymd_hms(c( "2019-10-24 00:00:00",
"2020-03-03 00:00:00",
"2020-05-28 00:00:00",
"2020-12-10 00:00:00",
"2021-05-07 00:00:00",
"2022-01-04 00:00:00",
"2022-01-19 00:00:00"), tz = "UTC")
df1 = data.frame(studying = paste0("period",seq(1:7),sep = ""),start_dates,end_dates)
It was suggested to me to use do(), which currently works fine but I hate it when things are superseded. I also have a way of doing it using map2. But reading the file (https://dplyr.tidyverse.org/reference/do.html) suggests you can use nest_by(), across() and summarise() to do the same job as do(), how would I go about getting same result? I've tried a lot of things but I just can't seem to get it.
# do() way to do it
df1 %>%
group_by(studying) %>%
do(data.frame(week=seq(.$start_dates,.$end_dates,by="1 week")))
# transmute() way to do it
df1 %>%
transmute(weeks = map2(start_dates,end_dates, seq, by = "1 week"), studying)
%>% unnest(cols = c(weeks))
As the documentation of ?do suggests, we can now use summarise and replace the . with across():
library(tidyverse)
library(lubridate)
df1 %>%
group_by(studying) %>%
summarise(week = seq(across()$start_dates,
across()$end_dates,
by = "1 week"))
#> `summarise()` has grouped output by 'studying'. You can override using the
#> `.groups` argument.
#> # A tibble: 134 x 2
#> # Groups: studying [7]
#> studying week
#> <chr> <dttm>
#> 1 period1 2019-05-08 00:00:00
#> 2 period1 2019-05-15 00:00:00
#> 3 period1 2019-05-22 00:00:00
#> 4 period1 2019-05-29 00:00:00
#> 5 period1 2019-06-05 00:00:00
#> 6 period1 2019-06-12 00:00:00
#> 7 period1 2019-06-19 00:00:00
#> 8 period1 2019-06-26 00:00:00
#> 9 period1 2019-07-03 00:00:00
#> 10 period1 2019-07-10 00:00:00
#> # … with 124 more rows
Created on 2022-01-19 by the reprex package (v0.3.0)
You can also use tidyr::complete:
df1 %>%
group_by(studying) %>%
complete(start_dates = seq(from = start_dates, to = end_dates, by = "1 week")) %>%
select(-end_dates, weeks = start_dates)
# A tibble: 134 x 2
# Groups: studying [7]
studying weeks
<chr> <dttm>
1 period1 2019-05-08 00:00:00
2 period1 2019-05-15 00:00:00
3 period1 2019-05-22 00:00:00
4 period1 2019-05-29 00:00:00
5 period1 2019-06-05 00:00:00
6 period1 2019-06-12 00:00:00
7 period1 2019-06-19 00:00:00
8 period1 2019-06-26 00:00:00
9 period1 2019-07-03 00:00:00
10 period1 2019-07-10 00:00:00
# ... with 124 more rows
Although marked Experimental the help file for group_modify does say that
‘group_modify()’ is an evolution of ‘do()’
and, in fact, the code for the example in the question using group_modify is nearly the same as with do.
# with group_modify
df2 <- df1 %>%
group_by(studying) %>%
group_modify(~ data.frame(week = seq(.$start_dates, .$end_dates, by = "1 week")))
# with do
df0 <- df1 %>%
group_by(studying) %>%
do(data.frame(week = seq(.$start_dates, .$end_dates, by = "1 week")))
identical(df2, df0)
## [1] TRUE
Not sure if this exactly what you are looking for, but here is my attempt with rowwise and unnest
df1 %>%
rowwise() %>%
mutate(week = list(seq(start_dates, end_dates, by = "1 week"))) %>%
select(studying, week) %>%
unnest(cols = c(week))
Another approach:
library(tidyverse)
df1 %>%
group_by(studying) %>%
summarise(df = tibble(weeks = seq(start_dates, end_dates, by = 'week'))) %>%
unnest(df)
#> `summarise()` has grouped output by 'studying'. You can override using the `.groups` argument.
#> # A tibble: 134 × 2
#> # Groups: studying [7]
#> studying weeks
#> <chr> <dttm>
#> 1 period1 2019-05-08 00:00:00
#> 2 period1 2019-05-15 00:00:00
#> 3 period1 2019-05-22 00:00:00
#> 4 period1 2019-05-29 00:00:00
#> 5 period1 2019-06-05 00:00:00
#> 6 period1 2019-06-12 00:00:00
#> 7 period1 2019-06-19 00:00:00
#> 8 period1 2019-06-26 00:00:00
#> 9 period1 2019-07-03 00:00:00
#> 10 period1 2019-07-10 00:00:00
#> # … with 124 more rows
Created on 2022-01-20 by the reprex package (v2.0.1)
I have a start and end date for individuals and i need to estimate if the time passed from the start to the end is within 2 days
or 3 plus days.These dates are assign to record ids, how can i filter ones that ended within 2 days (from the start date)
and the ones that ended after 3 days or later.
Record_id <- c("2245","6728","5122","9287")
Start <- c("2021-01-13 CST" ,"2021-01-21 CST" ,"2021-01-17 CST","2021-01-13 CST")
End <- c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST","2021-01-25 15:00:00 CST")
I tried using
elapsed.time <- DF$start %--% DF$End
time.duration <- as.duration(elapsed.time)
but I am getting error because End date contains hour.Thank you.
Here's a dplyr pipe that will include both constraints (2 and 3 days):
df %>%
mutate(across(Start:End, as.POSIXct)) %>%
mutate(d = difftime(End, Start, units = "days")) %>%
filter(!between(difftime(End, Start, units = "days"), 2, 3))
# # A tibble: 4 x 4
# Record_id Start End d
# <chr> <dttm> <dttm> <drtn>
# 1 2245 2021-01-13 00:00:00 2021-01-21 18:00:00 8.750000 days
# 2 6728 2021-01-21 00:00:00 2021-01-22 16:00:00 1.666667 days
# 3 5122 2021-01-17 00:00:00 2021-01-22 13:00:00 5.541667 days
# 4 9287 2021-01-13 00:00:00 2021-01-25 15:00:00 12.625000 days
I included mutate(d= so that we can see what the actual differences are. If you were looking to remove those, then use filter(between(..)) (no !).
In the case of the data you provided, all observations are less than 2 or more than 3 days. I'll expand this range so that we can see it in effect:
df %>%
mutate(across(Start:End, as.POSIXct)) %>%
mutate(d = difftime(End, Start, units = "days")) %>%
filter(!between(difftime(End, Start, units = "days"), 1, 6))
# # A tibble: 2 x 4
# Record_id Start End d
# <chr> <dttm> <dttm> <drtn>
# 1 2245 2021-01-13 00:00:00 2021-01-21 18:00:00 8.750 days
# 2 9287 2021-01-13 00:00:00 2021-01-25 15:00:00 12.625 days
Data
df <- structure(list(Record_id = c("2245", "6728", "5122", "9287"), Start = c("2021-01-13 CST", "2021-01-21 CST", "2021-01-17 CST", "2021-01-13 CST"), End = c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST", "2021-01-25 15:00:00 CST")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I just converted the character to a date time with lubridate and then subtracted the dates. What you'll get back are days. I then filter for dates that are within 2 days.
Record_id<- c("2245","6728","5122","9287")
Start<-c("2021-01-13 CST" ,"2021-01-21 CST" ,"2021-01-17 CST","2021-01-13 CST")
End<-c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST","2021-01-25 15:00:00 CST")
df <- dplyr::tibble(x = Record_id, y = Start, z = End)
df %>%
dplyr::mutate_at(vars(y:z), ~ lubridate::as_datetime(.)) %>%
dplyr::mutate(diff = as.numeric(z - y)) %>%
dplyr::filter(diff <= 2 )
I have a cohort of data with multiple person visits and want to group visits with a common ID based on person # and the time of the visit. The condition is if an start is within 24 hours of a the previous exit, then I want those to have the same ID.
Sample of what data looks like:
dat <- data.frame(
Person_ID = c(1,1,1,2,3,3,3,4,4),
Admit_Date_Time = as.POSIXct(c("2017-02-07 15:26:00","2017-04-21 10:20:00",
"2017-04-22 12:12:00", "2017-10-16 01:31:00","2017-01-24 02:41:00","2017- 01-24 05:31:00", "2017-01-28 04:26:00", "2017-12-01 01:31:00","2017-12-01
01:31:00"), format = "%Y-%m-%d %H:%M"),
Discharge_Date_Time = as.POSIXct(c("2017-03-01 11:42:00","2017-04-22
05:56:00",
"2017-04-26 21:01:00",
"2017-10-18 20:11:00",
"2017-01-27 22:15:00",
"2017-01-26 15:35:00",
"2017-01-28 09:25:00",
"2017-12-05 18:33:00",
"2017-12-04 16:41:00"),format = "%Y-%m-%d %H:%M" ),
Visit_ID = c(1:9))
this is what I tried to start:
dat1 <-
dat %>%
arrange(Person_ID, Admit_Date_Time) %>%
group_by(Person_ID) %>%
mutate(Previous_Visit_Interval = difftime(lag(Discharge_Date_Time,
1),Admit_Date_Time, units = "hours")) %>%
mutate(start = c(1,Previous_Visit_Interval[-1] < hours(-24)), run =
cumsum(start))
dat1$ID = as.numeric(as.factor(paste0(dat1$Person_ID,dat1$run)))
Which is almost right, except it does not give the correct ID for visit 7 (person #3). Since there are three visits and the second visit is entirely within the first, and the third starts within 24 hours of the first but not the second.
There's probably a way to shorten this, but here's an approach using tidyr::gather and spread. By gathering into long format, we can track the cumulative admissions inside each visit. A new visit is recorded whenever there's a new Person_ID or that Person_ID completed a visit (cumulative admissions went to zero) at least 24 hours prior.
library(tidyr)
dat1 <- dat %>%
# Gather into long format with event type in one column, timestamp in another
gather(event, time, Admit_Date_Time:Discharge_Date_Time) %>%
# I want discharges to have an effect up to 24 hours later. Sort using that.
mutate(time_adj = if_else(event == "Discharge_Date_Time",
time + ddays(1),
time)) %>%
arrange(Person_ID, time_adj) %>%
# For each Person_ID, track cumulative admissions. 0 means a visit has completed.
# (b/c we sorted by time_adj, these reflect the 24hr period after discharges.)
group_by(Person_ID) %>%
mutate(admissions = if_else(event == "Admit_Date_Time", 1, -1)) %>%
mutate(admissions_count = cumsum(admissions)) %>%
ungroup() %>%
# Record a new Hosp_ID when either (a) a new Person, or (b) preceded by a
# completed visit (ie admissions_count was zero).
mutate(Hosp_ID_chg = 1 *
(Person_ID != lag(Person_ID, default = 1) | # (a)
lag(admissions_count, default = 1) == 0), # (b)
Hosp_ID = cumsum(Hosp_ID_chg)) %>%
# Spread back into original format
select(-time_adj, -admissions, -admissions_count, -Hosp_ID_chg) %>%
spread(event, time)
Results
> dat1
# A tibble: 9 x 5
Person_ID Visit_ID Hosp_ID Admit_Date_Time Discharge_Date_Time
<dbl> <int> <dbl> <dttm> <dttm>
1 1 1 1 2017-02-07 15:26:00 2017-03-01 11:42:00
2 1 2 2 2017-04-21 10:20:00 2017-04-22 05:56:00
3 1 3 2 2017-04-22 12:12:00 2017-04-26 21:01:00
4 2 4 3 2017-10-16 01:31:00 2017-10-18 20:11:00
5 3 5 4 2017-01-24 02:41:00 2017-01-27 22:15:00
6 3 6 4 2017-01-24 05:31:00 2017-01-26 15:35:00
7 3 7 4 2017-01-28 04:26:00 2017-01-28 09:25:00
8 4 8 5 2017-12-01 01:31:00 2017-12-05 18:33:00
9 4 9 5 2017-12-01 01:31:00 2017-12-04 16:41:00
Here's a data.table approach using an overlap-join
library( data.table )
library( lubridate )
setDT( dat )
setorder( dat, Person_ID, Admit_Date_Time )
#create a 1-day extension after each discharge
dt2 <- dat[, discharge_24h := Discharge_Date_Time %m+% days(1)][]
#now create id
setkey( dat, Admit_Date_Time, discharge_24h )
#create data-table with overlap-join, create groups based on overlapping ranges
dt2 <- setorder(
foverlaps( dat,
dat,
mult = "first",
type = "any",
nomatch = 0L
),
Visit_ID )[, list( Visit_ID = i.Visit_ID,
Hosp_ID = .GRP ),
by = .( Visit_ID )][, Visit_ID := NULL]
#reorder the result
setorder( dt2[ dat, on = "Visit_ID" ][, discharge_24h := NULL], Visit_ID )[]
# Visit_ID Hosp_ID Person_ID Admit_Date_Time Discharge_Date_Time
# 1: 1 1 1 2017-02-07 15:26:00 2017-03-01 11:42:00
# 2: 2 2 1 2017-04-21 10:20:00 2017-04-22 05:56:00
# 3: 3 2 1 2017-04-22 12:12:00 2017-04-26 21:01:00
# 4: 4 3 2 2017-10-16 01:31:00 2017-10-18 20:11:00
# 5: 5 4 3 2017-01-24 02:41:00 2017-01-27 22:15:00
# 6: 6 4 3 2017-01-24 05:31:00 2017-01-26 15:35:00
# 7: 7 4 3 2017-01-28 04:26:00 2017-01-28 09:25:00
# 8: 8 5 4 2017-12-01 01:31:00 2017-12-05 18:33:00
# 9: 9 5 4 2017-12-01 01:31:00 2017-12-04 16:41:00