Generate dates based on condition before and after index dates - r

I have a data frame with 10,000+ dates. for example,
indexdt
01-02-2019
08-15-2019
I need to create two data frames based on the following conditions-
generate dates such that I get same day of week, upto 3 weeks before and after the index date. The out put should be
Table 1
indexdt dates
01-02-2019 12-26-2018
01-02-2019 12-19-2018
01-02-2019 12-12-2018
01-02-2019 01-09-2019
01-02-2019 01-16-2019
01-02-2019 01-23-2019
08-15-2019 07-25-2019
08-15-2019 08-01-2019
08-15-2019 08-08-2019
08-15-2019 08-22-2019
08-15-2019 08-29-2019
08-15-2019 08-05-2019
same day of week, same month. The output should be
Table 2
indexdt date
01-02-2019 01-09-2019
01-02-2019 01-16-2019
01-02-2019 01-23-2019
01-02-2019 01-30-2019
08-15-2019 08-01-2019
08-15-2019 08-08-2019
08-15-2019 08-22-2019
08-15-2019 08-29-2019

I have answered both the questions here but you should only ask one question in one post :
library(dplyr)
library(purrr)
library(lubridate)
#Convert to date
df <- df %>% mutate(indexdt = mdy(indexdt))
generate dates such that I get same day of week, upto 3 weeks before and after the index date
We use seq to generate before and after dates separately. [-1] is used to ignore the indexdt date since we don't want that in final output.
df %>%
mutate(dates = map(indexdt, ~c(seq(.x, length.out = 4, by = -7)[-1],
seq(.x, length.out = 4, by = 7)[-1]))) %>%
unnest(dates)
# indexdt dates
# <date> <date>
# 1 2019-01-02 2018-12-26
# 2 2019-01-02 2018-12-19
# 3 2019-01-02 2018-12-12
# 4 2019-01-02 2019-01-09
# 5 2019-01-02 2019-01-16
# 6 2019-01-02 2019-01-23
# 7 2019-08-15 2019-08-08
# 8 2019-08-15 2019-08-01
# 9 2019-08-15 2019-07-25
#10 2019-08-15 2019-08-22
#11 2019-08-15 2019-08-29
#12 2019-08-15 2019-09-05
same day of week, same month.
Here we create a sequence from indexdt date to start of the month (floor_date) and another sequence from indexdt to end of the month (ceiling_date - 1).
df %>%
mutate(dates = map(indexdt, ~c(seq(.x, floor_date(.x, 'month'), by = -7)[-1],
seq(.x, ceiling_date(.x, 'month') - 1, by = 7)[-1]))) %>%
unnest(dates)
# indexdt dates
# <date> <date>
#1 2019-01-02 2019-01-09
#2 2019-01-02 2019-01-16
#3 2019-01-02 2019-01-23
#4 2019-01-02 2019-01-30
#5 2019-08-15 2019-08-08
#6 2019-08-15 2019-08-01
#7 2019-08-15 2019-08-22
#8 2019-08-15 2019-08-29
data
df <- structure(list(indexdt = c("01-02-2019", "08-15-2019")),
class = "data.frame", row.names = c(NA, -2L))

Related

Joining two data frames on the closest date in R

I have two datasets that I would like to join based on date. One is a survey dataset, and the other is a list of prices at various dates. The dates don't match exactly, so I would like to join on the nearest date in the survey dataset (the price data is weekly).
Here's a brief snippet of what the survey dataset looks like (there are many other variables, but here's the two most relevant):
ID
actual.date
20120377
2012-09-26
2020455822
2020-11-23
20126758
2012-10-26
20124241
2012-10-25
2020426572
2020-11-28
And here's the price dataset (also much larger, but you get the idea):
date
price.var1
price.var2
2017-10-30
2.74733926399869
2.73994826674735
2015-03-16
2.77028200438506
2.74079930272231
2010-10-18
3.4265947805337
3.41591263539176
2012-10-29
4.10095806545397
4.14717556976502
2012-01-09
3.87888859352037
3.93074237884497
What I would like to do is join the price dataset to the survey dataset, joining on the nearest date.
I've tried a number of different things, none of which have worked to my satisfaction.
#reading in sample data
library(data.table)
library(dplyr)
survey <- fread(" ID actual.date
1: 20120377 2012-09-26
2: 2020455822 2020-11-23
3: 20126758 2012-10-26
4: 20124241 2012-10-25
5: 2020426572 2020-11-28
> ") %>% select(-V1)
price <- fread("date price.var1 price.var2
1: 2017-10-30 2.747339 2.739948
2: 2015-03-16 2.770282 2.740799
3: 2010-10-18 3.426595 3.415913
4: 2012-10-29 4.100958 4.147176
5: 2012-01-09 3.878889 3.930742") %>% select(-V1)
#using data.table
setDT(survey)[,DT_DATE := actual.date]
setDT(price)[,DT_DATE := date]
survey_price <- survey[price,on=.(DT_DATE),roll="nearest"]
#This works, and they join, but it drops a ton of observations, which won't work
#using dplyr
library(dplyr)
survey_price <- left_join(survey,price,by=c("actual.date"="date"))
#this joins them without dropping observations, but all of the price variables become NAs
You were almost there.
In the DT[i,on] syntax, i should be survey to join on all its rows
setDT(survey)
setDT(price)
survey_price <- price[survey,on=.(date=actual.date),roll="nearest"]
survey_price
date price.var1 price.var2 ID
<IDat> <num> <num> <int>
1: 2012-09-26 4.100958 4.147176 20120377
2: 2020-11-23 2.747339 2.739948 2020455822
3: 2012-10-26 4.100958 4.147176 20126758
4: 2012-10-25 4.100958 4.147176 20124241
5: 2020-11-28 2.747339 2.739948 2020426572
Convert the dates to numeric and find the closest date from the survey for price with Closest() from DescTools, and take that value.
Example datasets
survey <- tibble(
ID = sample(20000:40000, 9, replace = TRUE),
actual.date = seq(today() %m+% days(5), today() %m+% days(5) %m+% months(2),
"week")
)
price <- tibble(
date = seq(today(), today() %m+% months(2), by = "week"),
price_1 = sample(2:6, 9, replace = TRUE),
price_2 = sample(2:6, 9, replace = TRUE)
)
survey
# A tibble: 9 x 2
ID actual.date
<int> <date>
1 34592 2022-05-07
2 37846 2022-05-14
3 22715 2022-05-21
4 22510 2022-05-28
5 30143 2022-06-04
6 34348 2022-06-11
7 21538 2022-06-18
8 39802 2022-06-25
9 36493 2022-07-02
price
# A tibble: 9 x 3
date price_1 price_2
<date> <int> <int>
1 2022-05-02 6 6
2 2022-05-09 3 2
3 2022-05-16 6 4
4 2022-05-23 6 2
5 2022-05-30 2 6
6 2022-06-06 2 4
7 2022-06-13 2 2
8 2022-06-20 3 5
9 2022-06-27 5 6
library(tidyverse)
library(lubridate)
library(DescTools)
price <- price %>%
mutate(date = Closest(survey$actual.date %>%
as.numeric, date %>%
as.numeric) %>%
as_date())
# A tibble: 9 x 3
date price_1 price_2
<date> <int> <int>
1 2022-05-07 6 6
2 2022-05-14 3 2
3 2022-05-21 6 4
4 2022-05-28 6 2
5 2022-06-04 2 6
6 2022-06-11 2 4
7 2022-06-18 2 2
8 2022-06-25 3 5
9 2022-07-02 5 6
merge(survey, price, by.x = "actual.date", by.y = "date")
actual.date ID price_1 price_2
1 2022-05-07 34592 6 6
2 2022-05-14 37846 3 2
3 2022-05-21 22715 6 4
4 2022-05-28 22510 6 2
5 2022-06-04 30143 2 6
6 2022-06-11 34348 2 4
7 2022-06-18 21538 2 2
8 2022-06-25 39802 3 5
9 2022-07-02 36493 5 6

R create week numbers with specified start date

This seems like it should be straightforward but I cannot find a way to do this.
I have a sales cycle that begins ~ August 1 of each year and need to sum sales by week number. I need to create a "week number" field where week #1 begins on a date that I specify. Thus far I have looked at lubridate, baseR, and strftime, and I cannot find a way to change the "start" date from 01/01/YYYY to something else.
Solution needs to let me specify the start date and iterate week numbers as 7 days from the start date. The actual start date doesn't always occur on a Sunday or Monday.
EG Data Frame
eg_data <- data.frame(
cycle = c("cycle2019", "cycle2019", "cycle2018", "cycle2018", "cycle2017", "cycle2017", "cycle2016", "cycle2016"),
dates = as.POSIXct(c("2019-08-01" , "2019-08-10" ,"2018-07-31" , "2018-08-16", "2017-08-03" , "2017-08-14" , "2016-08-05", "2016-08-29")),
week_n = c("1", "2","1","3","1","2","1","4"))
I'd like the result to look like what is above - it would take the min date for each cycle and use that as a starting point, then iterate up week numbers based on a given date's distance from the cycle starting date.
This almost works. (Doing date arithmetic gives us durations in seconds: there may be a smoother way to convert with lubridate tools?)
secs_per_week <- 60*60*24*7
(eg_data
%>% group_by(cycle)
%>% mutate(nw=1+as.numeric(round((dates-min(dates))/secs_per_week)))
)
The results don't match for 2017, because there is an 11-day gap between the first and second observation ...
cycle dates week_n nw
<chr> <dttm> <chr> <dbl>
5 cycle2017 2017-08-03 00:00:00 1 1
6 cycle2017 2017-08-14 00:00:00 2 3
If someone has a better answer plz post, but this works -
Take the dataframe in the example, eg_data -
eg_data %>%
group_by(cycle) %>%
mutate(
cycle_start = as.Date(min(dates)),
days_diff = as.Date(dates) - cycle_start,
week_n = days_diff / 7,
week_n_whole = ceiling(days_diff / 7) ) -> eg_data_check
(First time I've answered my own question)
library("lubridate")
eg_data %>%
as_tibble() %>%
group_by(cycle) %>%
mutate(new_week = week(dates)-31)
This doesn't quite work the same as your example, but perhaps with some fiddling based on your domain experience you could adapt it:
library(lubridate)
eg_data %>%
mutate(aug1 = ymd_h(paste(str_sub(cycle, start = -4), "080100")),
week_n2 = ceiling((dates - aug1)/ddays(7)))
EDIT: If you have specific known dates for the start of each cycle, it might be helpful to join those dates to your data for the calc:
library(lubridate)
cycle_starts <- data.frame(
cycle = c("cycle2019", "cycle2018", "cycle2017", "cycle2016"),
start_date = ymd_h(c(2019080100, 2018072500, 2017080500, 2016071300))
)
eg_data %>%
left_join(cycle_starts) %>%
mutate(week_n2 = ceiling((dates - start_date)/ddays(7)))
#Joining, by = "cycle"
# cycle dates week_n start_date week_n2
#1 cycle2019 2019-08-01 1 2019-08-01 1
#2 cycle2019 2019-08-10 2 2019-08-01 2
#3 cycle2018 2018-07-31 1 2018-07-25 1
#4 cycle2018 2018-08-16 3 2018-07-25 4
#5 cycle2017 2017-08-03 1 2017-08-05 0
#6 cycle2017 2017-08-14 2 2017-08-05 2
#7 cycle2016 2016-08-05 1 2016-07-13 4
#8 cycle2016 2016-08-29 4 2016-07-13 7
This is a concise solution using lubridate
library(lubridate)
eg_data %>%
group_by(cycle) %>%
mutate(new_week = floor(as.period(ymd(dates) - ymd(min(dates))) / weeks()) + 1)
# A tibble: 8 x 4
# Groups: cycle [4]
cycle dates week_n new_week
<chr> <dttm> <chr> <dbl>
1 cycle2019 2019-08-01 00:00:00 1 1
2 cycle2019 2019-08-10 00:00:00 2 2
3 cycle2018 2018-07-31 00:00:00 1 1
4 cycle2018 2018-08-16 00:00:00 3 3
5 cycle2017 2017-08-03 00:00:00 1 1
6 cycle2017 2017-08-14 00:00:00 2 2
7 cycle2016 2016-08-05 00:00:00 1 1
8 cycle2016 2016-08-29 00:00:00 4 4

R: Moving average length of time between two dates

I have a dataset of observations with start and end dates. I would like to calculate the moving average difference between the start and end dates.
I've included an example dataset below.
require(dplyr)
df <- data.frame(id=c(1,2,3),
start=c("2019-01-01","2019-01-10", "2019-01-05"),
end=c("2019-02-01", "2019-01-15", "2019-01-10"))
df[,c("start", "end")] <- lapply(df[,c("start", "end")], as.Date)
id start end
1 2019-01-01 2019-02-01
2 2019-01-10 2019-01-15
3 2019-01-05 2019-01-10
The overall date ranges are 2019-01-01 to 2019-02-01. I would like to calculate the average difference between the start and end dates for each of the dates in that range.
The result would look exactly like this. I've included the actual values for the averages that should show up:
date avg
2019-01-01 0
2019-01-02 1
2019-01-03 2
2019-01-04 3
2019-01-05 4
2019-01-06 3
2019-01-07 4
2019-01-08 5
2019-01-09 6
2019-01-10 7
2019-01-11 5.5
. .
. .
. .
Creating a reproducible example:
df <- data.frame(id=c(1,2,3,4),
start=c("2019-01-01","2019-01-01", "2019-01-10", "2019-01-05"),
end=c("2019-01-04", "2019-01-05", "2019-01-12", "2019-01-08"))
df[,c("start", "end")] <- lapply(df[,c("start", "end")], as.Date)
df
Returns:
id start end
1 2019-01-01 2019-01-04
2 2019-01-01 2019-01-05
3 2019-01-10 2019-01-12
4 2019-01-05 2019-01-08
Then using the group_by function from dplyr:
library(dplyr)
df %>%
group_by(start) %>%
summarize(avg=mean(end - start)) %>%
rename(date=start)
Returns:
date avg
<time> <time>
2019-01-01 3.5 days
2019-01-05 3.0 days
2019-01-10 2.0 days
Editing the answer as per comments.
Creating the df:
require(dplyr)
df <- data.frame(id=c(1,2,3),
start=c("2019-01-01", "2019-01-10", "2019-01-05"),
end=c("2019-02-01", "2019-01-15", "2019-01-10"))
df[,c("start", "end")] <- lapply(df[,c("start", "end")], as.Date)
Create dates for every start-end combination:
#gives the list of all dates within start and end frames and calculates difference
datesList = lapply(1:nrow(df),function(i){
dat = data_frame('date'=seq.Date(from=df$start[i],to=df$end[i],by=1),
'start'=df$start[i]) %>%
dplyr::mutate(diff=date-start)
})
Finally, group_by the date and find avg to give output exactly as the one in the question:
finalDf = bind_rows(datesList) %>%
dplyr::filter(diff != 0) %>%
dplyr::group_by(date) %>%
dplyr::summarise(avg=mean(diff,na.rm=T))
The output thus becomes:
# A tibble: 31 x 2
date avg
<date> <time>
1 2019-01-02 1.0 days
2 2019-01-03 2.0 days
3 2019-01-04 3.0 days
4 2019-01-05 4.0 days
5 2019-01-06 3.0 days
6 2019-01-07 4.0 days
7 2019-01-08 5.0 days
8 2019-01-09 6.0 days
9 2019-01-10 7.0 days
10 2019-01-11 5.5 days
# … with 21 more rows
Let me know if it works.

Calculate Log Difference For Each Day in R Produce NA for the First Observation for Each Day

Problem: Calculate the difference in log for each day (group by each day). The ideal result should produce NA for the first observation for each day.
library(dplyr)
library(tidyverse)
library(tibble)
library(lubridate)
df <- tibble(t = c("2019-10-01 09:30", "2019-10-01 09:35", "2019-10-01 09:40", "2019-10-02 09:30", "2019-10-02 09:35", "2019-10-02 09:40", "2019-10-03 09:30", "2019-10-03 09:35", "2019-10-03 09:40"), v = c(105.0061, 104.891, 104.8321, 104.5552, 104.4407, 104.5837, 104.5534, 103.6992, 103.5851)) # data
# my attempt
df %>%
# create day
mutate(day = day(t)) %>%
# group by day
group_by(day) %>%
# calculate log difference and append column
mutate(logdif = diff(log(df$v)))
The problem is
Error: Column `logdif` must be length 3 (the group size) or one, not 8
What I need:
[1] NA -0.0010967280 -0.0005616930 NA -0.0010957154
[6] 0.0013682615 NA -0.0082035450 -0.0011009036
Never use $ in dplyr pipes, also you need to append NA to diff output
library(dplyr)
df %>%
mutate(day = lubridate::day(t)) %>%
group_by(day) %>%
mutate(logdif = c(NA, diff(log(v))))
# t v day logdif
# <chr> <dbl> <int> <dbl>
#1 2019-10-01 09:30 105. 1 NA
#2 2019-10-01 09:35 105. 1 -0.00110
#3 2019-10-01 09:40 105. 1 -0.000562
#4 2019-10-02 09:30 105. 2 NA
#5 2019-10-02 09:35 104. 2 -0.00110
#6 2019-10-02 09:40 105. 2 0.00137
#7 2019-10-03 09:30 105. 3 NA
#8 2019-10-03 09:35 104. 3 -0.00820
#9 2019-10-03 09:40 104. 3 -0.00110

How to generate a unique ID for each group based on relative date interval in R using dplyr?

I have a cohort of data with multiple person visits and want to group visits with a common ID based on person # and the time of the visit. The condition is if an start is within 24 hours of a the previous exit, then I want those to have the same ID.
Sample of what data looks like:
dat <- data.frame(
Person_ID = c(1,1,1,2,3,3,3,4,4),
Admit_Date_Time = as.POSIXct(c("2017-02-07 15:26:00","2017-04-21 10:20:00",
"2017-04-22 12:12:00", "2017-10-16 01:31:00","2017-01-24 02:41:00","2017- 01-24 05:31:00", "2017-01-28 04:26:00", "2017-12-01 01:31:00","2017-12-01
01:31:00"), format = "%Y-%m-%d %H:%M"),
Discharge_Date_Time = as.POSIXct(c("2017-03-01 11:42:00","2017-04-22
05:56:00",
"2017-04-26 21:01:00",
"2017-10-18 20:11:00",
"2017-01-27 22:15:00",
"2017-01-26 15:35:00",
"2017-01-28 09:25:00",
"2017-12-05 18:33:00",
"2017-12-04 16:41:00"),format = "%Y-%m-%d %H:%M" ),
Visit_ID = c(1:9))
this is what I tried to start:
dat1 <-
dat %>%
arrange(Person_ID, Admit_Date_Time) %>%
group_by(Person_ID) %>%
mutate(Previous_Visit_Interval = difftime(lag(Discharge_Date_Time,
1),Admit_Date_Time, units = "hours")) %>%
mutate(start = c(1,Previous_Visit_Interval[-1] < hours(-24)), run =
cumsum(start))
dat1$ID = as.numeric(as.factor(paste0(dat1$Person_ID,dat1$run)))
Which is almost right, except it does not give the correct ID for visit 7 (person #3). Since there are three visits and the second visit is entirely within the first, and the third starts within 24 hours of the first but not the second.
There's probably a way to shorten this, but here's an approach using tidyr::gather and spread. By gathering into long format, we can track the cumulative admissions inside each visit. A new visit is recorded whenever there's a new Person_ID or that Person_ID completed a visit (cumulative admissions went to zero) at least 24 hours prior.
library(tidyr)
dat1 <- dat %>%
# Gather into long format with event type in one column, timestamp in another
gather(event, time, Admit_Date_Time:Discharge_Date_Time) %>%
# I want discharges to have an effect up to 24 hours later. Sort using that.
mutate(time_adj = if_else(event == "Discharge_Date_Time",
time + ddays(1),
time)) %>%
arrange(Person_ID, time_adj) %>%
# For each Person_ID, track cumulative admissions. 0 means a visit has completed.
# (b/c we sorted by time_adj, these reflect the 24hr period after discharges.)
group_by(Person_ID) %>%
mutate(admissions = if_else(event == "Admit_Date_Time", 1, -1)) %>%
mutate(admissions_count = cumsum(admissions)) %>%
ungroup() %>%
# Record a new Hosp_ID when either (a) a new Person, or (b) preceded by a
# completed visit (ie admissions_count was zero).
mutate(Hosp_ID_chg = 1 *
(Person_ID != lag(Person_ID, default = 1) | # (a)
lag(admissions_count, default = 1) == 0), # (b)
Hosp_ID = cumsum(Hosp_ID_chg)) %>%
# Spread back into original format
select(-time_adj, -admissions, -admissions_count, -Hosp_ID_chg) %>%
spread(event, time)
Results
> dat1
# A tibble: 9 x 5
Person_ID Visit_ID Hosp_ID Admit_Date_Time Discharge_Date_Time
<dbl> <int> <dbl> <dttm> <dttm>
1 1 1 1 2017-02-07 15:26:00 2017-03-01 11:42:00
2 1 2 2 2017-04-21 10:20:00 2017-04-22 05:56:00
3 1 3 2 2017-04-22 12:12:00 2017-04-26 21:01:00
4 2 4 3 2017-10-16 01:31:00 2017-10-18 20:11:00
5 3 5 4 2017-01-24 02:41:00 2017-01-27 22:15:00
6 3 6 4 2017-01-24 05:31:00 2017-01-26 15:35:00
7 3 7 4 2017-01-28 04:26:00 2017-01-28 09:25:00
8 4 8 5 2017-12-01 01:31:00 2017-12-05 18:33:00
9 4 9 5 2017-12-01 01:31:00 2017-12-04 16:41:00
Here's a data.table approach using an overlap-join
library( data.table )
library( lubridate )
setDT( dat )
setorder( dat, Person_ID, Admit_Date_Time )
#create a 1-day extension after each discharge
dt2 <- dat[, discharge_24h := Discharge_Date_Time %m+% days(1)][]
#now create id
setkey( dat, Admit_Date_Time, discharge_24h )
#create data-table with overlap-join, create groups based on overlapping ranges
dt2 <- setorder(
foverlaps( dat,
dat,
mult = "first",
type = "any",
nomatch = 0L
),
Visit_ID )[, list( Visit_ID = i.Visit_ID,
Hosp_ID = .GRP ),
by = .( Visit_ID )][, Visit_ID := NULL]
#reorder the result
setorder( dt2[ dat, on = "Visit_ID" ][, discharge_24h := NULL], Visit_ID )[]
# Visit_ID Hosp_ID Person_ID Admit_Date_Time Discharge_Date_Time
# 1: 1 1 1 2017-02-07 15:26:00 2017-03-01 11:42:00
# 2: 2 2 1 2017-04-21 10:20:00 2017-04-22 05:56:00
# 3: 3 2 1 2017-04-22 12:12:00 2017-04-26 21:01:00
# 4: 4 3 2 2017-10-16 01:31:00 2017-10-18 20:11:00
# 5: 5 4 3 2017-01-24 02:41:00 2017-01-27 22:15:00
# 6: 6 4 3 2017-01-24 05:31:00 2017-01-26 15:35:00
# 7: 7 4 3 2017-01-28 04:26:00 2017-01-28 09:25:00
# 8: 8 5 4 2017-12-01 01:31:00 2017-12-05 18:33:00
# 9: 9 5 4 2017-12-01 01:31:00 2017-12-04 16:41:00

Resources