how to calculate date difference with r [duplicate] - r

This question already has answers here:
subtract value from previous row by group
(3 answers)
Closed 2 years ago.
I would like to calculate the date difference between each entry. The data looks like this
> dt <- data.table(id = c("A", "A", "A", "B", "B", "B", "C", "C", "C"), date = as.Date(c("2017-01-01", "2017-02-01", "2017-05-01", "2017-01-01", "2017-05-01", "2017-10-01", "2017-01-01", "2017-02-01", "2017-02-15")))
> dt
id date
1: A 2017-01-01
2: A 2017-02-01
3: A 2017-05-01
4: B 2017-01-01
5: B 2017-05-01
6: B 2017-10-01
7: C 2017-01-01
8: C 2017-02-01
9: C 2017-02-15
and what I want to get is sth looks like following, how should I construct var "Diff"?
Update:
I tried to solve this by using following codes:
> dt <- data.table(id = c("A", "A", "A", "B", "B", "B", "C", "C", "C"), date = as.Date(c("2017-01-01", "2017-02-01", "2017-05-01", "2017-01-01", "2017-05-01", "2017-10-01", "2017-01-01", "2017-02-01", "2017-02-15")))
> dt %>%
+ group_by(id) %>%
+ mutate(diff = date - lag(date))
# A tibble: 9 x 3
# Groups: id [3]
id date diff
<chr> <date> <drtn>
1 A 2017-01-01 NA days
2 A 2017-02-01 31 days
3 A 2017-05-01 89 days
4 B 2017-01-01 -120 days
5 B 2017-05-01 120 days
6 B 2017-10-01 153 days
7 C 2017-01-01 -273 days
8 C 2017-02-01 31 days
9 C 2017-02-15 14 days
I am not sure what I did wrong. Any idea?

how about this:
dt$diff <- unlist(lapply(split(dt,dt$id), function(x) c(0,diff(x$date)) ))
output:
> dt
id date diff
1: A 2017-01-01 0
2: A 2017-02-01 31
3: A 2017-05-01 89
4: B 2017-01-01 0
5: B 2017-05-01 120
6: B 2017-10-01 153
7: C 2017-01-01 0
8: C 2017-02-01 31
9: C 2017-02-15 14

Maybe not elegant by maybe performant using diff on all and overwrite between the id.
dt$diff <- c(NA, diff(dt$date))
dt$diff[c(TRUE, dt$id[-1] != dt$id[-nrow(dt)])] <- NA
dt
# id date diff
#1 A 2017-01-01 NA
#2 A 2017-02-01 31
#3 A 2017-05-01 89
#4 B 2017-01-01 NA
#5 B 2017-05-01 120
#6 B 2017-10-01 153
#7 C 2017-01-01 NA
#8 C 2017-02-01 31
#9 C 2017-02-15 14

Tidyverse solution:
library(tidyverse)
dt %>%
group_by(id) %>%
mutate(diff = date - lag(date))
id date diff
<chr> <date> <drtn>
1 A 2017-01-01 NA days
2 A 2017-02-01 31 days
3 A 2017-05-01 89 days
4 B 2017-01-01 NA days
5 B 2017-05-01 120 days
6 B 2017-10-01 153 days
7 C 2017-01-01 NA days
8 C 2017-02-01 31 days
9 C 2017-02-15 14 days

An option with data.table
library(data.table)
setDT(dt)[, diff := date - shift(date), id]

Related

How to combine two data.tables based on multiple criteria in R?

I have two data.tables, which I want to combine based on if a date in one table is in the given time range in the other table. In dt1 I have exit dates and I want to check in dt2 which values were valid at the exit date for each ID.
dt1 <- data.table (ID = 1:10,
exit = c("31/12/2010", "01/01/2021", "30/09/2010", "31/12/2015", "30/09/2010","31/10/2018", "01/02/2016", "01/05/2015", "01/09/2013", "01/01/2016"))
dt2 <- data.table (ID = c(1,2,2,2,3,5,6,6,7,8,8,9,10),
valid_from = c("01/01/2010", "01/01/2012", "01/01/2013", "01/12/2017", "01/05/2010", "01/04/2010", "01/05/2014", "01/11/2016", "01/01/2016", "15/04/2013", "01/01/2015", "15/02/2010", "01/04/2012"),
valid_until = c("01/01/2021", "31/12/2012", "30/11/2017", "01/01/2021", "01/01/2021", "01/01/2021", "31/10/2016", "01/01/2021", "01/01/2021", "31/12/2014", "01/05/2015", "01/01/2013", "01/01/2021"),
text1 = c("a", "a", "b", "c", "b", "b", "c", "a", "a", "b", "a", "c", "a"),
text2 = c("I", "I", "II", "I", "III", "I", "II", "III", "I", "II", "II", "I", "III" ))
ID exit
1: 1 31/12/2010
2: 2 01/01/2021
3: 3 30/09/2010
4: 4 31/12/2015
5: 5 30/09/2010
6: 6 31/10/2018
7: 7 01/02/2016
8: 8 01/05/2015
9: 9 01/09/2013
10: 10 01/01/2016
ID valid_from valid_until text1 text2
1: 1 01/01/2010 01/01/2021 a I
2: 2 01/01/2012 31/12/2012 a I
3: 2 01/01/2013 30/11/2017 b II
4: 2 01/12/2017 01/01/2021 c I
5: 3 01/05/2010 01/01/2021 b III
6: 5 01/04/2010 01/01/2021 b I
7: 6 01/05/2014 31/10/2016 c II
8: 6 01/11/2016 01/01/2021 a III
9: 7 01/01/2016 01/01/2021 a I
10: 8 15/04/2013 31/12/2014 b II
11: 8 01/01/2015 01/05/2015 a II
12: 9 15/02/2010 01/01/2013 c I
13: 10 01/04/2012 01/01/2021 a III
As a result I would like to return in dt1 the valid values to the exit dates.
If an ID is not found in dt2 (would be the case for ID 4 in the sample data), it should return NA.
ID exit text1 text2
1: 1 31/12/2010 a I
2: 2 01/01/2021 c I
3: 3 30/09/2010 b III
4: 4 31/12/2015 <NA> <NA>
5: 5 30/09/2010 b I
6: 6 31/10/2018 a III
7: 7 01/02/2016 a I
8: 8 01/05/2015 a II
9: 9 01/09/2013 c I
10: 10 01/01/2016 a III
Could anyone help me solve this?
As the input is a data.table, consider using data.table methods which are fast
library(data.table)
# // convert the date columns to `Date` class
dt1[, exit := as.IDate(exit, '%d/%m/%Y')]
dt2[, c('valid_from', 'valid_until') := .(as.IDate(valid_from, '%d/%m/%Y'),
as.IDate(valid_until, '%d/%m/%Y'))]
# // do a non-equi join
dt1[dt2, c('text1', 'text2') := .(i.text1, i.text2),
on = .(ID, exit >= valid_from, exit <= valid_until)]
-output
> dt1
ID exit text1 text2
1: 1 2010-12-31 a I
2: 2 2021-01-01 c I
3: 3 2010-09-30 b III
4: 4 2015-12-31 <NA> <NA>
5: 5 2010-09-30 b I
6: 6 2018-10-31 a III
7: 7 2016-02-01 a I
8: 8 2015-05-01 a II
9: 9 2013-09-01 <NA> <NA>
10: 10 2016-01-01 a III
Here is a dplyr solution, that was created with the help of #akrun: see here dates: Not yet implemented NAbounds=TRUE for this non-numeric and non-character type
library(dplyr)
libray(lubridate)
df1 <- left_join(dt1, dt2, by="ID") %>%
mutate(across(c(exit, valid_from, valid_until), dmy)) %>%
rowwise() %>%
mutate(match= +(dplyr::between(exit, valid_from, valid_until))) %>%
group_by(ID) %>%
filter(match==max(match) | is.na(match)) %>%
select(ID, exit, text1, text2) %>%
ungroup()
output:
ID exit text1 text2
<dbl> <date> <chr> <chr>
1 1 2010-12-31 a I
2 2 2021-01-01 c I
3 3 2010-09-30 b III
4 4 2015-12-31 NA NA
5 5 2010-09-30 b I
6 6 2018-10-31 a III
7 7 2016-02-01 a I
8 8 2015-05-01 a II
9 9 2013-09-01 c I
10 10 2016-01-01 a III
You may use fuzzyjoin after changing the dates to Date class.
library(fuzzyjoin)
library(dplyr)
dt1 %>%
mutate(exit = as.Date(exit, '%d/%m/%Y')) %>%
fuzzy_left_join(dt2 %>%
mutate(across(starts_with('valid'), as.Date, '%d/%m/%Y')),
by = c('ID', 'exit' = 'valid_from', 'exit' = 'valid_until'),
match_fun = c(`==`, `>=`, `<=`)) %>%
select(ID = ID.x, exit, text1, text2)
# ID exit text1 text2
#1 1 2010-12-31 a I
#2 2 2021-01-01 c I
#3 3 2010-09-30 b III
#4 4 2015-12-31 <NA> <NA>
#5 5 2010-09-30 b I
#6 6 2018-10-31 a III
#7 7 2016-02-01 a I
#8 8 2015-05-01 a II
#9 9 2013-09-01 <NA> <NA>
#10 10 2016-01-01 a III

Calculate each overlapping date ranges from two independent databases in r

I have two independent two databases, one contains followup data (start date and end date). As follows:
> data1 <- data.frame("ID" = c(1,1,1,1,2,2,2), "FUstart" = c("2019-01-01", "2019-04-01", "2019-07-01", "2019-10-01", "2019-04-01", "2019-07-01", "2019-10-01"), "FUend" = c("2019-03-31", "2019-06-30", "2019-09-30", "2019-12-31", "2019-06-30", "2019-09-30", "2019-12-31"))
> data1
ID FUstart FUend
1 1 2019-01-01 2019-03-31
2 1 2019-04-01 2019-06-30
3 1 2019-07-01 2019-09-30
4 1 2019-10-01 2019-12-31
5 2 2019-04-01 2019-06-30
6 2 2019-07-01 2019-09-30
7 2 2019-10-01 2019-12-31
Another contains drug use data (also start date and end date). As follows:
> data2 <- data.frame("ID" = c(1,1,1,2), "Drugstart" = c("2019-01-11", "2019-03-26", "2019-06-26", "2019-03-20"), "Drugend" = c("2019-01-20", "2019-04-05", "2019-10-05", "2019-10-10"))
> data2
ID Drugstart Drugend
1 1 2019-01-11 2019-01-20
2 1 2019-03-26 2019-04-05
3 1 2019-06-26 2019-10-05
4 2 2019-03-20 2019-10-10
The two databases are linked by "ID". The problem is that the rows for each ID may not be the same. I would like to calculate overlapping days and add it into the data1. I would expect to have the following results:
> data1
ID FUstart FUend Overlapping.Days
1 1 2019-01-01 2019-03-31 16
2 1 2019-04-01 2019-06-30 10
3 1 2019-07-01 2019-09-30 92
4 1 2019-10-01 2019-12-31 5
5 2 2019-04-01 2019-06-30 91
6 2 2019-07-01 2019-09-30 92
7 2 2019-10-01 2019-12-31 10
Note that data1 is the basic database. And adds data2's overlapping days into data1. Many many thanks for helping~~
An option using data.table::foverlaps:
foverlaps(data1, data2)[,
sum(1L + pmin(Drugend, FUend) - pmax(Drugstart, FUstart)),
.(ID, FUstart, FUend)]
output and I am also getting slightly diff numbers from OP's expected output:
ID FUstart FUend V1
1: 1 2019-01-01 2019-03-31 16
2: 1 2019-04-01 2019-06-30 10
3: 1 2019-07-01 2019-09-30 92
4: 1 2019-10-01 2019-12-31 5
5: 2 2019-04-01 2019-06-30 91
6: 2 2019-07-01 2019-09-30 92
7: 2 2019-10-01 2019-12-31 10
data:
library(data.table)
setDT(data1)
cols <- paste0("FU", c("start","end"))
data1[, (cols) := lapply(.SD, as.IDate, format="%Y-%m-%d"), .SDcols=cols]
setkeyv(data1, c("ID", cols))
#too lazy to generalize and hence copy paste
setDT(data2)
cols <- paste0("Drug", c("start","end"))
data2[, (cols) := lapply(.SD, as.IDate, format="%Y-%m-%d"), .SDcols=cols]
setkeyv(data2, c("ID", cols))

cumsum NAs and other condition R

I've seen lots of questions like this but can't figure this simple problem out. I don't want to collapse the dataset. Say I have this dataset:
library(tidyverse)
library(lubridate)
df <- data.frame(group = c("a", "a", "a", "a", "a", "b", "b", "b"),
starts = c("2011-09-18", NA, "2014-08-08", "2016-09-18", NA, "2013-08-08", "2015-08-08", NA),
ends = c(NA, "2013-03-06", "2015-08-08", NA, "2017-03-06", "2014-08-08", NA, "2016-08-08"))
df$starts <- parse_date_time(df$starts, "ymd")
df$ends <- parse_date_time(df$ends, "ymd")
df
group starts ends
1 a 2011-09-18 <NA>
2 a <NA> 2013-03-06
3 a 2014-08-08 2015-08-08
4 a 2016-09-18 <NA>
5 a <NA> 2017-03-06
6 b 2013-08-08 2014-08-08
7 b 2015-08-08 <NA>
8 b <NA> 2016-08-08
Desired output is:
group starts ends epi
1 a 2011-09-18 <NA> 1
2 a <NA> 2013-03-06 1
3 a 2014-08-08 2015-08-08 2
4 a 2016-09-18 <NA> 3
5 a <NA> 2017-03-06 3
6 b 2013-08-08 2014-08-08 1
7 b 2015-08-08 <NA> 2
8 b <NA> 2016-08-08 2
I was thinking something like this but obviously doesn't account for episodes where there is no NA
df <- df %>%
group_by(group) %>%
mutate(epi = cumsum(is.na(ends)))
df
I'm not sure how to incorporate cumsum(is.na) with condition if_else. Maybe I'm going at it the wrong way?
Any suggestions would be great.
A solution using dplyr. Assuming your data frame is well structured that each start always has an associated end record.
df2 <- df %>%
group_by(group) %>%
mutate(epi = cumsum(!is.na(starts))) %>%
ungroup()
df2
# # A tibble: 8 x 4
# group starts ends epi
# <fct> <dttm> <dttm> <int>
# 1 a 2011-09-18 00:00:00 NA 1
# 2 a NA 2013-03-06 00:00:00 1
# 3 a 2014-08-08 00:00:00 2015-08-08 00:00:00 2
# 4 a 2016-09-18 00:00:00 NA 3
# 5 a NA 2017-03-06 00:00:00 3
# 6 b 2013-08-08 00:00:00 2014-08-08 00:00:00 1
# 7 b 2015-08-08 00:00:00 NA 2
# 8 b NA 2016-08-08 00:00:00 2
An option is to get the rowSums of NA elements for columns 'starts', 'ends', grouped by 'group', get the rleid from the 'epi'
library(dplyr)
library(data.table)
df %>%
mutate(epi = rowSums(is.na(.[c("starts", "ends")]))) %>%
group_by(group) %>%
mutate(epi = rleid(epi))
# A tibble: 8 x 4
# Groups: group [2]
# group starts ends epi
# <fct> <dttm> <dttm> <int>
#1 a 2011-09-18 00:00:00 NA 1
#2 a NA 2013-03-06 00:00:00 1
#3 a 2014-08-08 00:00:00 2015-08-08 00:00:00 2
#4 a 2016-09-18 00:00:00 NA 3
#5 a NA 2017-03-06 00:00:00 3
#6 b 2013-08-08 00:00:00 2014-08-08 00:00:00 1
#7 b 2015-08-08 00:00:00 NA 2
#8 b NA 2016-08-08 00:00:00 2
If there are only two columns to consider
df %>%
group_by(group) %>%
mutate(epi = rleid(is.na(starts) + is.na(ends)))

How to generate a sequence of monthly dates from a data frame in R?

Consider the following data frame (df):
"id" "date_start" "date_end"
a 2012-03-11 2012-03-27
a 2012-05-17 2012-07-21
a 2012-06-09 2012-08-18
b 2015-06-21 2015-07-12
b 2015-06-27 2015-08-04
b 2015-07-02 2015-08-01
c 2017-10-11 2017-11-08
c 2017-11-27 2017-12-15
c 2017-01-02 2018-02-03
I am trying to create a new data frame with sequences of monthly dates, starting one month prior to the minimum value of "date_start" for each group in "id". The sequence also only includes dates from the first day of a month and ends at the maximum value of "date-end" for each group in "id".
This is a reproducible example for my data frame:
library(lubridate)
id <- c("a","a","a","b","b","b","c","c","c")
df <- data.frame(id)
df$date_start <- as.Date(c("2012-03-11", "2012-05-17","2012-06-09", "2015-06-21", "2015-06-27","2015-07-02", "2017-10-11", "2017-11-27","2018-01-02"))
df$date_end <- as.Date(c("2012-03-27", "2012-07-21","2012-08-18", "2015-07-12", "2015-08-04","2015-08-012", "2017-11-08", "2017-12-15","2018-02-03"))
What I have tried to do:
library(dplyr)
library(Desctools)
library(timeDate)
df2 <- df %>%
group_by(id) %>%
summarize(start= floor_date(AddMonths(min(date_start),-1), "month"),end=max(date_end)) %>%
do(data.frame(id=.$id, date=seq(.$start,.$end,by="1 month")))
The code works perfectly fine for an ungrouped data frame. Somehow, with the grouping by "id" it throws an error message:
Error in seq.default(.$date_start, .$date_end, by = "1 month") :
'from' must be of length 1
This is how the desired output looks like for the data frame given above:
"id" "date"
a 2012-02-01
a 2012-03-01
a 2012-04-01
a 2012-05-01
a 2012-06-01
a 2012-07-01
a 2012-08-01
b 2015-05-01
b 2015-06-01
b 2015-07-01
b 2015-08-01
c 2017-09-01
c 2017-10-01
c 2017-11-01
c 2017-12-01
c 2018-01-01
c 2018-02-01
Is there a way to alter the code to function with a grouped data frame? Is there an altogether different approach for this operation?
Another option using dplyr and lubridate is to first summarise a list of Date objects for each id and then unnest them to expand them into different rows.
library(dplyr)
library(lubridate)
df %>%
group_by(id) %>%
summarise(date = list(seq(floor_date(min(date_start),unit = "month") - months(1),
floor_date(max(date_end), unit = "month"), by = "month"))) %>%
tidyr::unnest()
# id date
# <fct> <date>
# 1 a 2012-02-01
# 2 a 2012-03-01
# 3 a 2012-04-01
# 4 a 2012-05-01
# 5 a 2012-06-01
# 6 a 2012-07-01
# 7 a 2012-08-01
# 8 b 2015-05-01
# 9 b 2015-06-01
#10 b 2015-07-01
#11 b 2015-08-01
#12 c 2017-09-01
#13 c 2017-10-01
#14 c 2017-11-01
#15 c 2017-12-01
#16 c 2018-01-01
#17 c 2018-02-01
In your code, since there are duplicates in id, you could group by row_number and achieve the same results as below:
df %>%
group_by(id) %>%
summarize(start= floor_date(AddMonths(min(date_start),-1), "month"),end=max(date_end)) %>%
group_by(rn=row_number()) %>%
do(data.frame(id=.$id, date=seq(.$start, .$end, by="1 month"))) %>%
ungroup() %>%
select(-rn)
# A tibble: 17 x 2
id date
<fct> <date>
1 a 2012-02-01
2 a 2012-03-01
3 a 2012-04-01
4 a 2012-05-01
5 a 2012-06-01
6 a 2012-07-01
7 a 2012-08-01
8 b 2015-05-01
9 b 2015-06-01
10 b 2015-07-01
11 b 2015-08-01
12 c 2017-09-01
13 c 2017-10-01
14 c 2017-11-01
15 c 2017-12-01
16 c 2018-01-01
17 c 2018-02-01
Use as.yearmon to convert to year/month. Note that yearmon objects are represented internally as year + fraction where fraction is 0 for January, 1/12 for February, 2/12 for March and so on. Then use as.Date to convert that to Date class. do allows the group to change size.
library(dplyr)
library(zoo)
df %>%
group_by(id) %>%
do( data.frame(month = as.Date(seq(as.yearmon(min(.$date_start)) - 1/12,
as.yearmon(max(.$date_end)),
1/12) ))) %>%
ungroup
giving:
# A tibble: 17 x 2
id month
<fct> <date>
1 a 2012-02-01
2 a 2012-03-01
3 a 2012-04-01
4 a 2012-05-01
5 a 2012-06-01
6 a 2012-07-01
7 a 2012-08-01
8 b 2015-05-01
9 b 2015-06-01
10 b 2015-07-01
11 b 2015-08-01
12 c 2017-09-01
13 c 2017-10-01
14 c 2017-11-01
15 c 2017-12-01
16 c 2018-01-01
17 c 2018-02-01
This could also be written like this using the same library statements as above:
Seq <- function(st, en) as.Date(seq(as.yearmon(st) - 1/12, as.yearmon(en), 1/12))
df %>%
group_by(id) %>%
do( data.frame(month = Seq(min(.$date_start), max(.$date_end))) ) %>%
ungroup

Fill in missing cases till specific condition per group

I'm attempting to create a data frame that shows all of the in between months for my data set, by subject. Here is an example of what the data looks like:
dat <- data.frame(c(1, 1, 1, 2, 3, 3, 3, 4, 4, 4), c(rep(30, 2), rep(25, 5), rep(20, 3)), c('2017-01-01', '2017-02-01', '2017-04-01', '2017-02-01', '2017-01-01', '2017-02-01', '2017-03-01', '2017-01-01',
'2017-02-01', '2017-04-01'))
colnames(dat) <- c('id', 'value', 'date')
dat$Out.Of.Study <- c("", "", "Out", "Out", "", "", "Out", "", "", "Out")
dat
id value date Out.Of.Study
1 1 30 2017-01-01
2 1 30 2017-02-01
3 1 25 2017-04-01 Out
4 2 25 2017-02-01 Out
5 3 25 2017-01-01
6 3 25 2017-02-01
7 3 25 2017-03-01 Out
8 4 20 2017-01-01
9 4 20 2017-02-01
10 4 20 2017-04-01 Out
If I want to show the in between months where no data was collected (but the subject was still enrolled in the study) I can use the complete() function. However, the issue is that I get all missing months for each subject id based on the min and max month identified in the data set:
## Add Dates by Group
library(tidyr)
complete(dat, id, date)
id date value Out.Of.Study
1 1 2017-01-01 30
2 1 2017-02-01 30
3 1 2017-03-01 NA <NA>
4 1 2017-04-01 25 Out
5 2 2017-01-01 NA <NA>
6 2 2017-02-01 25 Out
7 2 2017-03-01 NA <NA>
8 2 2017-04-01 NA <NA>
9 3 2017-01-01 25
10 3 2017-02-01 25
11 3 2017-03-01 25 Out
12 3 2017-04-01 NA <NA>
13 4 2017-01-01 20
14 4 2017-02-01 20
15 4 2017-03-01 NA <NA>
16 4 2017-04-01 20 Out
The issue with this is that I don't want the missing months to exceed the subject's final observed month (essentially, I have subjects who are censored and would need to be removed from the study) or show up prior to the month a subject started the study. For example, subject 2 was only a participant in the month '2017-02-01'. There for, I'd like the data to represent that this was the only month they were in there and not have them represented by the extra months after and the extra month before, as shown above. The same is the case with subject 3, who has an extra month, even though they are out of the study.
Perhaps the complete() isn't the best way to go about this?
This can be solved by creating a sequence of months individually for each id and by joining the sequences with dat to complete the missing months.
1. data.table
(The question is tagged with tidyr. But as I am more acquainted with data.table I have tried this first.)
library(data.table)
# coerce date strings to class Date
setDT(dat)[, date := as.Date(date)]
# create sequence of months for each id
sdt <- dat[, .(date = seq(min(date), max(date), "month")), by = id]
# join
dat[sdt, on = .(id, date)]
id value date Out.Of.Study
1: 1 30 2017-01-01
2: 1 30 2017-02-01
3: 1 NA 2017-03-01 <NA>
4: 1 25 2017-04-01 Out
5: 2 25 2017-02-01 Out
6: 3 25 2017-01-01
7: 3 25 2017-02-01
8: 3 25 2017-03-01 Out
9: 4 20 2017-01-01
10: 4 20 2017-02-01
11: 4 NA 2017-03-01 <NA>
12: 4 20 2017-04-01 Out
Note that there is only one row for id == 2 as requested by the OP.
This approach requires to coerce date from factor to class Date to make sure that all missing months will be completed.
This is also safer than to rely on the avialable date factors in the dataset. For illustration, let's assume that id == 4 is Out in month 2017-06-01 (June) instead of 2017-04-01 (April). Then, there would be no month 2017-05-01 (May) in the whole dataset and the final result would be incomplete.
Without creating the temporary variable sdt the code becomes
library(data.table)
setDT(dat)[, date := as.Date(date)][
dat[, .(date = seq(min(date), max(date), "month")), by = id], on = .(id, date)]
2. tidyr / dplyr
library(dplyr)
library(tidyr)
# coerce date strings to class Date
dat <- dat %>%
mutate(date = as.Date(date))
dat %>%
# create sequence of months for each id
group_by(id) %>%
expand(date = seq(min(date), max(date), "month")) %>%
# join to complete the missing month for each id
left_join(dat, by = c("id", "date"))
# A tibble: 12 x 4
# Groups: id [?]
id date value Out.Of.Study
<dbl> <date> <dbl> <chr>
1 1 2017-01-01 30 ""
2 1 2017-02-01 30 ""
3 1 2017-03-01 NA NA
4 1 2017-04-01 25 Out
5 2 2017-02-01 25 Out
6 3 2017-01-01 25 ""
7 3 2017-02-01 25 ""
8 3 2017-03-01 25 Out
9 4 2017-01-01 20 ""
10 4 2017-02-01 20 ""
11 4 2017-03-01 NA NA
12 4 2017-04-01 20 Out
There is a variant which does not update dat:
library(dplyr)
library(tidyr)
dat %>%
mutate(date = as.Date(date)) %>%
right_join(group_by(., id) %>%
expand(date = seq(min(date), max(date), "month")),
by = c("id", "date"))
I would still use complete (probably the right method to use here), but after it would subset rows that exceed row with "Out". You can do this with dplyr::between.
dat %>%
group_by(id) %>%
complete(date) %>%
# Filter rows that are between 1 and the one that has "Out"
filter(between(row_number(), 1, which(Out.Of.Study == "Out")))
id date value Out.Of.Study
<dbl> <fct> <dbl> <chr>
1 1 2017-01-01 30 ""
2 1 2017-02-01 30 ""
3 1 2017-03-01 NA NA
4 1 2017-04-01 25 Out
5 2 2017-01-01 NA NA
6 2 2017-02-01 25 Out
7 3 2017-01-01 25 ""
8 3 2017-02-01 25 ""
9 3 2017-03-01 25 Out
10 4 2017-01-01 20 ""
11 4 2017-02-01 20 ""
12 4 2017-03-01 NA NA
13 4 2017-04-01 20 Out

Resources