Complete data frame with missing date ranges for multiple parameters - r

I have the following data frame:
Date_from <- c("2013-02-01","2013-05-10","2013-08-13","2013-02-01","2013-05-10","2013-08-13","2013-02-01","2013-05-10","2013-08-13")
Date_to <- c("2013-05-07","2013-08-12","2013-11-18","2013-05-07","2013-08-12","2013-11-18","2013-05-07","2013-08-12","2013-11-18")
y <- data.frame(Date_from,Date_to)
y$concentration <- c("1.5","2.5","1.5","3.5","1.5","2.5","1.5","3.5","3")
y$Parameter<-c("A","A","A","B","B","B","C","C","C")
y$Date_from <- as.Date(y$Date_from)
y$Date_to <- as.Date(y$Date_to)
y$concentration <- as.numeric(y$concentration)
I will need to check the data frame if for EACH Parameter the date range begins at the first day of the year (2013-01-01) and ends at the last day of the year (2013-12-31). If not I will need to add an extra row at the beginning and at the end for each of the parameters to complete the date range to a full year for each parameter. The result should look like this:
Date_from Date_to concentration Parameter
2013-01-01 2013-01-31 NA NA
2013-02-01 2013-05-07 1.5 A
2013-05-10 2013-08-12 2.5 A
2013-08-13 2013-11-18 1.5 A
2013-11-19 2013-12-31 NA NA
2013-01-01 2013-01-31 NA NA
2013-02-01 2013-05-07 3.5 B
2013-05-10 2013-08-12 1.5 B
2013-08-13 2013-11-18 2.5 B
2013-11-19 2013-12-31 NA NA
2013-01-01 2013-01-31 NA NA
2013-02-01 2013-05-07 1.5 C
2013-05-10 2013-08-12 3.5 C
2013-08-13 2013-11-18 3.0 C
2013-11-19 2013-12-31 NA NA
Please note: The date ranges are only equal in this example for simplification.
UPDATE: This is my original data snippet and code:
sm<-read.csv("https://www.dropbox.com/s/tft6inwcrjqujgt/Test_data.csv?dl=1",sep=";",header=TRUE)
cleaned_sm<-sm[,c(4,5,11,14)] ##Delete obsolete columns
colnames(cleaned_sm)<-c("Parameter","Concentration","Date_from","Date_to")
cleaned_sm$Date_from<-as.Date(cleaned_sm$Date_from, format ="%d.%m.%Y")
cleaned_sm$Date_to<-as.Date(cleaned_sm$Date_to, format ="%d.%m.%Y")
#detect comma decimal separator and replace with dot decimal separater as comma is not recognised as a number
cleaned_sm=lapply(cleaned_sm, function(x) gsub(",", ".", x))
cleaned_sm<-data.frame(cleaned_sm)
cleaned_sm$Concentration <- as.numeric(cleaned_sm$Concentration)
cleaned_sm$Date_from <- as.Date(cleaned_sm$Date_from)
cleaned_sm$Date_to <- as.Date(cleaned_sm$Date_to)
Added code based on #jasbner:
cleaned_sm %>%
group_by(Parameter) %>%
do(add_row(.,
Date_from = ymd(max(Date_to))+1 ,
Date_to = ymd(paste(year(max(Date_to)),"1231")),
Parameter = .$Parameter[1])) %>%
do(add_row(.,
Date_to = ymd(min(Date_from))-1,
Date_from = ymd(paste(year(min(Date_from)),"0101")) ,
Parameter = .$Parameter[1],
.before = 0)) %>%
filter(!duplicated(Date_from,fromLast = T),!duplicated(Date_to))

My attempt with dplyr and lubridate. Hacked together but I think it should work. Note this does not look for any gaps in the middle of the date ranges. Basically, for each group, you add a row before and after that particular group. Then if there are any cases where the date range starts at the beginning of the year or ends at the end of the year the added rows are filtered out.
library(dplyr)
library(lubridate)
cleaned_sm %>%
group_by(Parameter) %>%
do(add_row(.,
Date_from = ymd(max(.$Date_to))+1 ,
Date_to = ymd(paste(year(max(.$Date_to)),"1231")),
Parameter = .$Parameter[1])) %>%
do(add_row(.,
Date_to = ymd(min(.$Date_from))-1,
Date_from = ymd(paste(year(min(.$Date_from)),"0101")) ,
Parameter = .$Parameter[1],
.before = 0)) %>%
filter(!duplicated(Date_from,fromLast = T),!duplicated(Date_to))
# A tibble: 15 x 4
# Groups: Parameter [3]
# Date_from Date_to concentration Parameter
# <date> <date> <dbl> <chr>
# 1 2013-01-01 2013-01-31 NA A
# 2 2013-02-01 2013-05-07 1.50 A
# 3 2013-05-10 2013-08-12 2.50 A
# 4 2013-08-13 2013-11-18 1.50 A
# 5 2013-11-19 2013-12-31 NA A
# 6 2013-01-01 2013-01-31 NA B
# 7 2013-02-01 2013-05-07 3.50 B
# 8 2013-05-10 2013-08-12 1.50 B
# 9 2013-08-13 2013-11-18 2.50 B
# 10 2013-11-19 2013-12-31 NA B
# 11 2013-01-01 2013-01-31 NA C
# 12 2013-02-01 2013-05-07 1.50 C
# 13 2013-05-10 2013-08-12 3.50 C
# 14 2013-08-13 2013-11-18 3.00 C
# 15 2013-11-19 2013-12-31 NA C

This seems like it requires a combination of different packages to attack it. I am using tidyr, data.table, and I used lubridate.
date.start <- seq.Date(as.Date("2013-01-01"), as.Date("2013-12-31"), by = "day")
Date.Int <- data.frame(Date_from = date.start, Date_to = date.start)
y_wide <- y %>% spread(Parameter, concentration)
y_wide <- as.data.table(setkey(as.data.table(y_wide), Date_from, Date_to))
Date.Int <- as.data.table(setkey(as.data.table(Date.Int), Date_from, Date_to))
dats <- foverlaps(Date.Int, y_wide, nomatch = NA)
fin.dat <- dats %>%
mutate(A = ifelse(is.na(A), -5, A),
seqs = cumsum(!is.na(A) & A != lag(A, default = -5))) %>%
group_by(seqs) %>%
summarise(Date_from = first(i.Date_from),
Date_to = last(i.Date_to) ,
A = first(A),
B = first(B),
C = first(C)) %>%
mutate(A = ifelse(A == -5, NA, A)) %>%
ungroup()%>%
gather(Concentration, Parameter, A:C) %>%
mutate(Concentration = ifelse(is.na(Parameter), NA, Concentration))
Okay, so I created a vector of dates from a start point to an end point (date.start); then I turned into a data.frame with the same interval names and interval dates for Date.Int. This is because foverlaps needs to compare two intervals (same date start and end dates in Date.Int are now officially intervals). I then took your data you provided and spread, turning it from long format data to wide format data and turned that into a data.table. keying a data.table sets up how it should be arranged, and when using foverlaps you have to key the start dates and end dates (in that order). foverlaps determines if an interval falls within another interval of dates. If you print out dats, you will see a bunch of lines with NA for everything because they did not fall within an interval. So now we have to group these in some manner. I picked grouping by values of "A" in dats. The grouping variable is called seqs. But then I summarised the data, and then switched it back from wide format to long format and replaced the appropriate NA values.

Related

How to generate a unique ID for each group based on relative date interval in R using dplyr?

I have a cohort of data with multiple person visits and want to group visits with a common ID based on person # and the time of the visit. The condition is if an start is within 24 hours of a the previous exit, then I want those to have the same ID.
Sample of what data looks like:
dat <- data.frame(
Person_ID = c(1,1,1,2,3,3,3,4,4),
Admit_Date_Time = as.POSIXct(c("2017-02-07 15:26:00","2017-04-21 10:20:00",
"2017-04-22 12:12:00", "2017-10-16 01:31:00","2017-01-24 02:41:00","2017- 01-24 05:31:00", "2017-01-28 04:26:00", "2017-12-01 01:31:00","2017-12-01
01:31:00"), format = "%Y-%m-%d %H:%M"),
Discharge_Date_Time = as.POSIXct(c("2017-03-01 11:42:00","2017-04-22
05:56:00",
"2017-04-26 21:01:00",
"2017-10-18 20:11:00",
"2017-01-27 22:15:00",
"2017-01-26 15:35:00",
"2017-01-28 09:25:00",
"2017-12-05 18:33:00",
"2017-12-04 16:41:00"),format = "%Y-%m-%d %H:%M" ),
Visit_ID = c(1:9))
this is what I tried to start:
dat1 <-
dat %>%
arrange(Person_ID, Admit_Date_Time) %>%
group_by(Person_ID) %>%
mutate(Previous_Visit_Interval = difftime(lag(Discharge_Date_Time,
1),Admit_Date_Time, units = "hours")) %>%
mutate(start = c(1,Previous_Visit_Interval[-1] < hours(-24)), run =
cumsum(start))
dat1$ID = as.numeric(as.factor(paste0(dat1$Person_ID,dat1$run)))
Which is almost right, except it does not give the correct ID for visit 7 (person #3). Since there are three visits and the second visit is entirely within the first, and the third starts within 24 hours of the first but not the second.
There's probably a way to shorten this, but here's an approach using tidyr::gather and spread. By gathering into long format, we can track the cumulative admissions inside each visit. A new visit is recorded whenever there's a new Person_ID or that Person_ID completed a visit (cumulative admissions went to zero) at least 24 hours prior.
library(tidyr)
dat1 <- dat %>%
# Gather into long format with event type in one column, timestamp in another
gather(event, time, Admit_Date_Time:Discharge_Date_Time) %>%
# I want discharges to have an effect up to 24 hours later. Sort using that.
mutate(time_adj = if_else(event == "Discharge_Date_Time",
time + ddays(1),
time)) %>%
arrange(Person_ID, time_adj) %>%
# For each Person_ID, track cumulative admissions. 0 means a visit has completed.
# (b/c we sorted by time_adj, these reflect the 24hr period after discharges.)
group_by(Person_ID) %>%
mutate(admissions = if_else(event == "Admit_Date_Time", 1, -1)) %>%
mutate(admissions_count = cumsum(admissions)) %>%
ungroup() %>%
# Record a new Hosp_ID when either (a) a new Person, or (b) preceded by a
# completed visit (ie admissions_count was zero).
mutate(Hosp_ID_chg = 1 *
(Person_ID != lag(Person_ID, default = 1) | # (a)
lag(admissions_count, default = 1) == 0), # (b)
Hosp_ID = cumsum(Hosp_ID_chg)) %>%
# Spread back into original format
select(-time_adj, -admissions, -admissions_count, -Hosp_ID_chg) %>%
spread(event, time)
Results
> dat1
# A tibble: 9 x 5
Person_ID Visit_ID Hosp_ID Admit_Date_Time Discharge_Date_Time
<dbl> <int> <dbl> <dttm> <dttm>
1 1 1 1 2017-02-07 15:26:00 2017-03-01 11:42:00
2 1 2 2 2017-04-21 10:20:00 2017-04-22 05:56:00
3 1 3 2 2017-04-22 12:12:00 2017-04-26 21:01:00
4 2 4 3 2017-10-16 01:31:00 2017-10-18 20:11:00
5 3 5 4 2017-01-24 02:41:00 2017-01-27 22:15:00
6 3 6 4 2017-01-24 05:31:00 2017-01-26 15:35:00
7 3 7 4 2017-01-28 04:26:00 2017-01-28 09:25:00
8 4 8 5 2017-12-01 01:31:00 2017-12-05 18:33:00
9 4 9 5 2017-12-01 01:31:00 2017-12-04 16:41:00
Here's a data.table approach using an overlap-join
library( data.table )
library( lubridate )
setDT( dat )
setorder( dat, Person_ID, Admit_Date_Time )
#create a 1-day extension after each discharge
dt2 <- dat[, discharge_24h := Discharge_Date_Time %m+% days(1)][]
#now create id
setkey( dat, Admit_Date_Time, discharge_24h )
#create data-table with overlap-join, create groups based on overlapping ranges
dt2 <- setorder(
foverlaps( dat,
dat,
mult = "first",
type = "any",
nomatch = 0L
),
Visit_ID )[, list( Visit_ID = i.Visit_ID,
Hosp_ID = .GRP ),
by = .( Visit_ID )][, Visit_ID := NULL]
#reorder the result
setorder( dt2[ dat, on = "Visit_ID" ][, discharge_24h := NULL], Visit_ID )[]
# Visit_ID Hosp_ID Person_ID Admit_Date_Time Discharge_Date_Time
# 1: 1 1 1 2017-02-07 15:26:00 2017-03-01 11:42:00
# 2: 2 2 1 2017-04-21 10:20:00 2017-04-22 05:56:00
# 3: 3 2 1 2017-04-22 12:12:00 2017-04-26 21:01:00
# 4: 4 3 2 2017-10-16 01:31:00 2017-10-18 20:11:00
# 5: 5 4 3 2017-01-24 02:41:00 2017-01-27 22:15:00
# 6: 6 4 3 2017-01-24 05:31:00 2017-01-26 15:35:00
# 7: 7 4 3 2017-01-28 04:26:00 2017-01-28 09:25:00
# 8: 8 5 4 2017-12-01 01:31:00 2017-12-05 18:33:00
# 9: 9 5 4 2017-12-01 01:31:00 2017-12-04 16:41:00

For loop generating months between dates in R

I have a data frame , it has three columns employid , start date(ydm) and end date(ydm). my objective was to create another data frame which has two columns, one is employee ID and the other one is date. Second data frame would be built around first Data frame such that it will take ids from the first data frame, and the column date will take all the months between Start Date and end date of that employee. In simple words , i would expand the data in first data frame by months according to the employee start date and end date.
I actually successfully created the code, using for loop. Problem is, it is very slower, and some where I read that one is to avoid loops in r. is there a way that can do the same in a much quicker way ?
an example of my data frame and code is below:
# Creating Data frame
a<- data.frame(employeeid =c('a','b','c'), StartDate= c('2018-1-1','2018-1-5','2018-11-2'),
EndDate= c('2018-1-3','2018-1-9','2018-1-8'), stringsAsFactors = F)
a$StartDate <- ydm(a$StartDate)
a$EndDate <- ydm(a$EndDate)
#second empty data frame
a1 <-a
a1 <- a1[0,1:2]
#my code starts
r <- 1
r.1 <- 1
for (id in a$employeeid) {
#r.1 <- 1
for ( i in format(seq(a[r,2],a[r,3],by="month"), "%Y-%m-%d") ) {
a1[r.1,1] <- a[r,1]
a1[r.1,2] <- i
r.1 <- r.1 +1
}
r <- r+1
}
This results in this :
I want the same result, but a bit quicker
Almost a one-liner with tidyverse:
> result
# A tibble: 12 x 2
employeeid date
<chr> <date>
1 a 2018-01-01
2 a 2018-02-01
3 a 2018-03-01
4 b 2018-05-01
5 b 2018-06-01
6 b 2018-07-01
7 b 2018-08-01
8 b 2018-09-01
9 c 2018-11-01
10 c 2018-12-01
11 c 2019-01-01
12 c 2019-02-01
Code
result <- df %>%
group_by(employeeid) %>%
summarise(date = list(seq(StartDate,
EndDate,
by = "month"))) %>%
unnest()
Data
library(tidyverse)
library(lubridate)
df <- data.frame(employeeid = c('a', 'b', 'c'),
StartDate = ymd(c('2018-1-1', '2018-5-1', '2018-11-1')),
EndDate = ymd(c('2018-3-1', '2018-9-1', '2019-02-1')),
stringsAsFactors = FALSE)
I'd try to solve this with by using apply and a custom function, that calculates the difference of end and start.
Im not sure how your desired output looks like, but in the function of the following example all month in between start and end are pasted in a string.
library(lubridate)
# Creating Data frame
a<- data.frame(employeeid =c('a','b','c'), StartDate= c('2018-1-1','2018-1-5','2018-11-2'),
EndDate= c('2018-2-3','2019-1-9','2020-1-8'), stringsAsFactors = F)
a$StartDate <- ymd(a$StartDate)
a$EndDate <- ymd(a$EndDate)
# create month-name month nummeric value mapping
month_names = month.abb[1:12]
month_dif = function(dates) # function to calc the dif. it expects a 2 units vector to be passed over
{
start = dates[1] # first unit of the vector is expected to be the start date
end = dates[2] # second unit is expected to be the end date
start_month = month(start)
end_month = month(end)
start_year = year(start)
end_year = year(end)
year_dif = end_year - start_year
if(year_dif == 0){ #if start and end both are in the same year month is start till end
return(paste(month_names[start_month:end_month], collapse= ", " ))
} else { #if there is an overlap, mont is start till dezember and jan till end (with x full year in between)
paste(c(month_names[start_month:12],
rep(month_names, year_dif-1),
month_names[1:end_month]), collapse = ", ")
}
}
apply(a[2:3], 1, month_dif)
output:
> apply(a[2:3], 1, month_dif)
[1] "Jan, Feb"
[2] "Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec, Jan"
[3] "Nov, Dec, Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec, Jan"
You can use a combination of apply and do.call:
out_apply_list <- apply(X=a, MARGIN=1,
FUN=function(x) {
data.frame(id= x[1],
date=seq(from = as.Date(x[2], "%Y-%d-%m"),
to = as.Date(x[3], "%Y-%d-%m"),
by = "month"),
row.names = NULL)
})
df <- do.call(what = rbind, args = out_apply_list)
which gives you the following output:
> df
id date
1 a 2018-01-01
2 a 2018-02-01
3 a 2018-03-01
4 b 2018-05-01
5 b 2018-06-01
6 b 2018-07-01
7 b 2018-08-01
8 b 2018-09-01
9 c 2018-02-11
10 c 2018-03-11
11 c 2018-04-11
12 c 2018-05-11
13 c 2018-06-11
14 c 2018-07-11
For the sake of completeness, here is a concise one-line with data.table:
library(data.table)
setDT(a)[, .(StartDate = seq(StartDate, EndDate, by = "month")), by = employeeid]
employeeid StartDate
1: a 2018-01-01
2: a 2018-02-01
3: a 2018-03-01
4: b 2018-05-01
5: b 2018-06-01
6: b 2018-07-01
7: b 2018-08-01
8: b 2018-09-01
9: c 2018-02-11
10: c 2018-03-11
11: c 2018-04-11
12: c 2018-05-11
13: c 2018-06-11
14: c 2018-07-11

Build datetime column in R

I have 2 columns
one is date :
2011-04-13
2013-07-29
2010-11-23
the other is time :
3
22
15
I want to make a new column contains date time
it will be like this
2011-04-13 3:00:00
2013-07-29 22:00:00
2010-11-23 15:00:00
I managed to combine them as string
but when i convert them to datetime i get only date the time disappears
any idea how to get date and time in one column?
my script
data <- read.csv("d:\\__r\\hour.csv")
data$date <- as.POSIXct(paste(data$dteday , paste(data$hr, ":00:00", sep=""), sep=" "))
as example you can use ymd_hm function from lubridate:
a <- c("2014-09-08", "2014-09-08", "2014-09-08")
b <- c(3, 4, 5)
library(lubridate)
library(tidyverse)
tibble(a, b) %>%
mutate(time = paste0(a, " ", b, "-0"),
time = ymd_hm(time))
output would be:
# A tibble: 3 x 3
a b time
<chr> <dbl> <dttm>
1 2014-09-08 3 2014-09-08 03:00:00
2 2014-09-08 4 2014-09-08 04:00:00
3 2014-09-08 5 2014-09-08 05:00:00
found this fixed the problem
data$date <- as.POSIXct(strptime(paste(data$dteday , paste(data$hr, ":00:00", sep=""), sep=" "), "%Y-%m-%d %H:%M:%S"))

Complete list of date ranges to full year

I have the following example data frame:
Date_from <- c("2013-01-01","2013-05-10","2013-08-13","2013-11-19")
Date_to <- c("2013-05-07","2013-08-12","2013-11-18","2013-12-25")
y <- data.frame(Date_from,Date_to)
y$concentration <- c("1.5","2.5","1.5","3.5")
y$Date_from <- as.Date(y$Date_from)
y$Date_to <- as.Date(y$Date_to)
y$concentration <- as.numeric(y$concentration)
I use the following code to detect gaps in date ranges and add the missing date ranges into the data frame and asign NA to the missing concentration:
adding<-data.frame(Date_from=y$Date_to[-nrow(y)]+1,Date_to=y$Date_from[-1]-1,concentration=NA)
adding<-adding[ adding$Date_from<adding$Date_to,]
res<-rbind(y,adding)
res[order(res$Date_from),]
This results in:
Date_from Date_to concentration
2013-01-01 2013-05-07 1.5
2013-05-08 2013-05-09 NA
2013-05-10 2013-08-12 2.5
2013-08-13 2013-11-18 1.5
2013-11-19 2013-12-25 3.5
The problem now is that the data frame ends at 2013-12-25 and not 2013-12-31. How can I do the following:
Detect the ending date of the last date range in the data frame, e.g. 2013-12-25
Add one additional line and calculate new date range up to the last day of the year and add NA for concentration
The results should look like this:
Date_from Date_to concentration
2013-01-01 2013-05-07 1.5
2013-05-08 2013-05-09 NA
2013-05-10 2013-08-12 2.5
2013-08-13 2013-11-18 1.5
2013-11-19 2013-12-25 3.5
2013-12-26 2013-12-31 NA
Don't you just want this?
df <- read.table(text = "
Date_from Date_to concentration
2013-01-01 2013-05-07 1.5
2013-05-08 2013-05-09 NA
2013-05-10 2013-08-12 2.5
2013-08-13 2013-11-18 1.5
2013-11-19 2013-12-25 3.5", h = T, stringsAsFactors = F)
rbind(df, c(as.character(max(as.Date(df$Date_to))+1), paste0(substr(max(as.Date(df$Date_to)), 1, 4),"-12-31") , NA))
Date_from Date_to concentration
1 2013-01-01 2013-05-07 1.5
2 2013-05-08 2013-05-09 <NA>
3 2013-05-10 2013-08-12 2.5
4 2013-08-13 2013-11-18 1.5
5 2013-11-19 2013-12-25 3.5
6 2013-12-26 2013-12-31 <NA>
You can use this explicit function
date_order<-function(dt){
for(i in 1:(nrow(dt)-1)){
if(dt[[1]][i+1] - dt[[2]][i] > 1){
pre<-dt[[2]][i] + 1
post<-dt[[1]][(i+1)] - 1
add<-data.frame("Date_from" = pre,"Date_to" = post,"concentration" = NA)
dt2<-rbind.data.frame(dt,add)
}
}
if(exists("dt2") == F){
dt2<-dt
}
dt2<-dt2[order(dt2$Date_from),]
yr<-substr(x = dt[[2]][nrow(dt)],start = 1,stop = 4)
last_day<-as.Date(paste(yr,"12-31",sep = "-"),format = "%Y-%m-%d")
if(dt[[2]][nrow(dt)] != last_day){
add2<-data.frame("Date_from" = dt[[2]][nrow(dt)] + 1,"Date_to" = last_day,"concentration" = NA)
dt2<-rbind.data.frame(dt2,add2)
}
return(dt2)
}
Using this function with your data returns this:
> date_order(y)
Date_from Date_to concentration
1 2013-01-01 2013-05-07 1.5
5 2013-05-08 2013-05-09 NA
2 2013-05-10 2013-08-12 2.5
3 2013-08-13 2013-11-18 1.5
4 2013-11-19 2013-12-25 3.5
11 2013-12-26 2013-12-31 NA
Hope that's what you were looking for.
My suggestion is to join y with a dataframe that contains all possible periods (either explicitely given or the "remainder") within the year. The solution below is using data.table syntax and the floor_date() and ceiling_date() functions from the lubridate package. This ensures that the solution will work even if the given periods span multiple years.
library(data.table)
library(magrittr)
# coerce character dates to numeric dates
cols <- c("Date_from", "Date_to")
setDT(y, key = cols)[, (cols) := lapply(.SD, as.IDate), .SDcols = cols]
# create sequence of starting points of all periods
breaks <- y[, c(Date_from, Date_to + 1L)] %>%
# append start and end of year
c(lubridate::floor_date(min(.), "year"),
lubridate:: ceiling_date(max(.), "year")) %>%
sort() %>%
unique() %T>%
print()
[1] "2013-01-01" "2013-05-08" "2013-05-10" "2013-08-13" "2013-11-19" "2013-12-26" "2014-01-01"
# create periods
x <- data.table(from = head(breaks, -1L), to = tail(breaks, -1L) - 1L,
key = c("from", "to"))
x
from to
1: 2013-01-01 2013-05-07
2: 2013-05-08 2013-05-09
3: 2013-05-10 2013-08-12
4: 2013-08-13 2013-11-18
5: 2013-11-19 2013-12-25
6: 2013-12-26 2013-12-31
# right join to create the expected result
y[x]
Date_from Date_to concentration
1: 2013-01-01 2013-05-07 1.5
2: 2013-05-08 2013-05-09 NA
3: 2013-05-10 2013-08-12 2.5
4: 2013-08-13 2013-11-18 1.5
5: 2013-11-19 2013-12-25 3.5
6: 2013-12-26 2013-12-31 NA

R: sequence of days between dates

I have the following dataframes:
AllDays
2012-01-01
2012-01-02
2012-01-03
...
2015-08-18
Leases
StartDate EndDate
2012-01-01 2013-01-01
2012-05-07 2013-05-06
2013-09-05 2013-12-01
What I want to do is, for each date in the allDays dataframe, calculate the number of leases that are in effect. e.g. if there are 4 leases with start date <= 2015-01-01 and end date >= 2015-01-01, then I would like to place a 4 in that dataframe.
I have the following code
for (i in 1:nrow(leases))
{
occupied = seq(leases$StartDate[i],leases$EndDate[i],by="days")
occupied = occupied[occupied < dateOfInt]
matching = match(occupied,allDays$Date)
allDays$Occupancy[matching] = allDays$Occupancy[matching] + 1
}
which works, but as I have about 5000 leases, it takes about 1.1 seconds. Does anyone have a more efficient method that would require less computation time?
Date of interest is just the current date and is used simply to ensure that it doesn't count lease dates in the future.
Using seq is almost surely inefficient--imagine you had a lease in your data that's 10000 years long. seq will take forever and return 10000*365-1 days that don't matter to us. We then have to use %in% which also makes the same number of unnecessary comparisons.
I'm not sure the following is the best approach (I'm convinced there's a fully vectorized solution) but it gets closer to the heart of the problem.
Data
set.seed(102349)
days<-data.frame(AllDays=seq(as.Date("2012-01-01"),
as.Date("2015-08-18"),"day"))
leases<-data.frame(StartDate=sample(days$AllDays,5000L,T))
leases$EndDate<-leases$StartDate+round(rnorm(5000,mean=365,sd=100))
Approach
Use data.table and sapply:
library(data.table)
setDT(leases); setDT(days)
days[,lease_count:=
sapply(AllDays,function(x)
leases[StartDate<=x&EndDate>=x,.N])][]
AllDays lease_count
1: 2012-01-01 5
2: 2012-01-02 8
3: 2012-01-03 11
4: 2012-01-04 16
5: 2012-01-05 18
---
1322: 2015-08-14 1358
1323: 2015-08-15 1358
1324: 2015-08-16 1360
1325: 2015-08-17 1363
1326: 2015-08-18 1359
This is exactly the problem where foverlaps shines: subsetting a data.frame based upon another data.frame (foverlaps seems to be tailored for that purpose).
Based on #MichaelChirico's data.
setkey(days[, AllDays1:=AllDays,], AllDays, AllDays1)
setkey(leases, StartDate, EndDate)
foverlaps(leases, days)[, .(lease_count=.N), AllDays]
# user system elapsed
# 0.114 0.018 0.136
# #MichaelChirico's approach
# user system elapsed
# 0.909 0.000 0.907
Here is a brief explanation on how it works by #Arun, which got me started with the data.table.
Without your data, I can't test whether or not this is faster, but it gets the job done with less code:
for (i in 1:nrow(AllDays)) AllDays$tally[i] = sum(AllDays$AllDays[i] >= Leases$Start.Date & AllDays$AllDays[i] <= Leases$End.Date)
I used the following to test it; note that the relevant columns in both data frames are formatted as dates:
AllDays = data.frame(AllDays = seq(from=as.Date("2012-01-01"), to=as.Date("2015-08-18"), by=1))
Leases = data.frame(Start.Date = as.Date(c("2013-01-01", "2012-08-20", "2014-06-01")), End.Date = as.Date(c("2013-12-31", "2014-12-31", "2015-05-31")))
An alternative approach, but I'm not sure it's faster.
library(lubridate)
library(dplyr)
AllDays = data.frame(dates = c("2012-02-01","2012-03-02","2012-04-03"))
Lease = data.frame(start = c("2012-01-03","2012-03-01","2012-04-02"),
end = c("2012-02-05","2012-04-15","2012-07-11"))
# transform to dates
AllDays$dates = ymd(AllDays$dates)
Lease$start = ymd(Lease$start)
Lease$end = ymd(Lease$end)
# create the range id
Lease$id = 1:nrow(Lease)
AllDays
# dates
# 1 2012-02-01
# 2 2012-03-02
# 3 2012-04-03
Lease
# start end id
# 1 2012-01-03 2012-02-05 1
# 2 2012-03-01 2012-04-15 2
# 3 2012-04-02 2012-07-11 3
data.frame(expand.grid(AllDays$dates,Lease$id)) %>% # create combinations of dates and ranges
select(dates=Var1, id=Var2) %>%
inner_join(Lease, by="id") %>% # join information
rowwise %>%
do(data.frame(dates=.$dates,
flag = ifelse(.$dates %in% seq(.$start,.$end,by="1 day"),1,0))) %>% # create ranges and check if the date is in there
ungroup %>%
group_by(dates) %>%
summarise(N=sum(flag))
# dates N
# 1 2012-02-01 1
# 2 2012-03-02 1
# 3 2012-04-03 2
Try the lubridate package. Create an interval for each lease. Then count the lease intervals which each date falls in.
# make some data
AllDays <- data.frame("Days" = seq.Date(as.Date("2012-01-01"), as.Date("2012-02-01"), by = 1))
Leases <- data.frame("StartDate" = as.Date(c("2012-01-01", "2012-01-08")),
"EndDate" = as.Date(c("2012-01-10", "2012-01-21")))
library(lubridate)
x <- new_interval(Leases$StartDate, Leases$EndDate, tzone = "UTC")
AllDays$NumberInEffect <- sapply(AllDays$Days, function(a){sum(a %within% x)})
The Output
head(AllDays)
Days NumberInEffect
1 2012-01-01 1
2 2012-01-02 1
3 2012-01-03 1
4 2012-01-04 1
5 2012-01-05 1
6 2012-01-06 1

Resources