Related
I need to remove rows with overlapped dates and keep the x value which is maximum among the overlapped dates. Here is a data frame:
data.frame(time_left = c("2011-08-05",
"2011-07-25",
"2017-08-20",
"2017-08-20",
"2017-10-09",
"2019-06-01"),
time_right= c("2011-09-14",
"2011-09-01",
"2017-09-12",
"2017-09-26",
"2017-10-15",
"2019-11-05"),
x = c(114,20,10,1,5,100) ) -> df
so my input is:
time_left time_right x
1 2011-08-05 2011-09-14 114
2 2011-07-25 2011-09-01 20
3 2017-08-20 2017-09-12 10
4 2017-08-20 2017-09-26 1
5 2017-10-09 2017-10-15 5
6 2019-06-01 2019-11-05 100
and my desired output is:
time_left time_right x
1 2011-08-05 2011-09-14 114
2 2011-07-25 2011-09-01 20
4 2017-08-20 2017-09-26 10
5 2017-10-09 2017-10-15 5
6 2019-06-01 2019-11-05 100
I appreciate any help.
#Maël brought this issue to my attention over on the ivs issue page https://github.com/DavisVaughan/ivs/issues/20.
I think this can be very elegantly and efficiently solved with ivs, but it is a bit hard to come up with the solution, so I'll probably add a helper to do this more easily.
This solution works with "recursive" containers too, i.e. where range A contains range B, but then range C also contains range A, so you really only want to list range C. I've described this in more detail with examples here https://github.com/DavisVaughan/ivs/issues/20#issuecomment-1234479783.
library(ivs)
library(dplyr)
library(vctrs)
df <- tibble(
time_left = as.Date(c(
"2011-08-05", "2011-07-25", "2017-08-20",
"2017-08-20", "2017-10-09", "2019-06-01"
)),
time_right = as.Date(c(
"2011-09-14", "2011-09-01", "2017-09-12",
"2017-09-26", "2017-10-15", "2019-11-05"
)),
x = c(114, 20, 10, 1, 5, 100)
)
df <- df %>%
mutate(range = iv(time_left, time_right), .keep = "unused")
df
#> # A tibble: 6 × 2
#> x range
#> <dbl> <iv<date>>
#> 1 114 [2011-08-05, 2011-09-14)
#> 2 20 [2011-07-25, 2011-09-01)
#> 3 10 [2017-08-20, 2017-09-12)
#> 4 1 [2017-08-20, 2017-09-26)
#> 5 5 [2017-10-09, 2017-10-15)
#> 6 100 [2019-06-01, 2019-11-05)
iv_locate_max_containment <- function(x) {
# Find all locations where the range "contains" any other range
# (including itself)
locs <- iv_locate_overlaps(x, x, type = "contains")
# Find the "top" ranges, i.e. the containers that aren't contained
# by any other containers
top <- !vec_duplicate_detect(locs$haystack)
top <- vec_slice(locs$haystack, top)
top <- vec_in(locs$needles, top)
locs <- vec_slice(locs, top)
locs
}
# i.e. row 4 "contains" rows 3 and 4
locs <- iv_locate_max_containment(df$range)
locs
#> needles haystack
#> 1 1 1
#> 2 2 2
#> 3 4 3
#> 4 4 4
#> 5 5 5
#> 6 6 6
iv_align(df$range, df$x, locations = locs) %>%
rename(range = needles) %>%
group_by(range) %>%
summarise(x = max(haystack))
#> # A tibble: 5 × 2
#> range x
#> <iv<date>> <dbl>
#> 1 [2011-07-25, 2011-09-01) 20
#> 2 [2011-08-05, 2011-09-14) 114
#> 3 [2017-08-20, 2017-09-26) 10
#> 4 [2017-10-09, 2017-10-15) 5
#> 5 [2019-06-01, 2019-11-05) 100
Created on 2022-09-01 with reprex v2.0.2
This may sound a little verbose, however, this could also be a solution:
First we identify those observations that are potentially overlapped.
Then we group the similar ones.
In each group we choose the minimum time_left and maximum time_right and x.
library(tidyverse)
df %>%
mutate(across(starts_with('time'), ymd),
intv = interval(time_left, time_right),
id = row_number()) %>%
mutate(id2 = map2(intv, id, ~ if (any(.x %within% intv[intv != .x])) {
id[which(.x %within% intv[intv != .x]) + 1]
} else {
.y
})) %>%
group_by(id2) %>%
summarise(time_left = min(time_left),
across(c(time_right, x), max)) %>%
select(!(id2))
# A tibble: 4 × 3
time_left time_right x
<date> <date> <dbl>
1 2011-08-05 2011-09-14 114
2 2017-08-20 2017-09-26 10
3 2017-10-09 2017-10-15 5
4 2019-06-01 2019-11-05 100
I combined Anoushiravan's solution with this
How do I determine in R if a date interval overlaps another date interval for the same individual in a data frame?
and I think it is working now.
df %>%
mutate(id = row_number(), days = as.numeric(as.Date(time_right) - as.Date(time_left)) ) %>%
mutate(Int = interval(time_left, time_right),
within = map(seq_along(Int), function(x){
y = setdiff(seq_along(Int), x)
if(any(id[which((Int[x] %within% Int[y]))+1])){
return(id[days == max(days[which((Int[x] %within% Int[y]))+1])])
}else{ return(0)}
})
) %>%
mutate(within = ifelse(within > 0 , within, id)) %>%
group_by(within) %>%
summarise(time_left = min(time_left), time_right = max(time_right), x = max(x)) %>%
select(!within)
But it still has some bugs. for the following df, this code will not work unless I change the order of the records.
df = data.frame(time_left = c("2014-01-01", "2014-01-01", "2014-12-01", "2014-12-26"),
time_right = c("2014-04-23", "2014-12-31", "2014-12-31", "2014-12-31"),
x = c(10,100,200,20))
I have a data frame of observations with a start and end date for each observation indicating the period it was active.
The duration active varies by observation, and can spread across multiple weeks.
Some observations are still active and do not have an end date.
For a given date range, how can I count the number of observations that were active during a week within that date range, including those still active?
I have a crude method that works, but is pretty slow. It seems like there has to be a more efficient and simpler way to do this.
EDIT: My first approach was similar to Ronak's solution, which is definitely better than mine for smaller data sets, but my real data set has more observations and longer date ranges, so I run into memory constraints.
#I'm primarily using tidyverse/lubridate, but definitely open to other solutions.
library(tidyverse)
library(lubridate)
# sample data frame of observations with start and end dates:
df_obs <- tibble(
observation = c(1:10),
date_start = as_date(c("2020-03-17", "2020-01-20", "2020-02-06", "2020-01-04", "2020-01-06", "2020-01-24", "2020-01-09", "2020-02-11", "2020-03-13", "2020-02-07")),
date_end = as_date(c("2020-03-27", "2020-03-20", NA, "2020-03-04", "2020-01-16", "2020-02-24", NA, "2020-02-19", NA, "2020-02-27"))
)
# to account for observations that are still active, NAs are converted to today's date:
df_obs <- mutate(df_obs, date_end = if_else(is.na(date_end), Sys.Date(), date_end))
# create a data frame of weeks by start and end date to count the active observations in a given week
# for this example I'm just using date ranges from the sample data:
df_weeks <-
seq(min(df_obs$date_start), max(df_obs$date_start), by = 'day') %>%
enframe(NULL, 'week_start') %>%
mutate(week_start = as_date(cut(week_start, "week"))) %>%
mutate(week_end = week_start + 6) %>%
distinct()
# create a function that filters the observations data frame based on start and end dates:
check_active <- function(d, s, e){
d %>%
filter(date_start <= e) %>%
filter(date_end >= s) %>%
nrow()
}
# applying that function to each week in the date range data frame gives the expected results:
df_weeks %>%
rowwise() %>%
mutate(total_active = check_active(df_obs, week_start, week_end)) %>%
select(-week_end) %>%
ungroup()
# A tibble: 12 x 2
week_start total_active
<date> <int>
1 2019-12-30 1
2 2020-01-06 3
3 2020-01-13 3
4 2020-01-20 4
5 2020-01-27 4
6 2020-02-03 6
7 2020-02-10 7
8 2020-02-17 7
9 2020-02-24 6
10 2020-03-02 4
11 2020-03-09 4
12 2020-03-16 5
Here is one way :
library(tidyverse)
df_obs %>%
#Replace NA with today's date
#Create sequence between start and end date
mutate(date_end = replace(date_end, is.na(date_end), Sys.Date()),
date = map2(date_start, date_end, seq, "day")) %>%
#Get data in long format
unnest(date) %>%
#Unselect start an end date
select(-date_start, -date_end) %>%
#Cut data by week
mutate(date = cut(date, "week")) %>%
#Get unique rows for observation and date
distinct(observation, date) %>%
#Count number of observation in each week
count(date)
which returns :
# A tibble: 14 x 2
# value n
# <fct> <int>
# 1 2019-12-30 1
# 2 2020-01-06 3
# 3 2020-01-13 3
# 4 2020-01-20 4
# 5 2020-01-27 4
# 6 2020-02-03 6
# 7 2020-02-10 7
# 8 2020-02-17 7
# 9 2020-02-24 6
#10 2020-03-02 4
#11 2020-03-09 4
#12 2020-03-16 5
#13 2020-03-23 4
#14 2020-03-30 3
When dealing with time series problems in R, I have multiple observations for one timestamp, how to replace the value for one timestamp by the mean value of all the observations for this timestamp and delete all the overlapped timestamp rows.
For example, I have a time series like this:
Date={2016-3-1, 2016-4-1, 2016-4-1, 2016-4-1,2016-5-1,2015-5-1, 2016-6-1).
Price={111,122,124,155,142,177,134}
My expected results are like this:
Date={2016-3-1,2016-4-1,2016-5-1,2017-6-1},
Price={111,133.67,159.5,134 }
(133.67=mean(122,124,155), 159.5=mean(142,177))
Is this what you're after?
Date=c("2016-3-1", "2016-4-1", "2016-4-1", "2016-4-1","2016-5-1","2015-5-1", "2017-6-1")
Price=c(111,122,124,155,142,177,134)
library(dplyr)
library(lubridate)
df <- data.frame(Date = ymd(Date), Price = Price)
df %>%
group_by(ymd(Date)) %>%
summarise(mean = sprintf("%0.2f",mean(Price, na.rm = TRUE)))
# # A tibble: 5 x 2
# `ymd(Date)` mean
# <date> <chr>
# 1 2015-05-01 177.00
# 2 2016-03-01 111.00
# 3 2016-04-01 133.67
# 4 2016-05-01 142.00
# 5 2017-06-01 134.00
If you want baseR this too works:
print(aggregate(list(Mean=df$Price), list(Date = df$Date), mean), digits = 5)
# Date Mean
# 1 2015-05-01 177.00
# 2 2016-03-01 111.00
# 3 2016-04-01 133.67
# 4 2016-05-01 142.00
# 5 2017-06-01 134.00
Given a table
id start end
1 22/03/2016 05/06/2016
2 17/08/2016 29/08/2016
3 22/09/2017 25/12/2017
I'm trying to split by Calendar month as the following table
id start end
1 22/03/2016 31/03/2016
1 01/04/2016 30/04/2016
1 01/05/2016 05/06/2016
2 17/08/2016 29/08/2016
3 22/09/2017 30/09/2017
3 01/10/2017 31/10/2017
3 01/11/2017 30/11/2017
3 01/12/2017 25/12/2017
I'm trying to modify a code extract from how to split rows of a dataframe in multiple rows based on start date and end date? , but I am not being able to modify correctly the code. The problem is generally in the months with 30 days, and maybe is easy but I am not still familiarized with regular expressions.
#sample data
df <- data.frame("starting_date" = as.Date(c("2016-03-22", "2016-08-17", "2017-09-12")),
"end_date" = as.Date(c("2016-06-05", "2016-08-29", "2017-12-25")),
col3=c('1','2', '3'))
df1 <- df[,1:2] %>%
rowwise() %>%
do(rbind(data.frame(matrix(as.character(c(
.$starting_date,
seq(.$starting_date, .$end_date, by=1)[grep("\\d{4}-\\d{2}-31|\\d{4}-\\d{2}-01", seq(.$starting_date, .$end_date, by=1))],
.$end_date)), ncol=2, byrow=T))
)
) %>%
data.frame() %>%
`colnames<-`(c("starting_date", "end_date")) %>%
mutate(starting_date= as.Date(starting_date, format= "%Y-%m-%d"),
end_date= as.Date(end_date, format= "%Y-%m-%d"))
#add temporary columns to the original and expanded date column dataframes
df$row_idx <- seq(1:nrow(df))
df$temp_col <- (year(df$end_date) - year(df$starting_date)) +1
df1 <- cbind(df1,row_idx = rep(df$row_idx,df$temp_col))
#join both dataframes to get the final result
final_df <- left_join(df1,df[,3:(ncol(df)-1)],by="row_idx") %>%
select(-row_idx)
final_df
If anyone knows how to modify the code or a better way to do it I will be very grateful.
We assume there is an error in the sample output in the question since the third row spans parts of two months and so should be split into two rows.
Define Seq which given one start and end Date variables produces a data.frame of start and end columns and then run it on each id using group_by:
library(dplyr)
library(zoo)
Seq <- function(start, end) {
ym <- seq(as.yearmon(start), as.yearmon(end), 1/12)
starts <- pmax(start, as.Date(ym, frac = 0))
ends <- pmin(end, as.Date(ym, frac = 1))
unique(data.frame(start = starts, end = ends))
}
fmt <- "%d/%m/%Y"
DF %>%
mutate(start = as.Date(start, fmt), end = as.Date(end, fmt)) %>%
group_by(id) %>%
do(Seq(.$start, .$end)) %>%
ungroup
giving:
# A tibble: 9 x 3
id start end
<int> <date> <date>
1 1 2016-03-22 2016-03-31
2 1 2016-04-01 2016-04-30
3 1 2016-05-01 2016-05-31
4 1 2016-06-01 2016-06-05
5 2 2016-08-17 2016-08-29
6 3 2017-09-22 2017-09-30
7 3 2017-10-01 2017-10-31
8 3 2017-11-01 2017-11-30
9 3 2017-12-01 2017-12-25
Note
The input DF in reproducible form:
Lines <- "
id start end
1 22/03/2016 05/06/2016
2 17/08/2016 29/08/2016
3 22/09/2017 25/12/2017"
DF <- read.table(text = Lines, header = TRUE)
So there's a probably a more elegant way to accomplish this and I feel like I've seen similar questions, but could not find a duplicate quickly, so here goes...
SETUP
library(tidyverse)
library(lubridate)
df <- data.frame(
id = c('1', '2', '3'),
starting_date = as.Date(c("2016-03-22", "2016-08-17", "2017-09-12")),
end_date = as.Date(c("2016-06-05", "2016-08-29", "2017-12-25")),
stringsAsFactors = FALSE
)
df
#> id starting_date end_date
#> 1 1 2016-03-22 2016-06-05
#> 2 2 2016-08-17 2016-08-29
#> 3 3 2017-09-12 2017-12-25
SOLUTION
df %>%
group_by(id) %>%
mutate(
date_seq = list(seq.Date(starting_date, end_date, by = "month") %>% ceiling_date("month") - 1)
) %>%
unnest() %>%
mutate(row = row_number()) %>%
mutate(
new_end_date = if_else(row == max(row), end_date, date_seq),
new_start_date = if_else(row == min(row), starting_date, floor_date(new_end_date, "month"))
) %>%
select(
id, new_start_date, new_end_date
)
#> # A tibble: 8 x 3
#> # Groups: id [3]
#> id new_start_date new_end_date
#> <chr> <date> <date>
#> 1 1 2016-03-22 2016-03-31
#> 2 1 2016-04-01 2016-04-30
#> 3 1 2016-06-01 2016-06-05
#> 4 2 2016-08-17 2016-08-29
#> 5 3 2017-09-12 2017-09-30
#> 6 3 2017-10-01 2017-10-31
#> 7 3 2017-11-01 2017-11-30
#> 8 3 2017-12-01 2017-12-25
EXPLANATION
Much of what's going on here takes place in the first mutate call which creates date_seq. To understand it, consider the following:
seq.Date(ymd("2016-03-22"), ymd("2016-06-05"), by = "month")
# [1] "2016-03-22" "2016-04-22" "2016-05-22"
seq.Date(ymd("2016-03-22"), ymd("2016-06-05"), by = "month") %>%
ceiling_date("month")
# [1] "2016-04-01" "2016-05-01" "2016-06-01"
seq.Date(ymd("2016-03-22"), ymd("2016-06-05"), by = "month") %>%
ceiling_date("month") - 1
# [1] "2016-03-31" "2016-04-30" "2016-05-31"
So basically, create a sequence of "end-of-month" dates between the original start and end dates. Putting this in a list-column allows us to organize by the id so that we unnest appropriately. Checkout the output after the end of the unnest():
df %>%
group_by(id) %>%
mutate(
date_seq = list(seq.Date(starting_date, end_date, by = "month") %>% ceiling_date("month") - 1)
) %>%
unnest()
From there I hope things are relatively straightforward. The row_number probably could have been replaced with something fancier like a first/last, but I thought this might be easier to follow.
I have 2 datasets in "R".
The first DB contains specific dates:
Value Date
# 20 2017-10-19
# 19 2017-10-23
# 19 2017-11-03
# 20 2017-11-10
And the second contains the level of an stock index from the last 5 years
Date Index
# 2017-11-10 13.206,35
# 2017-11-03 13.378,96
# 2017-10-25 13.404,58
# 2017-10-19 13.517,98
Now I want to merge by searching for the dates from the first dataset "DB" and adding the correct Index value for this date from the second dataset "Hist".
What I did is using the left_join function:
DB <- left_join(DB, Hist, by = "Date")
The problem is some dates in the first dataset are public holidays where no data is available in the second dataset "Hist". So I have some "NA".
Value Date Index
# 20 2017-10-19 13.517,98
# 19 2017-10-23 NA
# 19 2017-11-03 13.378,96
# 20 2017-11-10 13.206,35
What I'm looking for is to take the value of the next available date instead of adding NA.
Example: Instead of adding NA taking the index of 2017-10-25 (2 days later)
Value Date Index
# 20 2017-10-19 13.517,98
# 19 2017-10-23 13.404,58
# 19 2017-11-03 13.378,96
# 20 2017-11-10 13.206,35
Has anybody an idea. Thanks in advance!
Original Request
The following is an option. It uses full_join, and then the fill function to impute the missing value.
library(tidyverse)
DB_final <- DB %>%
full_join(Hist, by = "Date") %>%
arrange(Date) %>%
fill(Index, .direction = "up") %>%
filter(!is.na(Value))
DB_final
# Value Date Index
# 1 20 2017-10-19 13.517,98
# 2 19 2017-10-23 13.404,58
# 3 19 2017-11-03 13.378,96
# 4 20 2017-11-10 13.206,35
However, the user needs to know the fill direction (up or down) in advance. It may not be useful if the user does not know that.
Impute Missing Value based on the Nearest Date
Here is another option, which I think is more robust. It will impute the missing value use the Index from the nearest date.
Step 1: Find the Nearest Date
# Collect all dates
Date_vec <- sort(unique(c(DB$Date, Hist$Date)))
# Create a distance matrix based on dates than convert to a data frame
dt <- Date_vec %>%
dist() %>%
as.matrix() %>%
as.data.frame() %>%
rowid_to_column(var = "ID") %>%
gather(ID2, Value, -ID) %>%
mutate(ID2 = as.integer(ID2)) %>%
filter(ID != ID2) %>%
arrange(ID, Value) %>%
group_by(ID) %>%
slice(1) %>%
select(-Value)
dt$ID <- Date_vec[dt$ID]
dt$ID2 <- Date_vec[dt$ID2]
names(dt) <- c("Date1", "Date2")
dt
# # A tibble: 5 x 2
# # Groups: ID [5]
# Date1 Date2
# <date> <date>
# 1 2017-10-19 2017-10-23
# 2 2017-10-23 2017-10-25
# 3 2017-10-25 2017-10-23
# 4 2017-11-03 2017-11-10
# 5 2017-11-10 2017-11-03
dt shows the nearest date of all the dates.
Step 2: Perform multiple join
Join DB and dt, and then join Hist twice based on different date columns.
DB2 <- DB %>% left_join(dt, by = c("Date" = "Date1"))
DB3 <- DB2 %>%
left_join(Hist, by = "Date") %>%
left_join(Hist, by = c("Date2" = "Date"))
DB3
# Value Date Date2 Index.x Index.y
# 1 20 2017-10-19 2017-10-23 13.517,98 <NA>
# 2 19 2017-10-23 2017-10-25 <NA> 13.404,58
# 3 19 2017-11-03 2017-11-10 13.378,96 13.206,35
# 4 20 2017-11-10 2017-11-03 13.206,35 13.378,96
Step 3: Finalize the Index
If there are values in Index.x, use that, otherwise, use the values in Index.y.
DB4 <- DB3 %>%
mutate(Index = ifelse(is.na(Index.x), Index.y, Index.x)) %>%
select(Value, Date, Index)
DB4
# Value Date Index
# 1 20 2017-10-19 13.517,98
# 2 19 2017-10-23 13.404,58
# 3 19 2017-11-03 13.378,96
# 4 20 2017-11-10 13.206,35
DB4 is the final output.
DATA
DB <- structure(list(Value = c(20L, 19L, 19L, 20L), Date = structure(c(17458,
17462, 17473, 17480), class = "Date")), class = "data.frame", .Names = c("Value",
"Date"), row.names = c(NA, -4L))
Hist <- structure(list(Date = structure(c(17480, 17473, 17464, 17458), class = "Date"),
Index = c("13.206,35", "13.378,96", "13.404,58", "13.517,98"
)), class = "data.frame", .Names = c("Date", "Index"), row.names = c(NA,
-4L))
A solution could be
library(dplyr)
library(rlang)
clean_df <- function(df) {
ix <- which(is.na(df$Index))
df$Index[ix] <- df$Index[ix + 1]
filter(df, !is.na(.data$Value))
}
full_join(DB, Hist) %>%
arrange(Date) %>%
clean_df()
What you have done, plus as.Date() to format dates:
library(data.table)
library(dplyr)
DB = data.table(
Value = c(20,19,19,29),
Date = c("2017-10-19","2017-10-23","2017-11-03","2017-11-10")
)
Hist = data.table(
Date = c("2017-11-10","2017-11-03","2017-10-25","2017-10-19"),
Index = c("13.206,35","13.378,96","13.404,58","13.517,98")
)
DB[, Date := as.Date(Date)]
Hist[, Date := as.Date(Date)]
DB <- left_join(DB,Hist,by="Date") %>% as.data.table()
Now perform the steps below:
# Get rows which are missing an Index.
DB_na <- DB[is.na(Index),]
DB <- DB[!is.na(Index),]
# Build function to find appropriate Index, given an na_date.
get_na_index <- function(na_date) {
bigger_dates = DB[Date>na_date,]
index = bigger_dates[which.min(other_dates-na_date), Index]
return(index)
}
# Use apply() to perform row-wise operation.
DB_na$Index <- apply(matrix(DB_na$Date), 1, get_na_index)
# Combine rows
DB <- rbind(DB, DB_na) %>% arrange(Date)
The Output:
DB
Value Date Index
1 20 2017-10-19 13.517,98
2 19 2017-10-23 13.378,96
3 19 2017-11-03 13.378,96
4 29 2017-11-10 13.206,35