Joining dates are changed to certain key in data.table - r

I have the following dataframes df and df_dates (dput below):
> df
group start end
1 A 2022-12-01 2022-12-04
2 A 2022-12-04 2022-12-07
3 A 2022-12-07 2022-12-10
4 A 2022-12-10 2022-12-13
5 A 2022-12-13 2022-12-16
6 A 2022-12-16 2022-12-19
7 B 2022-12-01 2022-12-04
8 B 2022-12-04 2022-12-07
9 B 2022-12-07 2022-12-10
10 B 2022-12-10 2022-12-13
11 B 2022-12-13 2022-12-16
12 B 2022-12-16 2022-12-19
> df_dates
group date value
1 A 2022-12-02 1
2 A 2022-12-14 3
3 B 2022-12-06 2
4 B 2022-12-13 4
I would like to join the rows of df_dates by group where the date column is between the column start and end of df. When I join these two dataframes, the dates of the date column return the same as the dates from the start column of df. Here is the code with output:
df <- data.frame(group = rep(c('A', 'B'), each = 6),
start = c(seq.Date(as.Date('2022-12-01'), as.Date('2022-12-16'), '3 days')),
end = c(seq.Date(as.Date('2022-12-04'), as.Date('2022-12-19'), '3 days')))
df_dates <- data.frame(group = c('A', 'A', 'B', 'B'),
date = as.Date(c('2022-12-02', '2022-12-14', '2022-12-06', '2022-12-13')),
value = c(1,3,2,4))
library(data.table)
setDT(df)
setDT(df_dates)
df_dates[df,
.(group, date, start, end, value),
on = .(group, date >= start, date <= end)]
#> group date start end value
#> 1: A 2022-12-01 2022-12-01 2022-12-04 1
#> 2: A 2022-12-04 2022-12-04 2022-12-07 NA
#> 3: A 2022-12-07 2022-12-07 2022-12-10 NA
#> 4: A 2022-12-10 2022-12-10 2022-12-13 NA
#> 5: A 2022-12-13 2022-12-13 2022-12-16 3
#> 6: A 2022-12-16 2022-12-16 2022-12-19 NA
#> 7: B 2022-12-01 2022-12-01 2022-12-04 NA
#> 8: B 2022-12-04 2022-12-04 2022-12-07 2
#> 9: B 2022-12-07 2022-12-07 2022-12-10 NA
#> 10: B 2022-12-10 2022-12-10 2022-12-13 4
#> 11: B 2022-12-13 2022-12-13 2022-12-16 4
#> 12: B 2022-12-16 2022-12-16 2022-12-19 NA
Created on 2022-12-12 with reprex v2.0.2
As you can see, the dates of the date column are now changed to the dates of the start column, while I want them to be the same as in df_dates dataframe. The desired output should look like this:
#> group date start end value
#> 1: A 2022-12-02 2022-12-01 2022-12-04 1
#> 2: A NA 2022-12-04 2022-12-07 NA
#> 3: A NA 2022-12-07 2022-12-10 NA
#> 4: A NA 2022-12-10 2022-12-13 NA
#> 5: A 2022-12-14 2022-12-13 2022-12-16 3
#> 6: A NA 2022-12-16 2022-12-19 NA
#> 7: B NA 2022-12-01 2022-12-04 NA
#> 8: B 2022-12-06 2022-12-04 2022-12-07 2
#> 9: B NA 2022-12-07 2022-12-10 NA
#> 10: B 2022-12-13 2022-12-10 2022-12-13 4
#> 11: B 2022-12-13 2022-12-13 2022-12-16 4
#> 12: B NA 2022-12-16 2022-12-19 NA
So I was wondering if anyone knows how to join these two dataframes in the right way using data.table?
dput of df and df_dates:
df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "B", "B",
"B", "B", "B", "B"), start = structure(c(19327, 19330, 19333,
19336, 19339, 19342, 19327, 19330, 19333, 19336, 19339, 19342
), class = "Date"), end = structure(c(19330, 19333, 19336, 19339,
19342, 19345, 19330, 19333, 19336, 19339, 19342, 19345), class = "Date")), class = "data.frame", row.names = c(NA,
-12L))
df_dates <- structure(list(group = c("A", "A", "B", "B"), date = structure(c(19328,
19340, 19332, 19339), class = "Date"), value = c(1, 3, 2, 4)), class = "data.frame", row.names = c(NA,
-4L))

library(data.table)
setDT(df)
setDT(df_dates)
Just specifying you want to the original date (you can use prefix x. or i. (to reference df in this case) )
df_dates[df,
.(group, x.date, start, end, value),
on = .(group, date >= start, date <= end)]
Or modifying the original:
df[, c("date", "value") :=
df_dates[.SD, on = .(group, date >= start, date <= end), .(x.date, value)]]
# group start end date value
# <char> <Date> <Date> <Date> <num>
# 1: A 2022-12-01 2022-12-04 2022-12-02 1
# 2: A 2022-12-04 2022-12-07 <NA> NA
# 3: A 2022-12-07 2022-12-10 <NA> NA
# 4: A 2022-12-10 2022-12-13 <NA> NA
# 5: A 2022-12-13 2022-12-16 2022-12-14 3
# 6: A 2022-12-16 2022-12-19 <NA> NA
# 7: B 2022-12-01 2022-12-04 <NA> NA
# 8: B 2022-12-04 2022-12-07 2022-12-06 2
# 9: B 2022-12-07 2022-12-10 <NA> NA
# 10: B 2022-12-10 2022-12-13 2022-12-13 4
# 11: B 2022-12-13 2022-12-16 2022-12-13 4
# 12: B 2022-12-16 2022-12-19 <NA> NA

Related

Custom function: update old variables and create new variables/ non standard evaluation

I would use some help with a tidyverse solution for a function that I've custom written. I have a dataset with a binary phenotype and an associated diagnosis date, as well as 2 other dates I hope to use to update and create new variables.
I want to:
update the value of the supplied variables to NA if vardt < other_dt
generate new variables, {var}_incid & {var}_incid_dt, if the vardt variable is before baseline_dt
Here's my go at a function; I know that it likely requires some non-standard evaluation techniques, so I've tried to use assign() and eval(substitute()) around the names to no avail. Any tips? Thanks in advance for the help.
# load lib
library(tidyverse)
library(lubridate)
rdate <- function(x,
min = paste0(format(Sys.Date(), '%Y'), '-01-01'),
max = paste0(format(Sys.Date(), '%Y'), '-12-31'),
sort = TRUE) {
dates <- sample(seq(as.Date(min), as.Date(max), by = "day"), x, replace = TRUE)
if (sort == TRUE) {
sort(dates)
} else {
dates
}
}
# set seed for reproducibility
set.seed(42)
# Beginning dataset
das <- data.frame(id = rep(letters[1:3], each = 5),
pheno = rbinom(n=15, size = 1, prob = 0.30),
pheno_dt = rdate(15),
baseline_dt = rdate(15),
other_dt = rdate(15))
update_pheno <- function(var, vardt){
outds <- das %>%
mutate(eval(substitute(var)) = ifelse(var == 1 & pheno_dt < other_dt, NA, var),
# update vardt to NA if var value is NA
vardt = ifelse(is.na(var), NA, vardt))
# create incidence variable based on nomenclature of variable
paste0(var, "_incid") = ifelse(var == 1 & vardt < baseline_dt, NA, var),
# create associated dt variable
paste0(var, "_incid_dt" = ifelse(is.na(paste0(var, "_incid")), NA, vardt)))
return(outds)
}
test <- update_pheno(var = pheno, vardt = phenodt)
Limitations, Assumptions, and Simplifications
# Since we're talking *tidyverse*, let's make this a tibble:
das <- as_tibble( das )
das
# A tibble: 15 × 5
id pheno pheno_dt baseline_dt other_dt
<chr> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03
2 a 1 2022-01-20 2022-04-19 2022-01-05
3 a 0 2022-01-24 2022-05-16 2022-02-02
4 a 1 2022-03-30 2022-05-26 2022-02-09
5 a 0 2022-04-19 2022-06-07 2022-04-13
6 b 0 2022-04-20 2022-07-16 2022-04-19
7 b 1 2022-06-14 2022-08-03 2022-04-24
8 b 0 2022-07-31 2022-08-14 2022-05-10
9 b 0 2022-09-16 2022-09-02 2022-05-18
10 b 1 2022-10-10 2022-10-19 2022-07-05
11 c 0 2022-10-24 2022-10-26 2022-08-16
12 c 1 2022-10-25 2022-11-10 2022-09-15
13 c 1 2022-11-10 2022-11-20 2022-09-19
14 c 0 2022-12-14 2022-12-14 2022-11-25
15 c 0 2022-12-26 2022-12-21 2022-12-24
Update the value of the supplied variables to NA if vardt < other_dt
# Do this directly:
das[ das$pheno_dt < das$other_dt , "pheno" ] <- NA
das
# A tibble: 15 × 5
id pheno pheno_dt baseline_dt other_dt
<chr> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03
2 a 1 2022-01-20 2022-04-19 2022-01-05
3 a NA 2022-01-24 2022-05-16 2022-02-02
4 a 1 2022-03-30 2022-05-26 2022-02-09
5 a 0 2022-04-19 2022-06-07 2022-04-13
6 b 0 2022-04-20 2022-07-16 2022-04-19
7 b 1 2022-06-14 2022-08-03 2022-04-24
8 b 0 2022-07-31 2022-08-14 2022-05-10
9 b 0 2022-09-16 2022-09-02 2022-05-18
10 b 1 2022-10-10 2022-10-19 2022-07-05
11 c 0 2022-10-24 2022-10-26 2022-08-16
12 c 1 2022-10-25 2022-11-10 2022-09-15
13 c 1 2022-11-10 2022-11-20 2022-09-19
14 c 0 2022-12-14 2022-12-14 2022-11-25
15 c 0 2022-12-26 2022-12-21 2022-12-24
Generate new variables, {var}_incid & {var}_incid_dt, if the vardt variable is before baseline_dt
# What are the names of these new variables?
potential_new_names <- paste0( das$id, '_incid' )
potential_new_names
[1] "a_incid" "a_incid" "a_incid" "a_incid" "a_incid" "b_incid" "b_incid" "b_incid" "b_incid" "b_incid" "c_incid"
[12] "c_incid" "c_incid" "c_incid" "c_incid"
# To which rows does this apply?
these_rows <- which( das$pheno_dt < das$baseline_dt )
these_rows
[1] 2 3 4 5 6 7 8 10 11 12 13
# Remove duplicates
new_value_variables <- unique( potential_new_names[ these_rows ] )
# Create corresponding date variables
new_date_variables <- paste0( new_value_variables, "_dt" )
# Combine value variables and date variables
new_column_names <- c( new_value_variables, new_date_variables )
new_column_names
[1] "a_incid" "b_incid" "c_incid" "a_incid_dt" "b_incid_dt" "c_incid_dt"
code_to_make_new_columns <- sprintf(
'das %%>%% mutate( %s )'
, paste0( new_column_names, "=NA", collapse="," )
)
code_to_make_new_columns
[1] "das %>% mutate( a_incid=NA,b_incid=NA,c_incid=NA,a_incid_dt=NA,b_incid_dt=NA,c_incid_dt=NA )"
new_das <- eval( parse( text = code_to_make_new_columns ))
new_das
# A tibble: 15 × 11
id pheno pheno_dt baseline_dt other_dt a_incid b_incid c_incid a_incid_dt b_incid_dt c_incid_dt
<chr> <int> <date> <date> <date> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 a 1 2022-01-05 2022-01-04 2022-01-03 NA NA NA NA NA NA
2 a 1 2022-01-20 2022-04-19 2022-01-05 NA NA NA NA NA NA
3 a NA 2022-01-24 2022-05-16 2022-02-02 NA NA NA NA NA NA
4 a 1 2022-03-30 2022-05-26 2022-02-09 NA NA NA NA NA NA
5 a 0 2022-04-19 2022-06-07 2022-04-13 NA NA NA NA NA NA
6 b 0 2022-04-20 2022-07-16 2022-04-19 NA NA NA NA NA NA
7 b 1 2022-06-14 2022-08-03 2022-04-24 NA NA NA NA NA NA
8 b 0 2022-07-31 2022-08-14 2022-05-10 NA NA NA NA NA NA
9 b 0 2022-09-16 2022-09-02 2022-05-18 NA NA NA NA NA NA
10 b 1 2022-10-10 2022-10-19 2022-07-05 NA NA NA NA NA NA
11 c 0 2022-10-24 2022-10-26 2022-08-16 NA NA NA NA NA NA
12 c 1 2022-10-25 2022-11-10 2022-09-15 NA NA NA NA NA NA
13 c 1 2022-11-10 2022-11-20 2022-09-19 NA NA NA NA NA NA
14 c 0 2022-12-14 2022-12-14 2022-11-25 NA NA NA NA NA NA
15 c 0 2022-12-26 2022-12-21 2022-12-24 NA NA NA NA NA NA
Now update the values for the new variables
incident_value_columns <- grep( pattern = "incid$" , names( new_das ))
incident_date_columns <- grep( pattern = "incid_dt$", names( new_das ))
rows_to_update <- das$pheno_dt >= das$baseline_dt
new_das[ rows_to_update, incident_value_columns ] <- new_das[ rows_to_update, 'pheno' ]
new_das[ rows_to_update, incident_date_columns ] <- new_das[ rows_to_update, 'pheno_dt' ]
new_das
# A tibble: 15 × 11
id pheno pheno_dt baseline_dt other_dt a_incid b_incid c_incid a_incid_dt b_incid_dt c_incid_dt
<chr> <int> <date> <date> <date> <int> <int> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03 1 1 1 2022-01-05 2022-01-05 2022-01-05
2 a 1 2022-01-20 2022-04-19 2022-01-05 NA NA NA NA NA NA
3 a NA 2022-01-24 2022-05-16 2022-02-02 NA NA NA NA NA NA
4 a 1 2022-03-30 2022-05-26 2022-02-09 NA NA NA NA NA NA
5 a 0 2022-04-19 2022-06-07 2022-04-13 NA NA NA NA NA NA
6 b 0 2022-04-20 2022-07-16 2022-04-19 NA NA NA NA NA NA
7 b 1 2022-06-14 2022-08-03 2022-04-24 NA NA NA NA NA NA
8 b 0 2022-07-31 2022-08-14 2022-05-10 NA NA NA NA NA NA
9 b 0 2022-09-16 2022-09-02 2022-05-18 0 0 0 2022-09-16 2022-09-16 2022-09-16
10 b 1 2022-10-10 2022-10-19 2022-07-05 NA NA NA NA NA NA
11 c 0 2022-10-24 2022-10-26 2022-08-16 NA NA NA NA NA NA
12 c 1 2022-10-25 2022-11-10 2022-09-15 NA NA NA NA NA NA
13 c 1 2022-11-10 2022-11-20 2022-09-19 NA NA NA NA NA NA
14 c 0 2022-12-14 2022-12-14 2022-11-25 0 0 0 2022-12-14 2022-12-14 2022-12-14
15 c 0 2022-12-26 2022-12-21 2022-12-24 0 0 0 2022-12-26 2022-12-26 2022-12-26
The non-standard-evaluation part
When you need to access something referenced by a combination of names and values of the parameters passed to a function, eval and sym can be used as follows:
example_within_a_function <- function(
the_data
, var_column_name
, var_datestamp_column_name
, baseline_column_name
, other_column_name
){
# Skip the first argument, which is the function, itself,
# and get all the rest of the arguments,
# which are passed parameters
arguments <- match.call()[ -1 ] %>% as.list
# Extract the value passed to each argument
values <- seq( arguments ) %>% map_chr( ~rlang::as_string( arguments[[.]] ))
# Return the names of the arguments, their values,
# the data table (using non-standard evaluation), and
# the data table (using a straight-forward reference).
list(
labels_within_function = names( arguments )
, labels_in_parent_env = values
, data = eval( sym( values[[ 1 ]] ))
, also_data = the_data
)
}
example_within_a_function(
the_data = das
, var_column_name = pheno
, var_datestamp_column_name = pheno_dt
, baseline_column_name = baseline_dt
, other_column_name = other_dt
)
$labels_within_function
[1] "the_data" "var_column_name" "var_datestamp_column_name" "baseline_column_name"
[5] "other_column_name"
$labels_in_parent_env
[1] "das" "pheno" "pheno_dt" "baseline_dt" "other_dt"
$data
# A tibble: 15 × 5
id pheno pheno_dt baseline_dt other_dt
<chr> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03
2 a 1 2022-01-20 2022-04-19 2022-01-05
3 a NA 2022-01-24 2022-05-16 2022-02-02
4 a 1 2022-03-30 2022-05-26 2022-02-09
5 a 0 2022-04-19 2022-06-07 2022-04-13
6 b 0 2022-04-20 2022-07-16 2022-04-19
7 b 1 2022-06-14 2022-08-03 2022-04-24
8 b 0 2022-07-31 2022-08-14 2022-05-10
9 b 0 2022-09-16 2022-09-02 2022-05-18
10 b 1 2022-10-10 2022-10-19 2022-07-05
11 c 0 2022-10-24 2022-10-26 2022-08-16
12 c 1 2022-10-25 2022-11-10 2022-09-15
13 c 1 2022-11-10 2022-11-20 2022-09-19
14 c 0 2022-12-14 2022-12-14 2022-11-25
15 c 0 2022-12-26 2022-12-21 2022-12-24
$also_data
# A tibble: 15 × 5
id pheno pheno_dt baseline_dt other_dt
<chr> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03
2 a 1 2022-01-20 2022-04-19 2022-01-05
3 a NA 2022-01-24 2022-05-16 2022-02-02
4 a 1 2022-03-30 2022-05-26 2022-02-09
5 a 0 2022-04-19 2022-06-07 2022-04-13
6 b 0 2022-04-20 2022-07-16 2022-04-19
7 b 1 2022-06-14 2022-08-03 2022-04-24
8 b 0 2022-07-31 2022-08-14 2022-05-10
9 b 0 2022-09-16 2022-09-02 2022-05-18
10 b 1 2022-10-10 2022-10-19 2022-07-05
11 c 0 2022-10-24 2022-10-26 2022-08-16
12 c 1 2022-10-25 2022-11-10 2022-09-15
13 c 1 2022-11-10 2022-11-20 2022-09-19
14 c 0 2022-12-14 2022-12-14 2022-11-25
15 c 0 2022-12-26 2022-12-21 2022-12-24

Return first matched row using date lookup to a date range lookup table

I have 2 dataframes, one with list of dates (df1) and another one with date ranges by ID (df2). I would like to add a column from df2 if a date in df1 sits within the date range in df2.
## This is my base data
df1 <-
structure(list(Date = structure(c(18848, 18848, 18849, 18890,
18892, 18901, 18904, 18904, 18906, 18906, 18907, 18911, 18911,
18922, 18923, 18925, 18926, 18927, 18929), class = "Date"), Id = c(4,
6, 4, 6, 4, 4, 4, 6, 4, 6, 4, 4, 6, 4, 4, 4, 4, 6, 6)), row.names = c(NA,
19L), class = "data.frame")
## This is my date range table
df2 <- structure(list(Date.Start = structure(c(18898, 18897, 18848,
18898, 18897), class = "Date"), Date.End = structure(c(18924,
18924, 18903, 18924, 18924), class = "Date"), Id = c(6, 6, 4,
4, 4), Return.Value = c(1, 2, 3, 4, 5)), row.names = c(NA, 5L
), class = "data.frame")
So for the first row of df1, Date = '2021-08-09' sits within date range '2021-08-09' to '2021-10-03' (row 3 of df2) so I want to add a Return Value column to df1 with a value of 3.
There is no match for row 2 in df1 so this will return nothing.
My expected output is:
I tried this link Check if a date is in range of lookup table but I want to add a column from df2 instead of returning logical vector.
data.table version using the mult="first" option on a non-equi join allowing >= and <= comparisons:
library(data.table)
setDT(df1)
setDT(df2)
df1[, Return.Value := df2[
df1, on=c("Id", "Date.Start<=Date", "Date.End>=Date"), Return.Value, mult="first"]]
df1
# Date Id Return.Value
# 1: 2021-08-09 4 3
# 2: 2021-08-09 6 NA
# 3: 2021-08-10 4 3
# 4: 2021-09-20 6 NA
# 5: 2021-09-22 4 3
# 6: 2021-10-01 4 3
# 7: 2021-10-04 4 4
# 8: 2021-10-04 6 1
# 9: 2021-10-06 4 4
#10: 2021-10-06 6 1
#11: 2021-10-07 4 4
#12: 2021-10-11 4 4
#13: 2021-10-11 6 1
#14: 2021-10-22 4 4
#15: 2021-10-23 4 4
#16: 2021-10-25 4 NA
#17: 2021-10-26 4 NA
#18: 2021-10-27 6 NA
#19: 2021-10-29 6 NA
It's pretty messy...
library(dplyr)
df1 %>%
left_join(df2, by = 'Id') %>%
rowwise %>%
mutate(Return.Value = ifelse(between(Date, Date.Start, Date.End), Return.Value, NA)) %>%
select(-Date.Start, -Date.End) %>%
group_by(Date, Id) %>%
filter({if (sum(!is.na(Return.Value) >0)) !is.na(Return.Value) else row_number() == 1}) %>%
filter(row_number() == 1)
Date Id Return.Value
<date> <dbl> <dbl>
1 2021-08-09 4 3
2 2021-08-09 6 NA
3 2021-08-10 4 3
4 2021-09-20 6 NA
5 2021-09-22 4 3
6 2021-10-01 4 3
7 2021-10-04 4 4
8 2021-10-04 6 1
9 2021-10-06 4 4
10 2021-10-06 6 1
11 2021-10-07 4 4
12 2021-10-11 4 4
13 2021-10-11 6 1
14 2021-10-22 4 4
15 2021-10-23 4 4
16 2021-10-25 4 NA
17 2021-10-26 4 NA
18 2021-10-27 6 NA
19 2021-10-29 6 NA
We could do this also with match_fun() function from fuzzyjoin package:
library(dplyr)
library(fuzzyjoin)
fuzzy_left_join(
df1, df2,
by = c(
"Id" = "Id",
"Date" = "Date.Start",
"Date" = "Date.End"
),
match_fun = list(`==`, `>=`, `<=`)) %>%
group_by(Date, Id.x) %>%
slice(1) %>%
dplyr::select(Date, Id = Id.x, Return.Value)
Date Id Return.Value
<date> <dbl> <dbl>
1 2021-08-09 4 3
2 2021-08-09 6 NA
3 2021-08-10 4 3
4 2021-09-20 6 NA
5 2021-09-22 4 3
6 2021-10-01 4 3
7 2021-10-04 4 4
8 2021-10-04 6 1
9 2021-10-06 4 4
10 2021-10-06 6 1
11 2021-10-07 4 4
12 2021-10-11 4 4
13 2021-10-11 6 1
14 2021-10-22 4 4
15 2021-10-23 4 4
16 2021-10-25 4 NA
17 2021-10-26 4 NA
18 2021-10-27 6 NA
19 2021-10-29 6 NA

R: Create new variable based on date in other variable

I have a data frame that looks somewhat like this:
a = c(seq(as.Date("2020-08-01"), as.Date("2020-11-01"), by="months"), seq(as.Date("2021-08-01"), as.Date("2021-11-01"), by="months"),
seq(as.Date("2022-08-01"), as.Date("2022-11-01"), by="months"))
b = rep(LETTERS[1:3], each = 4)
df = data_frame(ID = b, Date = a)
> df
ID Date
<chr> <date>
1 A 2020-08-01
2 A 2020-09-01
3 A 2020-10-01
4 A 2020-11-01
5 B 2021-08-01
6 B 2021-09-01
7 B 2021-10-01
8 B 2021-11-01
9 C 2022-08-01
10 C 2022-09-01
11 C 2022-10-01
12 C 2022-11-01
And I want to create a new variable that replaces Date with the smallest value in Date for each ID, the resulting data frame should look like this:
c = c(rep(as.Date("2020-08-01"), each = 4), rep(as.Date("2021-08-01"), each = 4), rep(as.Date("2022-08-01"), each = 4))
df$NewDate = c
> df
# A tibble: 12 × 3
ID Date NewDate
<chr> <date> <date>
1 A 2020-08-01 2020-08-01
2 A 2020-09-01 2020-08-01
3 A 2020-10-01 2020-08-01
4 A 2020-11-01 2020-08-01
5 B 2021-08-01 2021-08-01
6 B 2021-09-01 2021-08-01
7 B 2021-10-01 2021-08-01
8 B 2021-11-01 2021-08-01
9 C 2022-08-01 2022-08-01
10 C 2022-09-01 2022-08-01
11 C 2022-10-01 2022-08-01
12 C 2022-11-01 2022-08-01
Can someone please help me do it? Thank you very much in advance.
Frist group, then mutate & min:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(NewDate = min(Date)) %>%
ungroup()
#> # A tibble: 12 × 3
#> ID Date NewDate
#> <chr> <date> <date>
#> 1 A 2020-08-01 2020-08-01
#> 2 A 2020-09-01 2020-08-01
#> 3 A 2020-10-01 2020-08-01
#> 4 A 2020-11-01 2020-08-01
#> 5 B 2021-08-01 2021-08-01
#> 6 B 2021-09-01 2021-08-01
#> 7 B 2021-10-01 2021-08-01
#> 8 B 2021-11-01 2021-08-01
#> 9 C 2022-08-01 2022-08-01
#> 10 C 2022-09-01 2022-08-01
#> 11 C 2022-10-01 2022-08-01
#> 12 C 2022-11-01 2022-08-01

How to search upwards a column for a value based on whether another column is NA or not?

I need to find the previous date for which value is not NA and then also use the value on that row. I have tried to use shift, but I have met a problem because shift works well for row 9 but not for when there are consecutive non-NAs on type, such as on rows 5,6.
dtihave = data.table(date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-03", "2020-04-02", "2020-05-09", "2020-06-10", "2020-07-18", "2020-08-23", "2020-09-09")),
type = c(1,1,NA,NA,1,1,NA,NA,1),
value = c(7,NA,6,8,NA,NA,5,9,NA))
> dtihave
date type value
1: 2020-01-01 1 7
2: 2020-02-01 1 NA
3: 2020-03-03 NA 6
4: 2020-04-02 NA 8
5: 2020-05-09 1 NA
6: 2020-06-10 1 NA
7: 2020-07-18 NA 5
8: 2020-08-23 NA 9
9: 2020-09-09 1 NA
dtiwant = data.table(date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-03", "2020-04-02", "2020-05-09", "2020-06-10", "2020-07-18", "2020-08-23", "2020-09-09")),
type = c(1,1,NA,NA,1,1,NA,NA,1),
value = c(7,NA,6,8,NA,NA,5,9,NA),
iwantdate = c(NA, as.Date("2020-01-01"), NA, NA, as.Date("2020-04-02"), as.Date("2020-04-02"), NA, NA, as.Date("2020-08-23")),
iwantvalue = c(NA,7,NA,NA,8,8,NA,NA,9))
dtiwant[, iwantdate := as.Date(iwantdate, origin = "1970-01-01")]
> dtiwant
date type value iwantdate iwantvalue
1: 2020-01-01 1 7 <NA> NA
2: 2020-02-01 1 NA 2020-01-01 7
3: 2020-03-03 NA 6 <NA> NA
4: 2020-04-02 NA 8 <NA> NA
5: 2020-05-09 1 NA 2020-04-02 8
6: 2020-06-10 1 NA 2020-04-02 8
7: 2020-07-18 NA 5 <NA> NA
8: 2020-08-23 NA 9 <NA> NA
9: 2020-09-09 1 NA 2020-08-23 9
My current progress using shift, but I need row 6's iwantdate = "2020-04-02". The number of shifts I need to make is unknown, so I can not just use n=2 in shift.
dtprogress = copy(dtihave)
dtprogress[, iwantdate := ifelse(!is.na(type) & is.na(value), shift(date), NA)]
dtprogress[, iwantdate := ifelse(!is.na(type) & !is.na(value), date, iwantdate)]
dtprogress[, iwantdate := as.Date(iwantdate, origin = "1970-01-01")]
> dtprogress
date type value iwantdate
1: 2020-01-01 1 7 2020-01-01
2: 2020-02-01 1 NA 2020-01-01
3: 2020-03-03 NA 6 <NA>
4: 2020-04-02 NA 8 <NA>
5: 2020-05-09 1 NA 2020-04-02
6: 2020-06-10 1 NA 2020-05-09
7: 2020-07-18 NA 5 <NA>
8: 2020-08-23 NA 9 <NA>
9: 2020-09-09 1 NA 2020-08-23
You could do:
dtihave[, idx := cummax((!is.na(value)) * .I) * NA^!is.na(value)][,
c('want_date', 'want_value') := lapply(.SD, '[', idx),
.SDcols = c('date', 'value')][, idx:=NULL]
dtihave
date type value want_date want_value
1: 2020-01-01 1 7 <NA> NA
2: 2020-02-01 1 NA 2020-01-01 7
3: 2020-03-03 NA 6 <NA> NA
4: 2020-04-02 NA 8 <NA> NA
5: 2020-05-09 1 NA 2020-04-02 8
6: 2020-06-10 1 NA 2020-04-02 8
7: 2020-07-18 NA 5 <NA> NA
8: 2020-08-23 NA 9 <NA> NA
9: 2020-09-09 1 NA 2020-08-23 9
with tidyverse. Hopefully this solves the grouping. ie just add %>%group_by(...) before mutate and you are good to go
dtihave %>%
mutate(val_na = !is.na(value),
idx = nafill(na_if(row_number() * val_na, 0), "locf"),
idx = idx * NA ^ val_na,
date1 = date[idx], value1 = value[idx],
val_na = NULL, idx = NULL)
You can use lag to get previous values, e.g.
library(dplyr)
dtihave %>%
mutate(iwantdate = ifelse(is.na(value), lag(date), NA) %>% as.Date(., origin = "1970-01-01"),
iwantvalue = ifelse(is.na(value), lag(value), NA))
date type value iwantdate iwantvalue
1: 2020-01-01 1 7 <NA> NA
2: 2020-02-01 1 NA 2020-01-01 7
3: 2020-03-03 NA 6 <NA> NA
4: 2020-04-02 NA 8 <NA> NA
5: 2020-05-09 1 NA 2020-04-02 8
6: 2020-06-10 1 NA 2020-05-09 NA
7: 2020-07-18 NA 5 <NA> NA
8: 2020-08-23 NA 9 <NA> NA
9: 2020-09-09 1 NA 2020-08-23 9

r - filter by date with group by condition

In R, using dplyr I want to filter greater than a date in for each group.
Below gives me the results, but I am wondering if there is a more elegant way to get the same thing. Is it possible to filter without using mutate?
max_dates <- data.frame(col_1 = c('a', 'b', 'c'), max_date = c('2021-08-23', '2021-07-19', '2021-07-02'))
df <- data.frame(col_1 = c(rep('a', 10), rep('b', 10), rep('c', 10)),
date = rep(seq(as.Date('2021-07-01'), by = 'week', length.out = 10), 3))
desired_df <- df %>%
left_join(max_dates, by = 'col_1') %>%
mutate(greater_than = ifelse(date >= max_date, T, F)) %>%
filter(greater_than)
You don't need the mutate argument; move the conditional to the filter argument...
library(dplyr)
df %>%
left_join(max_dates, by = 'col_1') %>%
filter(date >= max_date)
#> col_1 date max_date
#> 1 a 2021-08-26 2021-08-23
#> 2 a 2021-09-02 2021-08-23
#> 3 b 2021-07-22 2021-07-19
#> 4 b 2021-07-29 2021-07-19
#> 5 b 2021-08-05 2021-07-19
#> 6 b 2021-08-12 2021-07-19
#> 7 b 2021-08-19 2021-07-19
#> 8 b 2021-08-26 2021-07-19
#> 9 b 2021-09-02 2021-07-19
#> 10 c 2021-07-08 2021-07-02
#> 11 c 2021-07-15 2021-07-02
#> 12 c 2021-07-22 2021-07-02
#> 13 c 2021-07-29 2021-07-02
#> 14 c 2021-08-05 2021-07-02
#> 15 c 2021-08-12 2021-07-02
#> 16 c 2021-08-19 2021-07-02
#> 17 c 2021-08-26 2021-07-02
#> 18 c 2021-09-02 2021-07-02
Created on 2021-08-31 by the reprex package (v2.0.0)
We may use non-equi join
library(data.table)
setDT(df)[, date1 := date][max_dates, on = .(col_1, date1 >= max_date)]
-output
col_1 date date1
1: a 2021-08-26 2021-08-23
2: a 2021-09-02 2021-08-23
3: b 2021-07-22 2021-07-19
4: b 2021-07-29 2021-07-19
5: b 2021-08-05 2021-07-19
6: b 2021-08-12 2021-07-19
7: b 2021-08-19 2021-07-19
8: b 2021-08-26 2021-07-19
9: b 2021-09-02 2021-07-19
10: c 2021-07-08 2021-07-02
11: c 2021-07-15 2021-07-02
12: c 2021-07-22 2021-07-02
13: c 2021-07-29 2021-07-02
14: c 2021-08-05 2021-07-02
15: c 2021-08-12 2021-07-02
16: c 2021-08-19 2021-07-02
17: c 2021-08-26 2021-07-02
18: c 2021-09-02 2021-07-02

Resources