R fuzzyjoin on most recent previous record - r

I want to join two tables A & B by ID and find in B the most recent date that is anterior to A[date].
After some search it seems that fuzzyjoin allow to join on date ranges :
library(fuzzyjoin)
fuzzy_left_join(A, B,
by = c("ID" = "ID",
"date" = "date"),
match_fun = list("==","<"))
Problem is that this will return many records (if they exist in B), when I just want the most recent.
Any idea on how to proceed ?
EDIT :
A <- data.frame(ID=c(1,2,3),
date = c('2019-04-03','2019-05-13','2019-05-27'))
B <- data.frame(ID=c(1,1,2,3,4),
date = c('2018-01-01','2019-01-01','2019-02-20','2019-06-01','2019-01-01'),
value = c(1,1.5,1.2,3.7,4))
> A
ID date
1 1 2019-04-03
2 2 2019-05-13
3 3 2019-05-27
> B
ID date value
1 1 2018-01-01 1.0
2 1 2019-01-01 1.5
3 2 2019-02-20 1.2
4 3 2019-06-01 3.7
5 4 2019-01-01 4.0
Expected output :
ID date value
1 1 2019-04-03 1.5
2 2 2019-05-13 1.2
3 3 2019-05-27 NA

Another option using rolling join from data.table:
library(data.table)
setDT(A)[, date:=as.Date(date)]
setDT(B)[, date:=as.Date(date)]
B[A, on=.(ID, date), roll=Inf]
output:
ID date value
1: 1 2019-04-03 1.5
2: 2 2019-05-13 1.2
3: 3 2019-05-27 NA

We can do group_by A then select the last row in each group
library(fuzzyjoin)
fuzzy_left_join(A, B,
by = c("ID" = "ID","date" = "date"),
match_fun = list(`==`,`>`)) %>%
group_by(ID.x) %>%
slice(n()) %>%
select(-ends_with('y')) %>%
rename_at(vars(ends_with('x')), ~gsub('.x','',.))
# A tibble: 3 x 3
# Groups: ID.x [3]
ID date value
<dbl> <date> <dbl>
1 1 2019-04-03 1.5
2 2 2019-05-13 1.2
3 3 2019-05-27 NA

Related

How to add a column with most resent recurring observation within a group, but within a certain time period, in R

If I had:
person_ID visit date
1 2/25/2001
1 2/27/2001
1 4/2/2001
2 3/18/2004
3 9/22/2004
3 10/27/2004
3 5/15/2008
and I wanted another column to indicate the earliest recurring observation within 90 days, grouped by patient ID, with the desired output:
person_ID visit date date
1 2/25/2001 2/27/2001
1 2/27/2001 4/2/2001
1 4/2/2001 NA
2 3/18/2004 NA
3 9/22/2004 10/27/2004
3 10/27/2004 NA
3 5/15/2008 NA
Thank you!
We convert the 'visit_date' to Date class, grouped by 'person_ID', create a binary column that returns 1 if the difference between the current and next visit_date is less than 90 or else 0, using this column, get the correponding next visit_date' where the value is 1
library(dplyr)
library(lubridate)
library(tidyr)
df1 %>%
mutate(visit_date = mdy(visit_date)) %>%
group_by(person_ID) %>%
mutate(i1 = replace_na(+(difftime(lead(visit_date),
visit_date, units = 'day') < 90), 0),
date = case_when(as.logical(i1)~ lead(visit_date)), i1 = NULL ) %>%
ungroup
-output
# A tibble: 7 x 3
# person_ID visit_date date
# <int> <date> <date>
#1 1 2001-02-25 2001-02-27
#2 1 2001-02-27 2001-04-02
#3 1 2001-04-02 NA
#4 2 2004-03-18 NA
#5 3 2004-09-22 2004-10-27
#6 3 2004-10-27 NA
#7 3 2008-05-15 NA

Update the new date column using the existing date column with -1 day

I have a set of patient ids and date column. I want to update date1 column with -1 day from the date column. for example :
ID Date Date1
1 23-10-2017 23-09-2018
1 24-09-2018 28-08-2019
1 29-08-2019 -
2 30-05-2016 11-06-2017
2 12-06-2017 12-07-2018
2 13-07-2018 -
I don't know if i get what you want. But if you just want a date less one day, this is the code.
x <- data.frame(ID = c(1,1,1,2,2,2), Date = as.Date(c("20-10-2017", "24-09-2018", "29-08-2019", "30-05-2016", "12-06-2017", "13-07-2018"),"%d-%m-%Y"))
x$Date1 <- x$Date-1
Shift by one row by group, then subtract one day:
library(data.table)
dt1 <- fread("
ID Date
1 23-10-2017
1 24-09-2018
1 29-08-2019
2 30-05-2016
2 12-06-2017
2 13-07-2018")
# convert to date
dt1[, Date := as.Date(Date, "%d-%m-%y")]
# shift per group, then minus 1 day
dt1[, Date1 := shift(Date, - 1) - 1, by = ID]
dt1
# ID Date Date1
# 1: 1 2020-10-23 2020-09-23
# 2: 1 2020-09-24 2020-08-28
# 3: 1 2020-08-29 <NA>
# 4: 2 2020-05-30 2020-06-11
# 5: 2 2020-06-12 2020-07-12
# 6: 2 2020-07-13 <NA>
Try using lead:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Date1 = lead(Date)-1)
# A tibble: 6 x 3
# Groups: ID [2]
ID Date Date1
<int> <date> <date>
1 1 2017-10-23 2018-09-23
2 1 2018-09-24 2019-08-28
3 1 2019-08-29 NA
4 2 2016-05-30 2017-06-11
5 2 2017-06-12 2018-07-12
6 2 2018-07-13 NA

Count consecutive prior dates per group

My sample data.frame (date format d/m/y), recording the dates a customer was active:
customer date
1 10/1/20
1 9/1/20
1 6/1/20
2 10/1/20
2 8/1/20
2 7/1/20
2 6/1/20
I would like to make a column "n_consecutive_days" like so:
customer date n_consecutive_days
1 10/1/20 2
1 9/1/20 1
1 6/1/20 N/A
2 10/1/20 1
2 8/1/20 3
2 7/1/20 2
2 6/1/20 N/A
The new column counts the number of previous consecutive dates per customer. I would like the customer's first date to be N/A as it makes no sense to talk about previous consecutive days if it is the first one.
Any help would be appreciated. I can calculate the difference between dates, but not the number of consecutive days as desired.
One way would be:
library(dplyr)
df %>%
group_by(customer, idx = cumsum(as.integer(c(0, diff(as.Date(date, '%d/%m/%y')))) != -1)) %>%
mutate(n_consecutive_days = rev(sequence(n()))) %>% ungroup() %>%
group_by(customer) %>%
mutate(n_consecutive_days = replace(n_consecutive_days, row_number() == n(), NA), idx = NULL)
Output:
# A tibble: 7 x 3
# Groups: customer [2]
customer date n_consecutive_days
<int> <fct> <int>
1 1 10/1/20 2
2 1 9/1/20 1
3 1 6/1/20 NA
4 2 10/1/20 1
5 2 8/1/20 3
6 2 7/1/20 2
7 2 6/1/20 NA
An option using data.table:
#ensure that data is sorted by customer and reverse chronological
setorder(DT, customer, -date)
#group by customer and consecutive dates and then create the sequence
DT[, ncd := .N:1L, .(customer, cumsum(c(0L, diff(date)!=-1L)))]
#set the first date in each customer to NA
DT[DT[, .I[.N], customer]$V1, ncd := NA]
output:
customer date ncd
1: 1 2020-01-10 2
2: 1 2020-01-09 1
3: 1 2020-01-06 NA
4: 2 2020-01-10 1
5: 2 2020-01-08 3
6: 2 2020-01-07 2
7: 2 2020-01-06 NA
data:
library(data.table)
DT <- fread("customer date
1 10/1/20
1 9/1/20
1 6/1/20
2 10/1/20
2 8/1/20
2 7/1/20
2 6/1/20")
DT[, date := as.IDate(date, format="%d/%m/%y")]

How to keep only those rows of a data.frame if whole group fulfills condition

I am fairly new to R and I have a question on how to keep only certain values based on an ID and a date. I have a (quite big) dataset that looks like the following example:
ID Type Date
1 OUT 2016-06-18
1 OUT 2016-06-18
1 OUT 2016-06-18
1 IN 2016-06-25
1 OUT 2016-06-25
2 IN 2016-07-03
2 OUT 2016-07-03
My question now is how can I find dates that contain ONLY one of the types (IN or OUT) and remove those from the data. I would however like to keep the date if the type is a pair (IN and OUT) and if the ID value is the same.
Is there a way how to do this in R?
If I understood your requirement correctly, here's a simple way using dplyr package -
df %>%
group_by(ID, Date) %>%
filter(n_distinct(Type) > 1)
# A tibble: 4 x 3
# Groups: ID, Date [2]
ID Type Date
<int> <chr> <chr>
1 1 IN 2016-06-25
2 1 OUT 2016-06-25
3 2 IN 2016-07-03
4 2 OUT 2016-07-03
Another way using ave() from base R -
df[with(df, ave(Type, ID, Date, FUN = function(x) length(unique(x)))) == 2, ]
ID Type Date
4 1 IN 2016-06-25
5 1 OUT 2016-06-25
6 2 IN 2016-07-03
7 2 OUT 2016-07-03
Here's a way to do this with dplyr. This looks for all ID + Date combos that have at least one of each In and Out.
has_both <- df1 %>%
count(ID, Date, Type) %>% # How many rows with each combo ID / Date / Type
count(ID, Date) %>% # How many rows appear for each ID / Date
filter(nn == 2) %>% # Only keep where 2 types (IN and OUT, presumably)
left_join(df1) %>% # Bring back matching original data
Output
has_both
# A tibble: 4 x 4
ID Date nn Type
<int> <chr> <int> <chr>
1 1 2016-06-25 2 IN
2 1 2016-06-25 2 OUT
3 2 2016-07-03 2 IN
4 2 2016-07-03 2 OUT
For the sake of completeness, here are also some data.table solutions:
library(data.table)
setDT(df)[, if (uniqueN(Type) > 1) .SD, by = .(ID, Date)]
ID Date Type
1: 1 2016-06-25 IN
2: 1 2016-06-25 OUT
3: 2 2016-07-03 IN
4: 2 2016-07-03 OUT
Within each ID, Date group only those subsets of df are returned for which there is more than one distinct Type.
This can also be written as:
setDT(df)[, .SD[uniqueN(Type) > 1], by = .(ID, Date)]
There is also a variant which finds ID and Date combinations which fulfill the requirement and subsets df by joining:
setDT(df)[df[, uniqueN(Type), by = .(ID, Date)][V1 > 1], on = .(ID, Date), .SD]
ID Type Date
1: 1 IN 2016-06-25
2: 1 OUT 2016-06-25
3: 2 IN 2016-07-03
4: 2 OUT 2016-07-03
Data
df <-readr::read_delim(
"ID Type Date
1 OUT 2016-06-18
1 OUT 2016-06-18
1 OUT 2016-06-18
1 IN 2016-06-25
1 OUT 2016-06-25
2 IN 2016-07-03
2 OUT 2016-07-03",
delim = " ", trim_ws = TRUE)

Performing in group operations in R

I have a data in which I have 2 fields in a table sf -> Customer id and Buy_date. Buy_date is unique but for each customer, but there can be more than 3 different values of Buy_dates for each customer. I want to calculate difference in consecutive Buy_date for each Customer and its mean value. How can I do this.
Example
Customer Buy_date
1 2018/03/01
1 2018/03/19
1 2018/04/3
1 2018/05/10
2 2018/01/02
2 2018/02/10
2 2018/04/13
I want the results for each customer in the format
Customer mean
Here's a dplyr solution.
Your data:
df <- data.frame(Customer = c(1,1,1,1,2,2,2), Buy_date = c("2018/03/01", "2018/03/19", "2018/04/3", "2018/05/10", "2018/01/02", "2018/02/10", "2018/04/13"))
Grouping, mean Buy_date calculation and summarising:
library(dplyr)
df %>% group_by(Customer) %>% mutate(mean = mean(as.POSIXct(Buy_date))) %>% group_by(Customer, mean) %>% summarise()
Output:
# A tibble: 2 x 2
# Groups: Customer [?]
Customer mean
<dbl> <dttm>
1 1 2018-03-31 06:30:00
2 2 2018-02-17 15:40:00
Or as #r2evans points out in his comment for the consecutive days between Buy_dates:
df %>% group_by(Customer) %>% mutate(mean = mean(diff(as.POSIXct(Buy_date)))) %>% group_by(Customer, mean) %>% summarise()
Output:
# A tibble: 2 x 2
# Groups: Customer [?]
Customer mean
<dbl> <time>
1 1 23.3194444444444
2 2 50.4791666666667
I am not exactly sure of the desired output but this what I think you want.
library(dplyr)
library(zoo)
dat <- read.table(text =
"Customer Buy_date
1 2018/03/01
1 2018/03/19
1 2018/04/3
1 2018/05/10
2 2018/01/02
2 2018/02/10
2 2018/04/13", header = T, stringsAsFactors = F)
dat$Buy_date <- as.Date(dat$Buy_date)
dat %>% group_by(Customer) %>% mutate(diff_between = as.vector(diff(zoo(Buy_date), na.pad=TRUE)),
mean_days = mean(diff_between, na.rm = TRUE))
This produces:
Customer Buy_date diff_between mean_days
<int> <date> <dbl> <dbl>
1 1 2018-03-01 NA 23.3
2 1 2018-03-19 18 23.3
3 1 2018-04-03 15 23.3
4 1 2018-05-10 37 23.3
5 2 2018-01-02 NA 50.5
6 2 2018-02-10 39 50.5
7 2 2018-04-13 62 50.5
EDITED BASED ON USER COMMENTS:
Because you said that you have factors and not characters just convert them by doing the following:
dat$Buy_date <- as.Date(as.character(dat$Buy_date))
dat$Customer <- as.character(dat$Customer)

Resources