Left Join with Multiple Criteria - r

I have two datas and I'd like to do a left join with 2 criteria.
Criteria 1 : Same Title
Criteria 2 : Date between StartDate & EndDate
Data B
Title Date
A 2018-07-01
B 2019-12-30
Data A
Title StartDate EndDate Score
A 2018-01-01 2018-05-18 0
A 2018-05-19 2019-01-01 1
B 2019-10-01 2020-02-01 4
B 2020-02-02 2020-10-01 7
This is what I want to get
Title Score
A 1
B 4
Thanks in Advance!

We can use a non-equi join with data.table
library(data.table)
setDT(DataA)[DataB, .(Title, Score), on =
.(Title, StartDate < Date, EndDate >= Date)]
# Title Score
#1: A 1
#2: B 4
data
DataA <- structure(list(Title = c("A", "A", "B", "B"),
StartDate = structure(c(17532,
17670, 18170, 18294), class = "Date"), EndDate = structure(c(17669,
17897, 18293, 18536), class = "Date"), Score = c(0L, 1L, 4L,
7L)), row.names = c(NA, -4L), class = "data.frame")
DataB <- structure(list(Title = c("A", "B"), Date = structure(c(17713,
18260), class = "Date")), row.names = c(NA, -2L), class = "data.frame")

Related

how to filter groups matching more than one values?

id drug_name med_start med_end
<dbl> <chr> <date> <date>
1 pembrolizumab 2018-02-07 2018-02-07
1 pembrolizumab 2018-02-28 2018-02-28
2 pembrolizumab 2018-01-05 2018-01-05
2 nivolumab 2018-09-20 2018-09-20
2 nivolumab 2018-10-03 2018-10-03
2 nivolumab 2018-11-01 2018-11-01
I am trying to get ids who have both pembrolizumab and nivolumab in drug_name. Can I do a group_by over id? And then filter with both conditions?
For above table, id 2 has both drug_names. I might have situation where I will be filtering more than 2 drug_names.
I am also trying to find to see if the gap between two med_start is greater than x days. Let's say 30 days. Basically filter ids who have gap of 30 days between med_start.
Here is the code for above data
data <- structure(list(id = structure(c(1, 1, 2, 2, 2, 2), class = "int"),
drug_name = c("pembrolizumab", "pembrolizumab", "pembrolizumab",
"nivolumab", "nivolumab", "nivolumab"), med_start = structure(c(17569,
17590, 17536, 17794, 17807, 17836), class = "Date"), med_end = structure(c(17569,
17590, 17536, 17794, 17807, 17836), class = "Date")), row.names = c(NA,
-6L), groups = structure(list(patient_id = structure(c(1.49283861796358e-314,
1.6423825257779e-313), class = "integer64"), .rows = structure(list(
1:2, 3:6), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
We group by 'id', and filter where all the drugs of interest are %in% the 'drug_name' column, and extract the unique 'id'
library(dplyr)
data %>%
group_by(id) %>%
filter(all(c("pembrolizumab", "nivolumab") %in% drug_name)) %>%
ungroup %>%
pull(id)%>%
unique
-output
[1] 2
Here are some base R options
for the first question
> unique(
+ subset(
+ data,
+ ave(match(drug_name, c("pembrolizumab", "nivolumab")), id, FUN = var) > 0,
+ select = id
+ )
+ )
# A tibble: 1 x 1
id
<int>
1 2
for the second question
> subset(
+ data,
+ ave(as.integer(med_start), id, FUN = function(x) max(diff(x))) <= 30
+ )
# A tibble: 2 x 4
id drug_name med_start med_end
<int> <chr> <date> <date>
1 1 pembrolizumab 2018-02-07 2018-02-07
2 1 pembrolizumab 2018-02-28 2018-02-28

Solution for repeating values in a given date range

Error in seq.Date(as.Date(retail$Valid_from), as.Date(retail$Valid_to), :
'from' must be of length 1
I have tried both the methods as mentioned in the question :
How should I deal with 'from' must be of length 1 error?
I basically want to repeat the quantity for each day in a given date range :
HSD_RSP Valid_from Valid_to
70 1/1/2018 15/1/2018
80 1/16/2018 1/31/2018
.
.
.
Method 1 :
byDay = ddply(retail, .(HSD_RSP), transform,
day=seq(as.Date(retail$Valid_from), as.Date(retail$Valid_to), by="day"))
Method 2 :
dt <- data.table(retail)
dt <- dt[,seq(as.Date(Valid_from),as.Date(Valid_to),by="day"),
by=list(HSD_RSP)]
HSD_RSP final_date
70 1/1/2018
70 2/1/2018
70 3/1/2018
70 4/1/2018
.
.
.
output of
dput(head(retail))
structure(list(HSD_RSP = c(61.68, 62.96, 63.14, 60.51, 60.34,
61.63), Valid_from = structure(c(1483315200, 1484524800, 1487116800,
1491004800, 1491523200, 1492300800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Valid_to = structure(c(1484438400, 1487030400,
1490918400, 1491436800, 1492214400, 1493510400), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Convert to date, create a sequence of dates between Valid_from and Valid_to and unnest
library(tidyverse)
df %>%
mutate_at(vars(starts_with("Valid")), as.Date, "%m/%d/%Y") %>%
mutate(Date = map2(Valid_from, Valid_to, seq, by = "1 day")) %>%
unnest(Date) %>%
select(-Valid_from, -Valid_to)
# HSD_RSP Date
# <int> <date>
# 1 70 2018-01-01
# 2 70 2018-01-02
# 3 70 2018-01-03
# 4 70 2018-01-04
# 5 70 2018-01-05
# 6 70 2018-01-06
# 7 70 2018-01-07
# 8 70 2018-01-08
# 9 70 2018-01-09
#10 70 2018-01-10
# … with 21 more rows
data
df <- structure(list(HSD_RSP = c(70L, 80L), Valid_from = structure(1:2,
.Label = c("1/1/2018", "1/16/2018"), class = "factor"), Valid_to =
structure(1:2, .Label = c("1/15/2018", "1/31/2018"), class = "factor")),
class = "data.frame", row.names = c(NA, -2L))
Using Ronak Shah's data structure, using data.table:
library(data.table)
dt <- as.data.table(df1)
dt[, .(final_date = seq(as.Date(Valid_from, "%m/%d/%Y"), as.Date(Valid_to, "%m/%d/%Y"), by = "day")),
by = HSD_RSP]
HSD_RSP final_date
1: 70 2018-01-01
2: 70 2018-01-02
3: 70 2018-01-03
4: 70 2018-01-04
....
data:
df <- structure(list(HSD_RSP = c(70L, 80L), Valid_from = structure(1:2,
.Label = c("1/1/2018", "1/16/2018"), class = "factor"), Valid_to =
structure(1:2, .Label = c("1/15/2018", "1/31/2018"), class = "factor")),
class = "data.frame", row.names = c(NA, -2L))

r ifelse condition for the calculation on multiple dataframes

I have 3 data frames, df1 = a time interval, df2 = list of IDs, df3 = list of IDs with associated date.
df1 <- structure(list(season = structure(c(2L, 1L), .Label = c("summer",
"winter"), class = "factor"), mindate = structure(c(1420088400,
1433131200), class = c("POSIXct", "POSIXt")), maxdate = structure(c(1433131140,
1448945940), class = c("POSIXct", "POSIXt")), diff = structure(c(150.957638888889,
183.040972222222), units = "days", class = "difftime")), .Names = c("season",
"mindate", "maxdate", "diff"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(ID = c(23796, 23796, 23796)), .Names = "ID", row.names = c(NA,
-3L), class = "data.frame")
df3 <- structure(list(ID = c("23796", "123456", "12134"), time = structure(c(1420909920,
1444504500, 1444504500), class = c("POSIXct", "POSIXt"), tzone = "US/Eastern")), .Names = c("ID",
"time"), row.names = c(NA, -3L), class = "data.frame")
The code should compare if df2$ID == df3$ID. If true, and if df3$time >= df1$mindate and df3$time <= df1$maxdate, then df1$maxdate - df3$time, else df1$maxdate - df1$mindate. I tried using the ifelse function. This works when i manually specify specific cells, but this is not what i want as I have many more (uneven rows) for each of the dfs.
df1$result <- ifelse(df2[1,1] == df3[1,1] & df3[1,2] >= df1$mindate & df3[1,2] <= df1$maxdate,
difftime(df1$maxdate,df3[1,2],units="days"),
difftime(df1$maxdate,df1$mindate,units="days")
EDIT: The desired output is (when removing last row of df2):
season mindate maxdate diff result
1 winter 2015-01-01 2015-05-31 23:59:00 150.9576 days 141.9576
2 summer 2015-06-01 2015-11-30 23:59:00 183.0410 days 183.0410
Any ideas? I don't see how I could merge dfs to make them of the same length. Note that df2 can be of any row length and not affect the code. Issues arise when df1 and df3 differ in # of rows.
The > and < are vectorized:
transform(df1,result=ifelse(df3$ID%in%df2$ID & df3$time>mindate & df3$time <maxdate, difftime(maxdate,df3$time),difftime(maxdate,mindate)))
season mindate maxdate diff result
1 winter 2014-12-31 21:00:00 2015-05-31 20:59:00 150.9576 days 141.9576
2 summer 2015-05-31 21:00:00 2015-11-30 20:59:00 183.0410 days 183.0410
You can also use the between function from data.table library
library(data.table)
transform(df1,result=ifelse(df3$ID%in%df2$ID&df3$time%between%df1[2:3],
difftime(maxdate,df3$time),difftime(maxdate,mindate)))
season mindate maxdate diff result
1 winter 2014-12-31 21:00:00 2015-05-31 20:59:00 150.9576 days 141.9576
2 summer 2015-05-31 21:00:00 2015-11-30 20:59:00 183.0410 days 183.0410

Column from one data.table ordered by column from other data.table

I have two data.tables A:
contract.name contract.start contract.end price
Q1-2019 2019-01-01 2019-04-01 10
Q2-2019 2019-04-01 2019-07-01 12
Q3-2019 2019-07-01 2019-10-01 11
Q4-2019 2019-10-01 2020-01-01 13
and B:
contract delivery.begin delivery.end bid ask
Q2-2018 2018-04-01 2018-06-30 9.8 10.5
Q3-2018 2018-07-01 2018-09-30 11.5 12.1
Q4-2018 2018-10-01 2018-12-31 10.5 11.3
Q1-2019 2019-01-01 2019-03-31 12.8 13.5
I want a vector with the bid values from B ordered by the contract.name values from A like so:
bid = c(12.8, 0, 0, 0)
df1 %>%
left_join(df2, by=c("contract.name"="contract")) %>%
select(bid) %>%
replace_na(list(bid=0)) %>%
as.character()
Output is:
"c(12.8, 0, 0, 0)"
Sample data:
df1 <- structure(list(contract.name = c("Q1-2019", "Q2-2019", "Q3-2019",
"Q4-2019"), contract.start = c("2019-01-01", "2019-04-01", "2019-07-01",
"2019-10-01"), contract.end = c("2019-04-01", "2019-07-01", "2019-10-01",
"2020-01-01"), price = c(10L, 12L, 11L, 13L)), .Names = c("contract.name",
"contract.start", "contract.end", "price"), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(contract = c("Q2-2018", "Q3-2018", "Q4-2018",
"Q1-2019"), delivery.begin = c("2018-04-01", "2018-07-01", "2018-10-01",
"2019-01-01"), delivery.end = c("2018-06-30", "2018-09-30", "2018-12-31",
"2019-03-31"), bid = c(9.8, 11.5, 10.5, 12.8), ask = c(10.5,
12.1, 11.3, 13.5)), .Names = c("contract", "delivery.begin",
"delivery.end", "bid", "ask"), class = "data.frame", row.names = c(NA,
-4L))
library(data.table)
DT.A <- data.table(structure(list(contract.name = structure(1:4, .Label = c("Q1-2019",
"Q2-2019", "Q3-2019", "Q4-2019"), class = "factor"), contract.start = structure(1:4, .Label = c("2019-01-01",
"2019-04-01", "2019-07-01", "2019-10-01"), class = "factor"),
contract.end = structure(1:4, .Label = c("2019-04-01", "2019-07-01",
"2019-10-01", "2020-01-01"), class = "factor"), price = c(10L,
12L, 11L, 13L)), .Names = c("contract.name", "contract.start",
"contract.end", "price"), class = "data.frame", row.names = c(NA,
-4L)))
DT.B <- data.table(structure(list(contract = structure(c(2L, 3L, 4L, 1L), .Label = c("Q1-2019",
"Q2-2018", "Q3-2018", "Q4-2018"), class = "factor"), delivery.begin = structure(1:4, .Label = c("2018-04-01",
"2018-07-01", "2018-10-01", "2019-01-01"), class = "factor"),
delivery.end = structure(1:4, .Label = c("2018-06-30", "2018-09-30",
"2018-12-31", "2019-03-31"), class = "factor"), bid = c(9.8,
11.5, 10.5, 12.8), ask = c(10.5, 12.1, 11.3, 13.5)), .Names = c("contract",
"delivery.begin", "delivery.end", "bid", "ask"), class = "data.frame", row.names = c(NA,
-4L)))
# Get vector of contract names
orderVals <- DT.A$contract.name
# Key table B by contract
setkey(DT.B, contract)
# Extract rows from table B with the specified key values
output <- DT.B[.(orderVals)]
# Change the values where there was no match from NA to 0
output[is.na(bid), bid := 0]
# Get desired vector
output$bid
You can do:
library("data.table")
A <- fread(
"contract.name contract.start contract.end price
Q1-2019 2019-01-01 2019-04-01 10
Q2-2019 2019-04-01 2019-07-01 12
Q3-2019 2019-07-01 2019-10-01 11
Q4-2019 2019-10-01 2020-01-01 13")
B <- fread(
"contract delivery.begin delivery.end bid ask
Q2-2018 2018-04-01 2018-06-30 9.8 10.5
Q3-2018 2018-07-01 2018-09-30 11.5 12.1
Q4-2018 2018-10-01 2018-12-31 10.5 11.3
Q1-2019 2019-01-01 2019-03-31 12.8 13.5")
setnames(B, "contract", "contract.name")
A[B, on="contract.name", bid:=bid][, ifelse(is.na(bid), 0, bid)]
# > A[B, on="contract.name", bid:=bid][, ifelse(is.na(bid), 0, bid)]
# [1] 12.8 0.0 0.0 0.0
or (a variant without ifelse()):
setnames(B, "contract", "contract.name")
A[B, on="contract.name", bid:=bid]
A[is.na(bid), bid:=0][, bid]

R data.table comparing dates between tables and counting records

I have two data tables: a and b
a = structure(list(id = c(86246, 86252, 12262064), brand = c(3718L,
13474L, 17286L), offerdate = structure(c(15454, 15791, 15883), class = "Date")), .Names = c("id",
"brand", "offerdate"), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x041c24a0>)
b = structure(list(id = c(86246, 86246, 86246), brand = c(3718, 3718,
875), date = structure(c(15408, 15430, 15434), class = "Date")), .Names = c("id",
"brand", "date"), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x041c24a0>)
> a
id brand offerdate
1: 86246 3718 2012-04-24
2: 86252 13474 2013-03-27
3: 12262064 17286 2013-06-27
> b
id brand date
1: 86246 3718 2012-03-09
2: 86246 3718 2012-03-31
3: 86246 875 2012-04-04
Now I would like, for each id in a, to count the number of rows in b for the same id and brand, with a date less than 30 days before the a.offerdate.
The outcome I wish to have is an updated a:
> a
id brand offerdate nbTrans_last_30_days
1: 86246 3718 2013-04-24 1
2: 86252 13474 2013-03-27 0
3: 12262064 17286 2013-06-27 0
I can do the job with subset, but I am looking for a fast solution.
The subset version would be to do (for each line of a):
subset(b, (id == 86246) & (brand == 3718) & (date > as.Date("2012-03-24")) )
with the date depending on a.offerdate.
I manage to count the total rows in b:
> setkey(a,id, brand)
> setkey(b,id, brand)
> a = a[b[a, .N]]
> setnames(a, "N", "nbTrans")
> a
id brand offerdate nbTrans
1: 86246 3718 2012-04-24 2
2: 86252 13474 2013-03-27 0
3: 12262064 17286 2013-06-27 0
but I do not know how to handle the comparison of dates between the two tables.
The answer below works for the original small data set, but somehow did not work for my real data.
I tried to reproduce the problem with two new variables: a2 and b2
a2=structure(list(id = c(86246, 86252, 12262064), brand = structure(c(3L,
+ 9L, 12L), .Label = c("875", "1322", "3718", "4294", "5072", "6732",
+ "6926", "7668", "13474", "13791", "15889", "17286", "17311",
+ "26189", "26456", "28840", "64486", "93904", "102504"), class = "factor"),
+ offerdate = structure(c(15819, 15791, 15883), class = "Date")), .Names = c("id",
+ "brand", "offerdate"), row.names = c(NA, -3L), class = c("data.table",
+ "data.frame"))
b2=structure(list(id = c(86246, 86246, 86246, 86246, 86246, 86246,
+ 86246, 86246), brand = c(3718L, 3718L, 3718L, 3718L, 3718L, 3718L,
+ 3718L, 3718L), date = structure(c(15423, 15724, 15752, 15767,
+ 15782, 15786, 15788, 15811), class = "Date")), .Names = c("id",
+ "brand", "date"), sorted = c("id", "brand"), class = c("data.table",
+ "data.frame"))
> setkey(a2,id,brand)
> setkey(b2,id,brand)
> merge(a2, b2, all.x = TRUE, allow.cartesian = TRUE)
id brand offerdate date
1: 86246 3718 2013-04-24 <NA>
2: 86252 13474 2013-03-27 <NA>
3: 12262064 17286 2013-06-27 <NA>
The problem is that the merge does not keep the b2.date information.
The trick is to use allow.cartesian argument in merge:
setkey(a, id, brand)
setkey(b, id, brand)
c <- merge(a, b, all.x = T, allow.cartesian = T)
c[, Trans := (offerdate - date) <= 30]
c[, list(nbTrans_last_30_days = sum(Trans, na.rm = T)),
keyby = list(id, brand, offerdate)]

Resources