dplyr join with OR condition? - r

I am wondering whether there is any way, preferably in the tidyverse, to join two dataframes based on OR conditions.
There are two dataframes: df_obs and df_event.
a) The join should happen if there is a match between
the obs_id and event_id; and obs_date or event_date, or both are NA.
OR
b) the obs_date and event_date are identical; and obs_id, event_id, or both are NA.
The match should not happen if obs_id is not identical to event_id (if both are not NA) OR
obs_date and event_date are not identical (both being not NA).
The result should look like df_res below. The column 'event' from the df_event is added to the df_obs.
I have seen the answer to this question, but maybe there is a way around SQL?
df_obs <- tibble::tribble(
~obs, ~obs_date, ~obs_id,
"a1", NA, 10L,
"a2", "01/01/2000", NA,
"b", "02/01/2000", NA,
"a3", "03/01/2000", 10L
)
df_obs
#> # A tibble: 4 × 3
#> obs obs_date obs_id
#> <chr> <chr> <int>
#> 1 a1 <NA> 10
#> 2 a2 01/01/2000 NA
#> 3 b 02/01/2000 NA
#> 4 a3 03/01/2000 10
df_event <- tibble::tribble(
~event, ~event_date, ~event_id,
"A", "01/01/2000", 10L,
"B", "02/01/2000", NA
)
df_event
#> # A tibble: 2 × 3
#> event event_date event_id
#> <chr> <chr> <int>
#> 1 A 01/01/2000 10
#> 2 B 02/01/2000 NA
df_res <- tibble::tribble(
~obs, ~obs_date, ~obs_id, ~event,
"a1", NA, 10L, "A",
"a2", "01/01/2000", NA, "A",
"b", "02/01/2000", NA, "B",
"a3", "03/01/2000", 10L, NA
)
df_res
#> # A tibble: 4 × 4
#> obs obs_date obs_id event
#> <chr> <chr> <int> <chr>
#> 1 a1 <NA> 10 A
#> 2 a2 01/01/2000 NA A
#> 3 b 02/01/2000 NA B
#> 4 a3 03/01/2000 10 <NA>
Created on 2022-09-13 with reprex v2.0.2

I can come up only with this solution:
df_obs %>%
left_join(df_event, by = c("obs_id" = "event_id"), na_matches='never') %>%
mutate(event = ifelse(!(is.na(obs_date)|is.na(event_date)|obs_date == event_date), NA, event)) %>%
select(-event_date) %>%
left_join(df_event, by = c("obs_date" = "event_date"), na_matches='never') %>%
mutate(event.y = ifelse(!(is.na(obs_id)|is.na(event_id)|obs_id == event_id), NA, event.y)) %>%
select(-event_id) %>%
mutate(event = ifelse(is.na(event.x), event.y, event.x)) %>%
select(-c(event.x, event.y))

Related

Using coalesce function with many variables

I have two datasets with similar variables. dataset2 has values of of variables that were not captured in dataset2. My aim is to use the dataset2 variables to fill the corresponding values in variables in dataset1. Is there a way to achieve this. It is possible to use coalesce but listing all the variables is a bit cumbersome.
library(dplyr)
dat1 <- tibble(
id = c("soo1", "soo2", "soo3", "soo4"),
a1= c("Test", "Tested", "Testing", NA),
a2= c("Math", "Eng", NA, "French"),
a3= c("Science", NA, "Biology", "Chem"))
dat2 <- tibble(
id = c("soo1", "soo2", "soo3", "soo4"),
a1= c(NA, NA, NA, "Tested"),
a2= c("Math", NA, "UK", NA),
a3= c("Science", "Physic", NA, NA))
dat1 %>%
inner_join(dat2, by = "id") %>%
mutate(a1 = coalesce(a1.x, a1.y),
a2 = coalesce(a2.x, a2.y))
Another possible solution, based on powerjoin:
library(powerjoin)
library(tibble)
power_inner_join(dat1, dat2, by = "id", conflict = coalesce_xy)
#> # A tibble: 4 × 4
#> id a1 a2 a3
#> <chr> <chr> <chr> <chr>
#> 1 soo1 Test Math Science
#> 2 soo2 Tested Eng Physic
#> 3 soo3 Testing UK Biology
#> 4 soo4 Tested French Chem
You could also fill your values "downup" per group for every column like this:
library(dplyr)
library(tidyr)
dat1 <- tibble(
id = c("soo1", "soo2", "soo3", "soo4"),
a1= c("Test", "Tested", "Testing", NA),
a2= c("Math", "Eng", NA, "French"),
a3= c("Science", NA, "Biology", "Chem"))
dat2 <- tibble(
id = c("soo1", "soo2", "soo3", "soo4"),
a1= c(NA, NA, NA, "Tested"),
a2= c("Math", NA, "UK", NA),
a3= c("Science", "Physic", NA, NA))
dat1 %>%
bind_rows(dat2) %>%
group_by(id) %>%
fill(everything(), .direction = "downup") %>%
slice(1)
#> # A tibble: 4 × 4
#> # Groups: id [4]
#> id a1 a2 a3
#> <chr> <chr> <chr> <chr>
#> 1 soo1 Test Math Science
#> 2 soo2 Tested Eng Physic
#> 3 soo3 Testing UK Biology
#> 4 soo4 Tested French Chem
Created on 2022-07-18 by the reprex package (v2.0.1)
In dplyr, we may use rows_patch
library(dplyr)
rows_patch(dat1, dat2, by = 'id')
-output
# A tibble: 4 × 4
id a1 a2 a3
<chr> <chr> <chr> <chr>
1 soo1 Test Math Science
2 soo2 Tested Eng Physic
3 soo3 Testing UK Biology
4 soo4 Tested French Chem

Join similar observations within a data.frame with R

I want to mix several observations in a data.frame using as a reference one constantly repeated variable.
Example:
id var1 var2 var3
a 1 na na
a na 2 na
a na na 3
b 1 na
b na 2 na
b na na na
c na na 3
c na 2 na
c 1 na na
Expected result:
id var1 var2 var3
a 1 2 3
b 1 2 na
c 1 2 3
A possible solution (replacing "na" by NA with na_if):
library(tidyverse)
df %>%
na_if("na") %>%
group_by(id) %>%
summarize(across(var1:var3, ~ sort(.x)[1]))
#> # A tibble: 3 × 4
#> id var1 var2 var3
#> <chr> <chr> <chr> <chr>
#> 1 a 1 2 3
#> 2 b 1 2 <NA>
#> 3 c 1 2 3
Assumptions:
"na" above is really R's native NA (not a string);
b's first row, var2 should be NA instead of an empty string ""
perhaps from the above, var1:var3 should be numbers
either you will never have a group where there is more than one non-NA in a group/column, or you don't care about anything other than the first and want the remaining discarded
library(dplyr)
dat %>%
group_by(id) %>%
summarize(across(everything(), ~ na.omit(.)[1]))
# # A tibble: 3 x 4
# id var1 var2 var3
# <chr> <int> <int> <int>
# 1 a 1 2 3
# 2 b 1 2 NA
# 3 c 1 2 3
Data
dat <- structure(list(id = c("a", "a", "a", "b", "b", "b", "c", "c", "c"), var1 = c(1L, NA, NA, 1L, NA, NA, NA, NA, 1L), var2 = c(NA, 2L, NA, NA, 2L, NA, NA, 2L, NA), var3 = c(NA, NA, 3L, NA, NA, NA, 3L, NA, NA)), class = "data.frame", row.names = c(NA, -9L))
Assuming that your data has NA, you can use the following base R option using the Data from #r2evans (thanks!):
aggregate(.~id, dat, mean, na.rm = TRUE, na.action=NULL)
Output:
id var1 var2 var3
1 a 1 2 3
2 b 1 2 NaN
3 c 1 2 3

Filter by group and conditions

I have this type of data, where Sequis a grouping variable:
df <- data.frame(
Sequ = c(1,1,1,
2,2,2,
3,3,
4,4),
Answerer = c("A", NA, NA, "A", NA, NA, "B", NA, "C", NA),
PP_by = c(rep("A",5), rep("B",5)),
pp = c(0.1,0.2,0.3, 1, NA, NA, NA, NA, NA, NA)
)
I need to remove any Sequ where
(i) Answerer == PP_by AND
(ii) there is any NA in pp
I've tried this, but it obviously implements just the first condition (i):
library(dplyr)
df %>%
group_by(Sequ) %>%
filter(
all(!is.na(pp))
)
The expected result is:
Sequ Answerer PP_by pp
1 1 A A 0.1
2 1 <NA> A 0.2
3 1 <NA> A 0.3
9 4 C B NA
10 4 <NA> B NA
EDIT:
I've come up with this solution:
df %>%
group_by(Sequ) %>%
filter(
first(Answerer) != first(PP_by)
|
all(!is.na(pp))
)
Here's another way:
df %>%
group_by(Sequ) %>%
filter(!(
any(Answerer == PP_by, na.rm = TRUE) &
any(is.na(pp))
))
# # A tibble: 5 × 4
# # Groups: Sequ [2]
# Sequ Answerer PP_by pp
# <dbl> <chr> <chr> <dbl>
# 1 1 A A 0.1
# 2 1 NA A 0.2
# 3 1 NA A 0.3
# 4 4 C B NA
# 5 4 NA B NA

tidyverse: Simulating random sample with nested factor

I want to simulate random sample with nested factor. Factor Dept has two levels A & B. Level A has two nested levels A1 and A2. Level B has three nested levels B1, B2 and B3. Want to simulate random sample from 2022-01-01 to 2022-01-31 using some R code. Part of desired output is given below (from 2022-01-01 to 2022-01-02 only for reference).
library(tibble)
set.seed(12345)
df1 <-
tibble(
Date = c(rep("2022-01-01", 5), rep("2022-01-02", 4), rep("2022-01-03", 4))
, Dept = c("A", "A", "B", "B", "B", "A", "B", "B", "B", "A", "A", "B", "B")
, Prog = c("A1", "A2", "B1", "B2", "B3", "A1", "B1", "B2", "B3", "A1", "A2", "B2", "B3")
, Amount = runif(n = 13, min = 50000, max = 100000)
)
df1
#> # A tibble: 13 x 4
#> Date Dept Prog Amount
#> <chr> <chr> <chr> <dbl>
#> 1 2022-01-01 A A1 86045.
#> 2 2022-01-01 A A2 93789.
#> 3 2022-01-01 B B1 88049.
#> 4 2022-01-01 B B2 94306.
#> 5 2022-01-01 B B3 72824.
#> 6 2022-01-02 A A1 58319.
#> 7 2022-01-02 B B1 66255.
#> 8 2022-01-02 B B2 75461.
#> 9 2022-01-02 B B3 86385.
#> 10 2022-01-03 A A1 99487.
#> 11 2022-01-03 A A2 51727.
#> 12 2022-01-03 B B2 57619.
#> 13 2022-01-03 B B3 86784.
If we want to sample randomly, create the expanded data with crossing and then filter/slice to return random rows for each 'date'
library(dplyr)
library(tidyr)
library(stringr)
crossing(Date = seq(as.Date("2022-01-01"), as.Date("2022-01-31"),
by = "1 day"), Dept = c("A", "B"), Prog = 1:3) %>%
mutate(Prog = str_c(Dept, Prog)) %>%
filter(Prog != "A3") %>%
mutate(Amount = runif(n = n(), min = 50000, max = 100000)) %>%
group_by(Date) %>%
slice(seq_len(sample(row_number(), 1))) %>%
ungroup
-output
# A tibble: 102 × 4
Date Dept Prog Amount
<date> <chr> <chr> <dbl>
1 2022-01-01 A A1 83964.
2 2022-01-01 A A2 93428.
3 2022-01-01 B B1 85187.
4 2022-01-01 B B2 79144.
5 2022-01-01 B B3 65784.
6 2022-01-02 A A1 86014.
7 2022-01-03 A A1 76060.
8 2022-01-03 A A2 56412.
9 2022-01-03 B B1 87365.
10 2022-01-03 B B2 66169.
# … with 92 more rows

Join data frames without creating duplicate rows while concatenating unique entries under one column

I'm trying to merge two data frames together which are related to each other via a specific variable named patient. The second data frame has multiple entries for the same patient column. I don't want to create duplicate patient entries upon merging, but I want to retain unique information in the second data frame by concatenating the values under one column.
I tried manually concatenating certain variables using group_by which works. I have several variables, however, and manually specifying all of them is not feasible
I can also concatenate every variable in the data frame by using dplyr as seen below. The problem in the second case is that duplicate values are also concatenated making the data frame unnecessarily big and difficult to deal with. Please see the reprex below.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1 <- data.frame(patient=c("a", "b", "c"),
var1 = 1:3,
var2=11:13)
df1
#> patient var1 var2
#> 1 a 1 11
#> 2 b 2 12
#> 3 c 3 13
df2 <- data.frame(patient=c("a","a", "b", "b", "c", "c" ),
treatment= rep(c("drug1", "drug2"), 3),
time= rep(c("time1", "time2"), 3),
var3= "constant")
df2
#> patient treatment time var3
#> 1 a drug1 time1 constant
#> 2 a drug2 time2 constant
#> 3 b drug1 time1 constant
#> 4 b drug2 time2 constant
#> 5 c drug1 time1 constant
#> 6 c drug2 time2 constant
df_merged <- left_join(df1, df2)
#> Joining, by = "patient"
# Don't want duplicates like this
df_merged
#> patient var1 var2 treatment time var3
#> 1 a 1 11 drug1 time1 constant
#> 2 a 1 11 drug2 time2 constant
#> 3 b 2 12 drug1 time1 constant
#> 4 b 2 12 drug2 time2 constant
#> 5 c 3 13 drug1 time1 constant
#> 6 c 3 13 drug2 time2 constant
df_merged2 <- df_merged %>%
group_by(patient) %>%
mutate(treatment = paste(treatment, collapse = "_"),
time=paste(time, collapse = "_")) %>%
filter(!duplicated(patient))
# I can manually edit a few variables like this
df_merged2
#> # A tibble: 3 x 6
#> # Groups: patient [3]
#> patient var1 var2 treatment time var3
#> <fct> <int> <int> <chr> <chr> <fct>
#> 1 a 1 11 drug1_drug2 time1_time2 constant
#> 2 b 2 12 drug1_drug2 time1_time2 constant
#> 3 c 3 13 drug1_drug2 time1_time2 constant
df_merged3 <- df_merged %>%
group_by(patient) %>%
mutate_at(vars(-group_cols()), .funs = ~paste(., collapse ="_")) %>%
filter(!duplicated(patient))
# I have many variables I can't specify manually
# I can create this merged data frame, but I don't want to
# concatenate duplicated values such as var1, var2, and var3
df_merged3
#> # A tibble: 3 x 6
#> # Groups: patient [3]
#> patient var1 var2 treatment time var3
#> <fct> <chr> <chr> <chr> <chr> <chr>
#> 1 a 1_1 11_11 drug1_drug2 time1_time2 constant_constant
#> 2 b 2_2 12_12 drug1_drug2 time1_time2 constant_constant
#> 3 c 3_3 13_13 drug1_drug2 time1_time2 constant_constant
Created on 2019-10-23 by the reprex package (v0.3.0)
I'd like to see if there is a way of concatenating variables containing only unique values to retain information from the second data frame without duplicating the rows in the df_merged.
I would be happy to hear if you have recommendations other than dplyr. A data.table solution may also be suitable for me as well, since my real data frames are quite large.
Thanks!
We can use summarise_at and unique
library(dplyr)
df_merged %>%
group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~paste(unique(.), collapse ="_"))
Or we can do the merge/joint directly instead of adding/altering the Global Env with an intermediate dataframe.
left_join(df1,
df2 %>% group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~paste(unique(.), collapse ="_")) %>%
ungroup()
)
Joining, by = "patient"
patient var1 var2 treatment time var3
1 a 1 11 drug1_drug2 time1_time2 constant
2 b 2 12 drug1_drug2 time1_time2 constant
3 c 3 13 drug1_drug2 time1_time2 constant
Update
#Here a toy example to experiment with, uncomment browser to see how it works inside Reduce,
#also see ?Reduce for more info
paste_mod <- function(x) Reduce(function(u, v){
u <- ifelse(!grepl('_',u) & is.na(u),'.',u)
v <- ifelse(is.na(v),'.',v)
if(v=='.' | !grepl(v,u)) paste0(u,'_',v) else u
}, x)
paste_mod(c("drug1",NA,NA,"drug2","drug1","drug2"))
[1] "drug1_._._drug2"
paste_mod(c(NA,NA,"drug2","drug1","drug2"))
[1] "._._drug2_drug1"
#replace NA with . then apply Reduce
df2 %>%
mutate_if(is.factor,as.character) %>% mutate_all(~replace(.,is.na(.),'.')) %>%
group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~Reduce(function(u, v) if(v=='.' | !grepl(v,u)) paste0(u,'_',v) else u, .)) %>%
ungroup()
# A tibble: 2 x 4
patient treatment time var3
<chr> <chr> <chr> <chr>
1 a drug1_._._drug2 time1_time2 constant
2 c drug1_drug2 time1_time2 constant
New df2 for testing the updated solution
df2 <- structure(list(patient = structure(c(1L, 1L, 1L, 1L, 2L, 2L), .Label = c("a",
"c"), class = "factor"), treatment = structure(c(1L, NA, NA,
2L, 1L, 2L), .Label = c("drug1", "drug2"), class = "factor"),
time = structure(c(1L, 2L, 1L, 2L, 1L, 2L), .Label = c("time1",
"time2"), class = "factor"), var3 = structure(c(1L, 1L, 1L,
1L, 1L, 1L), class = "factor", .Label = "constant")), class = "data.frame", row.names = c(NA,
-6L))

Resources