Overwrite left_join dplyr to update data - r

My question is similar to this one however I have additional columns in the LHS that should be kept https://stackoverflow.com/a/35642948/9285732
y is a subset of x with updated values for val1. In x I want to overwrite the relevant values but keep the rest.
Sample data:
library(tidyverse)
x <- tibble(name = c("hans", "dieter", "bohlen", "hans", "dieter", "alf"),
location = c(1,1,1,2,2,3),
val1 = 1:6, val2 = 1:6, val3 = 1:6)
y <- tibble(name = c("hans", "dieter", "hans"),
location = c(2,2,1),
val1 = 10)
> x
# A tibble: 6 x 5
name location val1 val2 val3
<chr> <dbl> <int> <int> <int>
1 hans 1 1 1 1
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 4 4 4
5 dieter 2 5 5 5
6 alf 3 6 6 6
> y
# A tibble: 3 x 3
name location val1
<chr> <dbl> <dbl>
1 hans 2 10
2 dieter 2 10
3 hans 1 10
> # desired output
> out
# A tibble: 6 x 5
name location val1 val2 val3
<chr> <dbl> <dbl> <int> <int>
1 hans 1 10 1 1
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 10 4 4
5 dieter 2 10 5 5
6 alf 3 6 6 6
I wrote a function that is doing what I want, however it's quite cumbersome. I wonder if there's a more elegant way or even a dplyr function that I'm unaware of.
overwrite_join <- function(x, y, by = NULL){
bycols <- which(colnames(x) %in% by)
commoncols <- which(colnames(x) %in% colnames(y))
extracols <- which(!(colnames(x) %in% colnames(y)))
x1 <- anti_join(x, y, by = by) %>%
bind_rows(y) %>%
select(commoncols) %>%
left_join(x %>% select(bycols, extracols), by = by)
out <- x %>% select(by) %>%
left_join(x1, by = by)
return(out)
}
overwrite_join(t1, t2, by = c("name", "location"))

You could do something along the lines of
> x %>%
left_join(y = y, by = c("name", "location")) %>%
within(., val1.x <- ifelse(!is.na(val1.y), val1.y, val1.x)) %>%
select(-val1.y)
# # A tibble: 6 x 5
# name location val1.x val2 val3
# <chr> <dbl> <dbl> <int> <int>
# 1 hans 1 10 1 1
# 2 dieter 1 2 2 2
# 3 bohlen 1 3 3 3
# 4 hans 2 10 4 4
# 5 dieter 2 10 5 5
# 6 alf 3 6 6 6
and then rename val1.x.

My package safejoin might help. Only available on github so far but has a feature designed just for that.
The conflict argument below must be fed a function or lambda to deal with conflicting columns when joining, here we want in priority a value from the y data frame so we can use dplyr::coalesce() there. Note that we must first coerce y$val1 as in your example it's double while x$val1 is integer. Your real case might not need this step.
# remotes::install_github("moodymudskipper/safejoin")
library(safejoin)
library(dplyr)
y$val1 <- as.integer(y$val1)
safe_left_join(x, y, by = c("name", "location"), conflict = ~coalesce(.y, .x))
#> # A tibble: 6 x 5
#> name location val1 val2 val3
#> <chr> <dbl> <int> <int> <int>
#> 1 hans 1 10 1 1
#> 2 dieter 1 2 2 2
#> 3 bohlen 1 3 3 3
#> 4 hans 2 10 4 4
#> 5 dieter 2 10 5 5
#> 6 alf 3 6 6 6
Edit : inspired by your own solution here's a 100% dplyr option that you might like better, just like your option though it's not a proper join!
bind_rows(y, x) %>%
group_by(name, location) %>%
summarize_all(~na.omit(.x)[[1]]) %>%
ungroup()
#> # A tibble: 6 x 5
#> name location val1 val2 val3
#> <chr> <dbl> <dbl> <int> <int>
#> 1 alf 3 6 6 6
#> 2 bohlen 1 3 3 3
#> 3 dieter 1 2 2 2
#> 4 dieter 2 10 5 5
#> 5 hans 1 10 1 1
#> 6 hans 2 10 4 4

Try dplyr::coalesce
x %>%
left_join(y, by = c("name", "location")) %>%
mutate(val1 = coalesce(val1.y, val1.x)) %>%
select(-val1.x, -val1.y)
# A tibble: 6 x 5
name location val2 val3 val1
<chr> <dbl> <int> <int> <int>
1 hans 1 1 1 10
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 4 4 10
5 dieter 2 5 5 10
6 alf 3 6 6 6

This is the idiom I now use. It does not preserve the row or column order in x, if that is important.
I like it because I can evaluate the values to just before the bind_rows(), do a visual inspection, and if I like it, put the fixed rows back onto the base dataframe.
library(dplyr)
x <- tibble(name = c("hans", "dieter", "bohlen", "hans", "dieter", "alf"),
location = c(1,1,1,2,2,3),
val1 = 1:6, val2 = 1:6, val3 = 1:6)
y <- tibble(name = c("hans", "dieter", "hans"),
location = c(2,2,1),
val1 = 10)
keys <- c("name", "location")
out <- x %>%
semi_join(y, keys) %>%
select(-matches(setdiff(names(y), keys))) %>%
left_join(y, keys) %>%
bind_rows(x %>% anti_join(y, keys))
out %>%
print()
#> # A tibble: 6 x 5
#> name location val2 val3 val1
#> <chr> <dbl> <int> <int> <dbl>
#> 1 hans 1 1 1 10
#> 2 hans 2 4 4 10
#> 3 dieter 2 5 5 10
#> 4 dieter 1 2 2 2
#> 5 bohlen 1 3 3 3
#> 6 alf 3 6 6 6
Created on 2019-12-12 by the reprex package (v0.3.0)

Related

iterative functions in R

I’m trying to create multiple new score columns based on other columns. I’d like to use a function to minimize copy pasting large blocks of code.
I’m trying to do something like:
Myfunction <- function(column){
Column_df <- old_df %>%
mutate(column.score = if_else(column = 1, “yes”, “no”)
)
}
Score_df <- Myfunction(c(math, reading, science)))
But I’m getting an error saying object math is not found
Starting with an example data frame as below
df <- purrr::map_dfc(c('math', 'reading', 'science', 'history'),
~ rlang::list2(!!.x := sample(1:3, 10, TRUE)))
df
#> # A tibble: 10 × 4
#> math reading science history
#> <int> <int> <int> <int>
#> 1 2 1 3 1
#> 2 3 2 3 1
#> 3 2 2 2 2
#> 4 2 3 1 2
#> 5 3 3 1 2
#> 6 1 2 3 2
#> 7 3 3 2 1
#> 8 3 3 3 2
#> 9 1 2 2 1
#> 10 2 2 2 3
You can create new "score" columns with a function by passing your columns argument to across inside {{ }}, and using the .name option to add ".score" to the name.
If you want only the "score" columns in the output, rather than to add them to existing columns, use transmute instead of mutate.
library(dplyr, warn.conflicts = FALSE)
Myfunction <- function(df, columns){
df %>%
mutate(across({{ columns }}, ~ if_else(. == 1, 'yes', 'no'),
.names = '{.col}.score'))
}
df %>%
Myfunction(c(math, reading, science))
#> # A tibble: 10 × 7
#> math reading science history math.score reading.score science.score
#> <int> <int> <int> <int> <chr> <chr> <chr>
#> 1 2 1 3 1 no yes no
#> 2 3 2 3 1 no no no
#> 3 2 2 2 2 no no no
#> 4 2 3 1 2 no no yes
#> 5 3 3 1 2 no no yes
#> 6 1 2 3 2 yes no no
#> 7 3 3 2 1 no no no
#> 8 3 3 3 2 no no no
#> 9 1 2 2 1 yes no no
#> 10 2 2 2 3 no no no
Created on 2022-01-18 by the reprex package (v2.0.1)

Is there a way to get subdataframes with purrr in magrittr pipes workflow without using data.frame name?

That is, I was interested in doing the same as in the example, but with purrr functions.
tibble(a, b = a * 2, c = 1) %>%
{lapply(X = names(.), FUN = function(.x) select(., 1:.x))}
[[1]]
# A tibble: 5 x 1
a
<int>
1 1
2 2
3 3
4 4
5 5
[[2]]
# A tibble: 5 x 2
a b
<int> <dbl>
1 1 2
2 2 4
3 3 6
4 4 8
5 5 10
[[3]]
# A tibble: 5 x 3
a b c
<int> <dbl> <dbl>
1 1 2 1
2 2 4 1
3 3 6 1
4 4 8 1
5 5 10 1
I only could do it if I named foo <- tibble(a, b = a * 2, c = 1) and inside map I did select(foo, ...), but I wanted to avoid that, since I wanted to mutate the named dataframe in pipe workflow.
Thank you!
You can use map in the following way :
library(dplyr)
library(purrr)
tibble(a = 1:5, b = a * 2, c = 1) %>%
{map(names(.), function(.x) select(., 1:.x))}
Based on your actual use case you can also use imap which will pass column value (.x) along with it's name (.y).
tibble(a = 1:5, b = a * 2, c = 1) %>%
imap(function(.x, .y) select(., 1:.y))
#$a
# A tibble: 5 x 1
# a
# <int>
#1 1
#2 2
#3 3
#4 4
#5 5
#$b
# A tibble: 5 x 2
# a b
# <int> <dbl>
#1 1 2
#2 2 4
#3 3 6
#4 4 8
#5 5 10
#$c
# A tibble: 5 x 3
# a b c
# <int> <dbl> <dbl>
#1 1 2 1
#2 2 4 1
#3 3 6 1
#4 4 8 1
#5 5 10 1

Finding the differences of paired-columns using dplyr

set.seed(3)
library(dplyr)
dat <- tibble(Measure = c("Height","Weight","Width","Length"),
AD1_1= rpois(4,10),
AD1_2= rpois(4,9),
AD2_1= rpois(4,10),
AD2_2= rpois(4,9),
AD3_1= rpois(4,10),
AD3_2= rpois(4,9),
AD4_1= rpois(4,10),
AD4_2= rpois(4,9),
AD5_1= rpois(4,10),
AD5_2= rpois(4,9),
AD6_1= rpois(4,10),
AD6_2= rpois(4,9))
Suppose I have data that looks like this. I wish to calculate the difference for each AD, paired with underscored number, i.e., AD1diff, AD2diff,AD3diff.
Instead of writing
dat %>%
mutate(AD1diff = AD1_1 - AD1_2,
AD2diff = AD2_1 - AD2_2,
...)
what would be an efficient way to write this?
One dplyr option could be:
dat %>%
mutate(across(ends_with("_1"), .names = "{col}_diff") - across(ends_with("_2"))) %>%
rename_with(~ sub("_\\d+", "", .), ends_with("_diff"))
Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1 AD6_2 AD1_diff AD2_diff AD3_diff AD4_diff AD5_diff AD6_diff
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 Height 6 10 10 3 12 8 7 5 7 5 8 9 -4 7 4 2 2 -1
2 Weight 8 9 13 6 14 7 8 7 13 11 10 9 -1 7 7 1 2 1
3 Width 10 9 11 5 12 8 7 11 9 5 5 6 1 6 4 -4 4 -1
4 Length 8 9 8 7 8 13 8 7 6 11 14 6 -1 1 -5 1 -5 8
The "tidy" way to do this would be to convert your data from wide to long, do a grouped subtraction, and then go back to wide format:
library(tidyr)
dat_long = dat %>% pivot_longer(
cols = starts_with("AD"),
names_sep = "_",
names_to = c("group", "obs")
)
dat_long %>% head
# # A tibble: 48 x 4
# Measure group obs value
# <chr> <chr> <chr> <int>
# 1 Height AD1 1 6
# 2 Height AD1 2 10
# 3 Height AD2 1 10
# 4 Height AD2 2 3
# 5 Height AD3 1 12
# 6 Height AD3 2 8
dat_long %>%
group_by(Measure, group) %>%
summarize(diff = value[obs == 1] - value[obs == 2]) %>%
pivot_wider(names_from = "group", values_from = "diff") %>%
rename_with(.fn = ~ paste0(., "diff"), .cols = starts_with("AD"))
# # A tibble: 4 x 7
# # Groups: Measure [4]
# Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
# <chr> <int> <int> <int> <int> <int> <int>
# 1 Height -4 7 4 2 2 -1
# 2 Length -1 1 -5 1 -5 8
# 3 Weight -1 7 7 1 2 1
# 4 Width 1 6 4 -4 4 -1
Here is a data.table option
setDT(dat)[
,
paste0(
unique(gsub("_\\d+", "", names(dat)[-1])),
"diff"
) := lapply(
split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
function(x) do.call("-", x)
)
]
which gives
> dat
Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1
1: Height 6 10 10 3 12 8 7 5 7 5 8
2: Weight 8 9 13 6 14 7 8 7 13 11 10
3: Width 10 9 11 5 12 8 7 11 9 5 5
4: Length 8 9 8 7 8 13 8 7 6 11 14
AD6_2 AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1: 9 -4 7 4 2 2 -1
2: 9 -1 7 7 1 2 1
3: 6 1 6 4 -4 4 -1
4: 6 -1 1 -5 1 -5 8
or
setDT(dat)[
,
c(.(Measure = Measure), setNames(lapply(
split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
function(x) do.call("-", x)
), paste0(
unique(gsub("_\\d+", "", names(dat)[-1])),
"diff"
)))
]
gives
Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1: Height -4 7 4 2 2 -1
2: Weight -1 7 7 1 2 1
3: Width 1 6 4 -4 4 -1
4: Length -1 1 -5 1 -5 8
Use tidyverse package tidyr to rearrange your data before mutating
require(dplyr)
require(tidyr)
#> Loading required package: tidyr
First, tidyr::pivot_longer the data frame so that there's a separate row for every column:
new_dat <-
pivot_longer(dat, cols = starts_with("AD"), # For columns whose names start with 'AD'...
names_sep = "_", # separate columns using '_' in colname
names_to = c("AD_number", "observation")) %>%
arrange(AD_number, Measure, observation)
head(new_dat, 9)
#> # A tibble: 9 x 4
#> Measure AD_number observation value
#> <chr> <chr> <chr> <int>
#> 1 Height AD1 1 6
#> 2 Height AD1 2 10
#> 3 Length AD1 1 8
#> 4 Length AD1 2 9
#> 5 Weight AD1 1 8
#> 6 Weight AD1 2 9
#> 7 Width AD1 1 10
#> 8 Width AD1 2 9
#> 9 Height AD2 1 10
Then, use tidyr::pivot_wider (the functional opposite of pivot_longer) to make a separate column for each value in observation. This will be very compatible with the upcoming mutate operation.
new_dat <-
pivot_wider(new_dat,
names_from = observation,
values_from = value,
names_prefix = "value_")
head(new_dat, 5)
#> # A tibble: 5 x 4
#> Measure AD_number value_1 value_2
#> <chr> <chr> <int> <int>
#> 1 Height AD1 6 10
#> 2 Length AD1 8 9
#> 3 Weight AD1 8 9
#> 4 Width AD1 10 9
#> 5 Height AD2 10 3
Finally, mutate the data:
new_dat <-
mutate(new_dat, diff = value_1 - value_2)
head(new_dat, 4)
#> # A tibble: 4 x 5
#> Measure AD_number value_1 value_2 diff
#> <chr> <chr> <int> <int> <int>
#> 1 Height AD1 6 10 -4
#> 2 Length AD1 8 9 -1
#> 3 Weight AD1 8 9 -1
#> 4 Width AD1 10 9 1
Created on 2021-01-22 by the reprex package (v0.3.0)
Getting back to your original data format is possible, but it might not make the data any easier to work with:
rename(new_dat,
c(`1` = "value_1", `2` = "value_2")) %>%
pivot_wider(names_from = AD_number,
values_from = c(`1`, `2`, diff),
names_glue = "{AD_number}_{.value}") %>%
{.[,order(names(.))]} %>%
relocate(Measure)

Finding rowwise minimum and column index in a tibble

I have the following tibble:
> df <- tibble(
ID = LETTERS[1:4],
a = c(1,5,9,8),
b = c(5,9,8,2),
c = c(5,4,5,5)
)
> df
# A tibble: 4 x 4
ID a b c
<chr> <dbl> <dbl> <dbl>
1 A 1 5 5
2 B 5 9 4
3 C 9 8 5
4 D 8 2 5
>
What I want is to get the rowwise minimum of columns a:c and also the column index from this minimum.
The output tabel should look like this:
# A tibble: 4 x 6
ID a b c Min Col_Index
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 5 5 1 1
2 B 5 9 4 4 3
3 C 9 8 5 5 3
4 D 8 2 5 2 2
I don't want to use rowwise()!
Thank you!
You could use pmin with do.call to get rowwise minimum and negate the values to use with max.col to get the column index of minimum.
library(dplyr)
library(purrr)
df %>%
mutate(Min = do.call(pmin, select(., a:c)),
Col_Index = max.col(-select(., a:c)))
# ID a b c Min Col_Index
# <chr> <dbl> <dbl> <dbl> <dbl> <int>
#1 A 1 5 5 1 1
#2 B 5 9 4 4 3
#3 C 9 8 5 5 3
#4 D 8 2 5 2 2
Using purrr's pmap_dbl :
df %>%
mutate(Min = pmap_dbl(select(., a:c), ~min(c(...))),
Col_Index = pmap_dbl(select(., a:c), ~which.min(c(...))))
One option could be:
df %>%
rowwise() %>%
mutate(min = min(c_across(a:c)),
min_index = which.min(c_across(a:c)))
ID a b c min min_index
<chr> <dbl> <dbl> <dbl> <dbl> <int>
1 A 1 5 5 1 1
2 B 5 9 4 4 3
3 C 9 8 5 5 3
4 D 8 2 5 2 2
Base R solution:
setNames(cbind(df, t(apply(df[, vapply(df, is.numeric, logical(1))], 1, function(row) {
cbind(min(row), which.min(row))}))), c(names(df), "min", "col_index"))

bootstrap by group in tibble

Suppose I have a tibble tbl_
tbl_ <- tibble(id = c(1,1,2,2,3,3), dta = 1:6)
tbl_
# A tibble: 6 x 2
id dta
<dbl> <int>
1 1 1
2 1 2
3 2 3
4 2 4
5 3 5
6 3 6
There are 3 id groups. I want to resample entire id groups 3 times with replacement. For example the resulting tibble can be:
id dta
<dbl> <int>
1 1 1
2 1 2
3 1 1
4 1 2
5 3 5
6 3 6
but not
id dta
<dbl> <int>
1 1 1
2 1 2
3 1 1
4 2 4
5 3 5
6 3 6
or
id dta
<dbl> <int>
1 1 1
2 1 1
3 2 3
4 2 4
5 3 5
6 3 6
Here is one option with sample_n and distinct
library(tidyverse)
distinct(tbl_, id) %>%
sample_n(nrow(.), replace = TRUE) %>%
pull(id) %>%
map_df( ~ tbl_ %>%
filter(id == .x)) %>%
arrange(id)
# A tibble: 6 x 2
# id dta
# <dbl> <int>
#1 1.00 1
#2 1.00 2
#3 1.00 1
#4 1.00 2
#5 3.00 5
#6 3.00 6
An option can be to get the minimum row number for each id. That row number will be used to generate random samples from wiht replace = TRUE.
library(dplyr)
tbl_ %>% mutate(rn = row_number()) %>%
group_by(id) %>%
summarise(minrow = min(rn)) ->min_row
indx <- rep(sample(min_row$minrow, nrow(min_row), replace = TRUE), each = 2) +
rep(c(0,1), 3)
tbl_[indx,]
# # A tibble: 6 x 2
# id dta
# <dbl> <int>
# 1 1.00 1
# 2 1.00 2
# 3 3.00 5
# 4 3.00 6
# 5 2.00 3
# 6 2.00 4
Note: In the above answer the number of rows for each id has been assumed as 2 but this answer can tackle any number of IDs. The hard-coded each=2 and c(0,1) needs to be modified in order to scale it up to handle more than 2 rows for each id

Resources