Finding the differences of paired-columns using dplyr - r

set.seed(3)
library(dplyr)
dat <- tibble(Measure = c("Height","Weight","Width","Length"),
AD1_1= rpois(4,10),
AD1_2= rpois(4,9),
AD2_1= rpois(4,10),
AD2_2= rpois(4,9),
AD3_1= rpois(4,10),
AD3_2= rpois(4,9),
AD4_1= rpois(4,10),
AD4_2= rpois(4,9),
AD5_1= rpois(4,10),
AD5_2= rpois(4,9),
AD6_1= rpois(4,10),
AD6_2= rpois(4,9))
Suppose I have data that looks like this. I wish to calculate the difference for each AD, paired with underscored number, i.e., AD1diff, AD2diff,AD3diff.
Instead of writing
dat %>%
mutate(AD1diff = AD1_1 - AD1_2,
AD2diff = AD2_1 - AD2_2,
...)
what would be an efficient way to write this?

One dplyr option could be:
dat %>%
mutate(across(ends_with("_1"), .names = "{col}_diff") - across(ends_with("_2"))) %>%
rename_with(~ sub("_\\d+", "", .), ends_with("_diff"))
Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1 AD6_2 AD1_diff AD2_diff AD3_diff AD4_diff AD5_diff AD6_diff
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 Height 6 10 10 3 12 8 7 5 7 5 8 9 -4 7 4 2 2 -1
2 Weight 8 9 13 6 14 7 8 7 13 11 10 9 -1 7 7 1 2 1
3 Width 10 9 11 5 12 8 7 11 9 5 5 6 1 6 4 -4 4 -1
4 Length 8 9 8 7 8 13 8 7 6 11 14 6 -1 1 -5 1 -5 8

The "tidy" way to do this would be to convert your data from wide to long, do a grouped subtraction, and then go back to wide format:
library(tidyr)
dat_long = dat %>% pivot_longer(
cols = starts_with("AD"),
names_sep = "_",
names_to = c("group", "obs")
)
dat_long %>% head
# # A tibble: 48 x 4
# Measure group obs value
# <chr> <chr> <chr> <int>
# 1 Height AD1 1 6
# 2 Height AD1 2 10
# 3 Height AD2 1 10
# 4 Height AD2 2 3
# 5 Height AD3 1 12
# 6 Height AD3 2 8
dat_long %>%
group_by(Measure, group) %>%
summarize(diff = value[obs == 1] - value[obs == 2]) %>%
pivot_wider(names_from = "group", values_from = "diff") %>%
rename_with(.fn = ~ paste0(., "diff"), .cols = starts_with("AD"))
# # A tibble: 4 x 7
# # Groups: Measure [4]
# Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
# <chr> <int> <int> <int> <int> <int> <int>
# 1 Height -4 7 4 2 2 -1
# 2 Length -1 1 -5 1 -5 8
# 3 Weight -1 7 7 1 2 1
# 4 Width 1 6 4 -4 4 -1

Here is a data.table option
setDT(dat)[
,
paste0(
unique(gsub("_\\d+", "", names(dat)[-1])),
"diff"
) := lapply(
split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
function(x) do.call("-", x)
)
]
which gives
> dat
Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1
1: Height 6 10 10 3 12 8 7 5 7 5 8
2: Weight 8 9 13 6 14 7 8 7 13 11 10
3: Width 10 9 11 5 12 8 7 11 9 5 5
4: Length 8 9 8 7 8 13 8 7 6 11 14
AD6_2 AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1: 9 -4 7 4 2 2 -1
2: 9 -1 7 7 1 2 1
3: 6 1 6 4 -4 4 -1
4: 6 -1 1 -5 1 -5 8
or
setDT(dat)[
,
c(.(Measure = Measure), setNames(lapply(
split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
function(x) do.call("-", x)
), paste0(
unique(gsub("_\\d+", "", names(dat)[-1])),
"diff"
)))
]
gives
Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1: Height -4 7 4 2 2 -1
2: Weight -1 7 7 1 2 1
3: Width 1 6 4 -4 4 -1
4: Length -1 1 -5 1 -5 8

Use tidyverse package tidyr to rearrange your data before mutating
require(dplyr)
require(tidyr)
#> Loading required package: tidyr
First, tidyr::pivot_longer the data frame so that there's a separate row for every column:
new_dat <-
pivot_longer(dat, cols = starts_with("AD"), # For columns whose names start with 'AD'...
names_sep = "_", # separate columns using '_' in colname
names_to = c("AD_number", "observation")) %>%
arrange(AD_number, Measure, observation)
head(new_dat, 9)
#> # A tibble: 9 x 4
#> Measure AD_number observation value
#> <chr> <chr> <chr> <int>
#> 1 Height AD1 1 6
#> 2 Height AD1 2 10
#> 3 Length AD1 1 8
#> 4 Length AD1 2 9
#> 5 Weight AD1 1 8
#> 6 Weight AD1 2 9
#> 7 Width AD1 1 10
#> 8 Width AD1 2 9
#> 9 Height AD2 1 10
Then, use tidyr::pivot_wider (the functional opposite of pivot_longer) to make a separate column for each value in observation. This will be very compatible with the upcoming mutate operation.
new_dat <-
pivot_wider(new_dat,
names_from = observation,
values_from = value,
names_prefix = "value_")
head(new_dat, 5)
#> # A tibble: 5 x 4
#> Measure AD_number value_1 value_2
#> <chr> <chr> <int> <int>
#> 1 Height AD1 6 10
#> 2 Length AD1 8 9
#> 3 Weight AD1 8 9
#> 4 Width AD1 10 9
#> 5 Height AD2 10 3
Finally, mutate the data:
new_dat <-
mutate(new_dat, diff = value_1 - value_2)
head(new_dat, 4)
#> # A tibble: 4 x 5
#> Measure AD_number value_1 value_2 diff
#> <chr> <chr> <int> <int> <int>
#> 1 Height AD1 6 10 -4
#> 2 Length AD1 8 9 -1
#> 3 Weight AD1 8 9 -1
#> 4 Width AD1 10 9 1
Created on 2021-01-22 by the reprex package (v0.3.0)
Getting back to your original data format is possible, but it might not make the data any easier to work with:
rename(new_dat,
c(`1` = "value_1", `2` = "value_2")) %>%
pivot_wider(names_from = AD_number,
values_from = c(`1`, `2`, diff),
names_glue = "{AD_number}_{.value}") %>%
{.[,order(names(.))]} %>%
relocate(Measure)

Related

Subset row if column value for any of multiple columns equals value in a list

I have a data frame with ten columns, but five columns of concern: A, B, C, D, E. I also have a list of values. What's the best way to subset the rows whose values in column A, B, C, D, OR, E is included in the list of values?
If I were only concerned with a single column, I know I can use left_join(list_of_values, df$A) but I'm not sure how to do something similar with multiple columns.
The key here is if_any.
library(tidyverse)
set.seed(26)
sample_df <- tibble(col = rep(LETTERS[1:8], each = 5),
val = sample(1:10, 40, replace = TRUE),
ID = rep(1:5, 8)) |>
pivot_wider(names_from = col, values_from = val)
sample_df
#> # A tibble: 5 x 9
#> ID A B C D E F G H
#> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 1 8 4 10 7 2 7 4 3
#> 2 2 3 2 3 3 4 10 2 3
#> 3 3 9 6 6 8 2 10 10 3
#> 4 4 7 6 8 9 3 5 8 3
#> 5 5 6 3 4 1 9 7 9 1
vals <- c(1, 7)
#solution
sample_df |>
filter(if_any(A:E, ~. %in% vals))
#> # A tibble: 3 x 9
#> ID A B C D E F G H
#> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 1 8 4 10 7 2 7 4 3
#> 2 4 7 6 8 9 3 5 8 3
#> 3 5 6 3 4 1 9 7 9 1
or any and apply with base R:
#base solution
indx <- apply(sample_df[,which(colnames(sample_df) %in% LETTERS[1:5])], 1, \(x) any(x %in% vals))
sample_df[indx,]
#> # A tibble: 3 x 9
#> ID A B C D E F G H
#> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 1 8 4 10 7 2 7 4 3
#> 2 4 7 6 8 9 3 5 8 3
#> 3 5 6 3 4 1 9 7 9 1

Parse one column of json and bind with other column to make dataframe

I have data that takes the format:
have <- structure(list(V1 = c(4L, 28L, 2L),
V2 = c("[{\"group\":1,\"topic\":\"A\"},{\"group\":1,\"topic\":\"B\"},{\"group\":2,\"topic\":\"C\"},{\"group\":2,\"topic\":\"T\"},{\"group\":2,\"topic\":\"U\"},{\"group\":3,\"topic\":\"V\"},{\"group\":3,\"topic\":\"D\"},{\"group\":3,\"topic\":\"R\"},{\"group\":4,\"topic\":\"A\"},{\"group\":4,\"topic\":\"Q\"},{\"group\":4,\"topic\":\"S\"},{\"group\":4,\"topic\":\"W\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"F\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"H\"},{\"group\":6,\"topic\":\"I\"},{\"group\":6,\"topic\":\"J\"},{\"group\":6,\"topic\":\"K\"},{\"group\":6,\"topic\":\"L\"},{\"group\":6,\"topic\":\"M\"},{\"group\":6,\"topic\":\"N\"}]",
"[]",
"[{\"group\":2,\"topic\":\"C\"},{\"group\":3,\"topic\":\"D\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"M\"}]")
),
row.names = c(NA, 3L),
class = "data.frame")
The contents of V2 are nested groupings for each row like [{"group":1,"topic":"A"},{"group":1,"topic":"B"}...]
I want to get a wide dataframe that has an indicator (1/0) for each combination of group+topic (see also_have) for each row. Something like this:
# A tibble: 3 x 4
id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
<dbl> <dbl> <dbl> <dbl>
1 4 1 1 0
2 28 0 0 0
3 2 0 0 0
The first step is to parse the json.
I can use purrr::map(have$V2, jsonlite::fromJSON) to unnest into a list, but I'm not sure how to bind the V1 column (that we might rename to id) to each element of the resulting list (note that list element two is empty because V1==28 is empty). Here's a snippet of what the first element might look like with the id (V1) added.
[[1]]
group topic id
1 1 A 4
2 1 B 4
3 2 C 4
4 2 T 4
...
Alternatively, I think purrr::map_df(have$V2, jsonlite::fromJSON) would get me closer to what I ultimately need, but here too I'm not sure how to add the row id (V1).
df <- purrr::map_df(have$V2, jsonlite::fromJSON)
df
What I get:
group topic
1 1 A
2 1 B
3 2 C
4 2 T
...
What I want (notice `V1==28` does not appear):
group topic id
1 1 A 4
2 1 B 4
3 2 C 4
4 2 T 4
5 2 U 4
6 3 V 4
7 3 D 4
8 3 R 4
9 4 A 4
10 4 Q 4
11 4 S 4
12 4 W 4
13 6 O 4
14 6 P 4
15 6 E 4
16 6 F 4
17 6 G 4
18 6 H 4
19 6 I 4
20 6 J 4
21 6 K 4
22 6 L 4
23 6 M 4
24 6 N 4
25 2 C 2
26 3 D 2
27 6 O 2
28 6 P 2
29 6 E 2
30 6 G 2
31 6 M 2
STOP.
I think if I can get the above dataframe with id I can get the rest of the way. The ultimate goal is to join this info with also_have and then pivot wide.
# join
also_have <- expand_grid(c(1:6), c(LETTERS)) %>%
mutate(topic_id = 1:n()) %>%
magrittr::set_colnames(c("group", "topic", "topic_id")) %>%
select(topic_id, group, topic)
# pivot wide
# A tibble: 3 x 4
id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
<dbl> <dbl> <dbl> <dbl>
1 4 1 1 0
2 28 0 0 0
3 2 0 0 0
Update:
Applying #akrun's solution:
purrr::map_dfr(setNames(have$V2, have$V1),
jsonlite::fromJSON,
.id = 'V1') %>%
rename(id = V1) %>%
left_join(also_have, by=c("group", "topic")) %>%
select(-group, -topic) %>%
mutate(value = 1) %>%
pivot_wider(id_cols = id,
names_from = topic_id,
names_prefix = "topic_id",
values_from = value,
values_fill = 0
) %>%
full_join(tibble(id = as.character(have$V1))) %>%
replace(is.na(.), 0)
# A tibble: 3 x 25
id topic_id1 topic_id2 topic_id29 topic_id46 topic_id47 topic_id74 topic_id56
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4 1 1 1 1 1 1 1
2 2 0 0 1 0 0 0 1
3 28 0 0 0 0 0 0 0
# … with 17 more variables: topic_id70 <dbl>, topic_id79 <dbl>, topic_id95 <dbl>,
# topic_id97 <dbl>, topic_id101 <dbl>, topic_id145 <dbl>, topic_id146 <dbl>,
# topic_id135 <dbl>, topic_id136 <dbl>, topic_id137 <dbl>, topic_id138 <dbl>,
# topic_id139 <dbl>, topic_id140 <dbl>, topic_id141 <dbl>, topic_id142 <dbl>,
# topic_id143 <dbl>, topic_id144 <dbl>
We could pass a named vector and then use .id in map_dfr
purrr::map_dfr(setNames(have$V2, have$V1), jsonlite::fromJSON, .id = 'id')
-output
id group topic
1 4 1 A
2 4 1 B
3 4 2 C
4 4 2 T
5 4 2 U
6 4 3 V
7 4 3 D
8 4 3 R
9 4 4 A
10 4 4 Q
11 4 4 S
12 4 4 W
...
Or this can be done within in dplyr framework itself after using rowwise
library(tidyr)
have %>%
rowwise %>%
transmute(ID = V1, V2 = list(fromJSON(V2))) %>%
ungroup %>%
unnest(c(V2), keep_empty = TRUE) %>%
select(-V2)
# A tibble: 32 x 3
ID group topic
<int> <int> <chr>
1 4 1 A
2 4 1 B
3 4 2 C
4 4 2 T
5 4 2 U
6 4 3 V
7 4 3 D
8 4 3 R
9 4 4 A
10 4 4 Q
# … with 22 more rows
For the second step do a join
out <- have %>%
rowwise %>%
transmute(ID = V1, V2 = list(fromJSON(V2))) %>%
ungroup %>%
unnest(c(V2), keep_empty = TRUE) %>%
select(-V2) %>% right_join(also_have)
out
Joining, by = c("group", "topic")
# A tibble: 163 x 4
ID group topic topic_id
<int> <int> <chr> <int>
1 4 1 A 1
2 4 1 B 2
3 4 2 C 29
4 4 2 T 46
5 4 2 U 47
6 4 3 V 74
7 4 3 D 56
8 4 3 R 70
9 4 4 A 79
10 4 4 Q 95
# … with 153 more rows

Compute the maximum value by group and by a time "window"

For the following Panel data (Tracking the Value for unit "ID" over "Time" :
ID=c(1,1,1,1,1,2,2,2,2,2)
Time=c(1,2,3,4,5,1,2,3,4,5)
Value=c(1,9,4,8,5,2,5,9,7,6)
I would like to create a vector which is a maximum value for each "ID" over the last two days (assuming that the unit of Time is a day)
Output vector "Max_Value" would be as follows:
Max_Value=c(1,9,9,8,8,2,5,9,9,7)
To clarify, here's how Max_Value is computed for ID "1".
For ID "1", the maximum value by the "Time=1" is 1, which is a maximum of {1}.
Similarly, for ID "1", the maximum value at the "Time 2" is 9, which is a maximum of {1,9}.
Again, for ID "1", the maximum value at the "Time 3" is 9, which is a maximum of {9,4}.
For ID "1", the maximum value at the "Time 4" is 8, which is a maximum of {4,8}.
For ID "1", the maximum value at the "Time 5" is 8, which is a maximum of {8,5}.
If you just have vectors and Time is complete and sorted, slide + ave could work well for you:
ave(Value, ID, FUN = function(x) slider::slide_dbl(x, max, .before=1))
#> [1] 1 9 9 8 8 2 5 9 9 7
Or even a full Base R solution:
Value[ave(Value, ID, FUN = function(x) c(0, -(diff(x)<0))) + seq_along(Value)]
#> [1] 1 9 9 8 8 2 5 9 9 7
Otherwise you can solve it with dplyr + slider:
library(dplyr)
data.frame(ID, Time, Value) %>%
group_by(ID) %>%
mutate(Max_Value = slider::slide_index_dbl(Value, Time, max, .before=1)) %>%
ungroup()
#> # A tibble: 10 x 4
#> ID Time Value Max_Value
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 1
#> 2 1 2 9 9
#> 3 1 3 4 9
#> 4 1 4 8 8
#> 5 1 5 5 8
#> 6 2 1 2 2
#> 7 2 2 5 5
#> 8 2 3 9 9
#> 9 2 4 7 9
#> 10 2 5 6 7
Try this:
library(data.table)
dt <- data.table(ID=c(1,1,1,1,1,2,2,2,2,2),
Time=c(1,2,3,4,5,1,2,3,4,5),
Value=c(1,9,4,8,5,2,5,9,7,6))
max_v <- function(x) max(dt[ID==x$ID & Time <= x$Time & Time > (x$Time-2) ,Value])
sapply(split(dt,1:nrow(dt)),max_v)
I believe you can use a rollapply() style function from zoo setting a width of 2:
library(dplyr)
library(tidyr)
library(zoo)
#Data
df <- data.frame(ID,Time,Value)
#Code
newdf <- df %>% group_by(ID) %>%
mutate(Max=rollapply(Value,width=2,FUN=function(x) max(x, na.rm=TRUE),
by=1, by.column=TRUE,partial=TRUE,fill=NA, align="right"))
Output:
# A tibble: 10 x 4
# Groups: ID [2]
ID Time Value Max
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1
2 1 2 9 9
3 1 3 4 9
4 1 4 8 8
5 1 5 5 8
6 2 1 2 2
7 2 2 5 5
8 2 3 9 9
9 2 4 7 9
10 2 5 6 7
With data.table you also can try frollapply (fast rolling function). Note that fill is set to first(Value) in initial row of ID group where there is only one element available instead of two.
dt <- data.frame(ID,Time,Value)
setDT(dt)
dt[, ValueMax := frollapply(x = Value,
n = 2,
max,
fill = first(Value),
align = "right",
na.rm = TRUE),
by = ID]
Output
ID Time Value ValueMax
1: 1 1 1 1
2: 1 2 9 9
3: 1 3 4 9
4: 1 4 8 8
5: 1 5 5 8
6: 2 1 2 2
7: 2 2 5 5
8: 2 3 9 9
9: 2 4 7 9
10: 2 5 6 7

How to balance a dataset in `dplyr` using `sample_n` automatically to the size of the smallest class?

I have a dataset like:
df <- tibble(
id = 1:18,
class = rep(c(rep(1,3),rep(2,2),3),3),
var_a = rep(c("a","b"),9)
)
# A tibble: 18 x 3
id cluster var_a
<int> <dbl> <chr>
1 1 1 a
2 2 1 b
3 3 1 a
4 4 2 b
5 5 2 a
6 6 3 b
7 7 1 a
8 8 1 b
9 9 1 a
10 10 2 b
11 11 2 a
12 12 3 b
13 13 1 a
14 14 1 b
15 15 1 a
16 16 2 b
17 17 2 a
18 18 3 b
That dataset contains a number of observations in several classes. The classes are not balanced. In the sample above we can see, that only 3 observations are of class 3, while there are 6 observations of class 2 and 9 observations of class 1.
Now I want to automatically balance that dataset so that all classes are of the same size. So I want a dataset of 9 rows, 3 rows in each class. I can use the sample_n function from dplyr to do such a sampling.
I achieved to do so by first calculating the smallest class size..
min_length <- as.numeric(df %>%
group_by(class) %>%
summarise(n = n()) %>%
ungroup() %>%
summarise(min = min(n)))
..and then apply the sample_n function:
set.seed(1)
df %>% group_by(cluster) %>% sample_n(min_length)
# A tibble: 9 x 3
# Groups: cluster [3]
id cluster var_a
<int> <dbl> <chr>
1 15 1 a
2 7 1 a
3 13 1 a
4 4 2 b
5 5 2 a
6 17 2 a
7 18 3 b
8 6 3 b
9 12 3 b
I wondered If it's possible to do that (calculating the smallest class size and then sampling) in one go?
You can do it in one step, but it is cheating a little:
set.seed(42)
df %>%
group_by(class) %>%
sample_n(min(table(df$class))) %>%
ungroup()
# # A tibble: 9 x 3
# id class var_a
# <int> <dbl> <chr>
# 1 1 1 a
# 2 8 1 b
# 3 15 1 a
# 4 4 2 b
# 5 5 2 a
# 6 11 2 a
# 7 12 3 b
# 8 18 3 b
# 9 6 3 b
I say "cheating" because normally you would not want to reference df$ from within the pipe. However, because they property we're looking for is of the whole frame but the table function only sees one group at a time, we need to side-step that a little.
One could do
df %>%
mutate(mn = min(table(class))) %>%
group_by(class) %>%
sample_n(mn[1]) %>%
ungroup()
# # A tibble: 9 x 4
# id class var_a mn
# <int> <dbl> <chr> <int>
# 1 14 1 b 3
# 2 13 1 a 3
# 3 7 1 a 3
# 4 4 2 b 3
# 5 16 2 b 3
# 6 5 2 a 3
# 7 12 3 b 3
# 8 18 3 b 3
# 9 6 3 b 3
Though I don't think that that is any more elegant/readable.

Overwrite left_join dplyr to update data

My question is similar to this one however I have additional columns in the LHS that should be kept https://stackoverflow.com/a/35642948/9285732
y is a subset of x with updated values for val1. In x I want to overwrite the relevant values but keep the rest.
Sample data:
library(tidyverse)
x <- tibble(name = c("hans", "dieter", "bohlen", "hans", "dieter", "alf"),
location = c(1,1,1,2,2,3),
val1 = 1:6, val2 = 1:6, val3 = 1:6)
y <- tibble(name = c("hans", "dieter", "hans"),
location = c(2,2,1),
val1 = 10)
> x
# A tibble: 6 x 5
name location val1 val2 val3
<chr> <dbl> <int> <int> <int>
1 hans 1 1 1 1
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 4 4 4
5 dieter 2 5 5 5
6 alf 3 6 6 6
> y
# A tibble: 3 x 3
name location val1
<chr> <dbl> <dbl>
1 hans 2 10
2 dieter 2 10
3 hans 1 10
> # desired output
> out
# A tibble: 6 x 5
name location val1 val2 val3
<chr> <dbl> <dbl> <int> <int>
1 hans 1 10 1 1
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 10 4 4
5 dieter 2 10 5 5
6 alf 3 6 6 6
I wrote a function that is doing what I want, however it's quite cumbersome. I wonder if there's a more elegant way or even a dplyr function that I'm unaware of.
overwrite_join <- function(x, y, by = NULL){
bycols <- which(colnames(x) %in% by)
commoncols <- which(colnames(x) %in% colnames(y))
extracols <- which(!(colnames(x) %in% colnames(y)))
x1 <- anti_join(x, y, by = by) %>%
bind_rows(y) %>%
select(commoncols) %>%
left_join(x %>% select(bycols, extracols), by = by)
out <- x %>% select(by) %>%
left_join(x1, by = by)
return(out)
}
overwrite_join(t1, t2, by = c("name", "location"))
You could do something along the lines of
> x %>%
left_join(y = y, by = c("name", "location")) %>%
within(., val1.x <- ifelse(!is.na(val1.y), val1.y, val1.x)) %>%
select(-val1.y)
# # A tibble: 6 x 5
# name location val1.x val2 val3
# <chr> <dbl> <dbl> <int> <int>
# 1 hans 1 10 1 1
# 2 dieter 1 2 2 2
# 3 bohlen 1 3 3 3
# 4 hans 2 10 4 4
# 5 dieter 2 10 5 5
# 6 alf 3 6 6 6
and then rename val1.x.
My package safejoin might help. Only available on github so far but has a feature designed just for that.
The conflict argument below must be fed a function or lambda to deal with conflicting columns when joining, here we want in priority a value from the y data frame so we can use dplyr::coalesce() there. Note that we must first coerce y$val1 as in your example it's double while x$val1 is integer. Your real case might not need this step.
# remotes::install_github("moodymudskipper/safejoin")
library(safejoin)
library(dplyr)
y$val1 <- as.integer(y$val1)
safe_left_join(x, y, by = c("name", "location"), conflict = ~coalesce(.y, .x))
#> # A tibble: 6 x 5
#> name location val1 val2 val3
#> <chr> <dbl> <int> <int> <int>
#> 1 hans 1 10 1 1
#> 2 dieter 1 2 2 2
#> 3 bohlen 1 3 3 3
#> 4 hans 2 10 4 4
#> 5 dieter 2 10 5 5
#> 6 alf 3 6 6 6
Edit : inspired by your own solution here's a 100% dplyr option that you might like better, just like your option though it's not a proper join!
bind_rows(y, x) %>%
group_by(name, location) %>%
summarize_all(~na.omit(.x)[[1]]) %>%
ungroup()
#> # A tibble: 6 x 5
#> name location val1 val2 val3
#> <chr> <dbl> <dbl> <int> <int>
#> 1 alf 3 6 6 6
#> 2 bohlen 1 3 3 3
#> 3 dieter 1 2 2 2
#> 4 dieter 2 10 5 5
#> 5 hans 1 10 1 1
#> 6 hans 2 10 4 4
Try dplyr::coalesce
x %>%
left_join(y, by = c("name", "location")) %>%
mutate(val1 = coalesce(val1.y, val1.x)) %>%
select(-val1.x, -val1.y)
# A tibble: 6 x 5
name location val2 val3 val1
<chr> <dbl> <int> <int> <int>
1 hans 1 1 1 10
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 4 4 10
5 dieter 2 5 5 10
6 alf 3 6 6 6
This is the idiom I now use. It does not preserve the row or column order in x, if that is important.
I like it because I can evaluate the values to just before the bind_rows(), do a visual inspection, and if I like it, put the fixed rows back onto the base dataframe.
library(dplyr)
x <- tibble(name = c("hans", "dieter", "bohlen", "hans", "dieter", "alf"),
location = c(1,1,1,2,2,3),
val1 = 1:6, val2 = 1:6, val3 = 1:6)
y <- tibble(name = c("hans", "dieter", "hans"),
location = c(2,2,1),
val1 = 10)
keys <- c("name", "location")
out <- x %>%
semi_join(y, keys) %>%
select(-matches(setdiff(names(y), keys))) %>%
left_join(y, keys) %>%
bind_rows(x %>% anti_join(y, keys))
out %>%
print()
#> # A tibble: 6 x 5
#> name location val2 val3 val1
#> <chr> <dbl> <int> <int> <dbl>
#> 1 hans 1 1 1 10
#> 2 hans 2 4 4 10
#> 3 dieter 2 5 5 10
#> 4 dieter 1 2 2 2
#> 5 bohlen 1 3 3 3
#> 6 alf 3 6 6 6
Created on 2019-12-12 by the reprex package (v0.3.0)

Resources