Join two data frames with partially common and unequal columns - r

Is there a better way to leverage the power of merge and join in R? Merge looses unique rows and join creates duplicate and partially filled columns.
Dataframe1
Key Col1 Col2 Col3
A 1 2 3
B 2 4 6
Dataframe2
Key Col1 Col2 Col4
A 1 2 4
C 3 6 12
D 4 8 20
Merged Dataframe
Key Col1 Col2 Col3 Col4
A 1 2 3 4
B 2 4 6 <NA>
C 3 6 <NA> 12
D 4 8 <NA> 20

We could bind the datasets with bind_rows and then do a group by summarise or reframe to return only the non-NA rows
library(dplyr)
bind_rows(df1, df2) %>%
group_by(Key) %>%
reframe(across(everything(), ~ .x[!is.na(.x)][1]))
-output
# A tibble: 4 × 5
Key Col1 Col2 Col3 Col4
<chr> <int> <int> <int> <int>
1 A 1 2 3 4
2 B 2 4 6 NA
3 C 3 6 NA 12
4 D 4 8 NA 20
Or may use powerjoin
library(powerjoin)
power_full_join(df1, df2, by = "Key", conflict = coalesce_xy) %>%
select(Key, order(names(.)[-1])+1)
-output
Key Col1 Col2 Col3 Col4
1 A 1 2 3 4
2 B 2 4 6 NA
3 C 3 6 NA 12
4 D 4 8 NA 20
data
df1 <- structure(list(Key = c("A", "B"), Col1 = 1:2, Col2 = c(2L, 4L
), Col3 = c(3L, 6L)), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(Key = c("A", "C", "D"), Col1 = c(1L, 3L, 4L),
Col2 = c(2L, 6L, 8L), Col4 = c(4L, 12L, 20L)),
class = "data.frame", row.names = c(NA,
-3L))

Here is an option with full_join combined with coalesce:
library(dplyr)
full_join(df1, df2, by="Key") %>%
mutate(Col1 = coalesce(Col1.x, Col1.y),
Col2 = coalesce(Col2.x, Col2.y), .before="Col3") %>%
select(-contains("."))
Key Col1 Col2 Col3 Col4
1 A 1 2 3 4
2 B 2 4 6 NA
3 C 3 6 NA 12
4 D 4 8 NA 20

Related

How to remove rows if values from a specified column in data set 1 does not match the values of the same column from data set 2 using dplyr

I have 2 data sets, both include ID columns with the same IDs. I have already removed rows from the first data set. For the second data set, I would like to remove any rows associated with IDs that do not match the first data set by using dplyr.
Meaning whatever is DF2 must be in DF1, if it is not then it must be removed from DF2.
For example:
DF1
ID X Y Z
1 1 1 1
2 2 2 2
3 3 3 3
5 5 5 5
6 6 6 6
DF2
ID A B C
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
DF2 once rows have been removed
ID A B C
1 1 1 1
2 2 2 2
3 3 3 3
5 5 5 5
6 6 6 6
I used anti_join() which shows me the difference in rows but I cannot figure out how to remove any rows associated with IDs that do not match the first data set by using dplyr.
Try with paste
i1 <- do.call(paste, DF2) %in% do.call(paste, DF1)
# if it is only to compare the 'ID' columns
i1 <- DF2$ID %in% DF1$ID
DF3 <- DF2[i1,]
DF3
ID A B C
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 5 5 5 5
5 6 6 6 6
DF4 <- DF2[!i1,]
DF4
ID A B C
4 4 4 4 4
7 7 7 7 7
data
DF1 <- structure(list(ID = c(1L, 2L, 3L, 5L, 6L), X = c(1L, 2L, 3L,
5L, 6L), Y = c(1L, 2L, 3L, 5L, 6L), Z = c(1L, 2L, 3L, 5L, 6L)), class = "data.frame", row.names = c(NA,
-5L))
DF2 <- structure(list(ID = 1:7, A = 1:7, B = 1:7, C = 1:7), class = "data.frame", row.names = c(NA,
-7L))
# Load package
library(dplyr)
# Load dataframes
df1 <- data.frame(
ID = 1:6,
X = 1:6,
Y = 1:6,
Z = 1:6
)
df2 <- data.frame(
ID = 1:7,
X = 1:7,
Y = 1:7,
Z = 1:7
)
# Include all rows in df1
df1 %>%
left_join(df2)
Joining, by = c("ID", "X", "Y", "Z")
ID X Y Z
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6

Create a column that returns the min/max of certain rows

I have data like these:
col1 col2 col3 col4 col5
1 3 1 7 3
4 2 8 2 5
3 1 5 1 4
I want to add columns that show the minimum and maximum by row, but only for certain columns (2 - 4, for example):
col1 col2 col3 col4 col5 min max
1 3 1 7 3 1 7
1 2 8 2 5 2 8
9 1 5 1 0 1 5
I know I could use select to subset those rows and then calculate the min/max and use cbind to merge with the original data, but I feel like there is a better approach. Thanks!
Data
df <- structure(list(col1 = c(1L, 4L, 3L), col2 = 3:1, col3 = c(1L, 8L, 5L),
col4 = c(7L, 2L, 1L), col5 = c(3L, 5L, 4L)),
class = "data.frame", row.names = c(NA, -3L))
We could use pmin/pmax after selecting the columns
df$min <- do.call(pmin, c(df[2:4], na.rm = TRUE))
df$max <- do.call(pmax, c(df[2:4], na.rm = TRUE))
-output
> df
col1 col2 col3 col4 col5 min max
1 1 3 1 7 3 1 7
2 4 2 8 2 5 2 8
3 3 1 5 1 4 1 5
Or using tidyverse, we can do
library(dplyr)
df %>%
mutate(min = exec(pmin, !!! rlang::syms(names(.)[2:4]), na.rm = TRUE),
max = exec(pmax, !!! rlang::syms(names(.)[2:4]), na.rm =TRUE))
-output
col1 col2 col3 col4 col5 min max
1 1 3 1 7 3 1 7
2 4 2 8 2 5 2 8
3 3 1 5 1 4 1 5
data
df <- structure(list(col1 = c(1L, 4L, 3L), col2 = 3:1, col3 = c(1L,
8L, 5L), col4 = c(7L, 2L, 1L), col5 = c(3L, 5L, 4L)),
class = "data.frame", row.names = c(NA,
-3L))
With dplyr, you could use across + pmin/pmax:
library(dplyr)
df %>%
mutate(min = do.call(pmin, c(across(col2:col4), na.rm = TRUE)),
max = do.call(pmax, c(across(col2:col4), na.rm = TRUE)))
# # A tibble: 3 × 7
# col1 col2 col3 col4 col5 min max
# <int> <int> <int> <int> <int> <int> <int>
# 1 1 3 1 7 3 1 7
# 2 4 2 8 2 5 2 8
# 3 3 1 5 1 4 1 5
or c_across + min/max:
df %>%
rowwise() %>%
mutate(min = min(c_across(col2:col4), na.rm = TRUE),
max = max(c_across(col2:col4), na.rm = TRUE)) %>%
ungroup()
because you tagged the question with dplyr here is a dplyr solution
library(dplyr)
mt2 <- mtcars %>%
mutate(pmax = pmax(cyl,carb),
pmin = pmin(cyl,carb))
Here is one with rowwise() combined with c_across():
library(dplyr)
df %>%
rowwise() %>%
mutate(min = min(c_across(col1:col5)),
max = max(c_across(col1:col5)))
col1 col2 col3 col4 col5 min max
<int> <int> <int> <int> <int> <int> <int>
1 1 3 1 7 3 1 7
2 4 2 8 2 5 2 8
3 3 1 5 1 4 1 5

Change subsequent row values to previous maximum value up to that point if subsequent values are lower than the previous one. Ignore NA's

For example it looks like this now:
Sample
Col1
Col2
Col3
Col4
Col5
A
1
NA
2
1
3
B
1
2
NA
1
5
C
0
1
5
NA
3
I want it to look like this:
Sample
Col1
Col2
Col3
Col4
Col5
A
1
NA
2
2
3
B
1
2
NA
2
5
C
0
1
5
NA
5
df1 <- df
df1[is.na(df1)] <- -Inf
df1[-1] <- matrixStats::rowCummaxs(as.matrix(df1[-1]))* NA^is.na(df[-1])
df1
Sample Col1 Col2 Col3 Col4 Col5
1 A 1 NA 2 2 3
2 B 1 2 NA 2 5
3 C 0 1 5 NA 5
or even:
df1 <- df
df1[is.na(df1)] <- -Inf
df1[-1] <- matrixStats::rowCummaxs(as.matrix(df1[-1]))
is.na(df1) <- is.na(df)
df1
Sample Col1 Col2 Col3 Col4 Col5
1 A 1 NA 2 2 3
2 B 1 2 NA 2 5
3 C 0 1 5 NA 5
We may use cummax from base R - loop over subset of dataset i.e. numeric columns([-1]) by row with apply (MARGIN = 1), replace the non-NA elements with the cumulative max of the values and assign back
df[-1] <- t(apply(df[-1], 1, FUN = function(x) {
i1 <- !is.na(x)
x[i1] <- cummax(x[i1])
x}))
-output
> df
Sample Col1 Col2 Col3 Col4 Col5
1 A 1 NA 2 2 3
2 B 1 2 NA 2 5
3 C 0 1 5 NA 5
data
df <- structure(list(Sample = c("A", "B", "C"), Col1 = c(1L, 1L, 0L
), Col2 = c(NA, 2L, 1L), Col3 = c(2L, NA, 5L), Col4 = c(1L, 1L,
NA), Col5 = c(3L, 5L, 3L)), class = "data.frame", row.names = c(NA,
-3L))

Using complete to fill groups with NA to have same length as the maximum group

I have this dataframe:
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 3L), var = c("A", "B",
"C", "B", "C", "C")), class = "data.frame", row.names = c(NA,
-6L))
id var
1 1 A
2 1 B
3 1 C
4 2 B
5 2 C
6 3 C
I would like to get this dataframe:
id var
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C
I would like to learn how to use complete or expand.grid in this situation
I have tried several ways but was not successful: One of my tries:
df %>%
complete(id, var, fill=list(NA))
Create a duplicate column of 'var' and then do the complete on the other column, which makes the NA in the 'var' column and then remove the duplicate 'var' column
library(dplyr)
library(tidyr)
df %>%
mutate(var1 = var) %>%
complete(id, var1) %>%
select(-var1)
-output
# A tibble: 9 × 2
id var
<int> <chr>
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C

Transpose Rows in batches to Columns in R

My data.frame df looks like this:
A 1
A 2
A 5
B 2
B 3
B 4
C 3
C 7
C 9
I want it to look like this:
A B C
1 2 3
2 3 7
5 4 9
I have tried spread() but probably not in the right way. Any ideas?
We can use unstack from base R
unstack(df1, col2 ~ col1)
# A B C
#1 1 2 3
#2 2 3 7
#3 5 4 9
Or with split
data.frame(split(df1$col2, df1$col1))
Or if we use spread or pivot_wider, make sure to create a sequence column
library(dplyr)
library(tidyr)
df1 %>%
group_by(col1) %>%
mutate(rn = row_number()) %>%
ungroup %>%
pivot_wider(names_from = col1, values_from = col2) %>%
# or use
# spread(col1, col2) %>%
select(-rn)
# A tibble: 3 x 3
# A B C
# <int> <int> <int>
#1 1 2 3
#2 2 3 7
#3 5 4 9
Or using dcast
library(data.table)
dcast(setDT(df1), rowid(col1) ~ col1)[, .(A, B, C)]
data
df1 <- structure(list(col1 = c("A", "A", "A", "B", "B", "B", "C", "C",
"C"), col2 = c(1L, 2L, 5L, 2L, 3L, 4L, 3L, 7L, 9L)),
class = "data.frame", row.names = c(NA,
-9L))
In data.table, we can use dcast :
library(data.table)
dcast(setDT(df), rowid(col1)~col1, value.var = 'col2')[, col1 := NULL][]
# A B C
#1: 1 2 3
#2: 2 3 7
#3: 5 4 9

Resources