Master view of multiple dataframes with common columns - r

I have three dataframes like below:
df3 <- data.frame(col1=c('A','C','E'),col2=c(4,8,2))
df2 <- data.frame(col1=c('A','B','C','E','I'),col2=c(4,6,8,2,9))
df1 <- data.frame(col1=c('A','D','C','E','I'),col2=c(4,7,8,2,9))
The differences between any two files could be as below:
anti_join(df2, df3)
# Joining, by = c("col1", "col2")
# col1 col2
# 1 B 6
# 2 I 9
anti_join(df3, df2)
# Joining, by = c("col1", "col2")
# [1] col1 col2
# <0 rows> (or 0-length row.names)
anti_join(df1, df2)
# Joining, by = c("col1", "col2")
# col1 col2
# 1 D 7
anti_join(df2, df1)
# Joining, by = c("col1", "col2")
# col1 col2
# 1 B 6
I would like to create a master dataframe with all the values in col1 and col2 specific to each dataframe. If there is no such value present, it should populate NA.
col1 df1_col2 df2_col2 df3_col2
1 A 4 4 4
2 B NA 6 NA
3 C 8 8 8
4 E 2 2 2
5 I 9 9 NA
6 D 7 NA NA
The essence of the above output could be established from the above anti_join commands. However, it does not provide the complete picture at once. Any thoughts on how to achieve this?
Edit: For multiple values in col2 for col1, the output is a little messier. For example, A has values 4, 3.
df3 <- data.frame(col1=c('A','C','E'),col2=c(4,8,2))
df2 <- data.frame(col1=c('A','A','B','C','E','I'),col2=c(4,3,6,8,2,9))
df1 <- data.frame(col1=c('A','A','D','C','E','I'),col2=c(4,3,7,8,2,9))
lst_of_frames <- list(df1 = df1, df2 = df2, df3 = df3)
lst_of_frames %>%
imap(~ rename_at(.x, -1, function(z) paste(.y, z, sep = "_"))) %>%
reduce(full_join, by = "col1")
It gives the below output.
# col1 df1_col2 df2_col2 df3_col2
# 1 A 4 4 4
# 2 A 4 3 4
# 3 A 3 4 4
# 4 A 3 3 4
# 5 D 7 NA NA
# 6 C 8 8 8
# 7 E 2 2 2
# 8 I 9 9 NA
# 9 B NA 6 NA
The interesting part of the output is:
# col1 df1_col2 df2_col2 df3_col2
# 1 A 4 4 4
# 2 A 4 3 4
# 3 A 3 4 4
# 4 A 3 3 4
whereas the expected output is:
# col1 df1_col2 df2_col2 df3_col2
# 1 A 4 4 4
# 2 A 3 3 NA

You may use the full_join function from the dplyr package.
df_master <- df1 %>%
full_join(df2, by = "col1") %>%
full_join(df3, by = "col1") %>%
select(col1, df1_col2 = col2.x,
df2_col2 = col2.y,
df3_col2 = col2)
col1 df1_col2 df2_col2 df3_col2
1 A 4 4 4
2 D 7 NA NA
3 C 8 8 8
4 E 2 2 2
5 I 9 9 NA
6 B NA 6 NA

Similar to #tamtam's answer, but a little programmatic if you have a dynamic list of frames.
lst_of_frames <- list(df1 = df1, df2 = df2, df3 = df3)
# lst_of_frames <- tibble::lst(df1, df2, df3) # thanks, #user63230
library(dplyr)
library(purrr) # imap, reduce
lst_of_frames %>%
imap(~ rename_at(.x, -1, function(z) paste(.y, z, sep = "_"))) %>%
reduce(full_join, by = "col1")
# col1 df1_col2 df2_col2 df3_col2
# 1 A 4 4 4
# 2 D 7 NA NA
# 3 C 8 8 8
# 4 E 2 2 2
# 5 I 9 9 NA
# 6 B NA 6 NA
It's important (for automatically renaming the columns) that the list-of-frames be a named list; my assumption was the name of the frame variable list(df1=df1), but it could just as easily be list(A=df1) to produce a column named A_col2 in the end.

Related

Using bind_rows in a pipe

Following a question I came across today, I would like to know how I can use bind_rows function in a pipe while avoiding duplication and NA values. Consider I have the following simple tibble:
df <- tibble(
col1 = c(3, 4, 5),
col2 = c(5, 3, 1),
col3 = c(6, 4, 9),
col4 = c(9, 6, 5)
)
I would like to bind col1 & col2 row-wise with col3 & col4 so that I have a tibble with 2 columns and 6 observations. In the end changing the names of the columns to colnew1 and colnew2.
But when I use bind_rows I got the following output with a lot of duplications and NA values.
df %>%
bind_rows(
select(., 1:2),
select(., 3:4)
)
# A tibble: 9 x 4
col1 col2 col3 col4
<dbl> <dbl> <dbl> <dbl>
1 3 5 6 9
2 4 3 4 6
3 5 1 9 5
4 3 5 NA NA
5 4 3 NA NA
6 5 1 NA NA
7 NA NA 6 9
8 NA NA 4 6
9 NA NA 9 5
# My desired output would be something like this:
f1 <- function(x) {
df <- x %>%
set_names(nm = rep(c("newcol1", "newcol2"), 2))
bind_rows(df[, c(1, 2)], df[, c(3, 4)])
}
f1(df)
# A tibble: 6 x 2
newcol1 newcol2
<dbl> <dbl>
1 3 5
2 4 3
3 5 1
4 6 9
5 4 6
6 9 5
I can get the desired output without a pipe but first I would like to know how I could use bind_rows in a pipe without getting NA values and duplications and second whether I could use select function in bind_rows as I remember once Hadley Wickham used filter function wrapped by bind_rows.
I would appreciate any explanation to this problem and thank you in advance.
Select the first two columns and bind_rows col3 col4 to col1 and col2 then use transmute
df1 <- df %>%
select(col1, col2) %>%
bind_rows(
df %>%
transmute(col1 = col3, col2 = col4)
)
Results:
# A tibble: 6 x 2
col1 col2
<dbl> <dbl>
1 3 5
2 4 3
3 5 1
4 6 9
5 4 6
6 9 5

Nested full_join with suffixes for more than 2 data.frames

I want to merge several data.frames with some common columns and append a suffix to the column names to keep track from where does the data for each column come from.
I can do it easily with the suffix term in the first full_join, but when I do the second join, no suffixes are added. I can rename the third data.frame so it has suffixes, but I wanted to know if there is another way of doing it using the suffix term.
Here is an example code:
x = data.frame(col1 = c("a","b","c"), col2 = 1:3, col3 = 1:3)
y = data.frame(col1 = c("b","c","d"), col2 = 4:6, col3 = 1:3)
z = data.frame(col1 = c("c","d","a"), col2 = 7:9, col3 = 1:3)
> df = full_join(x, y, by = "col1", suffix = c("_x","_y")) %>%
full_join(z, by = "col1", suffix = c("","_z"))
> df
col1 col2_x col3_x col2_y col3_y col2 col3
1 a 1 1 NA NA 9 3
2 b 2 2 4 1 NA NA
3 c 3 3 5 2 7 1
4 d NA NA 6 3 8 2
I was expecting that col2 and col3 from data.frame z would have a "_z" suffix. I have tried using empty suffixes while merging two data.frames and it works.
I can work around by renaming the columns in z before doing the second full_join, but in my real data I have several common columns, and if I wanted to merge more data.frames it would complicate the code. This is my expected output.
> colnames(z) = paste0(colnames(z),"_z")
> df = full_join(x, y, by = "col1", suffix = c("_x","_y")) %>%
full_join(z, by = c("col1"="col1_z"))
> df
col1 col2_x col3_x col2_y col3_y col2_z col3_z
1 a 1 1 NA NA 9 3
2 b 2 2 4 1 NA NA
3 c 3 3 5 2 7 1
4 d NA NA 6 3 8 2
I have seen other similar problems in which adding an extra column to keep track of the source data.frame is used, but I was wondering why does not the suffix term work with multiple joins.
PS: If I keep the first suffix empty, I can add suffixes in the second join, but that will leave the col2 and col3 form x without suffix.
> df = full_join(x, y, by = "col1", suffix = c("","_y")) %>%
full_join(z, by = "col1", suffix = c("","_z"))
> df
col1 col2 col3 col2_y col3_y col2_z col3_z
1 a 1 1 NA NA 9 3
2 b 2 2 4 1 NA NA
3 c 3 3 5 2 7 1
4 d NA NA 6 3 8 2
You can do it like this:
full_join(x, y, by = "col1", suffix = c("","_y")) %>%
full_join(z, by = "col1", suffix = c("_x","_z"))
col1 col2_x col3_x col2_y col3_y col2_z col3_z
1 a 1 1 NA NA 9 3
2 b 2 2 4 1 NA NA
3 c 3 3 5 2 7 1
4 d NA NA 6 3 8 2
Adding the suffix for xat the last join should do the trick.

Replacement of column values based on a named vector

Consider the following named vector vec and tibble df:
vec <- c("1" = "a", "2" = "b", "3" = "c")
df <- tibble(col = rep(1:3, c(4, 2, 5)))
df
# # A tibble: 11 x 1
# col
# <int>
# 1 1
# 2 1
# 3 1
# 4 1
# 5 2
# 6 2
# 7 3
# 8 3
# 9 3
# 10 3
# 11 3
I would like to replace the values in the col column with the corresponding named values in vec.
I'm looking for a tidyverse approach, that doesn't involve converting vec as a tibble.
I tried the following, without success:
df %>%
mutate(col = map(
vec,
~ str_replace(col, names(.x), .x)
))
Expected output:
# A tibble: 11 x 1
col
<chr>
1 a
2 a
3 a
4 a
5 b
6 b
7 c
8 c
9 c
10 c
11 c
You could use col :
df$col1 <- vec[as.character(df$col)]
Or in mutate :
library(dplyr)
df %>% mutate(col1 = vec[as.character(col)])
# col col1
# <int> <chr>
# 1 1 a
# 2 1 a
# 3 1 a
# 4 1 a
# 5 2 b
# 6 2 b
# 7 3 c
# 8 3 c
# 9 3 c
#10 3 c
#11 3 c
We can also use data.table
library(data.table)
setDT(df)[, col1 := vec[as.character(col)]]

How to include exception when using fill everything?

I'm merging two data frames as follows:
data_merged <- full_join(df1, df2, by=c("col1","col2")) %>%
fill(everything(), .direction = 'down')
However, there is a column in the new merged data frame that I don't want to fill (say, col3). This row needs to retain its NA value. I've tried doing this with select but failed, and also thought of maybe working around with making part of it a tibble but can't capitalize on the idea.
Does anybody have any ideas?
Try this:
data.frame(col1 = 1:10, col2 = c(1, NA), col3 = c(2,NA))%>%
fill(!col3, .direction = 'down')
# col1 col2 col3
# 1 1 1 2
# 2 2 1 NA
# 3 3 1 2
# 4 4 1 NA
# 5 5 1 2
# 6 6 1 NA
# 7 7 1 2
# 8 8 1 NA
# 9 9 1 2
# 10 10 1 NA
We can also use na.locf from zoo
library(zoo)
df1$col3 <- na.locf0(df1$col3)
data
df1 <- data.frame(col1 = 1:10, col2 = c(1, NA), col3 = c(2,NA))

Add together 2 dataframes in R without losing columns

I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)
nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2
You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2
By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)
allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2
Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.

Resources