Binding dataframes with different column names by row - r

I imported this excel sheet as a list of dataframes. I want to merge the list into one dataframe. bind_rows() allow me to easily add together the dataframes, but the issue is that I have a variable/column that has different names in each dataframe. bind_row() will by default create two separate columns, with empty values for the data from the other data frames. How can I join these columns?
Sample code:
# Sample dataframes
df1 <- tibble(A = c(1,2,3),
B = c("X","Y","Z"),
C = c(T,F,F)
)
df2 <- tibble(A = c(3,4,5),
B = c("U","V","W"),
D = c(T,T,F)
)
# List of dataframes
my_ls <- list(df1, df2)
my_ls
[[1]]
# A tibble: 3 x 3
A B C
<dbl> <chr> <lgl>
1 1 X TRUE
2 2 Y FALSE
3 3 Z FALSE
[[2]]
# A tibble: 3 x 3
A B D
<dbl> <chr> <lgl>
1 3 U TRUE
2 4 V TRUE
3 5 W FALSE
# Creating joined dataframe:
my_df <- bind_rows(my_ls)
my_df
# Current outcome: A tibble: 6 x 4
A B C D
<dbl> <chr> <lgl> <lgl>
1 1 X TRUE NA
2 2 Y FALSE NA
3 3 Z FALSE NA
4 3 U NA TRUE
5 4 V NA TRUE
6 5 W NA FALSE
The desired outcome:
# Desired outcome: A tibble: 6 x 3
A B C
<dbl> <chr> <lgl>
1 1 X TRUE
2 2 Y FALSE
3 3 Z FALSE
4 3 U TRUE
5 4 V TRUE
6 5 W FALSE
Currently, I've been using mutate() with case_when(), where I check which column is not empty (!is.na()). This works, but I can't help but think there must be an easier way.
# Example using mutate
my_df <- my_df %>%
mutate(
C = case_when(is.na(C) & !is.na(D) ~ D,
!is.na(C) & is.na(D) ~ C,
# The lines below may be a bit redundant for my purpose, since the dataframes either have the C or D variable.
!is.na(C) & !is.na(D) ~ C, # Better would be to return that variable has overlapping information
is.na(C) & is.na(D) ~ NA
)
) %>%
select(-D)
my_df
# A tibble: 6 x 3
A B C
<dbl> <chr> <lgl>
1 1 X TRUE
2 2 Y FALSE
3 3 Z FALSE
4 3 U TRUE
5 4 V TRUE
6 5 W FALSE

You can bind_rows and then select non-NA value using coalesce :
library(dplyr)
bind_rows(my_ls) %>% mutate(C = coalesce(C, D)) %>% select(A:C)
# A B C
# <dbl> <chr> <lgl>
#1 1 X TRUE
#2 2 Y FALSE
#3 3 Z FALSE
#4 3 U TRUE
#5 4 V TRUE
#6 5 W FALSE

Following the comment by #KarthikS you can rename your columns before binding. My approach using rename_with does not require the columns to be in a specific order. To illusrate this I used somewhat different example dataframes:
library(purrr)
library(dplyr)
d1 <- data.frame(A = 1, B = 2, C = 3)
d2 <- data.frame(A = 4, B = 5, D = 6)
d3 <- data.frame(D = 7, A = 8, B = 9)
d <- list(d1, d2, d3)
map(d, ~ rename_with(.x, ~ "C", matches("^D$"))) %>%
bind_rows()
#> A B C
#> 1 1 2 3
#> 2 4 5 6
#> 3 8 9 7
And now four your dataset:
d <- list(df1, df2)
map(d, ~ rename_with(.x, ~ "C", matches("^D$"))) %>%
bind_rows()
#> # A tibble: 6 x 3
#> A B C
#> <dbl> <chr> <lgl>
#> 1 1 X TRUE
#> 2 2 Y FALSE
#> 3 3 Z FALSE
#> 4 3 U TRUE
#> 5 4 V TRUE
#> 6 5 W FALSE
And if we add an addtional one with a different order:
df3 <- tibble(D = c(T,T,F),
A = c(7,8,9),
B = c("A","B","C"))
d <- list(df1, df2, df3)
map(d, ~ rename_with(.x, ~ "C", matches("^D$"))) %>%
bind_rows()
#> # A tibble: 9 x 3
#> A B C
#> <dbl> <chr> <lgl>
#> 1 1 X TRUE
#> 2 2 Y FALSE
#> 3 3 Z FALSE
#> 4 3 U TRUE
#> 5 4 V TRUE
#> 6 5 W FALSE
#> 7 7 A TRUE
#> 8 8 B TRUE
#> 9 9 C FALSE
Created on 2020-10-16 by the reprex package (v0.3.0)

Apologize for breaking out of the tidyverse for a quick answer
expl <- read.table(text= " A B C D
1 1 X TRUE NA
2 2 Y FALSE NA
3 3 Z FALSE NA
4 3 U NA TRUE
5 4 V NA TRUE
6 5 W NA FALSE")
expl$E <- ifelse(is.na(expl$C), expl$D, expl$C)
print(expl)
or maybe
expl[,c("C", "D")] %>% rowMeans(na.rm = TRUE) %>% as.logical()
EDIT: Translated the latter to tidy:
expl %>% select("C", "D") %>% rowMeans(na.rm = TRUE) %>% as.logical()
EDIT after first comment:
If you want more control you should probably write the things you want to do in each case in a function similar to the following example:
library(magrittr)
expl <- read.table(text= " A B C D
1 1 X TRUE NA
2 2 Y FALSE NA
3 3 Z FALSE NA
4 3 U NA TRUE
5 4 V NA TRUE
6 5 W NA FALSE
7 7 I NA NA
8 9 J TRUE TRUE")
myfun <- function(a, b){
if(is.na(a) & is.na(b))
return(NA)
if(!is.na(a) & !is.na(b)) {
warning("too much information, a and b set!")
return(NaN)
}
return(max(a, b, na.rm=TRUE))
}
myfun = Vectorize(myfun)
myfun(expl$C, expl$D) %>% as.logical()

Related

Generating True/False column in one dataframe based on multiple conditions in another dataframe in R

I have two dataframes of different length. The first looks like this and is the dataframe I want to add the True/False column to:
chr_snp loc_snp ma_snp
1 184319928 T
1 276998062 A
1 278255864 G
2 243012470 G
2 123072103 T
3 526785124 A
The second data frame is the reference dataframe that is smaller:
chr_QTL loc_QTL ma_QTL
1 281788173 G
1 203085725 C
2 241577141 C
For each row in dataframe 1 (df1), I want to first check if the value of df1$chr_snp matches a value in df2$chr_QTL. If this match is true, then I want to determine if the value in df1$loc_snp is within 10 million units (these are DNA base-pairs) above OR below any values based on the first condition in df2$loc_QTL. Now, what is tricky is that for the first three rows of df1, there are three possible row matches in df2 (rows 1 and 2) based on the first criteria alone. However, only two match based on the second criteria (10M base-pairs greater than OR less than value in df2$loc_QTL). Note: df1$ma_snp and df2$ma_QTL can be totally ignored. So, based on these criteria, df1 should now look like:
chr_snp loc_snp ma_snp Match
1 184319928 T FALSE
1 276998062 A TRUE
1 278255864 G TRUE
2 243012470 G TRUE
2 123072103 T FALSE
3 526785124 A FALSE
Here is one option. Join the data then look for any instance of < 10,000,000.
library(tidyverse)
left_join(df1 |>
mutate(rw_id = row_number()),
df2, by = c("chr_snp" = "chr_QTL")) |>
mutate(less = abs(loc_snp -loc_QTL) < 10e6) |>
group_by(rw_id)|>
summarise(across(contains(colnames(df1)), ~.[[1]]),
Match = any(less),
Match = ifelse(is.na(Match), FALSE, Match))
#> # A tibble: 6 x 5
#> rw_id chr_snp loc_snp ma_snp Match
#> <int> <dbl> <dbl> <chr> <lgl>
#> 1 1 1 184319928 T FALSE
#> 2 2 1 276998062 A TRUE
#> 3 3 1 278255864 G TRUE
#> 4 4 2 243012470 G TRUE
#> 5 5 2 123072103 T FALSE
#> 6 6 3 526785124 A FALSE
or another option:
library(tidyverse)
df1 |>
mutate(Match = map2_lgl(chr_snp, loc_snp,
\(x, y){
(x %in% df2$chr_QTL) &&
any(abs(df2[df2$chr_QTL == x, 'loc_QTL']-y) < 10e6)
}))
#> # A tibble: 6 x 4
#> chr_snp loc_snp ma_snp Match
#> <dbl> <dbl> <chr> <lgl>
#> 1 1 184319928 T FALSE
#> 2 1 276998062 A TRUE
#> 3 1 278255864 G TRUE
#> 4 2 243012470 G TRUE
#> 5 2 123072103 T FALSE
#> 6 3 526785124 A FALSE
just for fun. Here is a third option with base:
df1$Match <- apply(outer(df1$loc_snp, df2$loc_QTL, \(x,y) abs(x-y) < 10e6) &
outer(df1$chr_snp, df2$chr_QTL, `==`), 1, any)
df1
#> # A tibble: 6 x 4
#> chr_snp loc_snp ma_snp Match
#> <dbl> <dbl> <chr> <lgl>
#> 1 1 184319928 T FALSE
#> 2 1 276998062 A TRUE
#> 3 1 278255864 G TRUE
#> 4 2 243012470 G TRUE
#> 5 2 123072103 T FALSE
#> 6 3 526785124 A FALSE
One way:
split(df1, 1:NROW(df1)) <- lapply(split(df1, 1:NROW(df1)), function(x)
x$Match <- any(df2$chr_QTL==x$chr_snp & abs(df2$loc_QTL - x$loc_snp) < 1e7))

Replacement of column values based on a named vector

Consider the following named vector vec and tibble df:
vec <- c("1" = "a", "2" = "b", "3" = "c")
df <- tibble(col = rep(1:3, c(4, 2, 5)))
df
# # A tibble: 11 x 1
# col
# <int>
# 1 1
# 2 1
# 3 1
# 4 1
# 5 2
# 6 2
# 7 3
# 8 3
# 9 3
# 10 3
# 11 3
I would like to replace the values in the col column with the corresponding named values in vec.
I'm looking for a tidyverse approach, that doesn't involve converting vec as a tibble.
I tried the following, without success:
df %>%
mutate(col = map(
vec,
~ str_replace(col, names(.x), .x)
))
Expected output:
# A tibble: 11 x 1
col
<chr>
1 a
2 a
3 a
4 a
5 b
6 b
7 c
8 c
9 c
10 c
11 c
You could use col :
df$col1 <- vec[as.character(df$col)]
Or in mutate :
library(dplyr)
df %>% mutate(col1 = vec[as.character(col)])
# col col1
# <int> <chr>
# 1 1 a
# 2 1 a
# 3 1 a
# 4 1 a
# 5 2 b
# 6 2 b
# 7 3 c
# 8 3 c
# 9 3 c
#10 3 c
#11 3 c
We can also use data.table
library(data.table)
setDT(df)[, col1 := vec[as.character(col)]]

Value matching with NA - missing values - using mutate

I am somewhat stuck. Is there a better way than the below to do value matching considering NAs as "real values" within mutate?
library(dplyr)
data_foo <- data.frame(A= c(1:2, NA, 4, NA), B = c(1, 3, NA, NA, 4))
Not the desired output:
data_foo %>% mutate(irr = A==B)
#> A B irr
#> 1 1 1 TRUE
#> 2 2 3 FALSE
#> 3 NA NA NA
#> 4 4 NA NA
#> 5 NA 4 NA
data_foo %>% rowwise() %>% mutate(irr = A%in%B)
#> Source: local data frame [5 x 3]
#> Groups: <by row>
#>
#> # A tibble: 5 x 3
#> A B irr
#> <dbl> <dbl> <lgl>
#> 1 1 1 TRUE
#> 2 2 3 FALSE
#> 3 NA NA FALSE
#> 4 4 NA FALSE
#> 5 NA 4 FALSE
Desired output: The below shows the desired column, irr. I am using this somewhat cumbersome helper columns. Is there a shorter way?
data_foo %>%
mutate(NA_A = is.na(A),
NA_B = is.na(B),
irr = if_else(is.na(A)|is.na(B), NA_A == NA_B, A == B))
#> A B NA_A NA_B irr
#> 1 1 1 FALSE FALSE TRUE
#> 2 2 3 FALSE FALSE FALSE
#> 3 NA NA TRUE TRUE TRUE
#> 4 4 NA FALSE TRUE FALSE
#> 5 NA 4 TRUE FALSE FALSE
Using map2
library(tidyverse)
data_foo %>%
mutate(irr = map2_lgl(A, B, `%in%`))
# A B irr
#1 1 1 TRUE
#2 2 3 FALSE
#3 NA NA TRUE
#4 4 NA FALSE
#5 NA 4 FALSE
Or with setequal
data_foo %>%
rowwise %>%
mutate(irr = setequal(A, B))
The above method is concise, but it is also loopy. We can replace the NA with a different value and then do the ==
data_foo %>%
mutate_all(list(new = ~ replace_na(., -999))) %>%
transmute(A, B, irr = A_new == B_new)
# A B irr
#1 1 1 TRUE
#2 2 3 FALSE
#3 NA NA TRUE
#4 4 NA FALSE
#5 NA 4 FALSE
Or with bind_cols and reduce
data_foo %>%
mutate_all(replace_na, -999) %>%
reduce(`==`) %>%
bind_cols(data_foo, irr = .)
Maybe simpler than akrun's answer?
Any of the two ways below will produce the expected result. Note that as.character won't do it, because the return value of as.character(NA) is NA_character_.
data_foo %>%
mutate(irr = paste(A) == paste(B))
data_foo %>%
mutate(irr = sQuote(A) == sQuote(B))
#Source: local data frame [5 x 3]
#Groups: <by row>
#
## A tibble: 5 x 3
# A B irr
# <dbl> <dbl> <lgl>
#1 1 1 TRUE
#2 2 3 FALSE
#3 NA NA TRUE
#4 4 NA FALSE
#5 NA 4 FALSE
Edit.
Following the comments below I have updated the code and it now follows akrun's suggestion.
There is also the excellent idea in tmfmnk's answer. I use a similar one in yet another way of solving the question's problem.
The documentation of all.equal says that
Do not use all.equal directly in if expressions—either use
isTRUE(all.equal(....)) or identical if appropriate.
Though there is no if expression in mutate, I believe that it is more stable than identical and has the same effect if the values being compared are (sort of/in fact) equal.
data_foo %>%
mutate(irr = isTRUE(all.equal(A, B)))
Could also be a possibility:
data_foo %>%
rowwise() %>%
mutate(irr = identical(A, B)) %>%
ungroup()
A B irr
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 2 3 FALSE
3 NA NA TRUE
4 4 NA FALSE
5 NA 4 FALSE
The coalesce function is useful if you want to perform an action when a value is NA
data_foo %>%
mutate(irr = coalesce(A == B, is.na(A) & is.na(B)))
# A B irr
# 1 1 1 TRUE
# 2 2 3 FALSE
# 3 NA NA TRUE
# 4 4 NA FALSE
# 5 NA 4 FALSE
Same thing for > 2 columns
data_foo %>%
mutate(irr = coalesce(reduce(., `==`), rowMeans(is.na(.)) == 1))

Add together 2 dataframes in R without losing columns

I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)
nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2
You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2
By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)
allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2
Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.

ddply multifactorial all pairwise t-tests

How to perform a multifactorial t-test for all possible pairs of groups with a minimal number of coding lines.
My example:
3x features : 1,2,3
4x groups: : A,B,C,D
Aim: For each feature test all pairs of groups:
1(A-B,A-C,A-D,B-C,B-D,C-D)
2(A-B,A-C,A-D,B-C,B-D,C-D)
3(A-B,A-C,A-D,B-C,B-D,C-D)
= 18 T-tests
At the moment I am using ddply and inside lapply :
library(plyr)
groupVector <- c(rep("A",10),rep("B",10),rep("C",10),rep("D",10))
featureVector <- rep(1:3,each=40)
mydata <- data.frame(feature=factorVector,group=groupVector,value=rnorm(120,0,1))
ddply(mydata,.(feature),function(x){
grid <- combn(unique(x$group),2, simplify = FALSE)
df <- lapply(grid,function(p){
sub <- subset(x,group %in% p)
pval <- t.test(sub$value ~ sub$group)$p.value
data.frame(groupA=p[1],groupB=p[2],pval=pval)
})
res <- do.call("rbind",df)
return(res)
})
Here's my take, although it's arguable whether it's 'better'
split.data <- split(mydata, mydata$feature)
pairs <- as.data.frame(matrix(combn(unique(mydata$group), 2), nrow=2))
library(tidyverse)
map_df(split.data, function(x) map_df(pairs, function(y) tibble(groupA = y[1], groupB = y[2],
pval = t.test(value ~ group, data = x, subset = which(x$group %in% y))$p.value)), .id="feature")
Output
# # A tibble: 18 x 4
# feature groupA groupB pval
# <chr> <chr> <chr> <dbl>
# 1 1 A B 0.28452419
# 2 1 A C 0.65114472
# 3 1 A D 0.77746420
# 4 1 B C 0.42546791
# 5 1 B D 0.39876582
# 6 1 C D 0.88079645
# 7 2 A B 0.57843592
# 8 2 A C 0.30726571
# 9 2 A D 0.55457986
# 10 2 B C 0.74871464
# 11 2 B D 0.24017130
# 12 2 C D 0.04252878
# 13 3 A B 0.01355117
# 14 3 A C 0.08746756
# 15 3 A D 0.24527519
# 16 3 B C 0.15130684
# 17 3 B D 0.09172577
# 18 3 C D 0.64206517

Resources