Merging dataframes by a vector of dataframe names - r

I am trying to merge approx 30 dataframes.
I have saved the global environment as a vector, comma separated, as below;
df_names <- (df1, df2, df3, df4)
Now I am trying to merge all of these dataframes
total <- merge(df_names, by = 'ID')
But I am getting an error;
Error in as.data.frame(y) : argument "y" is missing, with no default

Converting comments to an answer, you're probably looking for a combination of mget and Reduce along with merge.
Demo:
df1 <- data.frame(ID = 1:3, var = c("a", "b", "c"))
df2 <- data.frame(ID = c(1, 3, 4), var = c("A", "B", "X"))
df3 <- data.frame(ID = c(2, 3, 4, 5), var = c("X", "Y", "Z", "A"))
df4 <- data.frame(ID = 1:5, var = letters[1:5])
Reduce(function(x, y) merge(x, y, by = "ID", all = TRUE), mget(paste0("df", 1:4)))
# ID var.x var.y var.x var.y
# 1 1 a A <NA> a
# 2 2 b <NA> X b
# 3 3 c B Y c
# 4 4 <NA> X Z d
# 5 5 <NA> <NA> A e
# Warning message:
# In merge.data.frame(x, y, by = "ID", all = TRUE) :
# column names ‘var.x’, ‘var.y’ are duplicated in the result

Related

R script to generate all combinatorics of two identical lists including incomplete lists

I think this problem can be solved in many different ways, but I basically want to find a function that will give me a dataframe with every combination of values from a list into its columns, including the incomplete sets and excluding some, but not all, redundant combinations (order isn't important for now).
So I might start out with a list like this:
List = c("A","B","C")
and I want to get a dataframe that looks like
C1 = c("A","B","C","A","A","B","A")
C2 = c("","","","B","C","C","B")
C3 = c("","","","","","","C")
df <- cbind(C1, C2, C3)
row.names(df) <- c("A", "B", "C", "AB", "AC", "BC", "ABC")
colnames(df) <- c("First_Item", "Second_Item","Third_Item")
And then it fills in each cell with the corresponding letter.
e.g. position A1 in the df would be "A", positions A2 and A3 would be empty.
any idea how to do this?
I tried with dplyr:
library(tidyr)
list_1 = c("A", "B", "C", "NA")
list_2 = c("A", "B", "C", "NA")
list_3 = c("A", "B", "C", "NA")
list_4 = c("A", "B", "C", "NA")
test <- crossing(list_1, list_2,list_3,list_4)
test <- test[apply(test, MARGIN = 1, FUN = function(x) !(duplicated(x) | !any = "NA")),]
But I want to keep all the values with multiple NAs in them, so this doesn't quite work.
expand.grid has the same problem
expand.grid(list_1 = c("A", "B", "C", "NA"),list_2 = c("A", "B", "C", "NA"),list_3 = c("A", "B", "C", "NA"),list_4 = c("A", "B", "C", "NA"))
That's basically Roland's answer:
library(magrittr) # just for the pipe-operator
List %>%
seq_along() %>%
lapply(combn, x = List, simplify = FALSE) %>%
unlist(recursive = FALSE) %>%
sapply(`length<-`, length(List)) %>%
t() %>%
data.frame()
returns
X1 X2 X3
1 A <NA> <NA>
2 B <NA> <NA>
3 C <NA> <NA>
4 A B <NA>
5 A C <NA>
6 B C <NA>
7 A B C
Further more you could use the dplyr and tidyr packages to replace NAs. Just add one more function into the pipe:
mutate(across(everything(), replace_na, ""))
Here is my approach:
library(purrr)
List <- c("xA","xB","xC") # arbitrary as per request in comments
seq_along(List) %>% # h/t #MartinGal
map(~ combn(List, m = .x) %>%
apply(2, paste, collapse = "<!>")) %>%
unlist() %>%
tibble::tibble() %>%
tidyr::separate(1, into = c("First_Item", "Second_Item", "Third_Item"),
sep = "<!>")
Returns:
# A tibble: 7 x 3
First_Item Second_Item Third_Item
<chr> <chr> <chr>
1 xA NA NA
2 xB NA NA
3 xC NA NA
4 xA xB NA
5 xA xC NA
6 xB xC NA
7 xA xB xC

How get NA for values which are not found in a dataframe

I have a vector of values and a dataframe which I can find each item of a vector in a specific column of dataframe with the following command:
lapply(l, function(x) df[which(df$col1==x),col2])
How can I get NA for values which are not available in my dataframe?
For example:
df: col1 col2
1 a
1 b
2 c
l=c(1,3)
output: col1 col2
1 a,b
3 NA
Using data.table you could achieve this efficiently by running a binary join to l (your vector)
library(data.table)
setDT(df)[.(l), # join between `df` & `l`
on = .(col1), # using `col1`
.(col2 = toString(col2)), # paste the values in `col2` (you can add `unique`)
by = .EACHI] # do this per each value in `l`
# col1 col2
# 1: 1 a, b
# 2: 3 NA
DATA:
df <- structure(list(col1 = c(1L, 1L, 2L), col2 = c("a", "b", "c")), .Names = c("col1","col2"), class = "data.frame", row.names = c(NA, -3L))
l <- c(1, 3)
CODE:
library(magrittr)
lapply(l, function(x){
res<-df[[2]][df[[1]]==x] %>% paste(collapse=",")
if(res=="") res = NA
return(cbind(x,res))
}) %>% do.call(rbind,.)
Result:
x res
[1,] "1" "a,b"
[2,] "3" NA
Function which gives TRUE if sth is NOT integer(0), character(0), etc.
(they have in common that their length is zero):
non.zero.vec <- function(x) length(x) > 0
Any vector with such zero-length-value elements can be converted to NA using
zero2na <- function(vec) sapply(vec, function(x) ifelse(non.zero.vec(x), x, NA))
## e.g.
zero2na(c(1, 2, integer(0)) ## [1] 1 2 NA
Finally, this function does exactly what you want:
lookup <- function(df, key.col, val.col, keys) {
idxs <- lapply(keys, function(x) which(df[, key.col] == x))
lookups <- lapply(idxs, function(vec) if(length(vec) > 0) {df[vec , val.col]} else {NA})
lookupstrings <- unlist(lapply(lookups,
function(v) suppressWarnings(if(is.na(v)) {"NA"} else {paste(v, collapse = ", ")})))
res.df <- data.frame(unlist(keys), lookupstrings)
colnames(res.df) <- c(key.col, val.col)
res.df
}
df <- data.frame(col1 = c(1,1,2), col2 = c("a", "b", "c"))
lookup(df, "col1", "col2", c(1, 2, 3))
## output:
col1 col2
1 1 a, b
2 2 c
3 3 NA

Get elements by position from one data frame to another

Let's say we have two data frames:
df1 <- data.frame(A = letters[1:3], B = letters[4:6], C = letters[7:9], stringsAsFactors = FALSE)
A B C
1 a d g
2 b e h
3 c f i
df2 <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
V1 V2 V3
1 1 4 7
2 2 5 8
3 3 6 9
I need to build a function that takes as input a single value or a vector containing elements from one of the data frames and returns the elements from the other data frame according to their positional indexes.
The function should work like this:
> matchdf(values = c("a", "e", "i"), dfin = df1, dfout = df2)
[1] 1 5 9
> matchdf(values = c(1, 5, 9), dfin = df2, dfout = df1)
[1] "a" "e" "i"
> matchdf(values = c(1, 1, 1), dfin = df2, dfout = df1)
[1] "a" "a" "a"
This is what I have tried so far:
requiere(dplyr)
toVec <- function(df) df %>% as.matrix %>% as.vector
matchdf <- function(values, dfin, dfout) toVec(dfout)[toVec(dfin) %in% values]
# But sometimes the output values aren't in correct order:
> matchdf(c("c", "i", "h"), dt1, dt2)
[1] 3 8 9
# should output 3 9 8
> matchdf(values = c("a", "a", "a"), dfin = dt1, dfout = dt2)
[1] 1
# Should output 1 1 1
Feel free to use data.table or/and dplyr if it eases the task. I would prefer a solution without for loops.
Assumptions:
elements from df1 are different from df2
dim(df1) = dim(df2)
matchdf <- function(values, dfin, dfout){
unlist(sapply(values,
function(val) dfout[dfin == val],
USE.NAMES = F)
)
}
matchdf(c("c", "i", "h"), df1, df2)
#should output 3 9 8
[1] 3 9 8
matchdf(values = c("a", "a", "a"), dfin = df1, dfout = df2)
#should output 1 1 1
[1] 1 1 1
matchdf(values = c("X", "Y", "a"), dfin = df1, dfout = df2)
#should output vector, not list
[1] 1

How to join multiple data frames using dplyr?

I want to left_join multiple data frames:
dfs <- list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, b = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
Reduce(left_join, dfs)
# a b c d
# 1 1 a 4 NA
# 2 2 b NA 7
# 3 3 c 5 8
This works because they all have the same b column, but Reduce doesn't let me specify additional arguments that I can pass to left_join. Is there a work around for something like this?
dfs <- list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, d = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
Update
This kind of works: Reduce(function(...) left_join(..., by = c("b" = "d")), dfs) but when by is more than one element it gives this error: Error: cannot join on columns 'b' x 'd': index out of bounds
It's been too late i know....today I got introduced to the unanswered questions section. Sorry to bother.
Using left_join()
dfs <- list(
df1 = data.frame(b = c("a", "b", "c"), a = 1:3),
df2 = data.frame(d = c("a", "c", "d"), c = 4:6),
df3 = data.frame(b = c("b", "c", "e"), d = 7:9)
)
func <- function(...){
df1 = list(...)[[1]]
df2 = list(...)[[2]]
col1 = colnames(df1)[1]
col2 = colnames(df2)[1]
xxx = left_join(..., by = setNames(col2,col1))
return(xxx)
}
Reduce( func, dfs)
# b a c d
#1 a 1 4 NA
#2 b 2 NA 7
#3 c 3 5 8
Using merge() :
func <- function(...){
df1 = list(...)[[1]]
df2 = list(...)[[2]]
col1 = colnames(df1)[1]
col2 = colnames(df2)[1]
xxx=merge(..., by.x = col1, by.y = col2, , all.x = T)
return(xxx)
}
Reduce( func, dfs)
# b a c d
#1 a 1 4 NA
#2 b 2 NA 7
#3 c 3 5 8
Would this work for you?
jnd.tbl <- df1 %>%
left_join(df2, by='b') %>%
left_join(df3, by='d')
Yet another solution:
library(purrr)
library(dplyr)
dfs = list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, b = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
purrr::reduce(dfs, dplyr::left_join, by = 'b')

join matching columns in a data.frame or data.table

I have the following data.frames:
a <- data.frame(id = 1:3, v1 = c('a', NA, NA), v2 = c(NA, 'b', 'c'))
b <- data.frame(id = 1:3, v1 = c(NA, 'B', 'C'), v2 = c("A", NA, NA))
> a
id v1 v2
1 1 a <NA>
2 2 <NA> b
3 3 <NA> c
> b
id v1 v2
1 1 <NA> A
2 2 B <NA>
3 3 C <NA>
note: There are no ids for which v1 or v2 are defined in both tables; there is only a single unique non-NA value in each column for each id value
I would like to merge these data frames on matching values of "id':
ab <- merge(a, b, by = "id")
but I would also like to combine the two columns v1 and v2, so that the data.frame ab will look like this:
ab <- data.frame(id = 1:3, v1 = c("a", "B", "C"), v2 = c("A", "b", "c"))
> ab
id v1 v2
1 1 a A
2 2 B b
3 3 C c
instead, I get this:
> merge(a, b, by = "id")
id v1.x v2.x v1.y v2.y
1 1 a <NA> <NA> A
2 2 <NA> b B <NA>
3 3 <NA> c C <NA>
it would be helpful to have examples using both data.frame and data.table, so here are the data.table versions of above:
A <- data.table(a, key = 'id')
B <- data.table(b, key = 'id')
A[B]
The type of merge you specify probably won't be possible using merge (with data frames), although saying that usually invites being proved wrong.
You also omit some details: will there always be a single unique non-NA value in each column for each id value? If so, this will work:
ab <- rbind(a,b)
> colFun <- function(x){x[which(!is.na(x))]}
> ddply(ab,.(id),function(x){colwise(colFun)(x)})
id v1 v2
1 1 a A
2 2 B b
3 3 C c
A similar strategy should work with data.tables as well:
abDT <- data.table(ab,key = "id")
> abDT[,list(colFun(v1),colFun(v2)),by = id]
id V1 V2
[1,] 1 a A
[2,] 2 B b
[3,] 3 C c
If your data is as simple as it is above joran's answer is likely the simplest way. Here's may approach in base:
a <- data.frame(id = 1:3, v1 = c('a', NA, NA), v2 = c(NA, 'b', 'c'))
b <- data.frame(id = 1:3, v1 = c(NA, 'B', 'C'), v2 = c("A", NA, NA))
decider <- function(x, y) factor(ifelse(is.na(x), as.character(y), as.character(x)))
data.frame(mapply(a, b, FUN = decider))
If your data has different id's (some overlap and some do not, then here's a different approach:
a <- data.frame(id = c(1,2,4,5), v1 = c('a', NA, "q", NA), v2 = c(NA, 'b', 'c', "e"))
b <- data.frame(id = 1:4, v1 = c(NA, "A", "C", 'B'), v2 = c("A", NA, "D", NA))
decider <- function(x, y) factor(ifelse(is.na(x), as.character(y), as.character(x)))
DF <- data.frame(mapply(a, b, FUN = decider))
DF2 <- rbind(b[!b$id %in% DF$id , ], DF)
DF2 <- DF2[order(DF2$id), ]
rownames(DF2) <- 1:nrow(DF2)

Resources