How to join multiple data frames using dplyr? - r

I want to left_join multiple data frames:
dfs <- list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, b = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
Reduce(left_join, dfs)
# a b c d
# 1 1 a 4 NA
# 2 2 b NA 7
# 3 3 c 5 8
This works because they all have the same b column, but Reduce doesn't let me specify additional arguments that I can pass to left_join. Is there a work around for something like this?
dfs <- list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, d = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
Update
This kind of works: Reduce(function(...) left_join(..., by = c("b" = "d")), dfs) but when by is more than one element it gives this error: Error: cannot join on columns 'b' x 'd': index out of bounds

It's been too late i know....today I got introduced to the unanswered questions section. Sorry to bother.
Using left_join()
dfs <- list(
df1 = data.frame(b = c("a", "b", "c"), a = 1:3),
df2 = data.frame(d = c("a", "c", "d"), c = 4:6),
df3 = data.frame(b = c("b", "c", "e"), d = 7:9)
)
func <- function(...){
df1 = list(...)[[1]]
df2 = list(...)[[2]]
col1 = colnames(df1)[1]
col2 = colnames(df2)[1]
xxx = left_join(..., by = setNames(col2,col1))
return(xxx)
}
Reduce( func, dfs)
# b a c d
#1 a 1 4 NA
#2 b 2 NA 7
#3 c 3 5 8
Using merge() :
func <- function(...){
df1 = list(...)[[1]]
df2 = list(...)[[2]]
col1 = colnames(df1)[1]
col2 = colnames(df2)[1]
xxx=merge(..., by.x = col1, by.y = col2, , all.x = T)
return(xxx)
}
Reduce( func, dfs)
# b a c d
#1 a 1 4 NA
#2 b 2 NA 7
#3 c 3 5 8

Would this work for you?
jnd.tbl <- df1 %>%
left_join(df2, by='b') %>%
left_join(df3, by='d')

Yet another solution:
library(purrr)
library(dplyr)
dfs = list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, b = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
purrr::reduce(dfs, dplyr::left_join, by = 'b')

Related

Transposing a single row (top row) within a dataset to its own column

I have the following dataset
df <- structure(list(
X1 = c("A", "B", "C", "D"),
X2 = c("NA", "B", "C", "D"),
X3 = c("NA", "B", "C", "D"),
X4 = c("NA", "B", "C", "D")),
class = "data.frame", row.names = c(NA, -4L))
I need to transpose the top row into its own column such that the data looks like
df <- structure(list(
X1 = c("A", "A", "A"),
X2 = c("B", "C", "D"),
X3 = c("B", "C", "D"),
X4 = c("B", "C", "D")),
class = "data.frame", row.names = c(NA, -3L))
I have thought about just subsetting and taking the top row, then transposing only that one and then merging it back to the original dataset.
I am wondering if there is a more elegant solution.
Edit, Thanks for everyone's help.
The next step is to take this and apply it to a list of tibbles that were split via group_split (code thanks to #LMc).
data %>%
group_by(split_on = cumsum(is.na(Company) & is.na(lag(Company)))) %>%
group_split(.keep = F) %>%
`names<-`({.} %>%
map(~ .[1,1])%>%
unlist())
"[<-"("["(df, -1, ), ,1,df[1,1])
:-)
df[,1] <- df[1,1]
df[-1,]
X1 X2 X3 X4
2 A B B B
3 A C C C
4 A D D D
We could use
library(dplyr)
df %>%
mutate(X1 = first(X1)) %>%
slice(-1)
# X1 X2 X3 X4
#1 A B B B
#2 A C C C
#3 A D D D
Or in base R (R 4.1.0)
df |>
transform(X1 = X1[1]) |>
subset( seq_along(X1) > 1)
# X1 X2 X3 X4
#2 A B B B
#3 A C C C
#4 A D D D

Merging same column from a dataset onto all of the columns of another in R?

I'm trying to do multiple merges/joins onto different columns in the same dataset, but when I do so the output is entirely wrong.
df1 df2
P1 P2 P3 P4 P Output
A B C C 1
A B B 2
E F G H H 3
E E 4
I'm trying to merge df2 onto df1 and the output I would like to get would look like
df3
P1 P2 P3 P4 Output
A B C NA 1
A B NA NA 2
E F G H 3
E NA NA NA 4
I've tried
df3<- merge(df1,df2, by.x = "P1", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P2", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P3", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P4", by.y = "P", all.x = T, all.y = T)
however it doesn't work the way I think it should. Is there an easier function that can cleanly merge like this that I am not aware of?
Based on the output showed, it seems that for each row, we need to get the last non-NA element and do a match with the second data.frame 'P' column to get the corresponding 'Output'. If that is the logic,
df3 <- df1
df3$Output <- apply(df1, 1, function(x)
setNames(df2$Output, df2$P)[tail(x[!is.na(x)], 1)])
Or with tidyverse
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, values_drop_na = TRUE) %>%
group_by(rn) %>%
slice(n()) %>%
ungroup %>%
left_join(df2, by = c('value' = 'P')) %>%
select(Output) %>%
bind_cols(df1, .)
data
df1 <- structure(list(P1 = c("A", "A", "E", "E"), P2 = c("B", "B", "F",
NA), P3 = c("C", NA, "G", NA), P4 = c(NA, NA, "H", NA)), class = "data.frame",
row.names = c(NA,
-4L))
df2 <- structure(list(P = c("C", "B", "H", "E"), Output = 1:4),
class = "data.frame", row.names = c(NA,
-4L))
You can use coalesce from the dplyr package to create a new field in df1 which will be the key between the two datasets.
library(dplyr)
#create column P, which takes first non null value
df1$P <- coalesce(df1$P4,df1$P3,df1$P2,df1$P1)
#Join data frames on P
df3 <- inner_join(df1, df2, by='P')
#Rmove P from df3
df3$P <- NULL
>> df3
P1 P2 P3 P4 Output
1 A B C <NA> 1
2 A B <NA> <NA> 2
3 E F G H 3
4 E <NA> <NA> <NA> 4

Calling a list index within a loop using names()

I have a list aa which references the index names of another list bb as well as containing one other element (call it cm). List bb items contain strings. I have a loop that goes through bb and, for every item which matches a string I've specified, adds it to a new row in a dataframe. What I need is to also add the cm value to that row.
Example:
library("tidyverse")
aa <- list(c(123, 1), c(234, 1), c(345, 2), c(456, 3))
bb <- list("123" = c("a", "b", "c"), "234" = c("b", "c", "d"), "345" = c("c", "d", "e"), "456" = c("f", "g", "h"))
cc <- c("a", "b", "c")
tbl <- NULL
for (a in aa){
for (b in bb) {
if (any(cc %in% b)) {
tb <- tibble(cm=a[2],n1=b[1],n2=b[2],n3=b[3])
tbl <- bind_rows(tbl,tb)
}
}
}
This iterates through for every possible combination of bb, and adds it to pairs it to every cm, which is no good. My output should look something like this:
output <- tibble(cm = c(1, 1, 2), n1 = c("a", "b", "c"),
n2 = c("b", "c", "d"), n3 = c("c", "d", "e"))
> output
# A tibble: 3 x 4
cm n1 n2 n3
<dbl> <chr> <chr> <chr>
1 1 a b c
2 1 b c d
3 2 c d e
I thought maybe something like this would work, as at least then I could loop through tbl later and use nm to replace it with the appropriate cm values:
tbl <- NULL
for (a in aa){
for (b in bb) {
if (any(cc %in% b)) {
tb <- tibble(nm = names(bb)[b], n1=b[1],n2=b[2],n3=b[3])
tbl <- bind_rows(tbl,tb)
}
}
}
I don't really understand why this doesn't work, because names(bb)[1] returns 123 so I figured it would work the same in a loop with names(bb)[b].
If you're happy with a base R solution without the explicit loops, would this work?
# generate data
aa <- list(c(123, 1), c(234, 1), c(345, 2), c(456, 3))
# cm is an element of bb
bb <- list("123" = c("a", "b", "c"), "234" = c("b", "c", "d"),
"345" = c("c", "d", "e"), "456" = c("f", "g", "h"),
cm = c(1, 1, 2))
cc <- c("a", "b", "c")
tbl <- data.frame(
bb[["cm"]],
# apply to each element of aa
do.call(rbind, lapply(aa, function(x, y, c) { # function takes 3 args
# only elements of bb whose names are in aa[[x]]
names_y <- as.character(intersect(x, names(y)))
# turn subset of bb into data.frame
out <- as.data.frame(do.call(rbind, y[names_y]))
# subset rows for which any row element %in% cc
out <- out[apply(out, 1, function(x, c) any(x %in% c), c)]
return(out)
}, bb, cc))) # pass bb and cc as args to the function in lapply()
names(tbl) <- c("cm", paste0("n", 1:(ncol(tbl) - 1)))
gives
> tbl
cm n1 n2 n3
123 1 a b c
234 1 b c d
345 2 c d e

Column names into data frame

How can I create a data frame which contains the column names of all Environment objects (df)
Ex. Having this 3 df as all the objects in the global environment.
chocolate <- data.frame(a = 1, b = 2, c = 3)
banana <- data.frame(a = 2, d = 4, c = 3)
pear <- data.frame(d = 1, e = 4)
Desired output
output <- data.frame(id = c("chocolate","banana", "pear"),
v2 = c("a", "a", NA),
v3 = c("b", NA, NA),
v4 = c("c", "c", NA),
v5 = c(NA, "d", "d"),
v6 = c(NA, NA, "e"))
output
We can try
library(data.table)
lst <- mget(paste0("df", 1:3))
setnames(rbindlist(lapply(setNames(lst, seq_along(lst)), function(x) {
x[] <- names(x)
x}), fill = TRUE, idcol = 'id'), 2:6, paste0("V", 1:5))[]
# id V1 V2 V3 V4 V5
#1: 1 a b c NA NA
#2: 2 a NA c d NA
#3: 3 NA NA NA d e

Get elements by position from one data frame to another

Let's say we have two data frames:
df1 <- data.frame(A = letters[1:3], B = letters[4:6], C = letters[7:9], stringsAsFactors = FALSE)
A B C
1 a d g
2 b e h
3 c f i
df2 <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
V1 V2 V3
1 1 4 7
2 2 5 8
3 3 6 9
I need to build a function that takes as input a single value or a vector containing elements from one of the data frames and returns the elements from the other data frame according to their positional indexes.
The function should work like this:
> matchdf(values = c("a", "e", "i"), dfin = df1, dfout = df2)
[1] 1 5 9
> matchdf(values = c(1, 5, 9), dfin = df2, dfout = df1)
[1] "a" "e" "i"
> matchdf(values = c(1, 1, 1), dfin = df2, dfout = df1)
[1] "a" "a" "a"
This is what I have tried so far:
requiere(dplyr)
toVec <- function(df) df %>% as.matrix %>% as.vector
matchdf <- function(values, dfin, dfout) toVec(dfout)[toVec(dfin) %in% values]
# But sometimes the output values aren't in correct order:
> matchdf(c("c", "i", "h"), dt1, dt2)
[1] 3 8 9
# should output 3 9 8
> matchdf(values = c("a", "a", "a"), dfin = dt1, dfout = dt2)
[1] 1
# Should output 1 1 1
Feel free to use data.table or/and dplyr if it eases the task. I would prefer a solution without for loops.
Assumptions:
elements from df1 are different from df2
dim(df1) = dim(df2)
matchdf <- function(values, dfin, dfout){
unlist(sapply(values,
function(val) dfout[dfin == val],
USE.NAMES = F)
)
}
matchdf(c("c", "i", "h"), df1, df2)
#should output 3 9 8
[1] 3 9 8
matchdf(values = c("a", "a", "a"), dfin = df1, dfout = df2)
#should output 1 1 1
[1] 1 1 1
matchdf(values = c("X", "Y", "a"), dfin = df1, dfout = df2)
#should output vector, not list
[1] 1

Resources