Conditional merging based on full join - r

I would like to conditionally merge two datasets such that the values in dataframe2 replace the values in dataframe1, unless dataframe2 contains missing values. This should be performed in the case of a full join such that rows from both dataframe are preserved.
This question is inspired from Conditional merge/replacement in R (which seems to work only for inner join).
df1 <- data.frame(x1=1:4,x2=letters[1:4],stringsAsFactors=FALSE)
df2 <- data.frame(x1=2:5,x2=c("zz","qq", NA, "qy"),stringsAsFactors=FALSE)
I would like the following result:
x1 x2
1 1 a
2 2 zz
3 3 qq
4 4 d
5 5 qy
I tried the following code though it returns NA for the 4th column but I would like the original value to be preserved since in this case df2 contains missing value for 4.
df3 <- anti_join(df1, df2, by = "x1")
rbind(df3, df2)
x1 x2
1 1 a
2 2 zz
3 3 qq
4 4 <NA>
5 5 qy

It can be done with dplyr.
library(dplyr)
full_join(df1,df2,by = c("x1" = "x1")) %>%
transmute(x1 = x1,x2 = coalesce(x2.y,x2.x))
x1 x2
1 1 a
2 2 zz
3 3 qq
4 4 d
5 5 qy

Related

Take sum of rows for every 3 columns in a dataframe

I have searched high and low and also tried multiple options to solve this but did not get the desired output as mentioned below:
I have dataframe df3 with headers as date and values beteween 0-1 as shown below:
df = data.frame(replicate(6,sample(0:1,6,rep=TRUE)))
colnames(df) = c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018")
df2 = data.frame(c("A","B","C","D","E","F"))
colnames(df2) = c("CUST_ID")
df3 = cbind(df2,df)
Now I need df4 in which sum of first 3 columns in series will form one column. This will be repeated in series for rest of the columns dynamically.
df4
Options I tried:
a) rbind.data.frame(apply(matrix(df3, nrow = n - 1), 1,sum))
b) col_list <- list(c("1/1/2018","1/2/2018","1/3/2018"), c("1/4/2018","1/5/2018","1/6/2018"))
lapply(col_list, function(x)sum(df3[,x])) %>% data.frame
One way would be to split df3 every 3 columns using split.default. To split the data we generate a sequence using rep, then for each dataframe we take rowSums and finally cbind the result together.
cbind(df3[1], sapply(split.default(df3[-1],
rep(1:ncol(df3), each = 3, length.out = (ncol(df3) -1))), rowSums))
# CUST_ID 1 2
#1 A 1 1
#2 B 2 0
#3 C 2 1
#4 D 1 1
#5 E 2 2
#6 F 2 2
FYI, the sequence generated from rep is
rep(1:ncol(df3), each = 3, length.out = (ncol(df3) -1))
#[1] 1 1 1 2 2 2
This makes it possible to split every 3 columns.
The results are different because OP used sample without set.seed.
If rep seems too long then we can generate the same sequence of columns using gl
gl(ncol(df3[-1])/3, 3)
#[1] 1 1 1 2 2 2
#Levels: 1 2
So the final code, would be
cbind(df3[1], sapply(split.default(df3[-1], gl(ncol(df3[-1])/3, 3)), rowSums))
We can use seq to create index, get the subset of columns within in a list, Reduce by taking the sum, and create new columns
df4 <- df3[1]
df4[paste0('col', c('123', '456'))] <- lapply(seq(2, ncol(df3), by = 3),
function(i) Reduce(`+`, df3[i:min((i+2), ncol(df3))]))
df4
# CUST_ID col123 col456
#1 A 2 2
#2 B 3 3
#3 C 1 3
#4 D 2 3
#5 E 2 1
#6 F 0 1
data
set.seed(123)
df <- data.frame(replicate(6,sample(0:1,6,rep=TRUE)))
colnames(df) <- c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018")
df2 <- data.frame(c("A","B","C","D","E","F"))
colnames(df2) = c("CUST_ID")
df3 <- cbind(df2, df)

merging on multiple columns R

I'm surprised if this isn't a duplicate, but I couldn't find the answer anywhere else.
I have two data frames, data1 and data2, that differ in one column, but the rest of the columns are the same. I would like to merge them on a unique identifying column, id. However, in the event an ID from data2 does not have a match in data1, I want the entry in data2 to be appended at the bottom, similar to plyr::rbind.fill() rather than renaming all the corresponding columns in data2 as column1.x and column1.y. I realize this isn't the clearest explanation, maybe I shouldn't be working on a Saturday. Here is code to create the two dataframes, and the desired output:
spp1 <- c('A','B','C')
spp2 <- c('B','C','D')
trait.1 <- rep(1.1,length(spp1))
trait.2 <- rep(2.0,length(spp2))
id_1 <- c(1,2,3)
id_2 <- c(2,9,7)
data1 <- data.frame(spp1,trait.1,id_1)
data2 <- data.frame(spp2,trait.2,id_2)
colnames(data1) <- c('spp','trait.1','id')
colnames(data2) <- c('spp','trait.2','id')
Desired output:
spp trait.1 trait.2 id
1 A 1.1 NA 1
2 B 1.1 2 2
3 C 1.1 NA 3
4 C NA 2 9
5 D NA 2 7
Try this:
library(dplyr)
full_join(data1, data2, by = c("id", "spp"))
Output:
spp trait.1 id trait.2
1 A 1.1 1 NA
2 B 1.1 2 2
3 C 1.1 3 NA
4 C NA 9 2
5 D NA 7 2
Alternatively, also merge would work:
merge(data1, data2, by = c("id", "spp"), all = TRUE)

Merge on x1, or if no match x2, or if no match x3

I'm trying to merge 2 datasets on a key, but if there is no match then I want to try another key, and so on.
df1 <- data.frame(a=c(5,1,7,3),
b=c("T","T","T","F"),
c=c("F","T","F","F"))
df2 <- data.frame(x1=c(4,5,3,9),
x2=c(7,8,1,2),
x3=c("g","w","t","o"))
df1
a b c
1 5 T F
2 1 T T
3 7 T F
4 3 F F
df2
x1 x2 x3 ..
1 4 7 g ..
2 5 8 w ..
3 3 1 t ..
4 9 2 o ..
The desired output is something like
a b c x3 ..
1 5 T F w ..
2 1 T T t ..
3 7 T F g ..
4 3 F F t ..
I tried something along the lines of
dfm <- merge(df1,df2, by.x = "a", by.y = "x1", all.x = TRUE)
dfm <- merge(dfm,df2, by.x = "a", by.y = "x2", all.x = TRUE)
but that isn't quite right.
This really isn't a standard sort of merge. You can make it more standard by reshaping df2 so you have just one field to merge on
df2long <- rbind(
data.frame(a = df2$x1, df2[,-(1:2), drop=FALSE]),
data.frame(a = df2$x2, df2[,-(1:2), drop=FALSE])
)
dfm <- merge(df1, df2long, by = "a", all.x = TRUE)
You could do something like this:
matches <- lapply(df2[, c("x1", "x2")], function(x) match(df1$a, x))
# finding matches in df2$x1 and df2$x2
# notice that the code below should work with any number of columns to be matched:
# you just need to add the names here eg. df2[, paste0("x", 1:100)]
matches
$x1
[1] 2 NA NA 3
$x2
[1] NA 3 1 NA
combo <- Reduce(function(a,b) "[<-"(a, is.na(a), b[is.na(a)]), matches)
# combining the matches on "first come first served" basis
combo
[1] 2 3 1 3
cbind(df1, df2[combo,])
a b c x1 x2 x3
2 5 T F 5 8 w
3 1 T T 3 1 t
1 7 T F 4 7 g
3.1 3 F F 3 1 t
If I understand correctly, the OP has requested to try a match of a with x1 first, then - if failed - to try to match a with x2. So any match of a with x1 should take precedence over a match of a with x2.
Unfortunately, the sample data set provided by the OP does not include a use case to prove this. Therefore, I have modified the sample dataset accordingly (see Data section).
The approach suggested here is to reshape df2 from wide to long format (likewise to MrFlick's answer) but to use a data.table join with parameter mult = "first".
The columns of df2 to be considered as key columns and the precedence can be controlled by the measure.vars parameter to melt(). After reshaping, melt() arranges the rows in the column order given in measure.vars:
library(data.table)
# define cols of df2 to use as key in order of
key_cols <- c("x1", "x2")
# reshape df2 from wide to long format
long <- melt(setDT(df2), measure.vars = key_cols, value.name = "a")
# join long with df1, pick first matches
result <- long[setDT(df1), on = "a", mult = "first"]
# clean up
setcolorder(result, names(df1))
result[, variable := NULL]
result
a b c x3
1: 5 T F w
2: 1 T T t
3: 7 T F g
4: 3 F F t
5: 0 F F <NA>
Please, note that the original row order of df1 has been preserved.
Also, note that the code works for an arbitrary number of key columns. The precedence of key columns can be easily changed. E.g., if the order is reversed, i.e., key_cols <- c("x2", "x1") matches of a with x2 will be picked first.
Data
Enhanced sample datasets:
df1 has an additional row with no match in df2.
df1 <- data.frame(a=c(5,1,7,3,0),
b=c("T","T","T","F","F"),
c=c("F","T","F","F","F"))
df1
a b c
1: 5 T F
2: 1 T T
3: 7 T F
4: 3 F F
5: 0 F F
df2 has an additional row to prove that a match in x1 takes precedence over a match in x2. The value 5 appears twice: In row 2 of column x1 and in row 5 of column x2.
df2 <- data.frame(x1=c(4,5,3,9,6),
x2=c(7,8,1,2,5),
x3=c("g","w","t","o","n"))
df2
x1 x2 x3
1: 4 7 g
2: 5 8 w
3: 3 1 t
4: 9 2 o
5: 6 5 n
Not sure I understood your question, but rather than repetitive merging I'd compare the keys of the potential merge, if this number is >0, than you have a match. If you want to take the first column with a match you can try this:
library(tidyr)
library(purrr)
(df1 <- data.frame(a=c(5,1,7,3),
b=c("T","T","T","F"),
c=c("F","T","F","F")) )
(df2 <- data.frame(x1=c(4,5,3,9),
x2=c(7,8,1,2),
x3=c("g","w","t","o")) )
FirstColMatch<-1:ncol(df2) %>%
map(~intersect(df1$a, df2[[.x]])) %>%
map(length) %>%
detect_index(function(x)x>0)
NewDF<-merge(df1,df2,by.x="a", by.y =names(df2)[FirstColMatch])

Why are there differences in using merge and %in%?

I have two datasets that I'd like to merge via two identifying variables (up and ver_u):
df1 looks like this:
up ver_u
257001 1
1010 1
101010 1
100316 1
df2 looks like this:
up ver_u code_uc quantity
500116 1 395884 1
100116 1 36761 2
160116 1 81308 3
100116 1 76146 1
113216 1 6338 1
101116 1 33887 1
What I would like to do is to take out a subset of df2 where their up and ver_u matches with those in df1. I did this in two different ways and I got different answers.
First method:
pur <- merge(df2, df1,by=c("up","ver_u"))
Second method:
test <- df2[(df2$up %in% df1$up) & (df2$ver_u %in% df1$ver_u),]
They are giving me different number of observations and I don't see why they are giving me a difference.
When I used merge on dataframe test with the following code, I got the same number of observations, but the two resulting dataframes I got are still different.
pur1 = merge(test, df1,by=c("up","ver_u"))
Is there some systematic differences of using merge and %in%?
Would greatly appreciate any insight on this.
Because merge is comparing row by row for both columns, while %in% is comparing one row by all other rows. Example:
#dummy data
df1 <- data.frame(x = c(1,2,3),
y = c(2,3,4))
df1
# x y
# 2 2 3
# 3 3 4
df2 <- data.frame(x = c(2,3,1,3),
y = c(3,1,4,1))
df2
# x y
# 1 2 3
# 2 3 1
# 3 1 4
# 4 3 1
# using merge
merge(df1, df2, by = c("x", "y"))
# x y
# 1 2 3
# using %in%
df1[(df1$x %in% df2$x) & (df1$y %in% df2$y), ]
# x y
# 2 2 3
# 3 3 4

Sum of hybrid data frames depending on multiple conditions in R

This is a more complex follow-up to my previous question. The answer there was to use a matrix, but that doesn't work with data frames having values of different modes.
I want to combine data frames of different sizes, with character and integer columns, and calculate their sum depending on multiple conditions.
Conditions
sums are only calculated for those rows that have a matching "Name"-value
sums are calculated for matching column names only
if a cell in df4 is not 0 and not NA, the sum should be df3 + df4
else the sum should be df1 + df2 + df3
Example
> df1 <- data.frame(Name=c("Joe","Ann","Lee","Dan"), "1"=c(0,1,5,2), "2"=c(3,1,0,0), "3"=c(2,0,2,2), "4"=c(2,1,3,4))
> df1
Name X1 X2 X3 X4
1 Joe 0 3 2 2
2 Ann 1 1 0 1
3 Lee 5 0 2 3
4 Dan 2 0 2 4
> df2 <- data.frame(Name=c("Joe","Ann","Ken"), "1"=c(3,4,1), "2"=c(2,3,0), "3"=c(2,4,3))
> df2
Name X1 X2 X3
1 Joe 3 2 2
2 Ann 4 3 4
3 Ken 1 0 3
> df3 <- data.frame(Name=c("Lee","Ben"), "1"=c(1,3), "2"=c(3,4), "3"=c(4,3))
> df3
Name X1 X2 X3
1 Lee 1 3 4
2 Ben 3 4 3
The condition depends on this frame:
> df4 <- data.frame(Name=c("Lee","Ann","Dan"), "1"=c(6,0,NA), "2"=c(0,0,4), "3"=c(0,NA,0))
> df4
Name X1 X2 X3
1 Lee 6 0 0
2 Ann 0 0 NA
3 Dan NA 4 0
With the above examples, this is the expected result (* values depend on df4):
> dfsum
Name X1 X2 X3 X4
1 Joe 3 5 4 2
2 Ann 5 4 4 1
3 Lee 7* 3 6 3
4 Dan 2 4* 2 4
5 Ken 1 0 3 NA
6 Ben 3 4 3 NA
Possible steps?
First expand df1, df2, df3, df4 to 5 columns and 6 rows, fill missing data with NA.
Then for each data frame:
sort rows by "Name"
separate "Name" column from "X1"..."X4"
transform "X1"..."X4" columns to matrix
calculate sums of the matrices like in the answer to my other question but with the additional condition 1
transform result matrix to data frame
cbind the "Name" column with the result data frame
How can this be done in R?
Solution
#Ricardo Saporta's solution works with little changes:
Add , padValue=NA) in the four addCols().
As answered here, replace the definitions of sumD3D4 and dtsum with:
plus <- function(x) {
if(all(is.na(x))){
c(x[0],NA)} else {
sum(x,na.rm = TRUE)}
}
sumD3D4 <- setkey(rbind(dt3, dt4)[,lapply(.SD, plus), by = Name], "Name")
dtsum <- setkey(rbind(dt1, dt2, dt3)[, lapply(.SD, plus), by=Name], "Name")
If you use data.table instead of data.frame, you could use its by=xxxx feature, to add by name.
The code below should give you your expected results.
Please note that I am padding the data.tables with extra empty columns. However, we compute condTrue prior to then.
library(data.table)
dt1 <- data.table(df1)
dt2 <- data.table(df2)
dt3 <- data.table(df3)
dt4 <- data.table(df4)
# make sure all dt's have the same columns
#-----------------------------------------#
# identify which dt4 satisfy the condition
condTrue <- as.data.table(which(!(is.na(dt4) | dt4==0), arr.ind=TRUE))
# ignore column "Name" from dt4
condTrue <- condTrue[col>1]
# convert from (row, col) index to ("Name", columnName)
condTrue <- data.table(Name=dt4[condTrue$row, Name], colm=names(dt4)[condTrue$col], key="Name")
# First make a list of all the unique column names
allColumnNames <- unique(c(names(dt1), names(dt2), names(dt3), names(dt4)))
# add columns as necessary, using addCols (definted below)
addCols(dt1, allColumnNames)
addCols(dt2, allColumnNames)
addCols(dt3, allColumnNames)
addCols(dt4, allColumnNames)
sumD3D4 <- setkey(rbind(dt3, dt4)[, lapply(.SD, sum), by=Name], "Name")
dtsum <- setkey(rbind(dt1, dt2, dt3)[, lapply(.SD, sum), by=Name], "Name")
for (Nam in condTrue$Name) {
colsRepl <- condTrue[.(Nam)]$colm
valsRepl <- unlist(sumD3D4[.(Nam), c(colsRepl), with=FALSE])
dtsum[.(Nam), c(colsRepl) := as.list(valsRepl)]
}
dtsum
# Name 1 2 3 4
# 1: Ann 5 4 4 1
# 2: Ben 3 4 3 0
# 3: Dan 2 4 2 4
# 4: Joe 3 5 4 2
# 5: Ken 1 0 3 0
# 6: Lee 7 3 6 3
addCols <- function(x, cols, padValue=0) {
# adds to x any columns that are in cols but not in x
# Returns TRUE if columns were added
# FALSE if no columns added
colsMissing <- setdiff(cols, names(x))
# grab the actual DT name that was passed to function
dtName <- as.character(match.call()[2])
if (length(colsMissing)) {
get(dtName, envir=parent.frame(1))[, c(colsMissing) := padValue]
return(TRUE)
}
return(FALSE)
}

Resources