merge multiple data.frame by row in R - r

I would like to merge multiple data.frame in R using row.names, doing a full outer join. For this I was hoping to do the following:
x = as.data.frame(t(data.frame(a=10, b=13, c=14)))
y = as.data.frame(t(data.frame(a=1, b=2)))
z = as.data.frame(t(data.frame(a=3, b=4, c=3, d=11)))
res = Reduce(function(a,b) merge(a,b,by="row.names",all=T), list(x,y,z))
Warning message:
In merge.data.frame(a, b, by = "row.names", all = T) :
column name ‘Row.names’ is duplicated in the result
> res
Row.names Row.names V1.x V1.y V1
1 1 a 10 1 NA
2 2 b 13 2 NA
3 3 c 14 NA NA
4 a <NA> NA NA 3
5 b <NA> NA NA 4
6 c <NA> NA NA 3
7 d <NA> NA NA 11
What I was hoping to get would be:
V1 V2 V3
a 10 1 3
b 13 2 4
c 14 NA 3
d NA NA 11

The following works (up to some final column renaming):
res <- Reduce(function(a,b){
ans <- merge(a,b,by="row.names",all=T)
row.names(ans) <- ans[,"Row.names"]
ans[,!names(ans) %in% "Row.names"]
}, list(x,y,z))
Indeed:
> res
V1.x V1.y V1
a 10 1 3
b 13 2 4
c 14 NA 3
d NA NA 11
What happens with a row join is that a column with the original rownames is added in the answer, which in turn does not contain row names:
> merge(x,y,by="row.names",all=T)
Row.names V1.x V1.y
1 a 10 1
2 b 13 2
3 c 14 NA
This behavior is documented in ?merge (under Value)
If the matching involved row names, an extra character column called
Row.names is added at the left, and in all cases the result has
‘automatic’ row names.
When Reduce tries to merge again, it doesn't find any match unless the names are cleaned up manually.

For continuity, this is not a clean solution but a workaround, I transform the list argument of 'Reduce' using sapply.
Reduce(function(a,b) merge(a,b,by=0,all=T),
sapply(list(x,y,z),rbind))[,-c(1,2)]
x y.x y.y
1 10 1 3
2 13 2 4
3 14 NA 3
4 NA NA 11
Warning message:
In merge.data.frame(a, b, by = 0, all = T) :
column name ‘Row.names’ is duplicated in the result

For some reason I did not have much success with Reduce. given a list of data.frames (df.lst) and a list of suffixes (suff.lst) to change the names of identical columns, this is my solution (it's loop, I know it's ugly for R standards, but it works):
df.merg <- as.data.frame(df.lst[1])
colnames(df.merg)[-1] <- paste(colnames(df.merg)[-1],suff.lst[[1]],sep="")
for (i in 2:length(df.lst)) {
df.i <- as.data.frame(df.lst[i])
colnames(df.i)[-1] <- paste(colnames(df.i)[-1],suff.lst[[i]],sep="")
df.merg <- merge(df.merg, df.i, by.x="",by.y="", all=T)
}

Related

Comparing two columns in R based on matching values in a common column

So, I have two datasets, such that all the columns in one dataset is present in the other, along with some extra columns. What I want to do is to create a new dataset of the differences between the entries in common columns, on the basis of matching a common identifier column, present in both the datasets. How can I do that in R?
If it were a single column, I could have used the sqldf function as
sqldf("select a.v1 - b.v1 from ds1 a left join ds2 b on a.identifier=b.identifier")
But there are 900 common columns between both the datasets.
You can do this by simply joining the frames on identifier, and then subtracting the one frame from the other.
Here is an example of the approach using data.table
# load the library
library(data.table)
# do a left join on identifier
merged = setDT(ds2)[setDT(ds1), on="identifier"]
# get common column names, and remove "identifier" from that vector
cols = intersect(names(ds1), names(ds2))
cols = cols[cols!="identifier"]
# column-bind the identifier column with a substraction of the two sub-frames
cbind(
merged[,.(identifier)],
setnames(merged[,.SD, .SDcols = paste0("i.",cols)] - merged[,.SD, .SDcols = cols],paste0("diff_",cols))
)
Here is the same approach using dplyr:
library(dplyr)
merged = left_join(ds1,ds2, by="identifier")
cols = intersect(names(ds1), names(ds2))
cols = cols[cols!="identifier"]
bind_cols(
select(merged,identifier),
(select(merged, all_of(paste0(cols, ".x"))) - select(merged, all_of(paste0(cols, ".y")))) %>%
rename_with(~paste0("diff_",cols), everything())
)
Output (same under either approach):
identifier diff_v1 diff_v2 diff_v3
<char> <num> <num> <num>
1: O 0.5028498 0.7573174 -1.00630610
2: S -2.5631238 -0.7041228 1.33877932
3: N NA NA NA
4: C NA NA NA
5: J NA NA NA
6: R NA NA NA
7: K NA NA NA
8: E NA NA NA
9: X -0.1830764 0.2924459 -0.01860763
10: Y NA NA NA
11: W NA NA NA
12: T -0.4912840 -2.8126285 -1.33661044
13: I NA NA NA
14: L NA NA NA
15: U NA NA NA
16: M -0.3130889 1.1590316 -0.44551660
17: P NA NA NA
18: H NA NA NA
19: B NA NA NA
20: G -2.2817049 2.4156583 -0.34393988
21: Z NA NA NA
22: A -0.1654816 -0.8807393 -1.08534789
23: F NA NA NA
24: V NA NA NA
25: D 1.4653655 0.2604109 -0.17733840
26: Q NA NA NA
identifier diff_v1 diff_v2 diff_v3
Input:
set.seed(123)
ds1 = data.frame(identifier = sample(LETTERS),v1 = rnorm(26),v2 = rnorm(26),v3 = rnorm(26))
ds2 = data.frame(identifier = sample(LETTERS,8),v1 = rnorm(8),v2 = rnorm(8),v3 = rnorm(8))
Using S1 and S2 as an example (see Note at end) we find the common column names using intersect. In the example data in the Note at the end there is only one numeric common column that is not ID but the same code should work even if there are many.
Then using Filter extract the names of the numeric common columns. We have assumed that if a common column in S1 is numeric then it is also numeric in S2 so we only have to check S1. The Filter line could be omitted if we knew that all common columns were numeric.
Next ensure that the ID column is excluded using setdiff. If ID is non numeric the Filter line would have already removed it in which case we could omit the setdiff line.
Now construct the select clause. sprintf creates a character vector of the elements of the select clause and toString collapses it to a comma separated string giving the final select string. Finally run the SQL statement. Note that fn$sqldf turns on string interpolation in the SQL statement and $sel inserts the contents of the sel variable into the SQL string.
library (sqldf)
nms <- intersect(names(S1), names(S2))
nms <- names(Filter(is.numeric, S1[nms]))
nms <- setdiff(nms, "ID")
sel <- toString(sprintf("a.[%s] - b.[%s] as [%s]", nms, nms, nms))
fn$sqldf("select ID, $sel  
from S1 a  
left join S2 b using(ID)")
## ID extra
## 1 1 0
## 2 2 0
## 3 3 0
## 4 4 0
## 5 5 0
## 6 6 0
Pipe
The nms<- lines above could alternately be written in terms of pipes:
nms <- names(S1) |>
intersect(names(S2)) |>
subset(S1, select = _) |> 
Filter(f = is.numeric) |>
names() |>
setdiff("ID")
Note
The data frame sleep comes with R. S1 and S2 are used as an example.
S1 <- head(sleep)
S2 <- S1[-2]
S1
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
S2
## extra ID
## 1 0.7 1
## 2 -1.6 2
## 3 -0.2 3
## 4 -1.2 4
## 5 -0.1 5
## 6 3.4 6

How to cbind vectors of different length in R?

I need to combine some named numeric vectors in R into a data frame. I tried cbind.na as suggestet in another question, but it would not take names into account. Example:
v1 <- c(1,5,6,7)
names(v1) <- c("milk", "flour", "eggs", "sugar")
v2 <- c(2,3)
names(v2) <- c("fish", "chips")
v3 <- c(5,7,4)
names(v3) <- c("chips", "milk", "sugar")
The data frame should look like this
v1 v2 v3
milk 1 NA 7
flour 5 NA NA
eggs 6 NA NA
sugar 7 NA 4
fish NA 2 NA
chips NA 3 5
I can't figure out how to solve this in R.
This is a join, best done with data.table or other add-ins, but (especially for smallish arrays) can readily be performed in base R by creating an array of all the names and using it to index into the input arrays:
s <- unique(names(c(v1,v2,v3)))
x <- cbind(v1=v1[s], v2=v2[s], v3=v3[s])
rownames(x) <- s
print(x)
v1 v2 v3
milk 1 NA 7
flour 5 NA NA
eggs 6 NA NA
sugar 7 NA 4
fish NA 2 NA
chips NA 3 5
# get vectors into one list
v <- mget(paste0('v', 1:3))
# convert vectors to data frames
l <- lapply(v, stack)
# merge them all sequentially
out <- Reduce(function(x, y) merge(x, y, by = 'ind', all = T), l)
# name the columns according to the original vector names
setNames(out, c('ind', names(v)))
# ind v1 v2 v3
# 1 milk 1 NA 7
# 2 flour 5 NA NA
# 3 eggs 6 NA NA
# 4 sugar 7 NA 4
# 5 fish NA 2 NA
# 6 chips NA 3 5
Edit:
I think this is worse than whuber's solution because it requires creating a bunch of intermediate tables, both in the lapply step and in the Reduce step. Haven't done any benchmarks though.

R DataFrame leading NA's shift

I've searched longer than I'd like to admit for shifting leading NA's to the end.
Got close with a few stack questions "Cut out outer NAs in R","Rotate a Matrix in R","na.locf remove leading NAs, keep others [closed]" as well as looking over na.trim function in zoo package. Essentially I want to turn this:
D <- matrix(c(1:9), 3)
D[2,1]<- NA
D[3,1]<- NA
D[3,2]<- NA
D <- as.data.frame(D)
into this:
D1 <- data.frame(V1 = c(1,5,9),
V2 = c(4,8,NA),
V3 = c(7,NA,NA))
Any help is as always, much appreciated!
Thanks,
You can use sort(..., na.last = T) within row-wise apply:
as.data.frame(t(apply(D, 1, sort, na.last = T)))
# V1 V2 V3
#1 1 4 7
#2 5 8 NA
#3 9 NA NA
Update
To avoid ordering non-NA entries, you can do:
# Revised sample data
D <- matrix(c(1:9), 3)
D[2,1]<- NA
D[3,1]<- NA
D[3,2]<- NA
D <- as.data.frame(D)
D[2,2:3] <- c(8, 5);
D;
# V1 V2 V3
#1 1 4 7
#2 NA 8 5
#3 NA NA 9
as.data.frame(t(apply(D, 1, function(x) c(x[!is.na(x)], x[is.na(x)]))))
#V1 V2 V3
#1 1 4 7
#2 8 5 NA
#3 9 NA NA

Conditional replacement of NAs in two dataframes R

Probably simple but tricky question especially for larger data sets. Given two dataframes (df1,df2) of equal dimensions as below:
head(df1)
a b c
1 0.8569720 0.45839112 NA
2 0.7789126 0.36591578 NA
3 0.6901663 0.88095485 NA
4 0.7705756 0.54775807 NA
5 0.1743111 0.89087819 NA
6 0.5812786 0.04361905 NA
and
head(df2)
a b c
1 0.21210312 0.7670091 NA
2 0.19767464 0.3050934 1
3 0.08982958 0.4453491 2
4 0.75196925 0.6745908 3
5 0.73216793 0.6418483 4
6 0.73640209 0.7448011 5
How can one find all columns where if(all(is.na(df1)), in this case c, go to df2and set all values in matching column (c) to NAs.
Desired output
head(df3)
a b c
1 0.21210312 0.7670091 NA
2 0.19767464 0.3050934 NA
3 0.08982958 0.4453491 NA
4 0.75196925 0.6745908 NA
5 0.73216793 0.6418483 NA
6 0.73640209 0.7448011 NA
My actual dataframes have more than 140000 columns.
We can use colSums on the negated logical matrix (is.na(df1)), negate (!) thevector` so that 0 non-NA elements becomes TRUE and all others FALSE, use this to subset the columns of 'df2' and assign it to NA.
df2[!colSums(!is.na(df1))] <- NA
df2
# a b c
#1 0.21210312 0.7670091 NA
#2 0.19767464 0.3050934 NA
#3 0.08982958 0.4453491 NA
#4 0.75196925 0.6745908 NA
#5 0.73216793 0.6418483 NA
#6 0.73640209 0.7448011 NA
Or another option is to loop over the columns and check whether all the elements are NA to create a logical vector for subsetting the columns of 'df2' and assigning it to NA
df2[sapply(df1, function(x) all(is.na(x)))] <- NA
If these are big datasets, another option would be set from data.table (should be more efficient as this does the assignment in place)
library(data.table)
setDT(df2)
j1 <- which(sapply(df1, function(x) all(is.na(x))))
for(j in j1){
set(df2, i = NULL, j = j, value = NA)
}

Build a vector/frame by combining regmatches results

I have a regular expression that parses a bunch of text, an when doing regmatches(myText,myRegex) it returns a list which looks like:
[[1]]
[1] "a=1" "b=3" "a=9" "c=2" "b=4"
...
I'd like to build a data.frame or table - whatever suits best - to finally have something like:
a b c
1 3 2
9 4 ...
Is it possible to make this in a simple fashion? What are your suggestions?
Thanks in advance.
Its not entirely clear what the general case is here but this works on the data provided.
Assuming this input:
x <- c("a=1", "b=3", "a=9", "c=2", "b=4")
split the values by the names producing s and massage into a data.frame:
s <- split(as.numeric(sub(".*=", "", x)), sub("=.*", "", x))
as.data.frame(do.call(cbind, lapply(s, ts)))
giving:
a b c
1 1 3 2
2 9 4 NA
No packages needed.
You can either use base R methods
d1 <- read.table(text=gsub("[[:punct:]]", " " , unlist(lst)))
d2 <- transform(d1, indx=ave(seq_along(V1), V1, FUN=seq_along))
res <- reshape(d2, timevar='V1', idvar='indx', direction='wide')[,-1]
colnames(res) <- gsub(".*\\.", "", colnames(res))
res
# a b c
#1 1 3 2
#3 9 4 2
#6 4 5 NA
#9 9 NA NA
Or using dcast from reshape2 on d2
library(reshape2)
dcast(d2,indx~V1, value.var='V2')[,-1]
# a b c
#1 1 3 2
#2 9 4 2
#3 4 5 NA
#4 9 NA NA
data
lst <- list(c('a=1', 'b=3', 'a=9', 'c=2', 'b=4'),
c('a=4', 'c=2', 'b=5', 'a=9'))
Using rex may make this type of extraction task a little simpler.
x <- c("a=1", "b=3", "a=9", "c=2", "b=4", "a=2")
First extract the names and values from the strings.
library(rex)
matches <- re_matches(x,
rex(
capture(name="name", letter),
"=",
capture(name="value", digit)
))
#> name value
#>1 a 1
#>2 b 3
#>3 a 9
#>4 c 2
#>5 b 4
#>6 a 2
Then tally the groups using split().
groups <- split(as.numeric(matches$value), matches$name)
#>$a
#>[1] 1 9 2
#>
#>$b
#>[1] 3 4
#>
#>$c
#>[1] 2
If we try to convert directly to a data.frame from split() the groups with fewer members will have their members recycled rather than NA, so instead explicitly fill with NA.
largest_group <- max(sapply(groups, length))
#>[1] 3
groups <- lapply(groups, function(group) {
if (length(group) < largest_group) {
group[largest_group] <- NA
}
group
})
#>$a
#>[1] 1 9 2
#>
#>$b
#>[1] 3 4 NA
#>
#>$c
#>[1] 2 NA NA
Finally we can create the data.frame
do.call('data.frame', groups)
#> a b c
#>1 1 3 2
#>2 9 4 NA
#>3 2 NA NA
Here's an approach using tools from my "splitstackshape" package:
library(splitstackshape)
dcast.data.table( ## Makes the long data wide
getanID( ## Adds an ID variable for dcast
## create a single column data.table and split it by the "="
cSplit(as.data.table(unlist(lst)), "V1", "="), "V1_1"),
.id ~ V1_1, value.var = "V1_2")
# .id a b c
# 1: 1 1 3 2
# 2: 2 9 4 2
# 3: 3 4 5 NA
# 4: 4 9 NA NA
This uses #akrun's sample data:
lst <- list(c('a=1', 'b=3', 'a=9', 'c=2', 'b=4'),
c('a=4', 'c=2', 'b=5', 'a=9'))

Resources