This question already has an answer here:
Merging a lot of data.frames [duplicate]
(1 answer)
Closed 3 years ago.
The problem attempts to join matrices in a set, by a column identifier.
We can express the problem in the following form:
Setup
mat1 <- data.frame(matrix(nrow=4, ncol =3, rnorm(12,0,1)))
mat2 <- data.frame(matrix(nrow =5, ncol=3, rnorm(15,0,1)))
mat3 <- data.frame(matrix(nrow=3, ncol =3, rnorm(9,0,1)))
mat4 <- data.frame(matrix(nrow =6, ncol =3, rnorm(18,0,1)))
colnames(mat1) = colnames(mat2) = colnames(mat3) = colnames(mat4) <- c("Code", "x1", "x2")
mat1$Code <- c(1,2,3,4)
mat2$Code <- c(2,3,4,5,6)
mat3$Code <- c(6,7,8)
mat4$Code <- c(1,2,3,4,5,6)
mat_set <- c(mat1, mat2, mat3, mat4)
> mat1
Code x1 x2
1 1 0.6425172 -1.9404704
2 2 -0.1278021 0.8485476
3 3 -0.5525808 -0.9060624
4 4 -1.3013592 0.7350129
> mat2
Code x1 x2
1 2 -0.06543585 -1.1244444
2 3 0.03773743 -0.8124004
3 4 3.53421807 -0.4935844
4 5 0.56686927 0.3433276
5 6 0.41849489 0.8782866
> mat3
Code x1 x2
1 6 1.0821070 0.08006585
2 7 0.1038577 0.61057716
3 8 2.7002036 0.19693561
> mat1
Code x1 x2
1 1 0.6425172 -1.9404704
2 2 -0.1278021 0.8485476
3 3 -0.5525808 -0.9060624
4 4 -1.3013592 0.7350129
> mat2
Code x1 x2
1 2 -0.06543585 -1.1244444
2 3 0.03773743 -0.8124004
3 4 3.53421807 -0.4935844
4 5 0.56686927 0.3433276
5 6 0.41849489 0.8782866
> mat3
Code x1 x2
1 6 1.0821070 0.08006585
2 7 0.1038577 0.61057716
3 8 2.7002036 0.19693561
> mat4
Code x1 x2
1 1 -0.1188262 0.6338566
2 2 0.6128098 1.3759910
3 3 -1.3504901 -0.2830859
4 4 -1.2153638 -1.1611660
5 5 -1.7420065 0.2470048
6 6 -0.9786468 -1.2214594
I then want to bind by column all matrices in the set by "Code". Preserve the ordering. This will yield output of the form:
output <- data.frame(matrix(nrow = 8, ncol =9))
output[,1] <- c(1,2,3,4,5,6,7,8)
output[,2] <- c(mat1$x1, NA, NA,NA,NA)
output[,3] <- c(mat1$x2, NA,NA,NA,NA)
output[,4] <- c(NA, mat2$x1, NA, NA)
output[,5] <- c(NA, mat2$x2, NA, NA)
output[,6] <- c(NA,NA,NA,NA,NA,mat3$x1)
output[,7] <- c(NA,NA,NA,NA,NA,mat3$x2)
output[,8] <- c(mat4$x1, NA,NA)
output[,9] <- c(mat4$x2, NA,NA)
output
X1 X2 X3 X4 X5 X6 X7 X8 X9
1 1 0.6425172 -1.9404704 NA NA NA NA -0.1188262 0.6338566
2 2 -0.1278021 0.8485476 -0.06543585 -1.1244444 NA NA 0.6128098 1.3759910
3 3 -0.5525808 -0.9060624 0.03773743 -0.8124004 NA NA -1.3504901 -0.2830859
4 4 -1.3013592 0.7350129 3.53421807 -0.4935844 NA NA -1.2153638 -1.1611660
5 5 NA NA 0.56686927 0.3433276 NA NA -1.7420065 0.2470048
6 6 NA NA 0.41849489 0.8782866 1.0821070 0.08006585 -0.9786468 -1.2214594
7 7 NA NA NA NA 0.1038577 0.61057716 NA NA
8 8 NA NA NA NA 2.7002036 0.19693561 NA NA
>
A final point is that the code must be replicable over a large set of matrices. Thanks!
You can use merge in Reduce
out <- Reduce(function(x, y) merge(x, y, by = "Code", all = TRUE), mat_set)
colnames(out) <- paste0("x", seq_along(out))
out
# x1 x2 x3 x4 x5 x6 x7 x8 x9
#1 1 0.4291247 -0.5644520 NA NA NA NA -0.8553646 -0.5238281
#2 2 0.5060559 -0.8900378 -0.9111954 -0.4405479 NA NA -0.2806230 -0.4968500
#3 3 -0.5747400 -0.4771927 -0.8371717 0.4595894 NA NA -0.9943401 -1.8060313
#4 4 -0.5466319 -0.9983864 2.4158352 -0.6937202 NA NA -0.9685143 -0.5820759
#5 5 NA NA 0.1340882 -1.4482049 NA NA -1.1073182 -1.1088896
#6 6 NA NA -0.4906859 0.5747557 1.1022975 -0.5012581 -1.2519859 -1.0149620
#7 7 NA NA NA NA -0.4755931 -1.6290935 NA NA
#8 8 NA NA NA NA -0.7094400 -1.1676193 NA NA
data
set.seed(1234)
mat1 <- data.frame(matrix(nrow=4, ncol =3, rnorm(12,0,1)))
mat2 <- data.frame(matrix(nrow =5, ncol=3, rnorm(15,0,1)))
mat3 <- data.frame(matrix(nrow=3, ncol =3, rnorm(9,0,1)))
mat4 <- data.frame(matrix(nrow =6, ncol =3, rnorm(18,0,1)))
colnames(mat1) = colnames(mat2) = colnames(mat3) = colnames(mat4) <- c("Code", "x1", "x2")
mat1$Code <- c(1,2,3,4)
mat2$Code <- c(2,3,4,5,6)
mat3$Code <- c(6,7,8)
mat4$Code <- c(1,2,3,4,5,6)
mat_set <- list(mat1, mat2, mat3, mat4)
As noted by #G. Grothendieck, the above code gives a warning because the column names are similar in the list of dataframes. Since we only want to join by "Code", while creating the dataframe we can give unique names to columns so to avoid the warning and renaming later.
Updated data
set.seed(1234)
mat1 <- setNames(data.frame(matrix(nrow=4, ncol =3, rnorm(12,0,1))), c("Code", "x1", "x2"))
mat2 <- setNames(data.frame(matrix(nrow =5, ncol=3, rnorm(15,0,1))), c("Code", "x3", "x4"))
mat3 <- setNames(data.frame(matrix(nrow=3, ncol =3, rnorm(9,0,1))), c("Code", "x5", "x6"))
mat4 <- setNames(data.frame(matrix(nrow =6, ncol =3, rnorm(18,0,1))), c("Code", "x7", "x8"))
mat1$Code <- c(1,2,3,4)
mat2$Code <- c(2,3,4,5,6)
mat3$Code <- c(6,7,8)
mat4$Code <- c(1,2,3,4,5,6)
mat_set <- list(mat1, mat2, mat3, mat4)
and then we can use Reduce
Reduce(function(x, y) merge(x, y, by = "Code", all = TRUE), mat_set)
Related
I'm trying to make a new matrix using values from other matrix with R. I'm trying to match the names of rows and columns while importing the values. This is what what trying to do:
I have two matrices;
X1 X2 X3 X4
X1 0 9 8 0
X2 1 2 3 5
X4 6 1 2 4
X1 X2 X3 X4
X1 NA NA NA NA
X2 NA NA NA NA
X3 NA NA NA NA
X4 NA NA NA NA
I want to do
X1 X2 X3 X4
X1 0 9 8 0
X2 1 2 3 5
X3 NA NA NA NA
X4 6 1 2 4
These matrices are just simple examples of my dataset, my real data is more complicated.
Many thanks,
checking for rownames and colnames matches in both matrices will prevent subscript out of bounds error. See below.
mat2[rownames(mat2) %in% rownames(mat1),
colnames(mat2) %in% colnames(mat1)] <- mat1[rownames(mat1) %in% rownames(mat2),
colnames(mat1) %in% colnames(mat2)]
mat2
# X1 X2 X3 X4
# X1 0 9 8 0
# X2 1 2 3 5
# X3 NA NA NA NA
# X4 6 1 2 4
Data:
mat1 <- read.table(text = ' X1 X2 X3 X4
X1 0 9 8 0
X2 1 2 3 5
X4 6 1 2 4', header = TRUE)
mat1 <- as.matrix(mat1)
mat2 <- matrix(NA, nrow = 4, ncol = 4, dimnames = list(paste0("X", 1:4),
paste0("X", 1:4)))
If I understood your question you can do this:
# Building your matrices
mat1 <- matrix(runif(12), nrow = 3, ncol = 4)
mat2 <- matrix(NA, nrow = 4, ncol = 4)
labs <- paste0("x", 1:4)
colnames(mat1) <- colnames(mat2) <- labs
rownames(mat2) <- labs
rownames(mat1) <- labs[c(1:2, 4)]
#
rows <- sort(unique(c(rownames(mat1), rownames(mat2))))
result <- matrix(NA, nrow = length(rows), ncol = ncol(mat1))
result[match(rownames(mat1), rows), ] <- mat1
I have an empty matrix with a certain number of columns that I'm trying to fill row-by-row with output vectors of a for-loop. However, some of the output are not the same length as the number of columns as my matrix, and just want to fill up those "empty spaces" with NAs.
For example:
matrix.names <- c("x1", "x2", "x3", "x4", "y1", "y2", "y3", "y4", "z1", "z2", "z3", "z4")
my.matrix <- matrix(ncol = length(matrix.names))
colnames(my.matrix) <- matrix.names
This would be the output from one iteration:
x <- c(1,2)
y <- c(4,2,1,5)
z <- c(1)
Where I would want it in the matrix like this:
x1 x2 x3 x4 y1 y2 y3 y4 z1 z2 z3 z4
[1,] 1 2 NA NA 4 2 1 5 1 NA NA NA
The output from the next iteration would be, for example:
x <- c(1,1,1,1)
y <- c(0,4)
z <- c(4,1,3)
And added as a new row in the matrix:
x1 x2 x3 x4 y1 y2 y3 y4 z1 z2 z3 z4
[1,] 1 2 NA NA 4 2 1 5 1 NA NA NA
[2,] 1 1 1 1 0 4 NA NA 4 1 3 NA
It's not really a concern if I have a 0, it's just where there is no data. Also, the data is saved in such a way that whatever is there is listed in the row first, followed by NAs in empty slots. In other words, I'm not worried if an NA may pop up first.
Also, is such a thing better handled in data frames rather than matrices?
not the efficient answer : just a try
logic : extending the length to 4.(exception could be if already x/y/z is laready of length4) Therefore while rbinding I only extract the first 4 elements .
x[length(x)+1:4] <- NA
y[length(y)+1:4] <- NA
z[length(z)+1:4] <- NA
my.matrix <- rbind(my.matrix,c(x[1:4],y[1:4],z[1:4]))
Note : the exception I mentioned above is like below :
> x <- c(1,1,1,1)
> x
[1] 1 1 1 1
> x[length(x)+1:4] <- NA
> x
[1] 1 1 1 1 NA NA NA NA # therefore I extract only the first four
Here is an option to do this programmatically
d1 <- stack(mget(c("x", "y", "z")))[2:1]
nm <- with(d1, paste0(ind, ave(seq_along(ind),ind, FUN = seq_along)))
my.matrix[,match(nm,colnames(my.matrix), nomatch = 0)] <- d1$values
my.matrix
# x1 x2 x3 x4 y1 y2 y3 y4 z1 z2 z3 z4
#[1,] 1 2 NA NA 4 2 1 5 1 NA NA NA
Or another option is stri_list2matrix from stringi
library(stringi)
m1 <- as.numeric(stri_list2matrix(list(x,y, z)))
Change the 'x', 'y', 'z' values
m2 <- as.numeric(stri_list2matrix(list(x,y, z)))
rbind(m1, m2)
I would like to merge several matrices using their row names.
These matrices do not have the same number of rows and columns.
For instance:
m1 <- matrix(c(1, 2, 3, 4, 5, 6), 3, 2)
rownames(m1) <- c("a","b","c")
m2 <- matrix(c(1, 2, 3, 5, 4, 5, 6, 2), 4, 2)
rownames(m2) <- c("a", "b", "c", "d")
m3 <- matrix(c(1, 2, 3, 4), 2,2)
rownames(m3) <- c("d", "e")
mlist <- list(m1, m2, m3)
For them I would like to get:
Row.names V1.x V2.x V1.y V2.y V1.z V2.z
a 1 4 1 4 NA NA
b 2 5 2 5 NA NA
c 3 6 3 6 NA NA
d NA NA 5 2 1 3
e NA NA NA NA 2 4
I have tried to use lapply with the function merge:
M <- lapply(mlist, merge, mlist, by = "row.names", all = TRUE)
However, it did not work:
Error in data.frame(c(1, 2, 3, 4, 5, 6), c(1, 2, 3, 5, 4, 5, 6, 2), c(1, :
arguments imply differing number of rows: 3, 4, 2
Is there an elegant way to merge these matrices?
You are trying to apply a reduction (?Reduce) to the list of matrices, where the reduction is basically merge. The problem is that merge(m1, m2, by = "row.names", all = T) doesn't give you a new merged matrix with row names, but instead returns the row names in the first column. This is why we need additional logic in the reduction function.
Reduce(function(a,b) {
res <- merge(a,b,by = "row.names", all = T);
rn <- res[,1]; # Row.names column of merge
res <- res[,-1]; # Actual data
row.names(res) <- rn; # Assign row.names
return(res) # Return the merged data with proper row.names
},
mlist[-1], # Reduce (left-to-right) by applying function(a,b) repeatedly
init = mlist[[1]] # Start with the first matrix
)
Or alternatively:
df <- mlist[[1]]
for (i in 2:length(mlist)) {
df <- merge(df, mlist[[i]], by = "row.names", all=T)
rownames(df) <- df$Row.names
df <- df[ , !(names(df) %in% "Row.names")]
}
# V1.x V2.x V1.y V2.y V1 V2
# a 1 4 1 4 NA NA
# b 2 5 2 5 NA NA
# c 3 6 3 6 NA NA
# d NA NA 5 2 1 3
# e NA NA NA NA 2 4
This could also be conceptualised as a reshape operation if the right long-form data.frame is passed to the function:
tmp <- do.call(rbind, mlist)
tmp <- data.frame(tmp, id=rownames(tmp),
time=rep(seq_along(mlist),sapply(mlist,nrow)) )
reshape(tmp, direction="wide")
# id X1.1 X2.1 X1.2 X2.2 X1.3 X2.3
#a a 1 4 1 4 NA NA
#b b 2 5 2 5 NA NA
#c c 3 6 3 6 NA NA
#d d NA NA 5 2 1 3
#e e NA NA NA NA 2 4
I've got a dataframe which looks like follows:
# Code:
m3 <- c(NA, -3, NA, NA, -3)
m2 <- c(rep(NA, 5))
m1 <- c(rep(NA, 5))
Zero <- c(rep(NA, 5))
p1 <- c(1, NA, NA, 1, NA)
p2 <- c(NA, NA, NA, 2, NA)
p3 <- c(3, NA, 3, 3, NA)
df <- data.frame(m3, m2, m1, Zero, p1, p2, p3)
# Output:
m3 m2 m1 Zero p1 p2 p3
1 NA NA NA NA 1 NA 3
2 -3 NA NA NA NA NA NA
3 NA NA NA NA NA NA 3
4 NA NA NA NA 1 2 3
5 -3 NA NA NA NA NA NA
I need to insert a -3 in the whole row, if there is a -3 in the first column. I also need to delete all columns, but p1, p2, and p3. The final result should look like follows:
# Final output:
p1 p2 p3
1 1 NA 3
2 -3 -3 -3
3 NA NA 3
4 1 2 3
5 -3 -3 -3
I found a solution, but it seems very inefficient to me. I need to perform this operation multiple times and therefore need a code, which is as efficient as possible. My inefficient solution looks like follows:
# Inefficient code:
for(i in 1:length(df$m3)){
if(is.na(df$m3[i]) == FALSE){
df[i, ] <- -3
}
}
df <- df[ , 5:length(df)]
Is there a more efficient way? Thank you very much in advance!
update values:
df[df$m3 %in% -3,] <- -3
select columns:
df <- df[, c("p1", "p2", "p3")]
You can use data.table
dt <- data.table(df)
dt[m3 == -3, paste0('p', 1:3) := -3]
dt <- dt[, c("p1", "p2", "p3"), with = FALSE]
Hi guys I have a difficult situation to manage:
I have a data.frame that looks like this:
General_name
a
b
c
d
m
n
and another data.frame that looks like this:
First_names_list a=34;b=4
Second_names_list d=2;m=98;n=32
Third_names_list c=1;d=12;m=0.1
I have to match each element of the first data.frame with each element before = in the second data.frame[,2] so that finally I have to obtain the following table:
Names a b c d m n
First_names_list 34 4 NA NA NA NA
Second_names_list NA NA NA 2 98 32
Third_names_list NA NA 1 12 0.1 NA
Any suggestion? It seems to be too difficult to me.
Best
E.
Option 1
Here is one approach using dcast from "reshape2" and concat.split from my "splitstackshape" package:
library(splitstackshape)
## The following can also be done in 2 steps. The basic idea is to split
## the values into a semi-long form for `dcast` to be able to use. So,
## I've split first on the semicolon, and made the data into a long form
## at the same time, then I've split on =, but kept it wide that time.
out <- concat.split(concat.split.multiple(df, "V2", ";", "long"),
"V2", "=", drop = TRUE)
out
# V1 time V2_1 V2_2
# 1 First_names_list 1 a 34.0
# 2 Second_names_list 1 d 2.0
# 3 Third_names_list 1 c 1.0
# 4 First_names_list 2 b 4.0
# 5 Second_names_list 2 m 98.0
# 6 Third_names_list 2 d 12.0
# 7 First_names_list 3 <NA> NA
# 8 Second_names_list 3 n 32.0
# 9 Third_names_list 3 m 0.1
library(reshape2)
dcast(out[complete.cases(out), ], V1 ~ V2_1, value.var="V2_2")
# V1 a b c d m n
# 1 First_names_list 34 4 NA NA NA NA
# 2 Second_names_list NA NA NA 2 98.0 32
# 3 Third_names_list NA NA 1 12 0.1 NA
Option 2
Here's another option using a more recent version of data.table. The concept is very similar to the approach taken above.
library(data.table)
library(reshape2)
packageVersion("data.table")
# [1] ‘1.8.11’
dt <- data.table(df)
S1 <- dt[, list(X = unlist(strsplit(as.character(V2), ";"))), by = V1]
S1[, c("A", "B") := do.call(rbind.data.frame, strsplit(X, "="))]
S1
# V1 X A B
# 1: First_names_list a=34 a 34
# 2: First_names_list b=4 b 4
# 3: Second_names_list d=2 d 2
# 4: Second_names_list m=98 m 98
# 5: Second_names_list n=32 n 32
# 6: Third_names_list c=1 c 1
# 7: Third_names_list d=12 d 12
# 8: Third_names_list m=0.1 m 0.1
dcast.data.table(S1, V1 ~ A, value.var="B")
# V1 a b c d m n
# 1: First_names_list 34 4 NA NA NA NA
# 2: Second_names_list NA NA NA 2 98 32
# 3: Third_names_list NA NA 1 12 0.1 NA
Both of the above options assume we're starting with:
df <- structure(list(V1 = c("First_names_list", "Second_names_list",
"Third_names_list"), V2 = c("a=34;b=4", "d=2;m=98;n=32",
"c=1;d=12;m=0.1")), .Names = c("V1", "V2"), class = "data.frame",
row.names = c(NA, -3L))
Here is a solution, using apply within apply:
#Data frame 1
df1 <- read.table(text=
"General_name
a
b
c
d
m
n", header=T, as.is=T)
#Data frame 2
df2 <- read.table(text=
"col1 col2
First_names_list a=34;b=4
Second_names_list d=2;m=98;n=32
Third_names_list c=1;d=12;m=0.1", header=T, as.is=T)
#make lists for each row, sep by ";"
df2split <- strsplit(df2$col2,split=";")
#result
t(
sapply(seq(1:nrow(df2)),function(c){
x <- df2split[[c]]
sapply(df1$General_name,function(n){
t <- gsub(paste0(n,"="),"",x[grepl(n,x)])
ifelse(length(t)==0,NA,as.numeric(t))
})
})
)
I feel this is a slightly round-about way to do it so I look forward to a better solution as well. But this works.
library(data.table)
library(reshape2)
#creating datasets
dt <- data.table(read.csv(textConnection('
"First_names_list","a=34;b=4"
"Second_names_list","d=2;m=98;n=32"
"Third_names_list","c=1;d=12;m=0.1"
'),header = FALSE))
General_name = c('a','b','c','d','m','n')
TotalBreakup <- data.table(
V1 = General_name
)
# Fixing datatypes
TotalBreakup <- TotalBreakup[,lapply(.SD,as.character)]
dt <- dt[,lapply(.SD,as.character)]
# looping through each row and calculating breakdown
for(i in 1:nrow(dt))
{
# the next two statements are the workhorse of this code. Run each part of these statements step by step to see
dtlist <- strsplit(unlist(strsplit(dt[i,V2],";")),"=")
breakup <- data.table(
t(
matrix(
unlist(
strsplit(
unlist(
strsplit(
dt[i,V2],
";"
)
),
"="
)
),
nrow = 2
)
)
)
# fixing datatypes again
breakup <- breakup[,lapply(.SD,as.character)]
#appending to master dataset
TotalBreakup <- merge(TotalBreakup, breakup, by = "V1", all.x = TRUE)
}
#formatting results
setnames(TotalBreakup,c("Names",dt[,V1]))
TotalBreakup <- acast(melt(TotalBreakup,id.vars = "Names"),variable~Names)
Output -
> TotalBreakup
a b c d m n
First_names_list "34" "4" NA NA NA NA
Second_names_list NA NA NA "2" "98" "32"
Third_names_list NA NA "1" "12" "0.1" NA
A way is this:
#the second dataframe you provided
DF2 <- read.table(text = '
First_names_list a=34;b=4
Second_names_list d=2;m=98;n=32
Third_names_list c=1;d=12;m=0.1
', header = F, stringsAsFactors = F)
#empty dataframe
DF <- structure(list(a = c(NA, NA, NA), b = c(NA, NA, NA), c = c(NA,
NA, NA), d = c(NA, NA, NA), m = c(NA, NA, NA), n = c(NA, NA,
NA)), .Names = c("a", "b", "c", "d", "m", "n"), row.names = c("First_names_list",
"Second_names_list", "Third_names_list"), class = "data.frame")
DF
# a b c d m n
#First_names_list NA NA NA NA NA NA
#Second_names_list NA NA NA NA NA NA
#Third_names_list NA NA NA NA NA NA
#fill the dataframe
myls <- strsplit(DF2$V2, split = ";")
for(i in 1:length(myls))
{
sapply(myls[[i]],
function(x) { res <- unlist(strsplit(x, "=")) ; DF[i,res[1]] <<- res[2] })
}
DF
# a b c d m n
#First_names_list 34 4 <NA> <NA> <NA> <NA>
#Second_names_list <NA> <NA> <NA> 2 98 32
#Third_names_list <NA> <NA> 1 12 0.1 <NA>