Join matrices by both colnames and rownames in R - r

I would like to join matrices by both colnames and rownames in R:
m1 = matrix(c(1,2,3, 11,12,13), nrow = 2, ncol = 3, byrow = TRUE,
dimnames = list(c("r1", "r2"),
c("a", "b", "c")))
m2 = matrix(c(4, 5, 0, 2,3,4), nrow = 2, ncol = 3, byrow = TRUE,
dimnames = list(c("r2", "r3"),
c("d", "b", "c")))
Check m1:
> m1
a b c
r1 1 2 3
r2 11 12 13
Check m2:
> m2
d b c
r2 4 5 0
r3 2 3 4
I want to get m3 which looks like this:
> m3
a b c d
r1 1 2 3 0
r2 11 17 13 4
r3 0 3 4 2
I did't find an elegant way to do so. Using the rbind.fill.matrix function in package plyr, I can indirectly get m3.
require(plyr)
m3 = rbind.fill.matrix(m1, m2)
rownames(m3) = c(rownames(m1), rownames(m2))
m3[is.na(m3)]=0 # replace na with zero
m3 = t(sapply(by(m3,rownames(m3),colSums),identity)) # aggregate matrix by rownames
I guess there must be some better ways to do so. What's your suggestion?

The following seems valid:
tmp = rbind(as.data.frame(as.table(m1)), as.data.frame(as.table(m2)))
#tmp = aggregate(Freq ~ Var1 + Var2, tmp, sum) #unnecessary
xtabs(Freq ~ Var1 + Var2, tmp)
# Var2
#Var1 a b c d
# r1 1 2 3 0
# r2 11 17 13 4
# r3 0 3 4 2
edit: As noted by #AnandaMahto, xtabs is a 'contingency-table' and not a 'reshape-data' function and, so, it sums by default.

I used this code:
m1 = m1[sort(rownames(m1)),sort(colnames(m1))]
m2 = m2[sort(rownames(m2)),sort(colnames(m2))]
nr = unique(c(rownames(m1),rownames(m2)))
nc = unique(c(colnames(m1),colnames(m2)))
m3 = matrix(0,nr=length(nr),nc=length(nc),dimnames=list(nr,nc))
m3[rownames(m3)%in%rownames(m1),colnames(m3)%in%colnames(m1)]=m1
m3[rownames(m3)%in%rownames(m2),colnames(m3)%in%colnames(m2)]=m3[rownames(m3)%in%rownames(m2),colnames(m3)%in%colnames(m2)]+m2

Related

Loop through rows and variables with same prefix

Suppose we have the following data:
d <- data.frame(
"V" = c("A", "B"),
"X1" = c("A", "A"),
"X2" = c("B","B"),
"X3" = c("C", "C"),
"Y1" = c(1, 4),
"Y2" = c(2, 5),
"Y3" = c(3, 6)
)
d[] <- lapply(d, as.character)
d
V X1 X2 X3 Y1 Y2 Y3
1 A A B C 1 2 3
2 B A B C 4 5 6
I want to create a variable VAL that will take the value of Y[n] if V=X[n]
I can do it with ifelse statements but I want to avoid nested ifelse because n is unknown
d$VAL_ifelse = ifelse(d$V == d$X1,d$Y1,
ifelse(d$V == d$X2,d$Y2,
ifelse(d$V == d$X3,d$Y3,NA)))
I tried to create this loop but problem is with j I think ?
d_X_var=grep("^X", names(d), value=TRUE)
for(i in 1:nrow(d)){
for(j in 1:length(d_X_var)){
if((d[i,c('V')] == d[i,paste0('X',j)]) == TRUE){
d$VAL_loop[i] <- as.character(d[i,paste0('Y',j)])
} else if((d[i,c('V')] != d[i,paste0('X',j)]) == TRUE){
d$VAL_loop[i] <- NA
}
}
}
d
V X1 X2 X3 Y1 Y2 Y3 VAL_ifelse VAL_loop
1 A A B C 1 2 3 1 <NA>
2 B A B C 4 5 6 5 <NA>
We can use vectorized way to get VAL
d$Val <- d[5:7][which(d[2:4] == d$V, arr.ind = TRUE)]
d
# V X1 X2 X3 Y1 Y2 Y3 Val
#1 A A B C 1 2 3 1
#2 B A B C 4 5 6 5
The above is true when you know the column numbers beforehand of X and Y columns. If you don't know we can use grep first to get column numbers and then subset.
X_cols <- grep("^X", names(d))
Y_cols <- grep("^Y", names(d))
d$Val <- d[Y_cols][which(d[X_cols] == d$V, arr.ind = TRUE)]
We can use max.col from base Rin a vectorized way
d$Val <- d[5:7][cbind(seq_len(nrow(d)), max.col(d$V == d[2:4], 'first'))]
d
# V X1 X2 X3 Y1 Y2 Y3 Val
#1 A A B C 1 2 3 1
#2 B A B C 4 5 6 5
Update
If there are no matches we can get the output as NA with rowSums (data from the comments)
d <- data.frame( "V" = c("A", "B","C","D","C"), "X1" = c("A", "A","A","A","A"), "X2" = c("B","B","B","B","A"), "X3" = c("C", "C","C","D","A"), "Y1" = c(1, 4, 7, 10, 13), "Y2" = c(2, 5, 8, 11, 14), "Y3" = c(3, 6, 9, 12,15), "Val_expected" = c(1,5,9,12,NA) )
d[,] <- lapply(d, as.character)
d$Val <- d[5:7][cbind(seq_len(nrow(d)), max.col(d$V == d[2:4], 'first'))]
d$Val <- as.numeric(d$Val) * (NA^ !rowSums(d$V == d[2:4]))
d$Val
#[1] 1 5 9 12 NA
Here is a slightly convoluted way using ifelse and diag:
d$Val <- ifelse(d$V == diag(as.matrix(d[,2:4])), diag(as.matrix(d[,5:7])), NA)
Output:
V X1 X2 X3 Y1 Y2 Y3 Val
1 A A B C 1 2 3 1
2 B A B C 4 5 6 5

R-Converting Incidence matrix(csv file) to edge list format

I am studying social network analysis and will be using Ucinet to draw network graphs. For this, I have to convert the csv file to an edge list format. Converting the adjacency matrix to the edge list was successful. However, it is difficult to convert an incidence matrix to the edge list format.
The csv file('some.csv') I have, with a incidence matrix like this:
A B C D
a 1 0 3 1
b 0 0 0 2
c 3 2 0 1
The code that converted the adjacency matrix to the edge list was as follows:
x<-read.csv("C:/.../something.csv", header=T, row.names=1)
net<-as.network(x, matrix.type='adjacency', ignore.eval=FALSE, names.eval='dd', loops=FALSE)
el<-edgelist(net, attrname='dd')
write.csv(el, file='C:/.../result.csv')
Now It only succeedded in loading the file. I tried to follow the above method, but I get an error.
y<-read.csv("C:/.../some.csv", header=T, row.names=1)
net2<-network(y, matrix.type='incidence', ignore.eval=FALSE, names.eval='co', loops=FALSE)
Error in network.incidence(x, g, ignore.eval, names.eval, na.rm, edge.check) :
Supplied incidence matrix has empty head/tail lists. (Did you get the directedness right?)
I want to see the result in this way:
a A 1
a C 3
a D 1
b D 2
c A 3
c B 2
c D 1
I tried to put the values as the error said, but I could not get the result i wanted.
Thank you for any assistance with this.
Here's your data:
inc_mat <- matrix(
c(1, 0, 3, 1,
0, 0, 0, 2,
3, 2, 0, 1),
nrow = 3, ncol = 4, byrow = TRUE
)
rownames(inc_mat) <- letters[1:3]
colnames(inc_mat) <- LETTERS[1:4]
inc_mat
#> A B C D
#> a 1 0 3 1
#> b 0 0 0 2
#> c 3 2 0 1
Here's a generalized function that does the trick:
as_edgelist.weighted_incidence_matrix <- function(x, drop_rownames = TRUE) {
melted <- do.call(cbind, lapply(list(row(x), col(x), x), as.vector)) # 3 col matrix of row index, col index, and `x`'s values
filtered <- melted[melted[, 3] != 0, ] # drop rows where column 3 is 0
# data frame where first 2 columns are...
df <- data.frame(mode1 = rownames(x)[filtered[, 1]], # `x`'s rownames, indexed by first column in `filtered``
mode2 = colnames(x)[filtered[, 2]], # `x`'s colnames, indexed by the second column in `filtered`
weight = filtered[, 3], # the third column in `filtered`
stringsAsFactors = FALSE)
out <- df[order(df$mode1), ] # sort by first column
if (!drop_rownames) {
return(out)
}
`rownames<-`(out, NULL)
}
Take it for a spin:
el <- as_edgelist.weighted_incidence_matrix(inc_mat)
el
#> mode1 mode2 weight
#> 1 a A 1
#> 2 a C 3
#> 3 a D 1
#> 4 b D 2
#> 5 c A 3
#> 6 c B 2
#> 7 c D 1
Here are the results you wanted:
control_df <- data.frame(
mode1 = c("a", "a", "a", "b", "c", "c", "c"),
mode2 = c("A", "C", "D", "D", "A", "B", "D"),
weight = c(1, 3, 1, 2, 3, 2, 1),
stringsAsFactors = FALSE
)
control_df
#> mode1 mode2 weight
#> 1 a A 1
#> 2 a C 3
#> 3 a D 1
#> 4 b D 2
#> 5 c A 3
#> 6 c B 2
#> 7 c D 1
Do they match?
identical(control_df, el)
#> [1] TRUE
This might not be the most efficient way, but it produces expected result:
y <- matrix( c(1,0,3,0,0,2,3,0,0,1,2,1), nrow=3)
colnames(y) <- c("e.A","e.B","e.C","e.D")
dt <- data.frame(rnames=c("a","b","c"))
dt <- cbind(dt, y)
# rnames e.A e.B e.C e.D
#1 a 1 0 3 1
#2 b 0 0 0 2
#3 c 3 2 0 1
# use reshape () function to convert dataframe into the long format
M <- reshape(dt, direction="long", idvar = "rnames", varying = c("e.A","e.B","e.C","e.D"))
M <- M[M$e >0,]
M
# rnames time e
# a.A a A 1
# c.A c A 3
# c.B c B 2
# a.C a C 3
# a.D a D 1
# b.D b D 2
# c.D c D 1
# If M needs to be sorted by the column rnames:
M[order(M$rnames), ]
# rnames time e
# a.A a A 1
# a.C a C 3
# a.D a D 1
# b.D b D 2
# c.A c A 3
# c.B c B 2
# c.D c D 1

data.table - efficiently manipulate large data set

I am amazed by the blazing speed of data.table. The coding below does exactly what I need however when executed on a large table it does not perform very well.
convinced that this can be done faster with data.table but I do not see how.
Output
The output needs to be a matrix with the rownames a regular sequence of days.
For each column separately:
All values before the first value need to be NA
All values after the last value need to be NA
Between the first and the last value 0 need to be added as the do not exist in the input table
The following coding shows how the result should look like:
M <-
matrix(c(NA, NA, NA, 2, 0, 1, 3, 0, 2 , NA,
NA, NA, 3, 1, 3, 2, 1, 2, NA, NA),
ncol = 2,
dimnames = list(as.character((Sys.Date() + 0:9)),
c("E1", "E2")))
Output example
## E1 E2
## 2017-01-27 NA NA
## 2017-01-28 NA NA
## 2017-01-29 NA 2
## 2017-01-30 2 2
## 2017-01-31 0 2
## 2017-02-01 3 1
## 2017-02-02 1 3
## 2017-02-03 0 3
## 2017-02-04 2 NA
## 2017-02-05 NA NA
Input
The following table shows the source/input for the coding/function:
DS <- data.table(
E = c(rep("E1", 4), rep("E2", 6)),
C = c(c(Sys.Date() + c(3, 5, 6, 8)),
c(Sys.Date() + c(2, 3, 4, 5, 6, 7))),
S = round(runif(n = 10,min = 1, max = 3), 0),
key = c("E", "C"))
## E C S
## 1: E1 2017-01-30 3
## 2: E1 2017-02-01 1
## 3: E1 2017-02-02 2
## 4: E1 2017-02-04 1
## 5: E2 2017-01-29 3
## 6: E2 2017-01-30 2
## 7: E2 2017-01-31 3
## 8: E2 2017-02-01 1
## 9: E2 2017-02-02 2
## 10: E2 2017-02-03 3
Input example
Code working
The following few lines do exactly what I need and is simple. However it is not efficient.
The real table has 700 unique C values and 2 Million E values.
# Create the regular time line per day
CL <- c(C= (Sys.Date() + 0:9))
# Determine first and last per E
DM <- DS[, .(MIN = min(C), MAX = max(C)), by =.(E)]
# Generate all combinations
CJ <- CJ(E = DS$E, C = CL, unique = TRUE)
# Join
DC <- DS[CJ, on = .(E, C)][!is.na(E)]
# replace NA by 0
DC[is.na(S), S:=0]
# Lead-in
DC[DM, on=.(E, C<MIN), S:=NA]
# Lead-out
DC[DM, on=.(E, C>MAX), S:=NA]
# Cast to matrix format
DC2 <- dcast(
data = DC, formula = C ~ E,
fun.aggregate = sum, value.var = "S")
# coerce to matrix
M3 <- as.matrix(DC2[, -1])
# add row nanes
rownames(M3) <- format(CL, "%Y-%m-%d")
I made some long, un-readable, clumsy coding which creates the matrix with 1.2B cells in 35 secs. This must be possible as quick but far more elegant with data.table, however not like this.
A data.table, like a data.frame underneath everything is a list (with length = number of columns)
200 Million columns is a lot of columns - this will make anything slow.
The description of the conversion to "wide" will bloat the data with large number of NA values. You can almost certainly perform the analysis you need on the "long form" and using keys.
It isn't clear from your question what you need, but you can calculate the various sums
# convert to an IDate
DT[, CALDAY := as.IDate(CALDAY)]
# get range of dates
rangeDays <- DT[,range(CALDAY)]
all_days <- as.IDate(seq(rangeDays[1],rangeDays[2], by=1))
# create sums
DT_sum <- DT[, list(VALUE= sum(VALUE)), keyby = list(ENTITY, CALDAY)]
and then index using entity and dates.
DT_sum[.("2a8605e2-e283-11e6-a3bb-bbe3fd226f8d", all_days)]
and if you need to replace NA with 0
na_replace <- function(x,repl=0){x[is.na(x)]<-repl;x}
DT_sum[.("2a8605e2-e283-11e6-a3bb-bbe3fd226f8d", all_days), na_replace(VALUE)]
This does the trick. But still the performance is not good.
It takes DS as input parameter. The result is a data.table which should be coerced to matrix by:
M <- as.matrix(build_timeseries_DT(DS))
Function
build_timeseries_DT <- function(DS){
# regular time serie for complete range with index
dtC <- data.table(
CAL = seq(min(DS$C), max(DS$C), by = "day"))[, idx:= 1:.N]
# add row index (idx) to sales
DQ <- dtC[DS, on = "CAL"]
setkey(DQ, "ENT")
# calculate min index per ENT
DM <- DQ[, .(MIN = min(idx), MAX = max(idx)), by = .(ENT)]
# allocate memory, assign 0 and set rownames by reference
DT <- dtC[, .(CAL)][, (DM[, ENT]):= 0L][, CAL:= NULL]
setattr(DT, "row.names", format(dtC$CAL, "%Y-%m-%d"))
# Set NA for the Lead-in and out, next populate values by ref
for(j in colnames(DT)){
set(x = DT,
i = c(1L:(DM[j, MIN]), (DM[j, MAX]):DT[, .N]),
j = j,
value = NA )
set(x = DT,
i = DQ[j, idx],
j = j,
value = DQ[j, SLS] )}
return(DT)
}
Test Data
DS <- data.table(
ENT = c("A", "A", "A", "B", "B", "C", "C", "C", "D", "D"),
CAL = c(Sys.Date() + c(0, 5, 6, 3, 8, 1, 2, 9, 3, 5)),
SLS = as.integer(c(1, 2, 1, 2, 3, 1, 2, 3, 2, 1)),
key = c("ENT", "CAL"))
ENT CAL SLS
1: A 2017-01-28 1
2: A 2017-02-02 2
3: A 2017-02-03 1
4: B 2017-01-31 2
5: B 2017-02-05 3
6: C 2017-01-29 1
7: C 2017-01-30 2
8: C 2017-02-06 3
9: D 2017-01-31 2
10: D 2017-02-02 1
Result
as.matrix(build_timeseries_DT(DS))
A B C D
[1,] 1 NA NA NA
[2,] 0 NA 1 NA
[3,] 0 NA 2 NA
[4,] 0 2 0 2
[5,] 0 0 0 0
[6,] 2 0 0 1
[7,] 1 0 0 NA
[8,] NA 0 0 NA
[9,] NA 3 0 NA
[10,] NA NA 3 NA
result with colors

Print method for multiple matrices

Could someone please suggest a method to print several matrices side by side in the terminal window.
For the matrices m1 and m2, I would like the desired output below.
m1 <- m2 <- matrix(1:4, nrow=2, dimnames=list(c("a", "b"), c("d", "e")))
Desired output
m1 m2
d e d e
a 1 3 a 1 3
b 2 4 b 2 4
The reason is that I have several 2x2 matrices that i am using in calculations and want to show in a Rmarkdown doc. It takes up a bit too much of the page when printing length ways. Thanks.
EDIT
My attempt at a solution
fn <- function(x) setNames(data.frame(.=paste(" ", rownames(x)), x,
check.names=F, row.names=NULL),c(paste(substitute(x)), colnames(x)))
cbind(fn(m1), fn(m2))
# m1 d e m2 f g
#1 a 1 3 v 1 3
#2 b 2 4 w 2 4
But this of course doesnt look very good.
A little hack-ish, but I believe it is what you want:
m1 <- m2 <- m3 <- m4 <- matrix(1:4, nrow=2, dimnames=list(c("a", "b"), c("d", "e")))
fn <- function(x) setNames(data.frame(.=paste("", rownames(x)), x, check.names=F, row.names=NULL),c(" ", colnames(x)))
matrix.names <- Filter( function(x) 'matrix' %in% class( get(x) ), ls(pattern = "m") )
matrix.list <- lapply(matrix.names, get)
matrix.chain <- do.call(cbind, lapply(matrix.list, fn))
cat(" ", paste0(matrix.names, collapse = " "), "\n"); print(matrix.chain, row.names = FALSE)
m1 m2 m3 m4
d e d e d e d e
a 1 3 a 1 3 a 1 3 a 1 3
b 2 4 b 2 4 b 2 4 b 2 4

Insert count number of elements in columns into table in R

I'm working in R and I've got a matrix with A, B and NA values, and I would like to count the number of A or B or NA values in every column and insert the results into the table. I used the code below to account the A, B and NA.
mydata <- matrix(c(rep("A", 8), rep("B", 2), rep(NA, 2), rep("A", 4),
rep(c("B", "A", "A", "A"), 2), rep("A", 4)), ncol = 4, byrow = TRUE)
myFun <- function(x) {
data.frame(n.A = sum(x == "A", na.rm = TRUE), n.B = sum(x == "B",
na.rm = TRUE), n.NA = sum(is.na(x)))
}
count <- apply(mydata, 2, myFun)
Now, I need to insert the results from count (count <- apply(mydata, 2, myFun)) into the a dataframe as a table with only a header.
Almost identical in concept to mnel's answer, you can also try the following in base R:
sapply(as.data.frame(mydata),
function(x) table(factor(x, levels = unique(as.vector(mydata))),
useNA = "always"))
# V1 V2 V3 V4
# A 4 6 6 6
# B 3 1 0 0
# <NA> 0 0 1 1
Here, rather than manually specifying the factor levels, I've made use of the data in mydata.
I think the easiest with using plyr and adply or ldply
You can replace myfun with a call to table.
library(plyr)
adply(mydata,2, function(x) table(factor(x, levels = c('A','B')), useNA = 'always'))
# X1 A B NA
# 1 1 4 3 0
# 2 2 6 1 0
# 3 3 6 0 1
# 4 4 6 0 1
If you have large data, then plyr isn't the way go. apply will work nicely
apply(mydata, 2, function(x) {
xx <- table(factor(x, levels = c('A','B')), useNA = 'always')
names(xx) <- c('nA','nB', 'nNA')
xx})
[,1] [,2] [,3] [,4]
nA 4 6 6 6
nB 3 1 0 0
nNA 0 0 1 1

Resources