Data frame into a symmetric matrix while keeping all row and column - r

The question R DataFrame into square (symmetric) matrix is about transforming a data frame into a symmetric matrix given the following example:
# data
x <- structure(c(1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
.Dim = c(3L,5L),
.Dimnames = list(c("X", "Y", "Z"), c("A", "B", "C", "D", "E")))
x
# A B C D E
#X 1 1 0 0 0
#Y 1 1 0 0 0
#Z 0 0 0 1 0
# transformation
x <- x %*% t(x)
diag(x) <- 1
x
# X Y Z
#X 1 2 0
#Y 2 1 0
#Z 0 0 1
Now, I'm trying to keep the columns c("A", "B", "C", "D", "E") in the matrix, like this:
# X Y Z A B C D E
#X 1 0 0 1 1 0 0 0
#Y 0 1 0 1 1 0 0 0
#Z 0 0 1 0 0 0 1 0
#A 1 1 0 1 0 0 0 0
#B 1 1 0 0 1 0 0 0
#C 0 0 0 0 0 1 0 0
#D 0 0 1 0 0 0 1 0
#E 0 0 0 0 0 0 0 1
I'm pretty sure there's an easy way to do it similar to the first transformation including each case. Can anyone suggest a solution?
Thank you in advance!

This is just an augmentation, isn't it?
X <- as.matrix(x)
rbind(cbind(diag(nrow(X)), X), cbind(t(X), diag(ncol(X))))

build it step by step: thanks to #李哲源 for pointing out m <- diag(sum(dim(x))).
m <- diag(sum(dim(x)))
colnames(m) = rownames(m) = c(rownames(x),colnames(x))
m[dimnames(x)[[1]],dimnames(x)[[2]]] <- x
m[dimnames(x)[[2]],dimnames(x)[[1]]] <- t(x)
# X Y Z A B C D E
#X 1 0 0 1 1 0 0 0
#Y 0 1 0 1 1 0 0 0
#Z 0 0 1 0 0 0 1 0
#A 1 1 0 1 0 0 0 0
#B 1 1 0 0 1 0 0 0
#C 0 0 0 0 0 1 0 0
#D 0 0 1 0 0 0 1 0
#E 0 0 0 0 0 0 0 1
You can wrap it into a function and then call it:
makeSymMat <-function(x) {
x <- as.matrix(x)
m <- diag(sum(dim(x)))
colnames(m) = rownames(m) = c(rownames(x),colnames(x))
m[dimnames(x)[[1]],dimnames(x)[[2]]] <- x
m[dimnames(x)[[2]],dimnames(x)[[1]]] <- t(x)
return(m)
}
makeSymMat(x)

Related

Find most repeated sequence of values

Suppose I have a data frame that looks like this
ITEM
1 X
2 A
3 B
4 C
5 A
6 F
7 U
8 A
9 B
10 C
11 F
12 U
How can I obtain the most common sequence of values in the 'ITEM' column?. In this case the most frequent sequence would be A, B, C since it appears in row 2 to 4 and 8 to 10.
I have already tried the function rle, as well as some of the solutions found here, and I haven't been lucky. Can I have a suggestion, hint, or package recommendation?
I guess you want the longest non-overlapping sub-string. There's some good explanation about the dynamic programming solution here.
x = c("X", "A", "B", "C", "A", "F", "U", "A", "B", "C", "F", "U")
n = length(x)
m1 = sapply(x, function(i) sapply(x, function(j) as.integer(i == j)))
diag(m1) = 0
m1[lower.tri(m1)] = 0
m1
# X A B C A F U A B C F U
# X 0 0 0 0 0 0 0 0 0 0 0 0
# A 0 0 0 0 1 0 0 1 0 0 0 0
# B 0 0 0 0 0 0 0 0 1 0 0 0
# C 0 0 0 0 0 0 0 0 0 1 0 0
# A 0 0 0 0 0 0 0 1 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 1 0
# U 0 0 0 0 0 0 0 0 0 0 0 1
# A 0 0 0 0 0 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 0
m2 = m1
for (i in 2:nrow(m1)){
for (j in 2:nrow(m1)){
if (m1[i-1, j-1] == 1 & m1[i, j] == 1){
if (j - i > m2[i - 1, j - 1]){
m2[i, j] = m2[i - 1, j - 1] + m2[i, j]
m2[i - 1, j - 1] = 0
} else {
m2[i, j] = 0
}
}
}
}
m2
# X A B C A F U A B C F U
# X 0 0 0 0 0 0 0 0 0 0 0 0
# A 0 0 0 0 1 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 3 0 0
# A 0 0 0 0 0 0 0 1 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 2
# A 0 0 0 0 0 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 0
ans_len = max(m2)
inds = c(which(m2 == ans_len, arr.ind = TRUE)[,2])
lapply(inds, function(ind) x[(ind - ans_len + 1):ind])
# [[1]]
# [1] "A" "B" "C"
A tidyverse solution mixed with nested apply functions. The solution is generalized and will report the most frequent non-trivial consecutive sequence that appears at least twice--tie goes to the longer sequence.
library(tidyverse)
# Data
x <- data.frame(ITEM = c("X", "A", "B", "C", "A", "F", "U", "A", "B", "C", "F", "U"), stringsAsFactors = F)
# convert x to vector
y <- x$ITEM
# Create list to check for sequence of each length 2 through n/2
l <- lapply(2:floor(length(y)/2), function(a) sapply(1:a, function(x) y[(0 + x):(length(y) - a + x)])) %>%
lapply(as.data.frame) %>%
setNames(sapply(2:(length(.) + 1), function(a) paste0("Consecutive", a)))
# Show most frequent sequence(s), choosing the longest
lapply(1:length(l), function(x) (as.data.frame(table(do.call(paste, l[[x]])), stringsAsFactors = F) %>%
dplyr::mutate(length = nchar(Var1)) %>%
dplyr::filter(length == max(length) & Freq == max(Freq) & Freq > 1)) ) %>%
.[which(sapply(., nrow) > 0)] %>%
dplyr::bind_rows() %>%
dplyr::filter(Freq == max(Freq)) %>%
dplyr::filter(length == max(length)) %>%
dplyr::rename(Sequence = Var1) %>%
dplyr::select(-length)
# Sequence Freq
#1 A B C 2

R - Create matrix from 3 raw vector

I have 3 vectors as the following:
A <- c("A", "B", "C", "D", "E")
B <- c("1/1/1", "1/1/1", "2/1/1", "2/1/1", "3/1/1")
C <- c(1, 1, -1, 1, -1)
and I want to create a matrix like the following using these 3 vectors:
- 1/1/1 2/1/1 3/1/1
A 1 0 0
B 1 0 0
C 0 -1 0
D 0 1 0
E 0 0 -1
where vector A and B are rows and columns respectively and I have the data as C.
Any help would be appreciated.
Use ?xtabs
xtabs(C ~ A+B)
# B
#A 1/1/1 2/1/1 3/1/1
# A 1 0 0
# B 1 0 0
# C 0 -1 0
# D 0 1 0
# E 0 0 -1
You can try:
`[<-`(array(0,c(length(unique(A)),length(unique(B))),
list(unique(A),unique(B))),
cbind(A,B),C)
# 1/1/1 2/1/1 3/1/1
#A 1 0 0
#B 1 0 0
#C 0 -1 0
#D 0 1 0
#E 0 0 -1
Another option is acast from reshape2 after creating a data.frame
library(reshape2)
acast(data.frame(A, B, C), A~B, value.var = "C", fill =0)
# 1/1/1 2/1/1 3/1/1
#A 1 0 0
#B 1 0 0
#C 0 -1 0
#D 0 1 0
#E 0 0 -1

In an NxN matrix give value "True" to pairs from data frame

I have created a 5x5 matrix where rows and columns have the same names and a data frame with name pairs:
N <- 5
Names <- letters[1:N]
mat <- matrix(rep(0, N*N), nrow = N, ncol = N, dimnames = list(Names, Names))
a b c d e
a 0 0 0 0 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 0 0 0 0 0
The data frame then consist of different pairs:
col1 col2
1 a c
2 c b
3 d b
4 d e
How can I match these in so that col1 only refers to rows in my matrix and col2 only to columns? The above should compute to the following result:
a b c d e
a 0 0 1 0 0
b 0 0 0 0 0
c 0 1 0 0 0
d 0 1 0 0 1
e 0 0 0 0 0
You can use match to create a "key" of which combinations need to be replaced with 1, like this:
key <- vapply(seq_along(mydf),
function(x) match(mydf[[x]],
dimnames(mat)[[x]]),
numeric(nrow(mydf)))
Then, use matrix indexing to replace the relevant values.
mat[key] <- 1
mat
a b c d e
a 0 0 1 0 0
b 0 0 0 0 0
c 0 1 0 0 0
d 0 1 0 0 1
e 0 0 0 0 0
You could also do:
mat[as.matrix(d1)] <- 1
mat
# a b c d e
#a 0 0 1 0 0
#b 0 0 0 0 0
#c 0 1 0 0 0
#d 0 1 0 0 1
#e 0 0 0 0 0
data
d1 <- structure(list(col1 = c("a", "c", "d", "d"), col2 = c("c", "b",
"b", "e")), .Names = c("col1", "col2"), class = "data.frame",
row.names = c("1", "2", "3", "4"))

Keep sorting of data.frame when using acast

When I use acastin R, the sorting of my data frame gets messed up. Imagine my data.frame looks like this
V1 V2 V3
1 D Y 0
2 E X 0
3 C N 0
4 B M 0
5 A S 0
doing acast(dd, V1 ~ V2, value.var="V3", fill = 0) will result in an ordered matrix, e.g.
M N S X Y
A 0 0 0 0 0
B 0 0 0 0 0
C 0 0 0 0 0
D 0 0 0 0 0
E 0 0 0 0 0
How do I keep the original sorting of the data frame?
Make V1 and V2 into factors, and when you do so, make their levels the order you want. The default ordering when making factors is to sort them, which is why you got the order you did the first time.
d <- data.frame(V1=c("D", "E", "C", "B", "A"), V2=c("Y","X","N","M","S"), V3=0)
d$V1 <- factor(d$V1, levels=unique(d$V1))
d$V2 <- factor(d$V2, levels=unique(d$V2))
> acast(d, V1~V2, value.var="V3", fun.aggregate=sum)
Y X N M S
D 0 0 0 0 0
E 0 0 0 0 0
C 0 0 0 0 0
B 0 0 0 0 0
A 0 0 0 0 0
You can do something like this :
m <- acast(dd, V1 ~ V2, value.var="V3", fill = 0)
m[dd$V1,dd$V2]
Which gives :
Y X N M S
D 0 0 0 0 0
E 0 0 0 0 0
C 0 0 0 0 0
B 0 0 0 0 0
A 0 0 0 0 0

How to do row-wise subtraction and replace a specific number with zero?

Step 1: I have a simplified dataframe like this:
df1 = data.frame (B=c(1,0,1), C=c(1,1,0)
, D=c(1,0,1), E=c(1,1,0), F=c(0,0,1)
, G=c(0,1,0), H=c(0,0,1), I=c(0,1,0))
B C D E F G H I
1 1 1 1 1 0 0 0 0
2 0 1 0 1 0 1 0 1
3 1 0 1 0 1 0 1 0
Step 2: I want to do row wise subtraction, i.e. (row1 - row2), (row1 - row3) and (row2 - row3)
row1-row2 1 0 1 0 0 -1 0 -1
row1-row3 0 1 0 1 -1 0 -1 0
row2-row3 -1 1 -1 1 -1 1 -1 1
step 3: replace all -1 to 0
row1-row2 1 0 1 0 0 0 0 0
row1-row3 0 1 0 1 0 0 0 0
row2-row3 0 1 0 1 0 1 0 1
Could you mind to teach me how to do so?
I like using the plyr library for things like this using the combn function to generate all possible pairs of rows/columns.
require(plyr)
combos <- combn(nrow(df1), 2)
adply(combos, 2, function(x) {
out <- data.frame(df1[x[1] , ] - df1[x[2] , ])
out[out == -1] <- 0
return(out)
}
)
Results in:
X1 B C D E F G H I
1 1 1 0 1 0 0 0 0 0
2 2 0 1 0 1 0 0 0 0
3 3 0 1 0 1 0 1 0 1
If necessary, you can drop the first column, plyr spits that out automagically for you.
Similar questions:
Sum pairwise rows with R?
Chi Square Analysis using for loop in R
Compare one row to all other rows in a file using R
For the record, I would do this:
cmb <- combn(seq_len(nrow(df1)), 2)
out <- df1[cmb[1,], ] - df1[cmb[2,], ]
out[out < 0] <- 0
rownames(out) <- apply(cmb, 2,
function(x) paste("row", x[1], "-row", x[2], sep = ""))
This yields (the last line above is a bit of sugar, and may not be needed):
> out
B C D E F G H I
row1-row2 1 0 1 0 0 0 0 0
row1-row3 0 1 0 1 0 0 0 0
row2-row3 0 1 0 1 0 1 0 1
Which is fully vectorised and exploits indices to extend/extract the elements of df1 required for the row-by-row operation.
> df2 <- rbind(df1[1,]-df1[2,], df1[1,]-df1[3,], df1[2,]-df1[3,])
> df2
B C D E F G H I
1 1 0 1 0 0 -1 0 -1
2 0 1 0 1 -1 0 -1 0
21 -1 1 -1 1 -1 1 -1 1
> df2[df2==-1] <- 0
> df2
B C D E F G H I
1 1 0 1 0 0 0 0 0
2 0 1 0 1 0 0 0 0
21 0 1 0 1 0 1 0 1
If you'd like to change the name of the rows to those in your example:
> rownames(df2) <- c('row1-row2', 'row1-row3', 'row2-row3')
> df2
B C D E F G H I
row1-row2 1 0 1 0 0 0 0 0
row1-row3 0 1 0 1 0 0 0 0
row2-row3 0 1 0 1 0 1 0 1
Finally, if the number of rows is not known ahead of time, the following should do the trick:
df1 = data.frame (B=c(1,0,1), C=c(1,1,0), D=c(1,0,1), E=c(1,1,0), F=c(0,0,1), G=c(0,1,0), H=c(0,0,1), I=c(0,1,0))
n <- length(df1[,1])
ret <- data.frame()
for (i in 1:(n-1)) {
for (j in (i+1):n) {
diff <- df1[i,] - df1[j,]
rownames(diff) <- paste('row', i, '-row', j, sep='')
ret <- rbind(ret, diff)
}
}
ret[ret==-1] <- 0
print(ret)

Resources