Adjacency Matrix from a dataframe - r

I am trying to convert an edgelist to an adjacent matrix.
Below is the sample data
#Sample Data
User<-c("1","1","2","3","4")
v1 <- c("b", "b", "a", "d", "c")
v2 <- c("c", "d", "c", "a", "a")
v3 <- c(0, 0, "d", 0, "b")
v4 <- c(0, 0, 0, 0, 0)
v5 <- c(0, 0, 0, 0, 0)
my_data<-data.frame(User, v1, v2, v3, v4, v5)
my_data
If you run this code you will get the below as output,
User v1 v2 v3 v4 v5
1 b c 0 0 0
1 b d 0 0 0
2 a c d 0 0
3 d a 0 0 0
4 c a b 0 0
Using the data, I want to create an adjacent matrix that looks like follows:
a b c d
a 0 0 2 2
b 0 0 1 1
c 2 1 0 1
d 2 1 1 0
Basically, the desired output diplays the count how many times each pair appeared in column v1~v5 in the sample data frame.
I have tried to use AdjacencyFromEdgelist function from dils library, also tried to create a matrix shell with NAs and fill out the matrix by looping through the dataframe.
However, I could not get neither way to work.

I think this may be close to what you have in mind. In the rows where there are more than 2 vertices, I considered every existing pairs:
library(igraph)
do.call(rbind, my_data[-1] |>
apply(1, \(x) x[x != 0]) |>
lapply(\(x) t(combn(x, m = 2)))) |>
graph_from_edgelist(directed = FALSE) %>%
as_adjacency_matrix()
4 x 4 sparse Matrix of class "dgCMatrix"
b c d a
b . 2 1 1
c 2 . 1 2
d 1 1 . 2
a 1 2 2 .
Or without the pip operator in base R:
tmp <- apply(my_data[-1], 1, function(x) x[x != 0])
tmp <- do.call(rbind, lapply(tmp, function(x) t(combn(x, m = 2))))
my_graph <- graph_from_edgelist(tmp, directed = FALSE)
adj_mat <- as_adjacency_matrix(my_graph)
adj_mat

Another attempt, minus the need to calculate all the combinations with combn
sel <- my_data[-1] != 0
dat <- data.frame(row=row(my_data[-1])[sel], value = my_data[-1][sel])
out <- crossprod(table(dat))
diag(out) <- 0
out
# value
#value a b c d
# a 0 1 2 2
# b 1 0 2 1
# c 2 2 0 1
# d 2 1 1 0
Matches the result from #AnoushiravanR:
adj_mat[c("a","b","c","d"), c("a","b","c","d")]
#4 x 4 sparse Matrix of class "dgCMatrix"
# a b c d
#a . 1 2 2
#b 1 . 2 1
#c 2 2 . 1
#d 2 1 1 .

Another igraph option
do.call(
rbind,
combn(df, 2, setNames, nm = c("from", "to"), simplify = FALSE)
) %>%
filter(from > 0 & to > 0) %>%
arrange(from) %>%
graph_from_data_frame(directed = FALSE) %>%
get.adjacency(sparse = FALSE)
gives
a b c d
a 0 1 2 2
b 1 0 2 1
c 2 2 0 1
d 2 1 1 0

Related

Creating new columns with combinations of string patterns in R

I have a data frame - in which I have a column with a lengthy string separated by _. Now I am interested in counting the patterns and several possible combinations from the long string. In the use case I provided below, you can find that I would like to count the occurrence of events A and B but not anything else.
If A and B repeat like A_B or B_A alone or if they repeats itself n number of times, I want to count them and also if there are several occurrences of those combinations.
Example data frame:
participant <- c("A", "B", "C")
trial <- c(1,1,2)
string_pattern <- c("A_B_A_C_A_B", "B_A_B_A_C_D_A_B", "A_B_C_A_B")
df <- data.frame(participant, trial, string_pattern)
Expected output:
participant trial string_pattern A_B B_A A_B_A B_A_B B_A_B_A
1. A 1 A_B_A_C_A_B 2 1 1 0 0
2. B 1 B_A_B_A_C_D_A_B 2 2 1 1 1
3. C 2 A_B_C_A_B 2 0 0 0 0
My code:
revised_df <- df%>%
dplyr::mutate(A_B = stringr::str_count(string_pattern, "A_B"),
B_A = stringr::str_count(string_pattern, "B_A"),
B_A_B = string::str_count(string_pattern, "B_A_B"))
My approach gets complicated as the number of combinations increases. Hence, looking for a better solution.
You could write a function to solve this:
m <- function(s){
a <- seq(nchar(s)-1)
start <- rep(a, rev(a))
stop <- ave(start, start, FUN = \(x)seq_along(x)+x)
b <- substring(s, start, stop)
gsub('(?<=\\B)|(?=\\B)', '_', b, perl = TRUE)
}
n <- function(x){
names(x) <- x
a <- strsplit(gsub("_", '', gsub("_[^AB]+_", ':', x)), ':')
b <- t(table(stack(lapply(a, \(y)unlist(sapply(y, m))))))
data.frame(pattern=x, as.data.frame.matrix(b), row.names = NULL)
}
n(string_pattern)
pattern A_B A_B_A B_A B_A_B B_A_B_A
1 A_B_A_C_A_B 2 1 1 0 0
2 B_A_B_A_C_D_A_B 2 1 2 1 1
3 A_B_C_A_B 2 0 0 0 0
Try: This checks each string row for current column name
library(dplyr)
df |>
mutate(A_B = 0, B_A = 0, A_B_A = 0, B_A_B = 0, B_A_B_A = 0) |>
mutate(across(A_B:B_A_B_A, ~ str_count(string_pattern, cur_column())))
participant trial string_pattern A_B B_A A_B_A B_A_B B_A_B_A
1 A 1 A_B_A_C_A_B 2 1 1 0 0
2 B 1 B_A_B_A_C_D_A_B 2 2 1 1 1
3 C 2 A_B_C_A_B 2 0 0 0 0

R-Converting Incidence matrix(csv file) to edge list format

I am studying social network analysis and will be using Ucinet to draw network graphs. For this, I have to convert the csv file to an edge list format. Converting the adjacency matrix to the edge list was successful. However, it is difficult to convert an incidence matrix to the edge list format.
The csv file('some.csv') I have, with a incidence matrix like this:
A B C D
a 1 0 3 1
b 0 0 0 2
c 3 2 0 1
The code that converted the adjacency matrix to the edge list was as follows:
x<-read.csv("C:/.../something.csv", header=T, row.names=1)
net<-as.network(x, matrix.type='adjacency', ignore.eval=FALSE, names.eval='dd', loops=FALSE)
el<-edgelist(net, attrname='dd')
write.csv(el, file='C:/.../result.csv')
Now It only succeedded in loading the file. I tried to follow the above method, but I get an error.
y<-read.csv("C:/.../some.csv", header=T, row.names=1)
net2<-network(y, matrix.type='incidence', ignore.eval=FALSE, names.eval='co', loops=FALSE)
Error in network.incidence(x, g, ignore.eval, names.eval, na.rm, edge.check) :
Supplied incidence matrix has empty head/tail lists. (Did you get the directedness right?)
I want to see the result in this way:
a A 1
a C 3
a D 1
b D 2
c A 3
c B 2
c D 1
I tried to put the values as the error said, but I could not get the result i wanted.
Thank you for any assistance with this.
Here's your data:
inc_mat <- matrix(
c(1, 0, 3, 1,
0, 0, 0, 2,
3, 2, 0, 1),
nrow = 3, ncol = 4, byrow = TRUE
)
rownames(inc_mat) <- letters[1:3]
colnames(inc_mat) <- LETTERS[1:4]
inc_mat
#> A B C D
#> a 1 0 3 1
#> b 0 0 0 2
#> c 3 2 0 1
Here's a generalized function that does the trick:
as_edgelist.weighted_incidence_matrix <- function(x, drop_rownames = TRUE) {
melted <- do.call(cbind, lapply(list(row(x), col(x), x), as.vector)) # 3 col matrix of row index, col index, and `x`'s values
filtered <- melted[melted[, 3] != 0, ] # drop rows where column 3 is 0
# data frame where first 2 columns are...
df <- data.frame(mode1 = rownames(x)[filtered[, 1]], # `x`'s rownames, indexed by first column in `filtered``
mode2 = colnames(x)[filtered[, 2]], # `x`'s colnames, indexed by the second column in `filtered`
weight = filtered[, 3], # the third column in `filtered`
stringsAsFactors = FALSE)
out <- df[order(df$mode1), ] # sort by first column
if (!drop_rownames) {
return(out)
}
`rownames<-`(out, NULL)
}
Take it for a spin:
el <- as_edgelist.weighted_incidence_matrix(inc_mat)
el
#> mode1 mode2 weight
#> 1 a A 1
#> 2 a C 3
#> 3 a D 1
#> 4 b D 2
#> 5 c A 3
#> 6 c B 2
#> 7 c D 1
Here are the results you wanted:
control_df <- data.frame(
mode1 = c("a", "a", "a", "b", "c", "c", "c"),
mode2 = c("A", "C", "D", "D", "A", "B", "D"),
weight = c(1, 3, 1, 2, 3, 2, 1),
stringsAsFactors = FALSE
)
control_df
#> mode1 mode2 weight
#> 1 a A 1
#> 2 a C 3
#> 3 a D 1
#> 4 b D 2
#> 5 c A 3
#> 6 c B 2
#> 7 c D 1
Do they match?
identical(control_df, el)
#> [1] TRUE
This might not be the most efficient way, but it produces expected result:
y <- matrix( c(1,0,3,0,0,2,3,0,0,1,2,1), nrow=3)
colnames(y) <- c("e.A","e.B","e.C","e.D")
dt <- data.frame(rnames=c("a","b","c"))
dt <- cbind(dt, y)
# rnames e.A e.B e.C e.D
#1 a 1 0 3 1
#2 b 0 0 0 2
#3 c 3 2 0 1
# use reshape () function to convert dataframe into the long format
M <- reshape(dt, direction="long", idvar = "rnames", varying = c("e.A","e.B","e.C","e.D"))
M <- M[M$e >0,]
M
# rnames time e
# a.A a A 1
# c.A c A 3
# c.B c B 2
# a.C a C 3
# a.D a D 1
# b.D b D 2
# c.D c D 1
# If M needs to be sorted by the column rnames:
M[order(M$rnames), ]
# rnames time e
# a.A a A 1
# a.C a C 3
# a.D a D 1
# b.D b D 2
# c.A c A 3
# c.B c B 2
# c.D c D 1

In R: Split a character vector to find specific characters and return a data frame

I want to be able to extract specific characters from a character vector in a data frame and return a new data frame. The information I want to extract is auditors remark on a specific company's income and balance sheet. My problem is that the auditors remarks are stored in vectors containing the different remarks. For instance:
vec = c("A C G H D E"). Since "A" %in% vec won't return TRUE, I have to use strsplit to break up each character vector in the data frame, hence "A" %in% unlist(strsplit(dat[i, 2], " "). This returns TRUE.
Here is a MWE:
dat <- data.frame(orgnr = c(1, 2, 3, 4), rat = as.character(c("A B C")))
dat$rat <- as.character(dat$rat)
dat[2, 2] <- as.character(c("A F H L H"))
dat[3, 2] <- as.character(c("H X L O"))
dat[4, 2] <- as.character(c("X Y Z A B C"))
Now, to extract information about every single letter in the rat coloumn, I've tried several approaches, following similar problems such as Roland's answer to a similar question (How to split a character vector into data frame?)
DF <- data.frame(do.call(rbind, strsplit(dat$rat, " ", fixed = TRUE)))
DF
X1 X2 X3 X4 X5 X6
1 A B C A B C
2 A F H L H A
3 H X L O H X
4 X Y Z A B C
This returnsthe following error message: Warning message:
In (function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 2)
It would be a desirable approach since it's fast, but I can't use DF since it recycles.
Is there a way to insert NA instead of the recycling because of the different length of the vectors?
So far I've found a solution to the problem by using for-loops in combination with ifelse-statements. However, with 3 mill obs. this approach takes years!
dat$A <- 0
for(i in seq(1, nrow(dat), 1)) {
print(i)
dat[i, 3] <- ifelse("A" %in% unlist(strsplit(dat[i, 2], " ")), 1, 0)
}
dat$B <- 0
for(i in seq(1, nrow(dat), 1)) {
print(i)
dat[i, 4] <- ifelse("B" %in% unlist(strsplit(dat[i, 2], " ")), 1, 0)
}
This gives the results I want:
dat
orgnr rat A B
1 1 A B C 1 1
2 2 A F H L H 1 0
3 3 H X L O 0 0
4 4 X Y Z A B C 1 1
I've searched through most of the relevant questions I could find here on StackOverflow. This one is really close to my problem: How to convert a list consisting of vector of different lengths to a usable data frame in R?, but I don't know how to implement strsplit with that approach.
We can use for-loop with grepl to achieve this task. + 0 is to convert the column form TRUE or FALSE to 1 or 0
for (col in c("A", "B")){
dat[[col]] <- grepl(col, dat$rat) + 0
}
dat
# orgnr rat A B
# 1 1 A B C 1 1
# 2 2 A F H L H 1 0
# 3 3 H X L O 0 0
# 4 4 X Y Z A B C 1 1
If performance is an issue, try this data.table approach.
library(data.table)
# Convert to data.table
setDT(dat)
# Create a helper function
dummy_fun <- function(col, vec){
grepl(col, vec) + 0
}
# Apply the function to A and B
dat[, c("A", "B") := lapply(c("A", "B"), dummy_fun, vec = rat)]
dat
# orgnr rat A B
# 1: 1 A B C 1 1
# 2: 2 A F H L H 1 0
# 3: 3 H X L O 0 0
# 4: 4 X Y Z A B C 1 1
using Base R:
a=strsplit(dat$rat," ")
b=data.frame(x=rep(dat$orgnr,lengths(a)),y=unlist(a),z=1)
cbind(dat,as.data.frame.matrix(xtabs(z~x+y,b)))
orgnr rat A B C F H L O X Y Z
1 1 A B C 1 1 1 0 0 0 0 0 0 0
2 2 A F H L H 1 0 0 1 2 1 0 0 0 0
3 3 H X L O 0 0 0 0 1 1 1 1 0 0
4 4 X Y Z A B C 1 1 1 0 0 0 0 1 1 1
From here you can Just call those columns that you want:
d=as.data.frame.matrix(xtabs(z~x+y,b))
cbind(dat,d[c("A","B")])
orgnr rat A B
1 1 A B C 1 1
2 2 A F H L H 1 0
3 3 H X L O 0 0
4 4 X Y Z A B C 1 1

Converting counts to individual observations in r

I have a data set that looks as follows
df <- data.frame( name = c("a", "b", "c"),
judgement1= c(5, 0, NA),
judgement2= c(1, 1, NA),
judgement3= c(2, 1, NA))
I want to reshape the dataframe to look like this
# name judgement1 judgement2 judgement3
# a 1 0 0
# a 1 0 0
# a 1 0 0
# a 1 0 0
# a 1 0 0
# b 1 0 0
# b 0 1 0
# b 0 0 1
And so on. I have seen that untable is recommended on some other threads, but it does not appear to work with the current version of r. Is there a package that can convert summarised counts into individual observations?
You could try something like this:
df <- data.frame( name = c("a", "b", "c"),
judgement1= c(5, 0, NA),
judgement2= c(1, 1, NA),
judgement3= c(2, 1, NA))
rep.vec <- colSums(df[colnames(df) %in% paste0("judgement", (1:nrow(df)), sep="")], na.rm = TRUE)
want <- data.frame(name=df$name, cbind(diag(nrow(df))))
colnames(want)[-1] <- paste0("judgement", (1:nrow(df)), sep="")
(want <- want[rep(1:nrow(want), rep.vec), ])
I wrote a function that works to give you your desired output:
untabl <- function(df, id.col, count.cols) {
df[is.na(df)] <- 0 # replace NAs
out <- lapply(count.cols, function(x) { # for each column with counts
z <- df[rep(1:nrow(df), df[,x]), ] # replicate rows
z[, -c(id.col)] <- 0 # set all other columns to zero
z[, x] <- 1 # replace the count values with 1
z
})
out <- do.call(rbind, out) # combine the list
out <- out[order(out[,c(id.col)]),] # reorder (you can change this)
rownames(out) <- NULL # return to simple row numbers
out
}
untabl(df = df, id.col = 1, count.cols = c(2,3,4))
# name judgement1 judgement2 judgement3
#1 a 1 0 0
#2 a 1 0 0
#3 a 1 0 0
#4 a 1 0 0
#5 a 1 0 0
#6 a 0 1 0
#7 b 0 1 0
#8 a 0 0 1
#9 a 0 0 1
#10 b 0 0 1
And for your reference, reshape::untable consists of the following code:
function (df, num)
{
df[rep(1:nrow(df), num), ]
}

Run loop to build co-occurrence matrix

I have the below dataframe and hoping to:
(1) build a 4x4 co-occurrence matrix.
(2) use a loop to run this, as I am using a much larger dataset with more variables.
a <- rep(c("a", "a", "b", "c"), 4)
b <- rep(c("b", "c", "d", "d"), 4)
df <- data.frame(a,b)
out <- matrix(0,
nrow = 4, # how do I call just the levels?
ncol = 4)
out
The below code did not work, but may assist in someone helping me figure this out.
for (i in 1:nrow(df)) {
ind <- which(lvls == df[i, "a"])
out[i, ind] <- 1
}
out
# loop over variables in b
for (j in 1:nrow(df)) {
ind <- which(lvls == df[j, "b"])
out[j, ind] <- 1
}
out
Here is the output I am hoping for...
[a] [b] [c] [d]
[a] 0 4 4 0
[b] 4 0 0 4
[c] 4 0 0 4
[d] 0 4 4 0
Any help would be great. Thanks in advance!
You could try
lvls <- sort(as.character(unique(unlist(df))))
df[] <- lapply(df, function(x) factor(x, levels=lvls) )
m1 <- table(df)
m1[lower.tri(m1)] <- m1[upper.tri(m1)]
class(m1) <- "matrix"
dimnames(m1) <- unname(dimnames(m1)) #as suggested by #Richard Scriven
m1
# a b c d
# a 0 4 4 0
# b 4 0 0 4
# c 4 0 0 4
# d 0 4 4 0
Update
Suppose, if your data is changed (contributed by #user20650)
df[1, ] <- c("b", "a")
df[] <- lapply(df, function(x) factor(x, levels=lvls) )
m1 <- table(df)
m2 <- m1 + t(m1)
m2 #you can convert to class `matrix` and change the dimnames as above
# b
#a b a c d
#b 0 4 0 4
#a 4 0 4 0
#c 0 4 0 4
#d 4 0 4 0
Update2
If, you don't want a symmetric matrix and would like to have the actual counts
df[] <- lapply(df, function(x) factor(x, levels=lvls) )
m1 <- table(df)
indx <- !m1 & lower.tri(m1)
m1[indx] <- m1[t(indx)]
class(m1) <- "matrix"
dimnames(m1) <- unname(dimnames(m1))
m1
# b a c d
#b 0 1 0 4
#a 3 0 4 0
#c 0 4 0 4
#d 4 0 4 0
table(as.character(interaction(df,sep="")))
#ab ac ba bd cd
#3 4 1 4 4
Update3
Regarding multiple variables, I am not sure about the expected result, perhaps this helps:
indx <- combn(colnames(df1),2)
res <- Reduce(`+`,lapply(split(indx, col(indx)), function(x) table(df1[x])))
dimnames(res) <- unname(dimnames(res))
res
# a b c d e f g
#a 4 9 5 4 2 6 5
#b 8 6 13 6 5 9 3
#c 6 8 7 5 2 7 2
#d 4 3 5 6 2 2 6
#e 8 6 8 11 3 5 5
#f 4 4 3 5 2 1 4
#g 1 4 2 5 3 2 4
data
a <- rep(c("a", "a", "b", "c"), 4)
b <- rep(c("b", "c", "d", "d"), 4)
df <- data.frame(a,b, stringsAsFactors=FALSE)
Data with multiple columns
set.seed(24)
df1 <- as.data.frame(matrix(sample(letters[1:7], 6*16, replace=TRUE), ncol=6))
lvls1 <- sort(as.character(unique(unlist(df1))))
df1[] <- lapply(df1, function(x) factor(x, levels=lvls1))

Resources