Run loop to build co-occurrence matrix - r

I have the below dataframe and hoping to:
(1) build a 4x4 co-occurrence matrix.
(2) use a loop to run this, as I am using a much larger dataset with more variables.
a <- rep(c("a", "a", "b", "c"), 4)
b <- rep(c("b", "c", "d", "d"), 4)
df <- data.frame(a,b)
out <- matrix(0,
nrow = 4, # how do I call just the levels?
ncol = 4)
out
The below code did not work, but may assist in someone helping me figure this out.
for (i in 1:nrow(df)) {
ind <- which(lvls == df[i, "a"])
out[i, ind] <- 1
}
out
# loop over variables in b
for (j in 1:nrow(df)) {
ind <- which(lvls == df[j, "b"])
out[j, ind] <- 1
}
out
Here is the output I am hoping for...
[a] [b] [c] [d]
[a] 0 4 4 0
[b] 4 0 0 4
[c] 4 0 0 4
[d] 0 4 4 0
Any help would be great. Thanks in advance!

You could try
lvls <- sort(as.character(unique(unlist(df))))
df[] <- lapply(df, function(x) factor(x, levels=lvls) )
m1 <- table(df)
m1[lower.tri(m1)] <- m1[upper.tri(m1)]
class(m1) <- "matrix"
dimnames(m1) <- unname(dimnames(m1)) #as suggested by #Richard Scriven
m1
# a b c d
# a 0 4 4 0
# b 4 0 0 4
# c 4 0 0 4
# d 0 4 4 0
Update
Suppose, if your data is changed (contributed by #user20650)
df[1, ] <- c("b", "a")
df[] <- lapply(df, function(x) factor(x, levels=lvls) )
m1 <- table(df)
m2 <- m1 + t(m1)
m2 #you can convert to class `matrix` and change the dimnames as above
# b
#a b a c d
#b 0 4 0 4
#a 4 0 4 0
#c 0 4 0 4
#d 4 0 4 0
Update2
If, you don't want a symmetric matrix and would like to have the actual counts
df[] <- lapply(df, function(x) factor(x, levels=lvls) )
m1 <- table(df)
indx <- !m1 & lower.tri(m1)
m1[indx] <- m1[t(indx)]
class(m1) <- "matrix"
dimnames(m1) <- unname(dimnames(m1))
m1
# b a c d
#b 0 1 0 4
#a 3 0 4 0
#c 0 4 0 4
#d 4 0 4 0
table(as.character(interaction(df,sep="")))
#ab ac ba bd cd
#3 4 1 4 4
Update3
Regarding multiple variables, I am not sure about the expected result, perhaps this helps:
indx <- combn(colnames(df1),2)
res <- Reduce(`+`,lapply(split(indx, col(indx)), function(x) table(df1[x])))
dimnames(res) <- unname(dimnames(res))
res
# a b c d e f g
#a 4 9 5 4 2 6 5
#b 8 6 13 6 5 9 3
#c 6 8 7 5 2 7 2
#d 4 3 5 6 2 2 6
#e 8 6 8 11 3 5 5
#f 4 4 3 5 2 1 4
#g 1 4 2 5 3 2 4
data
a <- rep(c("a", "a", "b", "c"), 4)
b <- rep(c("b", "c", "d", "d"), 4)
df <- data.frame(a,b, stringsAsFactors=FALSE)
Data with multiple columns
set.seed(24)
df1 <- as.data.frame(matrix(sample(letters[1:7], 6*16, replace=TRUE), ncol=6))
lvls1 <- sort(as.character(unique(unlist(df1))))
df1[] <- lapply(df1, function(x) factor(x, levels=lvls1))

Related

Adjacency Matrix from a dataframe

I am trying to convert an edgelist to an adjacent matrix.
Below is the sample data
#Sample Data
User<-c("1","1","2","3","4")
v1 <- c("b", "b", "a", "d", "c")
v2 <- c("c", "d", "c", "a", "a")
v3 <- c(0, 0, "d", 0, "b")
v4 <- c(0, 0, 0, 0, 0)
v5 <- c(0, 0, 0, 0, 0)
my_data<-data.frame(User, v1, v2, v3, v4, v5)
my_data
If you run this code you will get the below as output,
User v1 v2 v3 v4 v5
1 b c 0 0 0
1 b d 0 0 0
2 a c d 0 0
3 d a 0 0 0
4 c a b 0 0
Using the data, I want to create an adjacent matrix that looks like follows:
a b c d
a 0 0 2 2
b 0 0 1 1
c 2 1 0 1
d 2 1 1 0
Basically, the desired output diplays the count how many times each pair appeared in column v1~v5 in the sample data frame.
I have tried to use AdjacencyFromEdgelist function from dils library, also tried to create a matrix shell with NAs and fill out the matrix by looping through the dataframe.
However, I could not get neither way to work.
I think this may be close to what you have in mind. In the rows where there are more than 2 vertices, I considered every existing pairs:
library(igraph)
do.call(rbind, my_data[-1] |>
apply(1, \(x) x[x != 0]) |>
lapply(\(x) t(combn(x, m = 2)))) |>
graph_from_edgelist(directed = FALSE) %>%
as_adjacency_matrix()
4 x 4 sparse Matrix of class "dgCMatrix"
b c d a
b . 2 1 1
c 2 . 1 2
d 1 1 . 2
a 1 2 2 .
Or without the pip operator in base R:
tmp <- apply(my_data[-1], 1, function(x) x[x != 0])
tmp <- do.call(rbind, lapply(tmp, function(x) t(combn(x, m = 2))))
my_graph <- graph_from_edgelist(tmp, directed = FALSE)
adj_mat <- as_adjacency_matrix(my_graph)
adj_mat
Another attempt, minus the need to calculate all the combinations with combn
sel <- my_data[-1] != 0
dat <- data.frame(row=row(my_data[-1])[sel], value = my_data[-1][sel])
out <- crossprod(table(dat))
diag(out) <- 0
out
# value
#value a b c d
# a 0 1 2 2
# b 1 0 2 1
# c 2 2 0 1
# d 2 1 1 0
Matches the result from #AnoushiravanR:
adj_mat[c("a","b","c","d"), c("a","b","c","d")]
#4 x 4 sparse Matrix of class "dgCMatrix"
# a b c d
#a . 1 2 2
#b 1 . 2 1
#c 2 2 . 1
#d 2 1 1 .
Another igraph option
do.call(
rbind,
combn(df, 2, setNames, nm = c("from", "to"), simplify = FALSE)
) %>%
filter(from > 0 & to > 0) %>%
arrange(from) %>%
graph_from_data_frame(directed = FALSE) %>%
get.adjacency(sparse = FALSE)
gives
a b c d
a 0 1 2 2
b 1 0 2 1
c 2 2 0 1
d 2 1 1 0

Find values in data frame 2 which is found in data frame 1, within a certain range

I want to find which values in df2 which is also present in df1, within a certain range. One value is considering both a and b in the data frames (a & b can't split up). For examples, can I find 9,1 (df1[1,1]) in df2? It doesn't have to be on the same position. Also, we can allow a diff of for example 1 for "a" and 1 for "b". For example, I want to find all values 9+-1,1+-1 in df2. "a" & "b" always go together, each row stick together. Does anyone have a suggestion of how to code this? Many many thanks!
set.seed(1)
a <- sample(10,5)
set.seed(1)
b <- sample(5,5, replace=T)
feature <- LETTERS[1:5]
df1 <- data.frame(feature,a,b)
df1
> df1
feature a b
A 9 1
B 4 4
C 7 1
D 1 2
E 2 5
set.seed(2)
a <- sample(10,5)
b <- sample(5,5, replace=T)
feature <- LETTERS[1:5]
df2 <- data.frame(feature,a,b)
df2
df2
feature a b
A 5 1
B 6 4
C 9 5
D 1 1
E 10 2
Not correct but Im imaging this can be done for a for loop somehow!
for(i in df1[,1]) {
for(j in df1[,2]){
s<- c(s,(df1[i,1] & df1[j,2]== df2[,1] & df2[,2]))# how to add certain allowed diff levels?
}
}
s
Output wanted:
feature_df1 <- LETTERS[1:5]
match <- c(1,0,0,1,0)
feature_df2 <- c("E","","","D", "")
df <- data.frame(feature_df1, match, feature_df2)
df
feature_df1 match feature_df2
A 1 E
B 0
C 0
D 1 D
E 0
I loooove data.table, which is (imo) the weapon of choice for these kind of problems..
library( data.table )
#make df1 and df2 a data.table
setDT(df1, key = "feature"); setDT(df2)
#now perform a join operation on each row of df1,
# creating an on-the-fly subset of df2
df1[ df1, c( "match", "feature_df2") := {
val = df2[ a %between% c( i.a - 1, i.a + 1) & b %between% c(i.b - 1, i.b + 1 ), ]
unique_val = sort( unique( val$feature ) )
num_val = length( unique_val )
list( num_val, paste0( unique_val, collapse = ";" ) )
}, by = .EACHI ][]
# feature a b match feature_df2
# 1: A 9 1 1 E
# 2: B 4 4 0
# 3: C 7 1 0
# 4: D 1 2 1 D
# 5: E 2 5 0
One way to go about this in Base R would be to split the data.frames() into a list of rows then calculate the absolute difference of row vectors to then evaluate how large the absolute difference is and if said difference is larger than a given value.
Code
# Find the absolute difference of all row vectors
listdif <- lapply(l1, function(x){
lapply(l2, function(y){
abs(x - y)
})
})
# Then flatten the list to a list of data.frames
listdifflat <- lapply(listdif, function(x){
do.call(rbind, x)
})
# Finally see if a pair of numbers is within our threshhold or not
m1 <- 2
m2 <- 3
listfin <- Map(function(x){
x[1] > m1 | x[2] > m2
},
listdifflat)
head(listfin, 1)
[[1]]
V1
[1,] TRUE
[2,] FALSE
[3,] TRUE
[4,] TRUE
[5,] TRUE
[6,] TRUE
[7,] TRUE
[8,] TRUE
[9,] TRUE
[10,] TRUE
Data
df1 <- read.table(text = "
4 1
7 5
1 5
2 10
13 6
19 10
11 7
17 9
14 5
3 5")
df2 <- read.table(text = "
15 1
6 3
19 6
8 2
1 3
13 7
16 8
12 7
9 1
2 6")
# convert df to list of row vectors
l1<- lapply(1:nrow(df1), function(x){
df1[x, ]
})
l2 <- lapply(1:nrow(df2), function(x){
df2[x, ]
})

R-Converting Incidence matrix(csv file) to edge list format

I am studying social network analysis and will be using Ucinet to draw network graphs. For this, I have to convert the csv file to an edge list format. Converting the adjacency matrix to the edge list was successful. However, it is difficult to convert an incidence matrix to the edge list format.
The csv file('some.csv') I have, with a incidence matrix like this:
A B C D
a 1 0 3 1
b 0 0 0 2
c 3 2 0 1
The code that converted the adjacency matrix to the edge list was as follows:
x<-read.csv("C:/.../something.csv", header=T, row.names=1)
net<-as.network(x, matrix.type='adjacency', ignore.eval=FALSE, names.eval='dd', loops=FALSE)
el<-edgelist(net, attrname='dd')
write.csv(el, file='C:/.../result.csv')
Now It only succeedded in loading the file. I tried to follow the above method, but I get an error.
y<-read.csv("C:/.../some.csv", header=T, row.names=1)
net2<-network(y, matrix.type='incidence', ignore.eval=FALSE, names.eval='co', loops=FALSE)
Error in network.incidence(x, g, ignore.eval, names.eval, na.rm, edge.check) :
Supplied incidence matrix has empty head/tail lists. (Did you get the directedness right?)
I want to see the result in this way:
a A 1
a C 3
a D 1
b D 2
c A 3
c B 2
c D 1
I tried to put the values as the error said, but I could not get the result i wanted.
Thank you for any assistance with this.
Here's your data:
inc_mat <- matrix(
c(1, 0, 3, 1,
0, 0, 0, 2,
3, 2, 0, 1),
nrow = 3, ncol = 4, byrow = TRUE
)
rownames(inc_mat) <- letters[1:3]
colnames(inc_mat) <- LETTERS[1:4]
inc_mat
#> A B C D
#> a 1 0 3 1
#> b 0 0 0 2
#> c 3 2 0 1
Here's a generalized function that does the trick:
as_edgelist.weighted_incidence_matrix <- function(x, drop_rownames = TRUE) {
melted <- do.call(cbind, lapply(list(row(x), col(x), x), as.vector)) # 3 col matrix of row index, col index, and `x`'s values
filtered <- melted[melted[, 3] != 0, ] # drop rows where column 3 is 0
# data frame where first 2 columns are...
df <- data.frame(mode1 = rownames(x)[filtered[, 1]], # `x`'s rownames, indexed by first column in `filtered``
mode2 = colnames(x)[filtered[, 2]], # `x`'s colnames, indexed by the second column in `filtered`
weight = filtered[, 3], # the third column in `filtered`
stringsAsFactors = FALSE)
out <- df[order(df$mode1), ] # sort by first column
if (!drop_rownames) {
return(out)
}
`rownames<-`(out, NULL)
}
Take it for a spin:
el <- as_edgelist.weighted_incidence_matrix(inc_mat)
el
#> mode1 mode2 weight
#> 1 a A 1
#> 2 a C 3
#> 3 a D 1
#> 4 b D 2
#> 5 c A 3
#> 6 c B 2
#> 7 c D 1
Here are the results you wanted:
control_df <- data.frame(
mode1 = c("a", "a", "a", "b", "c", "c", "c"),
mode2 = c("A", "C", "D", "D", "A", "B", "D"),
weight = c(1, 3, 1, 2, 3, 2, 1),
stringsAsFactors = FALSE
)
control_df
#> mode1 mode2 weight
#> 1 a A 1
#> 2 a C 3
#> 3 a D 1
#> 4 b D 2
#> 5 c A 3
#> 6 c B 2
#> 7 c D 1
Do they match?
identical(control_df, el)
#> [1] TRUE
This might not be the most efficient way, but it produces expected result:
y <- matrix( c(1,0,3,0,0,2,3,0,0,1,2,1), nrow=3)
colnames(y) <- c("e.A","e.B","e.C","e.D")
dt <- data.frame(rnames=c("a","b","c"))
dt <- cbind(dt, y)
# rnames e.A e.B e.C e.D
#1 a 1 0 3 1
#2 b 0 0 0 2
#3 c 3 2 0 1
# use reshape () function to convert dataframe into the long format
M <- reshape(dt, direction="long", idvar = "rnames", varying = c("e.A","e.B","e.C","e.D"))
M <- M[M$e >0,]
M
# rnames time e
# a.A a A 1
# c.A c A 3
# c.B c B 2
# a.C a C 3
# a.D a D 1
# b.D b D 2
# c.D c D 1
# If M needs to be sorted by the column rnames:
M[order(M$rnames), ]
# rnames time e
# a.A a A 1
# a.C a C 3
# a.D a D 1
# b.D b D 2
# c.A c A 3
# c.B c B 2
# c.D c D 1

In R: Split a character vector to find specific characters and return a data frame

I want to be able to extract specific characters from a character vector in a data frame and return a new data frame. The information I want to extract is auditors remark on a specific company's income and balance sheet. My problem is that the auditors remarks are stored in vectors containing the different remarks. For instance:
vec = c("A C G H D E"). Since "A" %in% vec won't return TRUE, I have to use strsplit to break up each character vector in the data frame, hence "A" %in% unlist(strsplit(dat[i, 2], " "). This returns TRUE.
Here is a MWE:
dat <- data.frame(orgnr = c(1, 2, 3, 4), rat = as.character(c("A B C")))
dat$rat <- as.character(dat$rat)
dat[2, 2] <- as.character(c("A F H L H"))
dat[3, 2] <- as.character(c("H X L O"))
dat[4, 2] <- as.character(c("X Y Z A B C"))
Now, to extract information about every single letter in the rat coloumn, I've tried several approaches, following similar problems such as Roland's answer to a similar question (How to split a character vector into data frame?)
DF <- data.frame(do.call(rbind, strsplit(dat$rat, " ", fixed = TRUE)))
DF
X1 X2 X3 X4 X5 X6
1 A B C A B C
2 A F H L H A
3 H X L O H X
4 X Y Z A B C
This returnsthe following error message: Warning message:
In (function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 2)
It would be a desirable approach since it's fast, but I can't use DF since it recycles.
Is there a way to insert NA instead of the recycling because of the different length of the vectors?
So far I've found a solution to the problem by using for-loops in combination with ifelse-statements. However, with 3 mill obs. this approach takes years!
dat$A <- 0
for(i in seq(1, nrow(dat), 1)) {
print(i)
dat[i, 3] <- ifelse("A" %in% unlist(strsplit(dat[i, 2], " ")), 1, 0)
}
dat$B <- 0
for(i in seq(1, nrow(dat), 1)) {
print(i)
dat[i, 4] <- ifelse("B" %in% unlist(strsplit(dat[i, 2], " ")), 1, 0)
}
This gives the results I want:
dat
orgnr rat A B
1 1 A B C 1 1
2 2 A F H L H 1 0
3 3 H X L O 0 0
4 4 X Y Z A B C 1 1
I've searched through most of the relevant questions I could find here on StackOverflow. This one is really close to my problem: How to convert a list consisting of vector of different lengths to a usable data frame in R?, but I don't know how to implement strsplit with that approach.
We can use for-loop with grepl to achieve this task. + 0 is to convert the column form TRUE or FALSE to 1 or 0
for (col in c("A", "B")){
dat[[col]] <- grepl(col, dat$rat) + 0
}
dat
# orgnr rat A B
# 1 1 A B C 1 1
# 2 2 A F H L H 1 0
# 3 3 H X L O 0 0
# 4 4 X Y Z A B C 1 1
If performance is an issue, try this data.table approach.
library(data.table)
# Convert to data.table
setDT(dat)
# Create a helper function
dummy_fun <- function(col, vec){
grepl(col, vec) + 0
}
# Apply the function to A and B
dat[, c("A", "B") := lapply(c("A", "B"), dummy_fun, vec = rat)]
dat
# orgnr rat A B
# 1: 1 A B C 1 1
# 2: 2 A F H L H 1 0
# 3: 3 H X L O 0 0
# 4: 4 X Y Z A B C 1 1
using Base R:
a=strsplit(dat$rat," ")
b=data.frame(x=rep(dat$orgnr,lengths(a)),y=unlist(a),z=1)
cbind(dat,as.data.frame.matrix(xtabs(z~x+y,b)))
orgnr rat A B C F H L O X Y Z
1 1 A B C 1 1 1 0 0 0 0 0 0 0
2 2 A F H L H 1 0 0 1 2 1 0 0 0 0
3 3 H X L O 0 0 0 0 1 1 1 1 0 0
4 4 X Y Z A B C 1 1 1 0 0 0 0 1 1 1
From here you can Just call those columns that you want:
d=as.data.frame.matrix(xtabs(z~x+y,b))
cbind(dat,d[c("A","B")])
orgnr rat A B
1 1 A B C 1 1
2 2 A F H L H 1 0
3 3 H X L O 0 0
4 4 X Y Z A B C 1 1

change data.frame column into rows in R

A <- c(1,6)
B <- c(2,7)
C <- c(3,8)
D <- c(4,9)
E <- c(5,0)
df <- data.frame(A,B,C,D,E)
df
A B C D E
1 1 2 3 4 5
2 6 7 8 9 0
I would like to have this:
df
1 2
A 1 6
B 2 7
C 3 8
D 4 9
E 5 0
If your dataframe is truly in that format, then all of your vectors will be character vectors. Or, you basically have a character matrix and you could do this:
data.frame(t(df))
It would be better, though, to just define it the way you want it from the get-go
df <- data.frame(c('A','B','C','D','E'),
c(1, 2, 3, 4, 5),
c(6, 7, 8, 9, 0))
You could also do this
df <- data.frame(LETTERS[1:5], 1:5, c(6:9, 0))
If you wanted to give the columns names, you could do this
df <- data.frame(L = LETTERS[1:5], N1 = 1:5, N2 = c(6:9, 0))
Sometimes, if I use read.DIF of Excel data the data gets transposed. Is that how you got the original data in? If so, you can call
read.DIF(filename, transpose = T)
to get the data in the correct orientation.
I really recommend data.table approach without manual steps becauce they are error-prone
A <- c(1,6)
B <- c(2,7)
C <- c(3,8)
D <- c(4,9)
E <- c(5,0)
df <- data.frame(A,B,C,D,E)
df
library('data.table')
dat.m <- melt(as.data.table(df, keep.rownames = "Vars"), id.vars = "Vars") # https://stackoverflow.com/a/44128640/54964
dat.m
Output
A B C D E
1 1 2 3 4 5
2 6 7 8 9 0
Vars variable value
1: 1 A 1
2: 2 A 6
3: 1 B 2
4: 2 B 7
5: 1 C 3
6: 2 C 8
7: 1 D 4
8: 2 D 9
9: 1 E 5
10: 2 E 0
R: 3.4.0 (backports)
OS: Debian 8.7

Resources