build word co-occurence edge list in R - r

I have a chunk of sentences and I want to build the undirected edge list of word co-occurrence and see the frequency of every edge. I took a look at the tm package but didn't find similar functions. Is there some package/script I can use? Thanks a lot!
Note: A word doesn't co-occur with itself. A word which appears twice or more co-occurs with other words for only once in the same sentence.
DF:
sentence_id text
1 a b c d e
2 a b b e
3 b c d
4 a e
5 a
6 a a a
OUTPUT
word1 word2 freq
a b 2
a c 1
a d 1
a e 3
b c 2
b d 2
b e 2
c d 2
c e 1
d e 1

It's convoluted so there's got to be a better approach:
dat <- read.csv(text="sentence_id, text
1, a b c d e
2, a b b e
3, b c d
4, a e", header=TRUE)
library(qdapTools); library(tidyr)
x <- t(mtabulate(with(dat, by(text, sentence_id, bag_o_words))) > 0)
out <- x %*% t(x)
out[upper.tri(out, diag=TRUE)] <- NA
out2 <- matrix2df(out, "word1") %>%
gather(word2, freq, -word1) %>%
na.omit()
rownames(out2) <- NULL
out2
## word1 word2 freq
## 1 b a 2
## 2 c a 1
## 3 d a 1
## 4 e a 3
## 5 c b 2
## 6 d b 2
## 7 e b 2
## 8 d c 2
## 9 e c 1
## 10 e d 1
Base only solution
out <- lapply(with(dat, split(text, sentence_id)), function(x) {
strsplit(gsub("^\\s+|\\s+$", "", as.character(x)), "\\s+")[[1]]
})
nms <- sort(unique(unlist(out)))
out2 <- lapply(out, function(x) {
as.data.frame(table(x), stringsAsFactors = FALSE)
})
dat2 <- data.frame(x = nms)
for(i in seq_along(out2)) {
m <- merge(dat2, out2[[i]], all.x = TRUE)
names(m)[i + 1] <- dat[["sentence_id"]][i]
dat2 <- m
}
dat2[is.na(dat2)] <- 0
x <- as.matrix(dat2[, -1]) > 0
out3 <- x %*% t(x)
out3[upper.tri(out3, diag=TRUE)] <- NA
dimnames(out3) <- list(dat2[[1]], dat2[[1]])
out4 <- na.omit(data.frame(
word1 = rep(rownames(out3), ncol(out3)),
word2 = rep(colnames(out3), each = nrow(out3)),
freq = c(unlist(out3)),
stringsAsFactors = FALSE)
)
row.names(out4) <- NULL
out4

This is very closely related to #TylerRinker's answer, but using different tools.
library(splitstackshape)
library(reshape2)
temp <- crossprod(
as.matrix(
cSplit_e(d, "text", " ", type = "character",
fill = 0, drop = TRUE)[-1]))
temp[upper.tri(temp, diag = TRUE)] <- NA
melt(temp, na.rm = TRUE)
# Var1 Var2 value
# 2 text_b text_a 2
# 3 text_c text_a 1
# 4 text_d text_a 1
# 5 text_e text_a 3
# 8 text_c text_b 2
# 9 text_d text_b 2
# 10 text_e text_b 2
# 14 text_d text_c 2
# 15 text_e text_c 1
# 20 text_e text_d 1
The "text_" parts of "Var1" and "Var2" can be stripped easily with sub or gsub.

Here's a base R way:
d <- read.table(text='sentence_id text
1 "a b c d e"
2 "a b b e"
3 "b c d"
4 "a e"', header=TRUE, as.is=TRUE)
result.vec <- table(unlist(lapply(d$text, function(text) {
pairs <- combn(unique(scan(text=text, what='', sep=' ')), m=2)
interaction(pairs[1,], pairs[2,])
})))
# a.b b.b c.b d.b a.c b.c c.c d.c a.d b.d c.d d.d a.e b.e c.e d.e
# 2 0 0 0 1 2 0 0 1 2 2 0 3 2 1 1
result <- subset(data.frame(do.call(rbind, strsplit(names(result.vec), '\\.')), freq=as.vector(result.vec)), freq > 0)
with(result, result[order(X1, X2),])
# X1 X2 freq
# 1 a b 2
# 5 a c 1
# 9 a d 1
# 13 a e 3
# 6 b c 2
# 10 b d 2
# 14 b e 2
# 11 c d 2
# 15 c e 1
# 16 d e 1

Related

Post-processing of full_join output to remove multiplicity

I have two data frames(df1, df2) and performed full_join using the common column of interest col1.
df1 <- data.frame(col1=c('A','D','C','C','E','E','I'),col2=c(4,7,8,3,2,4,9))
df2 <- data.frame(col1=c('A','A','B','C','C','E','E','I'),col2=c(4,1,6,8,3,2,1,9))
df1 %>% full_join(df2, by = "col1")
# col1 col2.x col2.y
# 1 A 4 4
# 2 A 4 1
# 3 D 7 NA
# 4 C 8 8
# 5 C 8 3
# 6 C 3 8
# 7 C 3 3
# 8 E 2 2
# 9 E 2 1
# 10 E 4 2
# 11 E 4 1
# 12 I 9 9
# 13 B NA 6
As expected the full_join provides multiplicty of the joining column values and I wish to avoid it. I wish to arrive at the following output. What kind of post-processing approaches do you suggest?
# col1 col2.x col2.y
# 1 A 4 4
# 2 A NA 1
# 3 D 7 NA
# 4 C 8 8
# 5 C 3 3
# 6 E 2 2
# 7 E 4 1
# 8 I 9 9
# 9 B NA 6
More information:
Case 1: I do not need four rows in the output for two same values in both input objects:
# 4 C 8 8
# 5 C 8 3
# 6 C 3 8
# 7 C 3 3
instead, I want only two as:
# 4 C 8 8
# 5 C 3 3
Case 2: Similarly, I need same row for the difference in values:
# 8 E 2 2
# 9 E 2 1
# 10 E 4 2
# 11 E 4 1
instead, I want only two rows as below:
# 8 E 2 2
# 9 E 4 1
A possible solution in 2 steps using the data.table-package:
0) load package & convert to data.table's
library(data.table)
setDT(df1)
setDT(df2)
1) define helper function
unlistSD <- function(x) {
l <- length(x)
ls <- sapply(x, lengths)
m <- max(ls)
newSD <- vector(mode = "list", length = l)
for (i in 1:l) {
u <- unlist(x[[i]])
lu <- length(u)
if (lu < m) {
u <- c(u, rep(NA_real_, m - lu))
}
newSD[[i]] <- u
}
return(setNames(as.list(newSD), names(x)))
}
2) merge and apply helper function
merge(df1[, .(col2 = list(col2)), by = col1],
df2[, .(col2 = list(col2)), by = col1],
by = "col1", all = TRUE
)[, unlistSD(.SD), by = col1]
which gives the following result:
col1 col2.x col2.y
1: A 4 4
2: A NA 1
3: C 8 8
4: C 3 3
5: D 7 NA
6: E 2 2
7: E 4 1
8: I 9 9
9: B NA 6
Another possibiliy with base R:
unlistDF <- function(d, groupcols) {
ds <- split(d[, setdiff(names(d), groupcols)], d[,groupcols])
ls <- lapply(ds, function(x) max(sapply(x, lengths)))
dl <- lapply(ds, function(x) lapply(as.list(x), unlist))
du <- Map(function(x, y) {
lapply(x, function(i) {
if(length(i) < y) {
c(i, rep(NA_real_, y - length(i)))
} else i
})
}, x = dl, y = ls)
ld <- lapply(du, as.data.frame)
cbind(d[rep(1:nrow(d), ls), groupcols, drop = FALSE],
do.call(rbind.data.frame, c(ld, make.row.names = FALSE)),
row.names = NULL)
}
Now you can use this function as follows in combination with merge:
df <- merge(aggregate(col2 ~ col1, df1, as.list),
aggregate(col2 ~ col1, df2, as.list),
by = "col1", all = TRUE)
unlistDF(df, "col1")

Find immediate neighbors by group using data table or igraph

I have a data.table:
groups <- data.table(group = c("A", "B", "C", "D", "E", "F", "G"),
code_1 = c(2,2,2,7,8,NA,5),
code_2 = c(NA,3,NA,3,NA,NA,2),
code_3 = c(4,1,1,4,4,1,8))
group code_1 code_2 code_3
A 2 NA 4
B 2 3 1
C 2 NA 1
D 7 3 4
E 8 NA 4
F NA NA 1
G 5 2 8
What I would like to achieve, is for each group to find the immediate neighbors based on the available codes. For example: Group A has immediate neighbors groups B, C due to code_1 (code_1 is equal to 2 in all groups) and has immediate neighbor groups D,E due to code_3 (code_3 is equal to 4 in all those groups).
What I tried is for each code, subsetting the first column (group) based on the matches as follows:
groups$code_1_match = list()
for (row in 1:nrow(groups)){
set(groups, i=row, j="code_1_match", list(groups$group[groups$code_1[row] == groups$code_1]))
}
group code_1 code_2 code_3 code_1_match
A 2 NA 4 A,B,C,NA
B 2 3 1 A,B,C,NA
C 2 NA 1 A,B,C,NA
D 7 3 4 D,NA
E 8 NA 4 E,NA
F NA NA 1 NA,NA,NA,NA,NA,NA,...
G 5 2 8 NA,G
This "kinda" works but I would assume there is a more data table kind of way of doing this. I tried
groups[, code_1_match_2 := list(group[code_1 == groups$code_1])]
But this doesn't work.
Am I missing some obvious data table trick to deal with it?
My ideal case result would look like this (which currently would require using my method for all 3 columns and then concatenating the results):
group code_1 code_2 code_3 Immediate neighbors
A 2 NA 4 B,C,D,E
B 2 3 1 A,C,D,F
C 2 NA 1 A,B,F
D 7 3 4 B,A
E 8 NA 4 A,D
F NA NA 1 B,C
G 5 2 8
Using igraph, get 2nd degree neighbours, drop numeric nodes, paste remaining nodes.
library(data.table)
library(igraph)
# reshape wide-to-long
x <- melt(groups, id.vars = "group")[!is.na(value)]
# convert to graph
g <- graph_from_data_frame(x[, .(from = group, to = paste0(variable, "_", value))])
# get 2nd degree neighbours
x1 <- ego(g, 2, nodes = groups$group)
# prettify the result
groups$res <- sapply(seq_along(x1), function(i) toString(intersect(names(x1[[ i ]]),
groups$group[ -i ])))
# group code_1 code_2 code_3 res
# 1: A 2 NA 4 B, C, D, E
# 2: B 2 3 1 A, C, D, F
# 3: C 2 NA 1 A, B, F
# 4: D 7 3 4 B, A, E
# 5: E 8 NA 4 A, D
# 6: F NA NA 1 B, C
# 7: G 5 2 8
More info
This is how our data looks like before converting to igraph object. We want to ensure code1 with value 2 is different from code2 with value 2, etc.
x[, .(from = group, to = paste0(variable, "_", value))]
# from to
# 1: A code_1_2
# 2: B code_1_2
# 3: C code_1_2
# 4: D code_1_7
# 5: E code_1_8
# 6: G code_1_5
# 7: B code_2_3
# 8: D code_2_3
# 9: G code_2_2
# 10: A code_3_4
# 11: B code_3_1
# 12: C code_3_1
# 13: D code_3_4
# 14: E code_3_4
# 15: F code_3_1
# 16: G code_3_8
Here is how our network looks like:
Note that A..G nodes are always connected through code_x_y.
So we need to get the 2nd degree, ego(..., order = 2) gives us neighbours up to including 2nd degree neighbours, and returns a list object.
To get the names:
lapply(x1, names)
# [[1]]
# [1] "A" "code_1_2" "code_3_4" "B" "C" "D" "E"
#
# [[2]]
# [1] "B" "code_1_2" "code_2_3" "code_3_1" "A" "C" "D" "F"
#
# [[3]]
# [1] "C" "code_1_2" "code_3_1" "A" "B" "F"
#
# [[4]]
# [1] "D" "code_1_7" "code_2_3" "code_3_4" "B" "A" "E"
#
# [[5]]
# [1] "E" "code_1_8" "code_3_4" "A" "D"
#
# [[6]]
# [1] "F" "code_3_1" "B" "C"
#
# [[7]]
# [1] "G" "code_1_5" "code_2_2" "code_3_8"
To prettify the result, we need to remove code_x_y nodes and the origin node (1st node)
sapply(seq_along(x1), function(i) toString(intersect(names(x1[[ i ]]), groups$group[ -i ])))
#[1] "B, C, D, E" "A, C, D, F" "A, B, F" "B, A, E" "A, D" "B, C" ""
There is probably some more practical way of achieving this but you could do something like this, using melts and joins:
mgrp <- melt(groups, id.vars = "group")[!is.na(value)]
setkey(mgrp, variable, value)
for (i in seq_along(groups$group)) {
let = groups$group[i]
set(
groups,
i = i,
j = "inei",
value = list(mgrp[mgrp[group == let], setdiff(unique(group), let)])
)
}
groups
# group code_1 code_2 code_3 inei
# 1: A 2 NA 4 B,C,D,E
# 2: B 2 3 1 A,C,D,F
# 3: C 2 NA 1 A,B,F
# 4: D 7 3 4 B,A,E
# 5: E 8 NA 4 A,D
# 6: F NA NA 1 B,C
# 7: G 5 2 8
As mentioned by zx8754, using data.table::melt with combn and then igraph::as_adjacency_matrix
library(data.table)
df <- melt(groups, id.vars="group", na.rm=TRUE)[,
if (.N > 1L) transpose(combn(group, 2L, simplify=FALSE)), value][, (1) := NULL]
library(igraph)
as_adjacency_matrix(graph_from_data_frame(df, FALSE))
output:
7 x 7 sparse Matrix of class "dgCMatrix"
A B C E D G F
A . 1 1 1 1 1 .
B 1 . 2 . 1 1 1
C 1 2 . . . 1 1
E 1 . . . 1 1 .
D 1 1 . 1 . . .
G 1 1 1 1 . . .
F . 1 1 . . . .
or without using igraph
x <- df[, unique(c(V1, V2))]
df <- rbindlist(list(df, data.table(x, x)))
tab <- table(df) #or xtabs(~ V1 + V2, data=df)
ans <- t(tab) + tab
diag(ans) <- 0L
ans
output:
V1
V2 A B C D E F G
A 0 1 1 1 1 0 1
B 1 0 2 1 0 1 1
C 1 2 0 0 0 1 1
D 1 1 0 0 1 0 0
E 1 0 0 1 0 0 1
F 0 1 1 0 0 0 0
G 1 1 1 0 1 0 0
This is inspired by #sindri_baldur's melt. This solution:
Melts the groups
Performs a cartesian self-join.
Pastes together all the groups that matches.
Joins back to the original DT
library(data.table)
#> Warning: package 'data.table' was built under R version 3.6.2
groups <- data.table(group = c("A", "B", "C", "D", "E", "F", "G"), code_1 = c(2,2,2,7,8,NA,5), code_2 = c(NA,3,NA,3,NA,NA,2), code_3=c(4,1,1,4,4,1,8))
molten_grps = melt(groups, measure.vars = patterns("code"), na.rm = TRUE)
inei_dt = molten_grps[molten_grps,
on = .(variable, value),
allow.cartesian = TRUE
][,
.(inei = paste0(setdiff(i.group, .BY[[1L]]), collapse = ", ")),
by = group]
groups[inei_dt, on = .(group), inei := inei]
groups
#> group code_1 code_2 code_3 inei
#> <char> <num> <num> <num> <char>
#> 1: A 2 NA 4 B, C, D, E
#> 2: B 2 3 1 A, C, D, F
#> 3: C 2 NA 1 A, B, F
#> 4: D 7 3 4 B, A, E
#> 5: E 8 NA 4 A, D
#> 6: F NA NA 1 B, C
#> 7: G 5 2 8

R: Restricted permutations more efficient way than using for loops

I am trying to permute a char vector a of variable length picking 3 elements every time, without repetition. Ordering counts only for the first element but doesn't for second and third (e.g. abc != bac != cab, but abc = acb and bca = bac). Each set of 3 permuted elements should be a row in a dataframe b.
A vector with letters a,b,c,d,e would result in this expected output:
abc
abd
abe
acd
ace
ade
bac
bad
bae
bcd
bce
bde
cab
cad
cae
cbd
cbe
cde
dab
dac
dae
dbc
dbe
dce
eab
eac
ead
ebc
ebd
ecd
Using 3 for loops I think I was able to achieve this output, but it is slow if the vector is long.
a = letters[1:5]
aL = length(a)
b <- data.frame(var1 = character(),
var2 = character(),
var3 = character(),
stringsAsFactors = FALSE)
# restricted permutations for moderation
pracma::tic()
for(i in 1:aL){
for(j in 1:(aL-1)){
for(k in (j+1):aL){
if(j != i & k != i) {
b <- rbind(b, data.frame(a[i], a[j], a[k])) }
}
}
}
pracma::toc()
#> elapsed time is 0.070000 seconds
b
#> a.i. a.j. a.k.
#> 1 a b c
#> 2 a b d
#> 3 a b e
#> 4 a c d
#> 5 a c e
#> 6 a d e
#> 7 b a c
#> 8 b a d
#> 9 b a e
#> 10 b c d
#> 11 b c e
#> 12 b d e
#> 13 c a b
#> 14 c a d
#> 15 c a e
#> 16 c b d
#> 17 c b e
#> 18 c d e
#> 19 d a b
#> 20 d a c
#> 21 d a e
#> 22 d b c
#> 23 d b e
#> 24 d c e
#> 25 e a b
#> 26 e a c
#> 27 e a d
#> 28 e b c
#> 29 e b d
#> 30 e c d
Created on 2019-07-17 by the reprex package (v0.2.1)
How can I achieve the same outcome in less time. Is recursion faster?
Any help is greatly appreciated. Thank you.
I propose the following solution:
a = letters[1:5]
A = t(combn(a,3)) # create all possible three-letter combinations,
# disregarding the order
Full = rbind(A, A[,3:1], A[,c(2,3,1)]) # put every of the elements of the
# differing combinations in first place once
Here's one option for your specific example:
library(gtools)
library(dplyr)
# example vector
vec = letters[1:5]
# vectorised function to rearrange elements (based on your restriction)
f = function(x1,x2,x3) paste0(c(x1, sort(c(x2,x3))), collapse = " ")
f = Vectorize(f)
permutations(length(vec), 3, vec) %>% # get permutations
data.frame(., stringsAsFactors = F) %>% # save as data frame
mutate(vec = f(X1,X2,X3)) %>% # apply function to each row
distinct(vec, .keep_all = T) # keep distinct vec values
# X1 X2 X3 vec
# 1 a b c a b c
# 2 a b d a b d
# 3 a b e a b e
# 4 a c d a c d
# 5 a c e a c e
# 6 a d e a d e
# 7 b a c b a c
# ...
Not clear if you want your output to be 3 separate columns with 1 element each, or one column with the vector, so I'm keeping both for you to choose from. You can keep columns {X1, X2, X3} or just vec.
The following is a straightforward rewrite of the triple for loop as a triple lapply loop.
t1 <- system.time({
for(i in 1:aL){
for(j in 1:(aL-1)){
for(k in (j+1):aL){
if(j != i & k != i) {
b <- rbind(b, data.frame(a[i], a[j], a[k])) }
}
}
}
})
t2 <- system.time({
d <- lapply(1:aL, function(i){
tmp <- lapply(1:(aL-1), function(j){
tmp <- lapply((j+1):aL, function(k){
if(j != i & k != i) c(a[i], a[j], a[k])
})
do.call(rbind, tmp)
})
do.call(rbind, tmp)
})
d <- do.call(rbind.data.frame, d)
names(d) <- paste("a", 1:3, sep = ".")
})
all.equal(b, d)
#[1] "Names: 3 string mismatches"
rbind(t1, t2)
# user.self sys.self elapsed user.child sys.child
#t1 0.051 0 0.051 0 0
#t2 0.017 0 0.018 0 0

Convert a matrix to columns

Assuming I have a matrix looks like below, the values up or down the diagonal are the same. In other words, [,1] x [2,] and [,2] x [1,] both are 2 in the matrix.
> m = cbind(c(1,2,3),c(2,4,5),c(3,5,6))
> m
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 2 4 5
[3,] 3 5 6
Then I have real name for 1, 2, and 3 as well.
>Real_name
A B C # A represents 1, B represents 2, and C represents 3.
If I would like to convert the matrix to 3 columns containing corresponding real name for each pair, and the pair must be unique, A x B is the same as B x A, so we keep A x B only. How can I achieve it using R?
A A 1
A B 2
A C 3
B B 4
B C 5
C C 6
The following is straightforward:
m <- cbind(c(1,2,3), c(2,4,5), c(3,5,6))
## read `?lower.tri` and try `v <- lower.tri(m, diag = TRUE)` to see what `v` is
## read `?which` and try `which(v, arr.ind = TRUE)` to see what it gives
ij <- which(lower.tri(m, diag = TRUE), arr.ind = TRUE)
Real_name <- LETTERS[1:3]
data.frame(row = Real_name[ij[, 1]], col = Real_name[ij[, 2]], val = c(m[ij]))
# row col val
#1 A A 1
#2 B A 2
#3 C A 3
#4 B B 4
#5 C B 5
#6 C C 6
colnames(m) <- c("A", "B", "C")
rownames(m) <- c("A", "B", "C")
m[lower.tri(m)] = NA # replace lower triangular elements with NA
data.table::melt(m, na.rm = TRUE) # melt and remove NA
# Var1 Var2 value
#1 A A 1
#4 A B 2
#5 B B 4
#7 A C 3
#8 B C 5
#9 C C 6
Or you can do it in a single line: melt(replace(m, lower.tri(m), NA), na.rm = TRUE)
This will also work:
g <- expand.grid(1:ncol(m), 1:ncol(m))
g <- g[g[,2]>=g[,1],]
cbind.data.frame(sapply(g, function(x) Real_name[x]), Val=m[as.matrix(g)])
Var1 Var2 Val
1 A A 1
2 A B 2
3 B B 4
4 A C 3
5 B C 5
6 C C 6

R: reshape data frame when one column has unequal number of entries

I have a data frame x with 2 character columns:
x <- data.frame(a = numeric(), b = I(list()))
x[1:3,"a"] = 1:3
x[[1, "b"]] <- "a, b, c"
x[[2, "b"]] <- "d, e"
x[[3, "b"]] <- "f"
x$a = as.character(x$a)
x$b = as.character(x$b)
x
str(x)
The entries in column b are comma-separated strings of characters.
I need to produce this data frame:
1 a
1 b
1 c
2 d
2 e
3 f
I know how to do it when I loop row by row. But is it possible to do without looping?
Thank you!
Have you checked out require(splitstackshape)?
> cSplit(x, "b", ",", direction = "long")
a b
1: 1 a
2: 1 b
3: 1 c
4: 2 d
5: 2 e
6: 3 f
> s <- strsplit(as.character(x$b), ',')
> data.frame(value=rep(x$a, sapply(s, FUN=length)),b=unlist(s))
value b
1 1 a
2 1 b
3 1 c
4 2 d
5 2 e
6 3 f
there you go, should be very fast:
library(data.table)
x <- data.table(x)
x[ ,strsplit(b, ","), by = a]

Resources