Union of intersecting vectors in a list in R - r

I have a list of vectors as follows.
data <- list(v1=c("a", "b", "c"), v2=c("g", "h", "k"),
v3=c("c", "d"), v4=c("n", "a"), v5=c("h", "i"))
I am trying to achieve the following:
Check whether any of the vectors intersect with each other
If intersecting vectors are found, get their union
So the desired output is
out <- list(v1=c("a", "b", "c", "d", "n"), v2=c("g", "h", "k", "i"))
I can get the union of a group of intersecting sets as follows.
Reduce(union, list(data[[1]], data[[3]], data[[4]]))
Reduce(union, list(data[[2]], data[[5]])
How to first identify the intersecting vectors? Is there a way of dividing the list into lists of groups of intersecting vectors?
#Update
Here is an attempt using data.table. Gets the desired results. But still slow for large lists as in this example dataset.
datasets.
data <- sapply(data, function(x) paste(x, collapse=", "))
data <- as.data.frame(data, stringsAsFactors = F)
repeat {
M <- nrow(data)
data <- data.table( data , key = "data" )
data <- data[ , list(dataelement = unique(unlist(strsplit(data , ", " )))), by = list(data)]
data <- data.table(data , key = "dataelement" )
data <- data[, list(data = paste0(sort(unique(unlist(strsplit(data, split=", ")))), collapse=", ")), by = "dataelement"]
data$dataelement <- NULL
data <- unique(data)
N <- nrow(data)
if (M == N)
break
}
data <- strsplit(as.character(data$data) , "," )

This is kind of like a graph problem so I like to use the igraph library for this, using your sample data, you can do
library(igraph)
#build edgelist
el <- do.call("rbind",lapply(data, embed, 2))
#make a graph
gg <- graph.edgelist(el, directed=F)
#partition the graph into disjoint sets
split(V(gg)$name, clusters(gg)$membership)
# $`1`
# [1] "b" "a" "c" "d" "n"
#
# $`2`
# [1] "h" "g" "k" "i"
And we can view the results with
V(gg)$color=c("green","purple")[clusters(gg)$membership]
plot(gg)

Here's another approach using only base R
Update
Next update after akrun's comment and with his sample data:
data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))
Modified function:
x <- lapply(seq_along(data), function(i) {
if(!any(data[[i]] %in% unlist(data[-i]))) {
data[[i]]
} else if (any(data[[i]] %in% unlist(data[seq_len(i-1)]))) {
NULL
} else {
z <- lapply(data[-seq_len(i)], intersect, data[[i]])
z <- names(z[sapply(z, length) >= 1L])
if (is.null(z)) NULL else union(data[[i]], unlist(data[z]))
}
})
x[!sapply(x, is.null)]
#[[1]]
#[1] "g" "k"
#
#[[2]]
#[1] "a" "b" "c" "d"
This works well with the original sample data, MrFlick's sample data and akrun's sample data.

Efficiency be damned and do you people even sleep? Base R only and much slower than the fastest answer. Since I wrote it, might as well post it.
f.union = function(x) {
repeat{
n = length(x)
m = matrix(F, nrow = n, ncol = n)
for (i in 1:n){
for (j in 1:n) {
m[i,j] = any(x[[i]] %in% x[[j]])
}
}
o = apply(m, 2, function(v) Reduce(union, x[v]))
if (all(apply(m, 1, sum)==1)) {return(o)} else {x=unique(o)}
}
}
f.union(data)
[[1]]
[1] "a" "b" "c" "d" "n"
[[2]]
[1] "g" "h" "k" "i"
Because I like being slow. (loaded library outside of benchmark)
Unit: microseconds
expr min lq mean median uq max neval
vlo() 896.435 1070.6540 1315.8194 1129.4710 1328.6630 7859.999 1000
akrun() 596.263 658.6590 789.9889 694.1360 804.9035 3470.158 1000
flick() 805.854 928.8160 1160.9509 1001.8345 1172.0965 5780.824 1000
josh() 2427.752 2693.0065 3344.8671 2943.7860 3524.1550 16505.909 1000 <- deleted :-(
doc() 254.462 288.9875 354.6084 302.6415 338.9565 2734.795 1000

One option would be to use combn and then find the intersects. There would be easier options.
indx <- combn(names(data),2)
lst <- lapply(split(indx, col(indx)),
function(i) Reduce(`intersect`,data[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
lapply(split(1:ncol(indx3), indx3[1,]),
function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE)))
#$v1
#[1] "a" "b" "c" "d" "n"
#$v2
#[1] "g" "h" "k" "i"
Update
You could use combnPrim from library(gRbase) to make this even faster. Using a slightly bigger dataset
library(gRbase)
set.seed(25)
data <- setNames(lapply(1:1e3,function(i)sample(letters,
sample(1:20), replace=FALSE)), paste0("v", 1:1000))
and comparing with the fastest. These are modified functions based on OP's comments to #docendo discimus.
akrun2M <- function(){
ind <- sapply(seq_along(data), function(i){#copied from #docendo discimus
!any(data[[i]] %in% unlist(data[-i]))
})
data1 <- data[!ind]
indx <- combnPrim(names(data1),2)
lst <- lapply(split(indx, col(indx)),
function(i) Reduce(`intersect`,data1[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
c(data[ind],lapply(split(1:ncol(indx3), indx3[1,]),
function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE))))
}
doc2 <- function(){
x <- lapply(seq_along(data), function(i) {
if(!any(data[[i]] %in% unlist(data[-i]))) {
data[[i]]
}
else {
z <- unlist(data[names(unlist(lapply(data[-c(1:i)],
intersect, data[[i]])))])
if (is.null(z)){
z
}
else union(data[[i]], z)
}
})
x[!sapply(x, is.null)]
}
Benchmarks
microbenchmark(doc2(), akrun2M(), times=10L)
# Unit: seconds
# expr min lq mean median uq max neval cld
# doc2() 35.43687 53.76418 54.77813 54.34668 62.86665 67.76754 10 b
#akrun2M() 26.64997 28.74721 38.02259 35.35081 47.56781 49.82158 10 a

I came across a similar problem that prompted me to look everywhere for a solution. I finally found a very good one thanks to a number of great contributors here, however as I seen this post I thought I would write my own custom function for this purpose. It's not actually elegant and is too slow but I think it's quite effective and can do the trick for now until I make some improvements:
anoush <- function(x) {
# First we check whether x is a list
stopifnot(is.list(x))
# Then we take every element of the input and calculate the intersect between
# that element & others. In case there were some we would store the indices
# in `vec` vector. So in the end we have a list called `ind` whose elements
# are all the indices connected with the corresponding elements of the original
# list for example first element of `ind` is `1`, `2`, `3` which means in
# the original list these elements have common values.
ind <- lapply(1:length(x), function(a) {
vec <- c()
for(i in 1:length(x)) {
if(length(unique(base::intersect(x[[a]], x[[i]]))) > 0) {
vec <- c(vec, i)
}
}
vec
})
# Then we go on to again compare each element of `ind` with other elements
# in case there were any intersect, we will calculate the `union` of them.
# for each element we will end up with a list of accumulated values but
# but in the end we use `Reduce` to capture only the last one. So for each
# element of `ind` we end up having a collection of indices that also
# result in duplicated values. For example elements `1` through `5` of
# `dup_ind` contains the same value cause in the original list these
# elements have common values.
dup_ind <- lapply(1:length(ind), function(a) {
out <- c()
for(i in 1:length(ind)) {
if(length(unique(base::intersect(ind[[a]], ind[[i]]))) > 0) {
out[[i]] <- union(ind[[a]], ind[[i]])
}
vec2 <- Reduce("union", out)
}
vec2
})
# Here we get rid of the duplicated elements of the list by means of
# `relist` funciton and since in this process all the duplicated elements
# will turn to `integer(0)` I have filtered those out.
un <- unlist(dup_ind)
res <- Map(`[`, dup_ind, relist(!duplicated(un), skeleton = dup_ind))
res2 <- Filter(length, res)
sapply(res2, function(a) unique(unlist(lapply(a, function(b) `[[`(x, b)))))
}
OP's Data Sample
> anoush(data)
[[1]]
[1] "a" "b" "c" "d" "n"
[[2]]
[1] "g" "h" "k" "i"
Dear #akrun's Data Sample
data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))
> anoush(data)
[[1]]
[1] "g" "k"
[[2]]
[1] "a" "b" "c" "d"

In general, you cannot do much better/faster than Floyd-Warshall-Algorithm, which is as follows:
library(Rcpp)
cppFunction(
"LogicalMatrix floyd(LogicalMatrix w){
int n = w.nrow();
for( int k = 0; k < n; k++ )
for( int i = 0; i < (n-1); i++ )
for( int j = i+1; j < n; j++ )
if( w(i,k) && w(k,j) ) {
w(i,j) = true;
w(j,i) = true;
}
return w;
}")
fw.union<-function(x) {
n<-length(x)
w<-matrix(F,nrow=n,ncol=n)
for( i in 1:n ) {
w[i,i]<-T
}
for( i in 1:(n-1) ) {
for( j in (i+1):n ) {
w[i,j]<-w[j,i]<- any(x[[i]] %in% x[[j]])
}
}
apply( unique( floyd(w) ), 1, function(y) { Reduce(union,x[y]) } )
}
Running benchmarks would be interesting, though. Preliminary tests suggest that my implementation is about 2-3 times faster than Vlo's.

Related

R Loop all vectors in a list between each other

I'm still a little green to R. So bear with me.
I have a list of vectors and I would like to compare each vector in the list and then tack on the matching list to the end of the match one. I am looking for robust repeatable solution, regardless of number of vectors in the list.
So if I have a list (lst) made of vectors:
lst <- list(c("a", "b"), c("b", "c"), c("e", "f"), c("c", "g"))
I want to get a list of vectors like this as a result:
[[1]]
[1] "a" "b" "c" "g"
[[2]]
[1] "e" "f"
So I've been able to make this work for a singular instance:
if(any(lst[[1]] %in% lst[[2]])){
c(lst[[1]], lst[[2]])
}
but now I'm trying to loop it over the entire list and this is what I have so far, but I'm a little stuck:
endmembers <- lapply(seq_along(lst), function(i,j){
x <- lst[[i]]
x2 <- lst[[j]]
if(any(x %in% x2)){
c(x, x2)
}
})
I would use a recursive function to stick all the components together, then remove list items that are contained within other items:
#### helper functions ----
# Recursive function to stick list items together
fun <- function(x, d) {
i <- which(sapply(d, function(y) y[1]) == tail(x, 1))
if (length(i) > 0) {
y <- d[[i[1]]]
x <- c(x, y[2:length(y)])
x <- fun(x, d)
}
x
}
# is vector inside another vector? - must be in the same sequence and order
inside <- function(x, y) {
if ( isTRUE(all.equal(x, y)) )
return(FALSE)
if ( length(x) > length(y) )
return(FALSE)
if ( !any(x %in% y))
return(FALSE)
!is.unsorted( sapply(x, function(a, b) which(a == b), b = y), strictly = TRUE )
}
#### analysis ----
# Stick vectors together if last == first
d <- lapply(lst, fun, d = lst)
# remove list items that are inside other list items - there might be a more
# elegant solution to this, I'm confused by it.
d[!apply(
sapply(d,
function(x, y) sapply(y, function(x, y) inside(x, y), y = x),
y = d),
1,
any)]
An easy option is using igraph
library(igraph)
u <- cluster_infomap(graph_from_data_frame(as.data.frame(do.call(rbind,lst))))
out <- split(u$names,u$membership)
which gives
> out
$`1`
[1] "a" "b" "c" "g"
$`2`
[1] "e" "f"
If you want base R solution with for loops, here is one version
out <- lst[1]
for (v in lst) {
flag <- 1
for (k in 1:length(out)) {
if (any(v %in% out[[k]])) {
out[[k]] <- union(out[[k]], v)
flag <- 0
break
}
}
if (flag) out[[length(out) + 1]] <- v
}
such that
> out
[[1]]
[1] "a" "b" "c" "g"
[[2]]
[1] "e" "f"
In case anyone wants to know what I did, I followed the code in merging sets which have even one element in common R that was commented.
m <- sapply(lst, function(x) sapply(lst, function(y) (any(x %in% y))))
#determine the groups of the graph constructed from m
groups <- groups(components(graph_from_adjacency_matrix(m)))
#Get the unique elements of each group
endmembers <- lapply(groups,function(x) sort(unique(unlist(lst[x]))))

R: replace a value of a vector with multiple values

Let's say, I have a vector a with length(a) = l and l >= 1.
The element "x" occurs at least one time in a, but we don't know an exact position.
I want to replace every "x" in a with the values c(1,2,3)
For example: a = ("y","x","z"), then I want the result after the replacement to be a = ("y",1,2,3,"z").
I thought of doing it this way:
l <- length(a)
pos.x <- which(a == "x")
if(l == 1L & pos.x == 1L) {
a <- c(1,2,3)
} else if (l > 1L & pos.x == 1) {
a <- c(1,2,3,a[-1])
} else if (l > 1L & pos.x == l) {
a <- c(a[-l],1,2,3)
} else if (l >= 3 & pos.x != 1 & pos.x != l) {
a <- c(a[1:(pos.x - 1)],1,2,3,a[(pos.x + 1):l])
}
While this code does work, my question would be wether there is a more 'elegant' way to solve this problem, that needs less processing power, and that could replace more than one "x".
Thank you!
Here's a simple vectorized solution with base R -
a <- c("y","x","z","y","x","z") # vector to search
b <- 1:3 # replacement values
a <- rep(a, 1 + (length(b) - 1)*(a == "x")) # repeat only "x" length(b) times
a[a == "x"] <- b # replace "x" with replacement values i.e. b
[1] "y" "1" "2" "3" "z" "y" "1" "2" "3" "z"
Here is an option using a for loop
a <- c("y","x","z","y","x","z")
b <- c(1,2,3)
The 'trick' is to create a list first, then replace all "x" with b and finally call unlist.
a_list <- as.list(a)
for(i in which(a_list == "x")) {
a_list[[i]] <- b
}
Result
unlist(a_list)
#[1] "y" "1" "2" "3" "z" "y" "1" "2" "3" "z"
Please consider #Shree's answer!
Here is why:
n <- 1e6
set.seed(1)
a <- sample(c("x", "y", "z"), size = n, replace = TRUE)
b <- 1:3
library(microbenchmark)
benchmark <- microbenchmark(
markus = markus(a, b),
IceCreamToucan = IceCreamToucan(a, b),
Shree = Shree(a, b)
)
autoplot(benchmark)
#Unit: milliseconds
# expr min lq mean median uq max neval
# markus 403.38464 467.03277 615.8078 556.74067 754.5117 1095.7035 100
#IceCreamToucan 401.34614 462.92680 602.1556 526.08280 687.8436 1422.0629 100
# Shree 52.33867 65.32323 157.6680 97.34066 162.0638 650.2571 100
functions
markus <- function(a, b) {
a_list <- as.list(a)
for(i in which(a_list == "x")) {
a_list[[i]] <- b
}
unlist(a_list)
}
Shree <- function(a, b) {
a <- rep(a, 1 + (length(b) - 1)*(a == "x"))
a[a == "x"] <- b
a
}
# from the comments
IceCreamToucan <- function(a, b) {
a_list <- as.list(a)
w <- which(a_list == "x")
a_list[w] <- rep(list(b), length(w)) # changed your answer slightly here
unlist(a_list)
}

Comparing strings in multiple R data frames to retrieve the first characters that are in agreement

I have the following data frames in R.
df1<-as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("ACTG","ACT","GTAC","GTC","GACT")))
df2<-as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("AC","ATC","GTCA","GC","GAC")))
I am trying to determine what the longest possible string that is consistent between the two Nucs columns.
So far, I have tried this:
x1 <- strsplit(as.character(df1$Nucs),"")
x2 <- strsplit(as.character(df2$Nucs),"")
x <- Map(intersect, x1, x2)
sapply( x, paste0, collapse="")
This gives me the following:
[1] "AC" "ACT" "GTAC" "GC" "GAC"
which is not quite what I want because in the case of Site 3 I have GTAC and GTCA so I only want the first two characters that are consistent in the string, i.e. GT.
Does anybody have any ideas on how I can go about this?
I also find a solution that you can try:
CompareVectors <- function(x, y){
comp_length <- min(length(y), length(x))
x <- x[1 : comp_length]
y <- y[1 : comp_length]
compare <- x == y
id <- which(compare == F)[1]
if(!is.na(id)){
x <- x[which(compare[1: (id - 1)])]
}
return(paste(x, collapse = ""))
}
OUTPUT:
sapply(1 : length(x1), function(i) CompareVectors(x1[[i]], x2[[i]]))
[1] "AC" "A" "GT" "G" "GAC"
So here's my not very efficient solution:
df1 <- as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("ACTG","ACT","GTAC","GTC","GACT")))
df2 <- as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("AC","ATC","GTCA","GC","GAC")))
x1 <- strsplit(as.character(df1$Nucs),"")
x2 <- strsplit(as.character(df2$Nucs),"")
for(i in 1:nrow(df1)){
a = ""
for(j in 1:min(length(x1[[i]]),length(x2[[i]]))){
a= paste(a,x1[[i]][j] == x2[[i]][j],sep=",")
}
print(head(x1[[i]],sum(as.logical(strsplit(a,",")[[1]][-1]))))
}
Output:
[1] "A" "C"
[1] "A"
[1] "G" "T"
[1] "G"
[1] "G" "A" "C"
Would you like me to comment the code?
You can try this, although a bit lengthy:
sapply(1:nrow(df1), function(x) {
s1 <- unlist(strsplit(as.character(df1$Nucs[x]), split = ''))
s2 <- unlist(strsplit(as.character(df2$Nucs[x]), split = ''))
n <- min(length(s1), length(s2))
i <- 1
while(i <= n) {
if (s1[i] != s2[i]) {
break
}
i <- i + 1
}
if (i > 0)
paste(s1[1:(i-1)], collapse ='')
else
''
})
# [1] "AC" "A" "GT" "G" "GAC"
Here's another solution. It may not be covering all possible cases but that's probably easy to extend.
df1<-as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("ACTG","ACT","GTAC","GTC","GACT")))
df2<-as.data.frame(cbind(Site=c(1,2,3,4,5),Nucs=c("AC","ATC","GTCA","GC","GAC")))
mapply(x = as.list(df1$Nucs), y = as.list(df2$Nucs), FUN = function(x, y) {
x <- as.character(x); y <- as.character(y) # doesn't work with factors
# To keep everything in one easy to debug chunk, just switch in case
# x is shorter than y.
if (!(nchar(x) >= nchar(y))) {
xp <- y
yp <- x
} else {
xp <- x
yp <- y
}
# create elements to work on and vector for storage
to.glue <- strsplit(xp, "")[[1]]
out <- rep(NA, times = length(to.glue)) # used as output
# If one string is shorter than the other, extract one element
# at a time and see if there's a match in y. If yes, then pro-
# ceed to the second element, concatenate it with the first
# one and see if this pattern is present anywhere in y...
for (i in 1:length(to.glue)) {
glued <- paste(to.glue[1:i], collapse = "")
fm <- pmatch(x = glued, table = yp)
if (is.na(fm)) {
return(out[i-1])
} else {
out[i] <- glued
}
}
})
[1] "AC" "A" "GT" "G" "GAC"

is there a way to extend LETTERS past 26 characters e.g., AA, AB, AC...?

I use LETTERS most of the time for my factors but today I tried to go beyond 26 characters:
LETTERS[1:32]
Expecting there to be an automatic recursive factorization AA, AB, AC... But was disappointed. Is this simply a limitation of LETTERS or is there a way to get what I'm looking for using another function?
Would 702 be enough?
LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
If not, how about 18,278?
MOAR_LETTERS <- function(n=2) {
n <- as.integer(n[1L])
if(!is.finite(n) || n < 2)
stop("'n' must be a length-1 integer >= 2")
res <- vector("list", n)
res[[1]] <- LETTERS
for(i in 2:n)
res[[i]] <- c(sapply(res[[i-1L]], function(y) paste0(y, LETTERS)))
unlist(res)
}
ml <- MOAR_LETTERS(3)
str(ml)
# chr [1:18278] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" ...
This solution uses recursion. Usage is a bit different in the sense MORELETTERS is not a long vector you will have to store and possibly expand as your inputs get larger. Instead, it is a function that converts your numbers into the new base.
extend <- function(alphabet) function(i) {
base10toA <- function(n, A) {
stopifnot(n >= 0L)
N <- length(A)
j <- n %/% N
if (j == 0L) A[n + 1L] else paste0(Recall(j - 1L, A), A[n %% N + 1L])
}
vapply(i-1L, base10toA, character(1L), alphabet)
}
MORELETTERS <- extend(LETTERS)
MORELETTERS(1:1000)
# [1] "A" "B" ... "ALL"
MORELETTERS(c(1, 26, 27, 1000, 1e6, .Machine$integer.max))
# [1] "A" "Z" "AA" "ALL" "BDWGN" "FXSHRXW"
You can make what you want like this:
LETTERS2<-c(LETTERS[1:26], paste0("A",LETTERS[1:26]))
Another solution for excel style column names, generalized to any number of letters
#' Excel Style Column Names
#'
#' #param n maximum number of letters in column name
excel_style_colnames <- function(n){
unlist(Reduce(
function(x, y) as.vector(outer(x, y, 'paste0')),
lapply(1:n, function(x) LETTERS),
accumulate = TRUE
))
}
A variant on eipi10's method (ordered correctly) using data.table:
library(data.table)
BIG_LETTERS <- c(LETTERS,
do.call("paste0",CJ(LETTERS,LETTERS)),
do.call("paste0",CJ(LETTERS,LETTERS,LETTERS)))
Yet another option:
l2 = c(LETTERS, sort(do.call("paste0", expand.grid(LETTERS, LETTERS[1:3]))))
Adjust the two instances of LETTERS inside expand.grid to get the number of letter pairs you'd like.
A function to produce Excel-style column names, i.e.
# A, B, ..., Z, AA, AB, ..., AZ, BA, BB, ..., ..., ZZ, AAA, ...
letterwrap <- function(n, depth = 1) {
args <- lapply(1:depth, FUN = function(x) return(LETTERS))
x <- do.call(expand.grid, args = list(args, stringsAsFactors = F))
x <- x[, rev(names(x)), drop = F]
x <- do.call(paste0, x)
if (n <= length(x)) return(x[1:n])
return(c(x, letterwrap(n - length(x), depth = depth + 1)))
}
letterwrap(26^2 + 52) # through AAZ
## This will take a few seconds:
# x <- letterwrap(1e6)
It's probably not the fastest, but it extends indefinitely and is nicely predictable. Took about 20 seconds to produce through 1 million, BDWGN.
(For a few more details, see here: https://stackoverflow.com/a/21689613/903061)
A little late to the party, but I want to play too.
You can also use sub, and sprintf in place of paste0 and get a length 702 vector.
c(LETTERS, sapply(LETTERS, sub, pattern = " ", x = sprintf("%2s", LETTERS)))
Here's another addition to the list. This seems a bit faster than Gregor's (comparison done on my computer - using length.out = 1e6 his took 12.88 seconds, mine was 6.2), and can also be extended indefinitely. The flip side is that it's 2 functions, not just 1.
make.chars <- function(length.out, case, n.char = NULL) {
if(is.null(n.char))
n.char <- ceiling(log(length.out, 26))
m <- sapply(n.char:1, function(x) {
rep(rep(1:26, each = 26^(x-1)) , length.out = length.out)
})
m.char <- switch(case,
'lower' = letters[m],
'upper' = LETTERS[m]
)
m.char <- LETTERS[m]
dim(m.char) <- dim(m)
apply(m.char, 1, function(x) paste(x, collapse = ""))
}
get.letters <- function(length.out, case = 'upper'){
max.char <- ceiling(log(length.out, 26))
grp <- rep(1:max.char, 26^(1:max.char))[1:length.out]
unlist(lapply(unique(grp), function(n) make.chars(length(grp[grp == n]), case = case, n.char = n)))
}
##
make.chars(5, "lower", 2)
#> [1] "AA" "AB" "AC" "AD" "AE"
make.chars(5, "lower")
#> [1] "A" "B" "C" "D" "E"
make.chars(5, "upper", 4)
#> [1] "AAAA" "AAAB" "AAAC" "AAAD" "AAAE"
tmp <- get.letters(800)
head(tmp)
#> [1] "A" "B" "C" "D" "E" "F"
tail(tmp)
#> [1] "ADO" "ADP" "ADQ" "ADR" "ADS" "ADT"
Created on 2019-03-22 by the reprex package (v0.2.1)

Find disjoint groups (Connected Components) in a set of vectors [duplicate]

Given a list:
foo <- list(c("a", "b", "d"), c("c", "b"), c("c"),
c("b", "d"), c("e", "f"), c("e", "g"))
what is an efficient way to get a list that contains the disjoint sets of its content?
Here I want to obtain:
[[1]]
[1] "a" "b" "c" "d"
[[2]]
[1] "e" "f" "g"
The solutions I have managed to come up with seemed overly complicated and slow (I'm working with a largish list (4000+ elements) that contain up to hundreds of elements).
Thanks!
Solutions benchmarking
Thank you all for your input. The igraph approach is really nice. I did some benchmarking of the proposed solutions and using igraph with #flodel suggestion is efficient. The example here (iGrp) has 3170 elements.
> microbenchmark(igraph_method(iGrp), igraph_method2(iGrp), iterative_method(iGrp), times=10L)
## Unit: milliseconds
## expr min lq median uq max neval
## igraph_method(iGrp) 6892.8534 7140.0287 7229.5569 7396.2458 8044.9796 10
## igraph_method2(iGrp) 381.4555 391.2097 442.3282 472.5641 537.4885 10
## iterative_method(iGrp) 7118.7857 7272.9568 7595.9700 7675.2888 8485.4388 10
#### functions used
igraph_method <- function(lst) {
edg <- do.call("rbind", lapply(lst, function(x) {
if (length(x) > 1) t(combn(x, 2)) else NULL
}))
g <- graph.data.frame(edg)
split(V(g)$name, clusters(g)$membership)
}
igraph_method2 <- function(lst) {
edg <- do.call("rbind", lapply(lst, function(x) {
if (length(x) > 1) cbind(head(x, -1), tail(x, -1)) else NULL
}))
g <- graph.data.frame(edg)
split(V(g)$name, clusters(g)$membership)
}
iterative_method <- function(lst) {
Reduce(function(l, x) {
matches <- sapply(l, function(i) any(x %in% i))
if (any(matches)) {
combined <- unique(c(unlist(l[matches]), x))
l[matches] <- NULL # Delete old entries
l <- c(l, list(combined)) # Add combined entries
} else {
l <- c(l, list(x)) # New list entry
}
l
}, lst, init=list())
}
One way to approach this sort of problem is to build a graph where nodes are the values in your list and edges are whether those values have appeared together. Then you're just asking for the connected components of that graph. The igraph package in R makes this pretty easy. First, you'll want to build a data frame with the edges:
edges <- do.call(rbind, lapply(foo, function(x) {
if (length(x) > 1) cbind(head(x, -1), tail(x, -1)) else NULL
}))
edges
# [,1] [,2]
# [1,] "a" "b"
# [2,] "b" "d"
# [3,] "c" "b"
# [4,] "b" "d"
# [5,] "e" "f"
# [6,] "e" "g"
Then, you can build your graph from the edges and compute the connected components:
library(igraph)
g <- graph.data.frame(edges, directed=FALSE)
split(V(g)$name, clusters(g)$membership)
# $`1`
# [1] "a" "b" "c" "d"
#
# $`2`
# [1] "e" "f" "g"
For reasonably large problems, this approach seems to be modestly faster than an iterative approach:
values = as.character(1:2000)
set.seed(144)
foo <- lapply(1:4000, function(x) sample(values, rbinom(1, 10, .5)))
library(microbenchmark)
microbenchmark(josilber(foo), lundberg(foo))
# Unit: milliseconds
# expr min lq median uq max neval
# josilber(foo) 251.8007 281.0168 297.2446 314.6714 635.7916 100
# lundberg(foo) 640.0575 714.9658 761.3777 827.5415 1118.3517 100
Here is an iterative approach, building a list for the result, and combining elements as they are seen together:
Reduce(function(l, x) {
matches <- sapply(l, function(i) any(x %in% i))
if (any(matches)) {
combined <- unique(c(unlist(l[matches]), x))
l[matches] <- NULL # Delete old entries
l <- c(l, list(combined)) # Add combined entries
} else {
l <- c(l, list(x)) # New list entry
}
l
}, foo, init=list())
## [[1]]
## [1] "a" "b" "d" "c"
##
## [[2]]
## [1] "e" "f" "g"

Resources