Extract single linkage clusters from very large pairs list - r

I have a very large pairs list that I need to break down into single linkage communities. So far I have been able to do this entirely in R just fine. But I need to prepare for the eventuality that the entire list may be too large to hold in memory, or for igraph's R implementation to handle. A very simple version of this task looks like:
library(igraph)
df <- data.frame("p1" = c("a", "a", "d", "d"),
"p2" = c("b", "c", "e", "f"),
"val" = c(0.5, 0.75, 0.25, 0.35))
g <- graph_from_data_frame(d = df,
directed = FALSE)
sg <- groups(components(g))
sg <- sapply(sg,
function(x) induced_subgraph(graph = g,
vids = x),
USE.NAMES = FALSE,
simplify = FALSE)
if df is incredibly large - on the scale of hundreds of millions, to tens of billions of rows, is there a way for me to extract individual positions of sg without having to build g in it's entirety? It's relatively easy for me to store representations of df outside of R either as a compressed txt file or as a sqlite database.

To adress the problem with igraph's R implementation (assuming the dataset is still holdable in RAM, otherwise see #Paul Brodersen's answer):
The solution below works by specifying one element of the graph and then going over all connections until no further edges are found. It therefore creates the subgraph without building the whole graph. It looks a bit hacky compared to a recursive function but scales better.
library(igraph)
reduce_graph <- function(df, element) {
stop = F
elements_to_inspect <- element
rows_graph <-0
while(stop ==F) {
graph_parts <- df[df$p1 %in% elements_to_inspect |
df$p2 %in% elements_to_inspect,]
elements_to_inspect <- unique(c(unique(graph_parts$p1),
unique(graph_parts$p2)))
if(dim(graph_parts)[1] == rows_graph) {
stop <-TRUE
} else {
rows_graph <- dim(graph_parts)[1]
}
}
return(graph_parts)
}
df <- data.frame("p1" = c("a", "a", "d", "d","o"),
"p2" = c("b", "c", "e", "f","u"),
"val" = c(100, 0.75, 0.25, 0.35,1))
small_graph <- reduce_graph(df, "f")
g <- graph_from_data_frame(d = small_graph,
directed = FALSE)
sg <- groups(components(g))
sg <- sapply(sg,
function(x) induced_subgraph(graph = g,
vids = x),
USE.NAMES = FALSE,
simplify = FALSE)
One can test the speed on a bigger dataset.
##larger dataset with lots of sparse graphs.
set.seed(100)
p1 <- as.character(sample(1:10000000, 1000000, replace=T))
p2 <- as.character(sample(1:10000000, 1000000, replace=T))
val <- rep(1, 1000000)
df <- data.frame("p1" = p1,
"p2" = p2,
"val" = val)
small_graph <- reduce_graph(df, "9420672") #has 3 pairwise connections
g <- graph_from_data_frame(d = small_graph,
directed = FALSE)
sg <- groups(components(g))
sg <- sapply(sg,
function(x) induced_subgraph(graph = g,
vids = x),
USE.NAMES = FALSE,
simplify = FALSE)
Building groups and subgraph takes one second, compared to multiple minutes for the whole graph on my machine. This of course depends on how sparsely connected the graphs are.

Related

Automatically categorize and add annotations using pheatmap in R

I have a dataframe made by the school grades of some students in different subjects. The students are also characterized by their gender (F or M), that is included as a suffix in their names (e.g. Anne_F, Albert_M, etc...)
With these data I have created an heatmap with the package pheatmap(), in this way:
library(pheatmap)
Anne_F <- c(9,7,6,10,6)
Carl_M <- c(6,7,9,5,7)
Albert_M <- c(8,8,8,7,9)
Kate_F <- c(10,5,10,9,5)
Emma_F <- c(6,8,10,8,7)
matrix <- cbind(Anne_F, Carl_M, Albert_M, Kate_F, Emma_F)
rownames(matrix) <- c("Math", "Literature", "Arts", "Science", "Music")
print(matrix)
heatmap <- pheatmap(
mat = matrix,
cluster_rows = F,
cluster_cols = F,
cellwidth = 30,
cellheight = 30,
)
heatmap
Which gives this matrix
and the relative plot:
Now I would like to automatically recognize if a student is Male or Female and add this as a column annotation in the heatmap, in order to have a graph like this:
I have thought to create two vectors, one with the name of the students:
name <- c("Anne", "Carl", "Albert", "Kate", "Emma") and one with the respective genders: gender <- c("F", "M", "M", "F", "F") , but I can't figure out how to associate names with genders, and to show them on the heatmap.
I don't mean to manually associate one-name to one-gender (as Anne to F, Albert to M, etc,). I need to take the entire vector of names and associate it with the corresponding vector of genders (and then annotate them on the heatmap), because their number will increase in the future.
Many thanks in advance for your help.
You need to use annotation_col option in pheatmap.
library(pheatmap)
# split matrix into "Name" and "Gender"
name_gender_matrix <- str_split_fixed(colnames(matrix), "_", 2)
# Data that maps to the heatmap should be set at the rownames
annot_col <- data.frame(row.names = name_gender_matrix[, 1], Gender = name_gender_matrix[, 2])
# Align the column name of your matrix and with the annotation
colnames(matrix) <- rownames(annot_col)
heatmap <- pheatmap(
mat = matrix,
cluster_rows = F,
cluster_cols = F,
cellwidth = 30,
cellheight = 30,
annotation_col = annot_col
)
With the given data, you could achieve your desired output like this:
Gender <- sapply(colnames(matrix), function(x) strsplit(x, "_")[[1]][2])
df <- as.data.frame(Gender)
pheatmap(
mat = matrix,
cluster_rows = F,
cluster_cols = F,
cellwidth = 30,
cellheight = 30,
annotation_col = df,
annotation_colors = list(Gender = c(M = "#6ef88a", F = "#d357fe"))
)

ggarrange generates an empty pdf file

I am dealing with a function that takes a big data frame (36 rows and 194 columns) which performs a Principal Component Analysis and then generates a list of plots where I have the combination of 26 Principal Components which are 325 in total, using 'expand.grid'.
My problem is that when I am using ggarrange(), from ggpubr, to merge all the plots in only one pdf file, this file is empty.
My code:
a = 26
row.pairs = 325
PC.Graph <- function(df, col1, col2, tag, id){
df1 <- df[,-c(col1:col2)]
pca <- prcomp(df1, scale. = T)
pc.summ <- summary(pca)
a <- sum(pc.summ$importance[3,] < 0.975)
b <- c(1:a)
pc.grid <- expand.grid(b, b)
pc.pairs <- pc.grid[pc.grid$Var1 < pc.grid$Var2,]
row.pairs <- nrow(pc.pairs)
components <- c(1:row.pairs)
S.apply.FUN <- function(x){
c <- sapply(pc.pairs, "[", x, simplify = F)
pcx <- c$Var1
pcy <- c$Var2
df2 <- df
row.names(df2) <- df[, tag]
name = paste("PCA_", pcx, "_vs_", pcy)
autoplot(pca, data = df2, colour = id, label = T, label.repel = T, main = name,
x = pcx, y = pcy)
}
all.plots <- Map(S.apply.FUN, components)
pdf(file = "All_PC.pdf", width = 50, height = 70)
print(ggarrange(all.plots))
dev.off()
}
PC.Graph(Final_DF, col1 = 1, col2 = 5, tag = "Sample", id = "Maturation")
You would have to pass a plotlist to ggarrange, but I am not sure you would get any useful plot out of that plot area in the PDF file, so I would advise you to split the plotlist into chunks (e.g. of 20) and plot these to multiple pages.
Specifically, I would export all.plots from your PC.Graph function (and remove the code to write to PDF there).
I would also change the expand.grid(b, b) to t(combn(b, 2)), since you don't need to plot the PC combinations twice.
Then I would do something like this:
# export the full list of plots
plots <- PC.Graph(Final_DF, col1 = 1, col2 = 5, tag = "Sample", id = "Maturation")
# split the plotlist
splitPlots <- split(plots, ceiling(seq_along(plots)/20))
plotPlots <- function(x){
out <- cowplot::plot_grid(plotlist = x, ncol = 5, nrow = 4)
plot(out)
}
pdf(file = "All_PC.pdf", width = 50, height = 45)
lapply(splitPlots, plotPlots)
dev.off()

Perform an operation with complete cases without changing the original vectors

I would like to calculate a rank-biserial correlation. But the (only it seems) package can't handle missing values that well. It has no built in "na.omit = TRUE" function. I could remove the missings in the data frame, but that would be a hustle with many different calculations.
n <- 500
df <- data.frame(id = seq (1:n),
ord = sample(c(0:3), n, rep = TRUE),
sex = sample(c("m", "f"), n, rep = TRUE, prob = c(0.55, 0.45))
)
df <- as.data.frame(apply (df, 2, function(x) {x[sample( c(1:n), floor(n/10))] <- NA; x} ))
library(rcompanion)
wilcoxonRG(x = df$ord, g = df$sex, verbose = T)
I imagine something stupidly easy like "complete.cases(wilcoxonRG(x = df$ord, g = df$sex, verbose = T)). It's probably not that hard but I could only find comeplete data frame manipulations. Thanks in advance!

How to calculate a maximum-bottleneck path with igraph?

Given a capacity network with a single source and a single sink, how can I calculate the maximum-bottleneck path (also known as the widest path or maximum capacity path problem) using igraph?
I've read (e.g. here or even with pseudocode there) that it is possible with some modifications to Dijkstra’s algorithm, but I do not want to dive into algortihm development but use igraph instead.
Example
library(igraph)
set.seed(21)
nodes = cbind(
'id' = c('Fermenters', 'Methanogens', 'carbs', 'CO2', 'H2', 'other', 'CH4', 'H2O')
)
from <- c('carbs', rep('Fermenters', 3), rep('Methanogens', 2), 'CO2', 'H2')
to <- c('Fermenters', 'other', 'CO2', 'H2', 'CH4', 'H2O', rep('Methanogens', 2))
weight <- sample(1 : 20, 8)
links <- data.frame(from, to, weight, stringsAsFactors = FALSE)
net = graph_from_data_frame(links, vertices = nodes, directed = T)
## Calculate max-bottleneck here !
# # disabled because just vis
# plot(net, edge.width = E(net)$weight)
# require(networkD3)
# require(tidyverse)
#
# d3net <- igraph_to_networkD3(net, group = rep(1, 8))
# forceNetwork(
# Links = mutate(d3net$links, weight = E(net)$weight), Nodes = d3net$nodes,
# Source = 'source', Target = 'target',
# NodeID = 'name', Group = "group", Value = "weight",
# arrows = TRUE, opacity = 1, opacityNoHover = 1
# )
So with respect to the example, how would I calculate the maximum capacity path from carbs to H2O?
I don't know how efficient this would be, but you could use igraph to find all "simple" paths, then calculate the minimum edge weight of each, then choose the max...
require(tibble)
require(igraph)
nodes = data_frame('id' = c('A', "B", "C", "D"))
links = tribble(
~from, ~to, ~weight,
"A" , "B", 10,
"B", "C", 10,
"C", "D", 6,
"A", "D", 4,
)
net = graph_from_data_frame(links, vertices = nodes, directed = T)
simple_paths <- all_simple_paths(net, "A", "D")
simple_paths[which.max(
sapply(simple_paths, function(path) {
min(E(net, path = path)$weight)
})
)]
# [[1]]
# + 4/4 vertices, named, from 061ab8d:
# [1] A B C D
You could try the same idea as in IGRAPH IN R: Find the path between vertices that maximizes the product of edge attributes. Invert the weights, divide by the total to keep the weights < 1 (to keep the log-weights positive), and take the min:
x<-shortest_paths(net,3,8, weights=-log(E(net)$weight/sum(E(net)$weight)), output="epath")[[2]]
E(net)[x[[1]]]
min(E(net)$weight[x[[1]]])
which gives
+ 4/8 edges from 57589bc (vertex names):
[1] carbs ->Fermenters Fermenters ->H2 H2 ->Methanogens Methanogens->H2O
[1] 10

How to color branches in R dendogram as a function of the classes in it?

I wish to visualize how well a clustering algorithm is doing (with certain distance metric). I have samples and their corresponding classes.
To visualize, I cluster and I wish to color the branches of a dendrogram by the items in the cluster. The color will be the color most items in the hierarchical cluster correspond to (given by the data\classes).
Example: If my clustering algorithm chose indexes 1,21,24 to be a certain cluster (at a certain level) and I have a csv file containing a class number in each row corresponding to lets say 1,2,1. I want this edge to be coloured 1.
Example Code:
require(cluster)
suppressPackageStartupMessages(library(dendextend))
dir <- 'distance_metrics/'
filename <- 'aligned.csv'
my.data <- read.csv(paste(dir, filename, sep=""), header = T, row.names = 1)
my.dist <- as.dist(my.data)
real.clusters <-read.csv("clusters", header = T, row.names = 1)
clustered <- diana(my.dist)
# dend <- colour_branches(???dend, max(real.clusters)???)
plot(dend)
EDIT:
another example partial code
dir <- 'distance_metrics/' # csv in here contains a symmetric matrix
clust.dir <- "clusters/" #csv in here contains a column vector with classes
my.data <- read.csv(paste(dir, filename, sep=""), header = T, row.names = 1)
filename <- 'table.csv'
my.dist <- as.dist(my.data)
real.clusters <-read.csv(paste(clust.dir, filename, sep=""), header = T, row.names = 1)
clustered <- diana(my.dist)
dnd <- as.dendrogram(clustered)
Both node and edge color attributes can be set recursively on "dendrogram" objects (which are just deeply nested lists) using dendrapply. The cluster package also features an as.dendrogram method for "diana" class objects, so conversion between the object types is seamless. Using your diana clustering and borrowing some code from #Edvardoss iris example, you can create the colored dendrogram as follows:
library(cluster)
set.seed(999)
iris2 <- iris[sample(x = 1:150,size = 50,replace = F),]
clust <- diana(iris2)
dnd <- as.dendrogram(clust)
## Duplicate rownames aren't allowed, so we need to set the "labels"
## attributes recursively. We also label inner nodes here.
rectify_labels <- function(node, df){
newlab <- df$Species[unlist(node, use.names = FALSE)]
attr(node, "label") <- (newlab)
return(node)
}
dnd <- dendrapply(dnd, rectify_labels, df = iris2)
## Create a color palette as a data.frame with one row for each spp
uniqspp <- as.character(unique(iris$Species))
colormap <- data.frame(Species = uniqspp, color = rainbow(n = length(uniqspp)))
colormap[, 2] <- c("red", "blue", "green")
colormap
## Now color the inner dendrogram edges
color_dendro <- function(node, colormap){
if(is.leaf(node)){
nodecol <- colormap$color[match(attr(node, "label"), colormap$Species)]
attr(node, "nodePar") <- list(pch = NA, lab.col = nodecol)
attr(node, "edgePar") <- list(col = nodecol)
}else{
spp <- attr(node, "label")
dominantspp <- levels(spp)[which.max(tabulate(spp))]
edgecol <- colormap$color[match(dominantspp, colormap$Species)]
attr(node, "edgePar") <- list(col = edgecol)
}
return(node)
}
dnd <- dendrapply(dnd, color_dendro, colormap = colormap)
## Plot the dendrogram
plot(dnd)
The function you are looking for is color_brances from the dendextend R package, using the arguments clusters and col. Here is an example (based on Shaun Wilkinson's example):
library(cluster)
set.seed(999)
iris2 <- iris[sample(x = 1:150,size = 50,replace = F),]
clust <- diana(iris2)
dend <- as.dendrogram(clust)
temp_col <- c("red", "blue", "green")[as.numeric(iris2$Species)]
temp_col <- temp_col[order.dendrogram(dend)]
temp_col <- factor(temp_col, unique(temp_col))
library(dendextend)
dend %>% color_branches(clusters = as.numeric(temp_col), col = levels(temp_col)) %>%
set("labels_colors", as.character(temp_col)) %>%
plot
there are suspicions that misunderstood the question however I'll try to answer:
from my previous objectives were rewritten by the example of iris
clrs <- rainbow(n = 3) # create palette
clrs <- clrs[iris$Species] # assign colors
plot(x = iris$Sepal.Length,y = iris$Sepal.Width,col=clrs) # simple test colors
# cluster
dt <- cbind(iris,clrs)
dt <- dt[sample(x = 1:150,size = 50,replace = F),] # create short dataset for visualization convenience
empty.labl <- gsub("."," ",dt$Species) # create a space vector with length of names intended for reserve place to future text labels
dst <- dist(x = scale(dt[,1:4]),method = "manhattan")
hcl <- hclust(d = dst,method = "complete")
plot(hcl,hang=-1,cex=1,labels = empty.labl, xlab = NA,sub=NA)
dt <- dt[hcl$order,] # sort rows for order objects in dendrogramm
text(x = seq(nrow(dt)), y=-.5,labels = dt$Species,srt=90,cex=.8,xpd=NA,adj=c(1,0.7),col=as.character(dt$clrs))

Resources