How to generate the N most dissimilar combinations in R - r

I have a set of 6 colour codes (x), a set of N individuals that each need to be labelled with a unique colour code, and four locations on each animal, each of which can carry a different colour. I have 6 different colors.
So, the codes for two individual might be;
1. red, blue, blue,white
2. white,yellow,pink,yellow
However, as the colour at each position can fall off, I would like to generate a redundant labelling scheme, that would allow still allow an individual to be distinguished from others, even after it loses the colour at one (or even two?) locations.
Even though 6 colours and 4 positions gives 1296 combinations, I am finding it difficult to select the N most dissimilar combinations:
Reproducible example:
library(gtools)
x <- c("white", "red", "green", "blue", "pink", "yellow")
Perms <- permutations(n=6,r=4,v=x,repeats.allowed=T)
print(nrow(Perms))
head(Perms)
Note that the first 6 combinations differ in the colour at only 1 position - loss of this code by >1 individual would mean that they can no longer be distinguished!
So, for values of N between 50-150, how to select the N most dissimilar combinations?
Thanks !

I cannot conclusively answer your question, but I have an idea that might help you.
Build string codes with the first letter of each color:
library(gtools)
x <- c("w", "r", "g", "b", "p", "y")
Perms <- permutations(n=6,r=4,v=x,repeats.allowed=T)
m <- apply(Perms, 1, paste, collapse = "")
> head(m)
[1] "bbbb" "bbbg" "bbbp" "bbbr" "bbbw" "bbby"
Sample n codes:
set.seed(1)
n <- 50
y <- sample(m, n)
Create a n*n matrix of Levenshtein distances:
library(vwr)
lvmat <- sapply(y, function(x) levenshtein.distance(x, y))
> lvmat[1:5, 1:5]
grrp pgpg rprr yprw gggp
grrp 0 4 3 3 2
pgpg 4 0 4 4 3
rprr 3 4 0 2 4
yprw 3 4 2 0 4
gggp 2 3 4 4 0
Now you could maximize sum(lvmat), maybe via bootstrapping or whatever floats your boat, to get the sample of most dissimilar combinations.

Reproducible example of LAPs suggestion above.
Note, due to the reliance upon random sampling, this still does not guarantee that there will be no code pairs that differ at only one position. Still, it's a good start -thanks LAP!
# install.packages("gtools")
library(gtools)
library(vwr)
## Available colours
x <- c("W", "R", "G", "B", "P", "Y")
## Generate all possible colour combinations, for 6 colours & 4 positions
body <- data.frame(permutations(n=6,r=4,v=x,repeats.allowed=T), stringsAsFactors = F) ; colnames(body) <- c("Head","Thorax","L_gaster","R_gaster")
## concatenate each colour-code to a sequence without spaces, etc
m <- paste( body$Head, body$Thorax, body$L_gaster, body$R_gaster, sep="")
##
set.seed(1)
COLONY_SIZE <- 50 ## How many adult workers in the colony excluding the queen
N_Attempts <- 1000 ## How many alternative solutions to generate - the more the better, but it takes longer
## prepare data-containers
Summary <- NULL
LvList <- list()
for (TRY in 1:N_Attempts)
{print(paste(TRY,"of",N_Attempts))
y <- sample(m, COLONY_SIZE) ## randomly sample COLONY_SIZE codes
## measure pairwise Levenshtein distances for all pair combinations
Matrix <- sapply(y, function(x) levenshtein.distance(x, y))
diag(Matrix) <- NA ## eliminate self-self measure (distance = 0)
Matrix[lower.tri(Matrix)] <- NA ## dist i-j = dist j-i
## store solution
LvList[[TRY]] <- Matrix
## summarize each solution using three metrics:
## (i) the average pair distance (higher is better)
## (ii) the number of 'close' code pairs (those with the minimum distance of 1 - lower is better)
## (iii) the maximum number of 'close' code *pairs across all codes (lower is better)
Summary <- rbind(Summary, data.frame(Mean_Distance = mean(Matrix, na.rm=T),
N_close_pairs = sum(Matrix[!is.na(Matrix)]==1),
N_close_pairs_per_ant = max(rowSums( Matrix==1, na.rm=T)) ))
}
## ***Find the solution with the fewest pairs wiRth the lowest distance***
Summary$Mean_Distance_Rank <- rank(Summary$Mean_Distance)
Summary$N_close_pairs_Rank <- rank(-Summary$N_close_pairs)
Summary$N_close_pairs_per_ant_Rank <- rank(-Summary$N_close_pairs_per_ant)
Summary$Rank_Total <- Summary$Mean_Distance_Rank + Summary$N_close_pairs_Rank + Summary$N_close_pairs_per_ant_Rank
solution <- rownames( LvList[[which.max(Summary$Rank_Total)]] )
## Highlight candidate solutions
Colour <- rep(rgb(0,0,0,0.1,1),nrow(Summary) )
Colour [which.max(Summary$Rank_Total) ] <- "red"
pairs(Summary[,c("Mean_Distance","N_close_pairs","N_close_pairs_per_ant")], col=Colour, bg=Colour, pch=21, cex=1.4)
## format into a table
SOLUTION <- data.frame(Code=1:COLONY_SIZE, t(as.data.frame(sapply(solution, strsplit, ""))))
colnames(SOLUTION)[2:5] <- c("Head","Thorax","L_gaster","R_gaster")

Here's a better approach that does not rely upon blind sampling, but instead represents the similarity between each code pair as an edge in a network, and then uses the igraph function largest_ivs to searche for the most dissimilar code pairs:
rm(list=ls())
library(gtools)
library(igraph)
##
outputfolder <- "XXXXXXXXXX"
dir.create(outputfolder,showWarnings = F)
setwd(outputfolder)
## Available colours
x <- c("W", "R", "G", "B", "P", "Y")
## Generate all possible colour combinations, for 6 colours & 4 positions
body <- data.frame(permutations(n=6,r=4,v=x,repeats.allowed=T), stringsAsFactors = F) ; colnames(body) <- c("Head","Thorax","L_gaster","R_gaster")
write.table(body,file="Paint_marks_full_list.txt",col.names=T,row.names=F,quote=F,append=F)
## Generate edge list
edge_list <- data.frame(comb_1=character(),comb_2=character(),similarity=character())
if (!file.exists("Edge_list.txt")){
write.table(edge_list,file="Edge_list.txt",col.names=T,row.names=F,quote=F,append=F)
}else{
edge_list <- read.table("Edge_list.txt",header=T,stringsAsFactors = F)
}
if (nrow(edge_list)>0){
last_i <- edge_list[nrow(edge_list),"comb_1"]
last_j <- edge_list[nrow(edge_list),"comb_2"]
}
if (!(last_i==(nrow(body)-1)&last_j==nrow(body))){
for (i in last_i:(nrow(body)-1)){
print(paste("Combination",i))
for (j in (i+1):nrow(body)){
if (i>last_i|j>last_j){
simil <- length(which(body[i,]==body[j,]))
if (simil>0){
write.table(data.frame(comb_1=i,comb_2=j,similarity=simil),file="Edge_list.txt",col.names=F,row.names=F,quote=F,append=T)
}
}
}
}
}
######let's make 3 graphs with edges representing overlap between combinations ###
##First graph, in which ANY overlap between two combinations is seen as an edge. Will be used to produce list of paint combination with no overlap
net1 <- graph.data.frame(edge_list[c("comb_1","comb_2")],directed=F)
##Second graph, in which only overlaps of 2 or more spots is seen as an edge. Will be used to produce list of paint combinations with no more than 1 spot in common
net2 <- graph.data.frame(edge_list[which(edge_list$similarity>=2),c("comb_1","comb_2")],directed=F)
##Third graph, in which only overlaps of 3 or more spots is seen as an edge. Will be used to produce list of paint combinations with no more than 2 spots in common
net3 <- graph.data.frame(edge_list[which(edge_list$similarity>=3),c("comb_1","comb_2")],directed=F)
#######Now let's use the ivs function to get independent vertex sets, i.e., set of vertices with no connections between any of them
no_overlap_list <- largest_ivs(net1)
max_one_spot_overlap_list <- largest_ivs(net2)
max_two_spots_overlap_list <- largest_ivs(net3)

Related

Strange igraph behaviour, generating duplicate vertices

I'm constructing large directional graphs (using igraph, from R) and have discovered a strange issue in which vertices are apparently duplicated for certain vertex names. This issue doesn't happen in small graphs, and the issue only appears to arise when the vertex names reach 1e+05. There is a clear regularity to the vertices that get duplicated. To jump ahead, the vertex duplication looks like this (generated in section 2 of the below code):
name_vertex id_cell id_vertex
1: 100000 100000 97355
2: 1e+05 100000 1435205
3: 200000 200000 197106
4: 2e+05 200000 1435206
5: 400000 400000 396605
6: 4e+05 400000 1435207
7: 500000 500000 496356
8: 5e+05 500000 1435208
9: 700000 700000 695855
10: 7e+05 700000 1435209
11: 800000 800000 795606
12: 8e+05 800000 1435210
13: 1000000 1000000 995105
14: 1e+06 1000000 1435211
The duplication occurs when 1e+05 is reached and then duplicates are generated for that and every subsequent vertex that is xe+0n where x is in 1:9 and n is >=5 (note that in this graph there is no 3e+05 valued vertex by construction- it lies on the matrix margin- and this is why it isn't present).
All x0.. versions of the vertices hold the outgoing edges, while the xe+0.. versions hold the incoming edges.
Reproducible example:
(note: the way in which I generate the adjacency dataframe owes more to the pipeline I've been using to generate graphs for my use case. The issue could presumably be generated more directly).
The below code generates a matrix, identifies each cell's adjacencies and then constructs a graph from these. Cells at the matrix margin are assigned 0 values to remove them from the adjacency table (to prevent wrapping round the edges).
There are three sections:
(1) running for matrix dimension 100x100: correct behaviour
(2) running for matrix dimension 1200x1200: duplication
(3) unpacking the duplication issue
NOTE: it takes 30 seconds or so and 3-4GB RAM to produce the graph in (2)
# packages
library(data.table); library(igraph)
# function to get adjacent cells in a matrix
get_adjacent <- function(cells, n_row, n_col) {
adjacencies_i <- c(cells-n_row - 1,
cells-n_row,
cells-n_row+1,
cells-1,
cells+1,
cells+n_row-1,
cells+n_row,
cells+n_row+1)
return(adjacencies_i)
}
# function to get the margins of a matrix (i.e. 1-deep outer margin of cells)
get_margins <- function(matrix) {
dims <- dim(matrix)
bottom_right <- prod(dims)
top_right <- (bottom_right - dims[1])
c(1:dims[1], # first column
top_right:bottom_right, # last column
seq(1, top_right, dims[1]), # top row
seq(dims[1], bottom_right, dims[1])) # bottom row
}
# (1) Before creating the failure case, produce a much smaller graph that
# has the correct behaviour
# generate a matrix of 1-valued cells
test_mat <- matrix(1, ncol=100, nrow=100)
# remove edge cells to prevent the adjacencies wrapping around the edges
test_mat[get_margins(test_mat)] <- 0
# plot: all black cells are those that should be represented in the graph, and
# each of these cells should each be linked to their immediately adjacent neighbours
# (including diagonals - see get_adjacent function)
image(test_mat, asp=1, col=c("red", "black"))
# calculate the adjacency dataframe to calculate a graph from
permitted_cells <- which(test_mat[] == 1)
n_row <- dim(test_mat)[1]
n_col <- dim(test_mat)[2]
# full set of adjacencies
adj <- data.table(from = rep(permitted_cells, (1*2 + 1)^2 - 1),
to = get_adjacent(permitted_cells, n_row, n_col))
# remove those that are 0-valued
adj_permitted <- adj[to %in% permitted_cells,]
# calculate graph
g <- graph_from_data_frame(adj_permitted[,list(from, to)], directed = T)
# get vertex names
vertex_names <- names(V(g))
graph_vertices <- data.table(name_vertex = vertex_names,
id_cell = as.integer(vertex_names),
id_vertex = 1:length(vertex_names))
setorder(graph_vertices, id_cell)
# looks good: same number of vertices in graph as there are 1-valued cells in the
# original matrix
print(paste0("n_vertices: ", nrow(graph_vertices)))
print(paste0("n_cells: ", sum(test_mat)))
## (2) failure case. Code is identical to the above, save for the dimensions of
## the matrix being much larger (1200 rather than 100), and the image() function
## is commented out.
# generate a matrix of 1-valued cells
test_mat <- matrix(1, ncol=1200, nrow=1200)
# remove edge cells to prevent the adjacencies wrapping around the edges
test_mat[get_margins(test_mat)] <- 0
# plot: all black cells are those that should be represented in the graph, and
# each of these cells should each be linked to their immediately adjacent neighbours
# (including diagonals - see get_adjacent function)
# image(test_mat, asp=1, col=c("red", "black"))
# calculate the adjacency dataframe to calculate a graph from
permitted_cells <- which(test_mat[] == 1)
n_row <- dim(test_mat)[1]
n_col <- dim(test_mat)[2]
# full set of adjacencies
adj <- data.table(from = rep(permitted_cells, (1*2 + 1)^2 - 1),
to = get_adjacent(permitted_cells, n_row, n_col))
# remove those that are 0-valued
adj_permitted <- adj[to %in% permitted_cells,]
# calculate graph
g <- graph_from_data_frame(adj_permitted[,list(from, to)], directed = T)
# get vertex names
vertex_names <- names(V(g))
graph_vertices <- data.table(name_vertex = vertex_names,
id_cell = as.integer(vertex_names),
id_vertex = 1:length(vertex_names))
setorder(graph_vertices, id_cell)
# there are 7 more vertices than there are 1-valued cells
print(paste0("n_vertices: ", nrow(graph_vertices)))
print(paste0("n_cells: ", sum(test_mat)))
print(paste0("n_extra_vertices: ", nrow(graph_vertices) - sum(test_mat)))
# (3) What are these extra vertices?
# get duplicated vertices
duplicated_vertices <-
graph_vertices[id_cell %in% graph_vertices[duplicated(id_cell),id_cell]]
setorder(duplicated_vertices, id_cell, id_vertex)
# the 7 additional vertices arise through duplication
nrow(duplicated_vertices)
print(duplicated_vertices)
# xe+.. version has the incoming edges
incoming <- adjacent_vertices(g, duplicated_vertices$id_vertex, mode="in")
incoming[unlist(lapply(incoming, function(x) length(x) != 0))]
# x0.. version has outgoing edges
outgoing <- adjacent_vertices(g, duplicated_vertices$id_vertex, mode="out")
outgoing[unlist(lapply(outgoing, function(x) length(x) != 0))]
To (finally) get to the question. What is going on here? Is there something I can do to prevent this behaviour? The workaround I currently have is to take the incoming edges that are received by the xe+0.. version and add edges to these vertices for the x0.. version, before deleting the xe+0.. version of the vertex.
The problem seems to be caused by R (or igraph) equating the two forms 100000 and 1e+05. I managed to resolve it by adding the statement options(scipen=99) at the start of the script, which stops R from using the e notation.

Sampling from a subset of data

I have the following problem.
I have multiple subarrays (say 2) that I have populated with character labels (1, 2, 3, 4, 5). My algorithm selects labels at random based on occurrence probabilities.
How can I get R to instead select labels 1:3 for subarray 1 and 4:5 for subarray 2, say, without using subsetting (i.e., []). That is, I want a random subset of labels to be selected for each subarray, instead of all labels assigned to each subarray manually using [].
I know sample() should help.
Using subsetting (which I don't want) one would do
x <- 1:5
sample(x[1:3], size, prob = probs[1:3])
but this assigns labels 1:3 to ALL subarrays.
Would
sample(sample(x), size, replace = TRUE, prob = probs)
work?
Any ideas? Please let me know if this is unclear.
Here is a small example, which selects labels from 1:5 for each of 10 subarrays.
set.seed(1)
N <- 10
K <- 2
Hstar <- 5
probs <- rep(1/Hstar, Hstar)
perms <- 5
## Set up container(s) to hold the identity of each individual from each permutation ##
num.specs <- ceiling(N / K)
## Create an ID for each haplotype ##
haps <- 1:Hstar
## Assign individuals (N) to each subpopulation (K) ##
specs <- 1:num.specs
## Generate permutations, assume each permutation has N individuals, and sample those individuals' haplotypes from the probabilities ##
gen.perms <- function() {
sample(haps, size = num.specs, replace = TRUE, prob = probs) # I would like each subarray to contain a random subset of 1:5.
}
pop <- array(dim = c(perms, num.specs, K))
for (i in 1:K) {
pop[,, i] <- replicate(perms, gen.perms())
}
pop
Hopefully this helps.
I think what you actually want is something like that
num.specs <- 3
haps[sample(seq(haps),size = num.specs,replace = F)]
[1] 3 5 4
That is a random subset of your vector haps ?
Not quite what you want (returns list of matrices instead of 3D array) but this might help
lapply(split(1:5, cut(1:5, breaks=c(0, 2, 5))), function(i) matrix(sample(i, 25, replace=TRUE), ncol=5))
Use cut and split to partition your vector of character labels before sampling them. Here I split your character labels at the value 2. Also, rather than sampling 5 numbers 5 times, you can sample 25 numbers once, and convert to matrix.

Sort list into hash table according to specific comparisson criteria in R

I am looking for a way, in R, to convert a list into a hash table, grouping elements that are similar according to a specific criteria.
The details are specific to "graph theory", as explained bellow, but I suppose the answer is a general procedure to hash based on some specific criteria.
The list is comprised of "graph" objects (from igraph package).
library(igraph)
#Creating the list of graphs
edgeList <- data.frame(
idA=c(008, 001, 001, 010, 047, 002, 005, 005),
idB=c(100, 010, 020, 030, 030, 001, 011, 111)
)
edgeList$idB= edgeList$idB+0.1
g <- graph_from_data_frame(edgeList, directed = TRUE)
g_list <- decompose(g, mode = "weak")
#from the 8 edges we obtain 5 graphs (connected components of the original graph)
The similarity criteria is that graphs must be isomorphic:
isomorphic(g_list[[1]],g_list[[4]])
How can I hash the indexes for the elements in g_list into a hash table?
For this toy example the expected result should be:
g_inded_hash
[[1]]
[1] 1 4
[[2]]
[1] 2 5
[[3]]
[1] 3
(not necessarily a list, but some data structure that groups graphs (1 and 4) and (2 and 5) which are similar)
In reality, I have 40 millions of (small) graphs that I need to group according to the isomorphisms.
From searching I found the answer must be related to the hash package or environment, but could not adapt that into a solution.
EDIT: changed directed = TRUE in graph_from_data_frame(), above.
Since isomorphism is transitive, we can look at all the pairs of components (i,j), such that i < j, then build a graph where the nodes are the components and the edges are defined by the isomorphic property. The hash table can be extracted from the connected components of this new graph.
# all pairs (i,j) such that i < j
combinations <- unlist(sapply(seq_along(g_list),
function(j) lapply(seq_len(j-1),
function(i) c(i,j))),
recursive = FALSE)
# filter the isomorphic pairs
iso <- Filter(function(pair) isomorphic(g_list[[pair[1]]],g_list[[pair[2]]]),
combinations)
# convert to data frame
df <- data.frame(matrix(unlist(iso), ncol = 2, byrow = TRUE))
# build graph where the vertices are the components
# and the edges indicate the isomorphic property
g_iso <- graph_from_data_frame(df, directed = FALSE)
# identify groups that share the same property
groups <- clusters(g_iso)$membership
# the names are the indices of g_list
g_hash <- lapply(unique(groups),
function(i) as.integer(names(which(groups == i))))
Result:
> g_hash
[[1]]
[1] 2 3 5
[[2]]
[1] 1 4
This does not match the expected result in the question but isomorphic(g_list[[2]],g_list[[3]]) and isomorphic(g_list[[3]],g_list[[5]]) are true.
It's probably not the most straightforward way to do this but that's what came to mind.
I managed to write a solution for my problem. It is probably not very "Rish", not very efficient, with all the loops, but I think it works. Please let me know of a better way to do this.
gl_hash <- list()
gl_hash[1] <- 1
j <- 1
for(i in 2:length(gl)) {
m <- 0
for(k in 1:j){
if(isomorphic( gl[[ gl_hash[[k]][1] ]], gl[[i]])) {
gl_hash[[k]] <- c(gl_hash[[1]],i)
m <- 1
break
}
}
if(m==0) {
j <- j+ 1
gl_hash[j] <- i
}
}

How to collapse branches in a phylogenetic tree by the label in their nodes or leaves?

I have built a phylogenetic tree for a protein family that can be split into different groups, classifying each one by its type of receptor or type of response. The nodes in the tree are labeled as the type of receptor.
In the phylogenetic tree I can see that proteins that belong to the same groups or type of receptor have clustered together in the same branches. So I would like to collapse these branches that have labels in common, grouping them by a given list of keywords.
The command would be something like this:
./collapse_tree_by_label -f phylogenetic_tree.newick -l list_of_labels_to_collapse.txt -o collapsed_tree.eps(or pdf)
My list_of_labels_to_collapse.txt would be like this:
A
B
C
D
My newick tree would be like this:
(A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2)
The output image without collapsing is like this:
http://i.stack.imgur.com/pHkoQ.png
The output image collapsing should be like this (collapsed_tree.eps):
http://i.stack.imgur.com/TLXd0.png
The width of the triangles should represent the branch length, and the high of the triangles must represent the number of nodes in the branch.
I have been playing with the "ape" package in R. I was able to plot a phylogenetic tree, but I still can't figure out how to collapse the branches by keywords in the labels:
require("ape")
This will load the tree:
cat("((A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
Here should be the code to collapse
This will plot the tree:
plot(tree.test)
Your tree as it is stored in R already has the tips stored as polytomies. It's just a matter of plotting the tree with triangles representing the polytomies.
There is no function in ape to do this, that I am aware of, but if you mess with the plotting function a little bit you can pull it off
# Step 1: make edges for descendent nodes invisible in plot:
groups <- c("A", "B", "C", "D")
group_edges <- numeric(0)
for(group in groups){
group_edges <- c(group_edges,getMRCA(tree.test,tree.test$tip.label[grepl(group, tree.test$tip.label)]))
}
edge.width <- rep(1, nrow(tree.test$edge))
edge.width[tree.test$edge[,1] %in% group_edges ] <- 0
# Step 2: plot the tree with the hidden edges
plot(tree.test, show.tip.label = F, edge.width = edge.width)
# Step 3: add triangles
add_polytomy_triangle <- function(phy, group){
root <- length(phy$tip.label)+1
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_nodes <- which(phy$tip.label %in% group_node_labels)
group_mrca <- getMRCA(phy,group_nodes)
tip_coord1 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[1])
tip_coord2 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[length(group_nodes)])
node_coord <- c(dist.nodes(phy)[root, group_mrca], mean(c(tip_coord1[2], tip_coord2[2])))
xcoords <- c(tip_coord1[1], tip_coord2[1], node_coord[1])
ycoords <- c(tip_coord1[2], tip_coord2[2], node_coord[2])
polygon(xcoords, ycoords)
}
Then you just have to loop through the groups to add the triangles
for(group in groups){
add_polytomy_triangle(tree.test, group)
}
I've also been searching for this kind of tool for ages, not so much for collapsing categorical groups, but for collapsing internal nodes based on a numerical support value.
The di2multi function in the ape package can collapse nodes to polytomies, but it currently can only does this by branch length threshold.
Here is a rough adaptation that allows collapsing by a node support value threshold instead (default threshold = 0.5).
Use at your own risk, but it works for me on my rooted Bayesian tree.
di2multi4node <- function (phy, tol = 0.5)
# Adapted di2multi function from the ape package to plot polytomies
# based on numeric node support values
# (di2multi does this based on edge lengths)
# Needs adjustment for unrooted trees as currently skips the first edge
{
if (is.null(phy$edge.length))
stop("the tree has no branch length")
if (is.na(as.numeric(phy$node.label[2])))
stop("node labels can't be converted to numeric values")
if (is.null(phy$node.label))
stop("the tree has no node labels")
ind <- which(phy$edge[, 2] > length(phy$tip.label))[as.numeric(phy$node.label[2:length(phy$node.label)]) < tol]
n <- length(ind)
if (!n)
return(phy)
foo <- function(ancestor, des2del) {
wh <- which(phy$edge[, 1] == des2del)
for (k in wh) {
if (phy$edge[k, 2] %in% node2del)
foo(ancestor, phy$edge[k, 2])
else phy$edge[k, 1] <<- ancestor
}
}
node2del <- phy$edge[ind, 2]
anc <- phy$edge[ind, 1]
for (i in 1:n) {
if (anc[i] %in% node2del)
next
foo(anc[i], node2del[i])
}
phy$edge <- phy$edge[-ind, ]
phy$edge.length <- phy$edge.length[-ind]
phy$Nnode <- phy$Nnode - n
sel <- phy$edge > min(node2del)
for (i in which(sel)) phy$edge[i] <- phy$edge[i] - sum(node2del <
phy$edge[i])
if (!is.null(phy$node.label))
phy$node.label <- phy$node.label[-(node2del - length(phy$tip.label))]
phy
}
This is my answer based on phytools::phylo.toBackbone function,
see http://blog.phytools.org/2013/09/even-more-on-plotting-subtrees-as.html, and http://blog.phytools.org/2013/10/finding-edge-lengths-of-all-terminal.html. First, load the function at the end of code.
library(ape)
library(phytools) #phylo.toBackbone
library(phangorn)
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);"
, file = "ex.tre", sep = "\n")
phy <- read.tree("ex.tre")
groups <- c("A", "B|C", "D")
backboneoftree<-makebackbone(groups,phy)
# tip.label clade.label N depth
# 1 A_1 A 10 0.2481818
# 2 B_1 B|C 6 0.9400000
# 3 D_1 D 5 0.4600000
{
tryCatch(dev.off(),error=function(e){""})
par(fig=c(0,0.5,0,1), mar = c(0, 0, 2, 0))
plot(phy, main="Original" )
par(fig=c(0.5,1,0,1), oma = c(0, 0, 1.2, 0), xpd=NA, new=T)
plot(backboneoftree)
title(main="Clades")
}
makebackbone <- function(groupings,phy){
listofspecies <- phy$tip.label
listtopreserve <- character()
newedgelengths <- meandistnode<- lengthofclades<- numeric()
for (i in 1:length(groupings)){
bestmrca<-getMRCA(phy,grep(groupings[i], phy$tip.label) )
mrcatips<-phy$tip.label[unlist(phangorn::Descendants(phy,bestmrca, type="tips") )]
listtopreserve[i] <- mrcatips[1]
meandistnode[i] <- mean(dist.nodes(phy)[unlist(lapply(mrcatips,
function(x) grep(x, phy$tip.label) ) ),bestmrca] )
lengthofclades[i] <- length(mrcatips)
provtree <- drop.tip(phy,mrcatips, trim.internal=F, subtree = T)
n3 <- length(provtree$tip.label)
newedgelengths[i] <- setNames(provtree$edge.length[sapply(1:n3,function(x,y)
which(y==x),
y=provtree$edge[,2])],
provtree$tip.label)[provtree$tip.label[grep("tips",provtree$tip.label)] ]
}
newtree <- drop.tip(phy,setdiff(listofspecies,listtopreserve),
trim.internal = T)
n <- length(newtree$tip.label)
newtree$edge.length[sapply(1:n,function(x,y)
which(y==x),
y=newtree$edge[,2])] <- newedgelengths + meandistnode
trans <- data.frame(tip.label=newtree$tip.label,clade.label=groupings,
N=lengthofclades, depth=meandistnode )
rownames(trans) <- NULL
print(trans)
backboneoftree <- phytools::phylo.toBackbone(newtree,trans)
return(backboneoftree)
}
EDIT: I haven't tried this, but it might be another answer: "Script and function to transform the tip branches of a tree , i.e the thickness or to triangles, with the width of both correlating with certain parameters (e.g., species number of the clade) (tip.branches.R)"
https://www.en.sysbot.bio.lmu.de/people/employees/cusimano/use_r/index.html
I think the script is finally doing what I wanted.
From the answer that #CactusWoman provided, I changed the code a little bit so the script will try to find the MRCA that represents the largest branch that matches to my search pattern. This solved the problem of not merging non-polytomic branches, or collapsing the whole tree because one matching node was mistakenly outside the correct branch.
In addition, I included a parameter that represents the limit for the pattern abundance ratio in a given branch, so we can select and collapse/group branches that have at least 90% of its tips matching to the search pattern, for example.
library(geiger)
library(phylobase)
library(ape)
#functions
find_best_mrca <- function(phy, group, threshold){
group_matches <- phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)]
group_mrca <- getMRCA(phy,phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)])
group_leaves <- tips(phy, group_mrca)
match_ratio <- length(group_matches)/length(group_leaves)
if( match_ratio < threshold){
#start searching for children nodes that have more than 95% of descendants matching to the search pattern
mrca_children <- descendants(as(phy,"phylo4"), group_mrca, type="all")
i <- 1
new_ratios <- NULL
nleaves <- NULL
names(mrca_children) <- NULL
for(new_mrca in mrca_children){
child_leaves <- tips(tree.test, new_mrca)
child_matches <- grep(group, child_leaves, ignore.case=TRUE)
new_ratios[i] <- length(child_matches)/length(child_leaves)
nleaves[i] <- length(tips(phy, new_mrca))
i <- i+1
}
match_result <- data.frame(mrca_children, new_ratios, nleaves)
match_result_sorted <- match_result[order(-match_result$nleaves,match_result$new_ratios),]
found <- numeric(0);
print(match_result_sorted)
for(line in 1:nrow(match_result_sorted)){
if(match_result_sorted$ new_ratios[line]>=threshold){
return(match_result_sorted$mrca_children[line])
found <- 1
}
}
if(found==0){return(found)}
}else{return(group_mrca)}
}
add_triangle <- function(phy, group,phylo_plot){
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_mrca <- getMRCA(phy,group_node_labels)
group_nodes <- descendants(as(tree.test,"phylo4"), group_mrca, type="tips")
names(group_nodes) <- NULL
x<-phylo_plot$xx
y<-phylo_plot$yy
x1 <- max(x[group_nodes])
x2 <-max(x[group_nodes])
x3 <- x[group_mrca]
y1 <- min(y[group_nodes])
y2 <- max(y[group_nodes])
y3 <- y[group_mrca]
xcoords <- c(x1,x2,x3)
ycoords <- c(y1,y2,y3)
polygon(xcoords, ycoords)
return(c(x2,y3))
}
#main
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
# Step 1: Find the best MRCA that matches to the keywords or search patten
groups <- c("A", "B|C", "D")
group_labels <- groups
group_edges <- numeric(0)
edge.width <- rep(1, nrow(tree.test$edge))
count <- 1
for(group in groups){
best_mrca <- find_best_mrca(tree.test, group, 0.90)
group_leaves <- tips(tree.test, best_mrca)
groups[count] <- paste(group_leaves, collapse="|")
group_edges <- c(group_edges,best_mrca)
#Step2: Remove the edges of the branches that will be collapsed, so they become invisible
edge.width[tree.test$edge[,1] %in% c(group_edges[count],descendants(as(tree.test,"phylo4"), group_edges[count], type="all")) ] <- 0
count = count +1
}
#Step 3: plot the tree hiding the branches that will be collapsed/grouped
last_plot.phylo <- plot(tree.test, show.tip.label = F, edge.width = edge.width)
#And save a copy of the plot so we can extract the xy coordinates of the nodes
#To get the x & y coordinates of a plotted tree created using plot.phylo
#or plotTree, we can steal from inside tiplabels:
last_phylo_plot<-get("last_plot.phylo",envir=.PlotPhyloEnv)
#Step 4: Add triangles and labels to the collapsed nodes
for(i in 1:length(groups)){
text_coords <- add_triangle(tree.test, groups[i],last_phylo_plot)
text(text_coords[1],text_coords[2],labels=group_labels[i], pos=4)
}
This doesn't address depicting the clades as triangles, but it does help with collapsing low-support nodes. The library ggtree has a function as.polytomy which can be used to collapse nodes based on support values.
For example, to collapse bootstraps less than 50%, you'd use:
polytree = as.polytomy(raxtree, feature='node.label', fun=function(x) as.numeric(x) < 50)

Algorithm for automating pairwise significance grouping labels in R

After struggling with this problem for a while, I am hoping to get some advice here. I am wondering if anyone is aware of an automated method for determining pairwise grouping labels based on significance. The question is independent of the significance test (e.g. Tukey for parametric or Mann-Whitney for non-parametric) - given these pairwise comparisons, some boxplot-type figures often represent these groupings with a sub-script:
I have done this example by hand, which can be quite tedious. I think that the sequence of labeling in the algorithm should be based on the number of levels in each group - e.g. those groups containing single levels that are significantly different from all other levels should be named first, then groups containing 2 levels, then 3, etc., all the while checking that new groupings add a new needed grouping and do not violate and differences.
In the example below, the tricky part is getting the algorithm to recognize that level 1 should be grouped with 3 and 5, but 3 and 5 should not be grouped (i.e. share a label).
Example code:
set.seed(1)
n <- 7
n2 <- 100
mu <- cumsum(runif(n, min=-3, max=3))
sigma <- runif(n, min=1, max=3)
dat <- vector(mode="list", n)
for(i in seq(dat)){
dat[[i]] <- rnorm(n2, mean=mu[i], sd=sigma[i])
}
df <- data.frame(group=as.factor(rep(seq(n), each=n2)), y=unlist(dat))
bp <- boxplot(y ~ group, df, notch=TRUE)
kr <- kruskal.test(y ~ group, df)
kr
mw <- pairwise.wilcox.test(df$y, df$g)
mw
mw$p.value > 0.05 # TRUE means that the levels are not significantly different at the p=0.05 level
# 1 2 3 4 5 6
#2 FALSE NA NA NA NA NA
#3 TRUE FALSE NA NA NA NA
#4 FALSE FALSE FALSE NA NA NA
#5 TRUE FALSE FALSE FALSE NA NA
#6 FALSE FALSE FALSE TRUE FALSE NA
#7 FALSE FALSE FALSE FALSE FALSE FALSE
text(x=1:n, y=bp$stats[4,], labels=c("AB", "C", "A", "D", "B", "D", "E"), col=1, cex=1.5, pos=3, font=2)
First let me restate the problem in the language of graph theory. Define a graph as follows. Each sample gives rise to a vertex that represents it. Between two vertices, there is an edge if and only if some test indicates that the samples represented by those vertices could not be distinguished statistically. In graph theory, a clique is a set of vertices such that, between every two vertices in the set, there is an edge. We're looking for a collection of cliques such that every edge in the graph belongs to (at least? exactly?) one of the cliques. We'd like to use as few cliques as possible. (This problem is called clique edge cover, not clique cover.) We then assign each clique its own letter and label its members with that letter. Each sample distinguishable from all others gets its own letter as well.
For example, the graph corresponding to your sample input could be drawn like this.
3---1---5 4--6
My proposed algorithm is the following. Construct the graph and use the Bron--Kerbosch algorithm to find all maximal cliques. For the graph above, these are {1, 3}, {1, 5}, and {4, 6}. The set {1}, for example, is a clique, but it is not maximal because it is a subset of the clique {1, 3}. The set {1, 3, 5} is not a clique because there is no edge between 3 and 5. In the graph
1
/ \
3---5 4--6,
the maximal cliques would be {1, 3, 5} and {4, 6}.
Now search recursively for a small clique edge cover. The input to our recursive function is a set of edges remaining to be covered and the list of maximal cliques. Find the least edge in the remaining set, where, e.g., edge (1,2) < (1,5) < (2,3) < (2,5) < (3,4). For each maximal clique that contains this edge, construct a candidate solution comprised of that clique and the output of a recursive call where the clique edges are removed from set of edges remaining. Output the best candidate.
Unless there are very few edges, this may be too slow. The first performance improvement is memoize: maintain a map from inputs to outputs of the recursive function so that we can avoid doing the work twice. If that doesn't work, then R should have an interface to an integer program solver, and we can use integer programming to determine the best collection of cliques. (I'll explain this more if the other approach is insufficient.)
I thought I would post the solution that I was able to derive with additional help from the following question:
set.seed(1)
n <- 7
n2 <- 100
mu <- cumsum(runif(n, min=-3, max=3))
sigma <- runif(n, min=1, max=3)
dat <- vector(mode="list", n)
for(i in seq(dat)){
dat[[i]] <- rnorm(n2, mean=mu[i], sd=sigma[i])
}
df <- data.frame(group=as.factor(rep(seq(n), each=n2)), y=unlist(dat))
bp <- boxplot(y ~ group, df, notch=TRUE)
#significance test
kr <- kruskal.test(y ~ group, df)
mw <- pairwise.wilcox.test(df$y, df$g)
#matrix showing connections between levels
g <- as.matrix(mw$p.value > 0.05)
g <- cbind(rbind(NA, g), NA)
g <- replace(g, is.na(g), FALSE)
g <- g + t(g)
diag(g) <- 1
rownames(g) <- 1:n
colnames(g) <- 1:n
g
#install.packages("igraph")
library(igraph)
# Load data
same <- which(g==1)
topology <- data.frame(N1=((same-1) %% n) + 1, N2=((same-1) %/% n) + 1)
topology <- topology[order(topology[[1]]),] # Get rid of loops and ensure right naming of vertices
g3 <- simplify(graph.data.frame(topology,directed = FALSE))
get.data.frame(g3)
# Plot graph
plot(g3)
# Calcuate the maximal cliques
res <- maximal.cliques(g3)
# Reorder given the smallest level
res <- sapply(res, sort)
res <- res[order(sapply(res,function(x)paste0(sort(x),collapse=".")))]
ml<-max(sapply(res, length))
reord<-do.call(order, data.frame(
do.call(rbind,
lapply(res, function(x) c(sort(x), rep.int(0, ml-length(x))))
)
))
res <- res[reord]
lab.txt <- vector(mode="list", n)
lab <- letters[seq(res)]
for(i in seq(res)){
for(j in res[[i]]){
lab.txt[[j]] <- paste0(lab.txt[[j]], lab[i])
}
}
bp <- boxplot(y ~ group, df, notch=TRUE, outline=FALSE, ylim=range(df$y)+c(0,1))
text(x=1:n, y=bp$stats[5,], labels=lab.txt, col=1, cex=1, pos=3, font=2)
Cool code.
I think you need to quote the function order() when calling do.call:
reord<-do.call("order", data.frame(
do.call(rbind,
lapply(res, function(x) c(sort(x), rep.int(0, ml-length(x))))
)
))

Resources