Igraph - A way to extract which nodes got into which communities - r

I found this code online here: https://blog.revolutionanalytics.com/2015/08/contracting-and-simplifying-a-network-graph.html
library(igraph)
# Download prepared igraph file from github
gs <- readRDS("pdb/depGraph-CRAN.rds")
set.seed(42)
# Compute communities (clusters)
cl <- walktrap.community(gs, steps = 5)
cl$degree <- (degree(gs)[cl$names])
# Assign node with highest degree as name for each cluster
cl$cluster <- unname(ave(cl$degree, cl$membership,
FUN=function(x)names(x)[which.max(x)])
)
V(gs)$name <- cl$cluster
# Contract graph ----------------------------------------------------------
# Contract vertices
E(gs)$weight <- 1
V(gs)$weight <- 1
gcon <- contract.vertices(gs, cl$membership,
vertex.attr.comb = list(weight = "sum", name = function(x)x[1], "ignore"))
# Simplify edges
gcon <- simplify(gcon, edge.attr.comb = list(weight = "sum", function(x)length(x)))
gcc <- induced.subgraph(gcon, V(gcon)$weight > 20)
V(gcc)$degree <- unname(degree(gcc))
# ------------------------------------------------------------------------
set.seed(42)
par(mar = rep(0.1, 4))
g.layout <- layout.kamada.kawai(gcc)
plot.igraph(gcc, edge.arrow.size = 0.1, layout = g.layout, vertex.size = 0.5 * (V(gcc)$degree))
This code contracts nodes and simplifies edges. It reduces my graph from over 500 nodes to around 39, which is great! However, I want to know which nodes ended up in which clusters in order to check if the procedure makes sense.
I also get this error when using the code:
> V(gs)$name <- cl$cluster
Warning message:
In length(vattrs[[name]]) <- vc : length of NULL cannot be changed
> (degree(gs)[cl$names])
numeric(0) <-- there seems to be nothing?
> unname(ave(cl$degree, cl$membership,
+ FUN=function(x)names(x)[which.max(x)]))
numeric(0) <-- there seems to be nothing?
Is this causing my problem or can I find my answer somewhere else?

Related

iGraph R package bug?

When executing the simple code below to add weight to an edge from node 3 to node 5 in the network and looking edge weights it shows NA values there.
g<-make_empty_graph(directed = F)
g<-add.vertices(g,c(10))
g<-add_edges(g,c(3,5))
g<-set_edge_attr(graph = g,name="weight",index = c(3,5),value = 0.3)
E(g)$weight
plot(g)
After execution I get this
> E(g)$weight
[1] NA
> plot(g)
>
Is this a bug or I'm doing something incorrectly?
The issue is that you have specified badly the index argument when using set_edge_attr:
This will produce the correct result:
g<-make_empty_graph(directed = F)
g<-add.vertices(g,c(10))
g<-add_edges(g,c(3,5))
g<-set_edge_attr(graph = g,name="weight",index = E(g),value = 0.3)
E(g)$weight
plot(g)
As you can see from ?set_edge_attr:
index: An optional edge sequence to set the attributes of a subset of
edges
So now, let say you have another edge and want to set it a value of 10:
g<-make_empty_graph(directed = F)
g<-add.vertices(g,c(10))
g<-add_edges(g,c(3,5))
g<-add_edges(g,c(4,5))
g<-set_edge_attr(graph = g,name="weight",index = E(g)[1],value = 0.3)
g<-set_edge_attr(graph = g,name="weight",index = E(g)[2],value = 10)
E(g)$weight
plot(g)
You use E(g)[1] for the first and E(g)[2] because E(g) gives you back an array of all your edges in the order you specified them (1 will be c(3,5) and 2 will be c(4,5))
Best!

R - Finding least cost path through raster image (maze)?

How can I find a non-linear path through raster image data? e.g., least cost algorithm? Starting and ending points are known and given as:
Start point = (0,0)
End point = (12,-5)
For example, extract the approximate path of a winding river through a (greyscale) raster image.
# fake up some noisy, but reproducible, "winding river" data
set.seed(123)
df <- data.frame(x=seq(0,12,by=.01),
y=sapply(seq(0,12,by=.01), FUN = function(i) 10*sin(i)+rnorm(1)))
# convert to "pixels" of raster data
# assumption: image color is greyscale, only need one numeric value, v
img <- data.frame(table(round(df$y,0), round(df$x,1)))
names(img) <- c("y","x","v")
img$y <- as.numeric(as.character(img$y))
img$x <- as.numeric(as.character(img$x))
## take a look at the fake "winding river" raster image...
library(ggplot2)
ggplot(img) +
geom_raster(aes(x=x,y=y,fill=v))
As I was writing up my example, I stumbled upon an answer using the 'gdistance' r package... hopefully others will find this useful.
library(gdistance)
library(sp)
library(ggplot2)
# convert to something rasterFromXYZ() understands
spdf <- SpatialPixelsDataFrame(points = img[c("x","y")], data = img["v"])
# use rasterFromXYZ to make a RasterLayer
r <- rasterFromXYZ(spdf)
# make a transition layer, specifying a sensible function and the number of connection directions
tl <- transition(r, function(x) min(x), 8)
## mean(x), min(x), and max(x) produced similar results for me
# extract the shortest path as something we can plot
sPath <- shortestPath(tl, c(0,0), c(12,-5), output = "SpatialLines")
# conversion for ggplot
sldf <- fortify(SpatialLinesDataFrame(sPath, data = data.frame(ID = 1)))
# plot the original raster, truth (white), and the shortest path solution (green)
ggplot(img) +
geom_raster(aes(x=x,y=y,fill=v)) +
stat_function(data=img, aes(x=x), fun = function(x) 10*sin(x), geom="line", color="white") +
geom_path(data=sldf, aes(x=long,y=lat), color="green")
I wanted to make sure that I wasn't just giving myself too easy of a problem... so I made a noisier version of the image.
img2 <- img
img2$v <- ifelse(img2$v==0, runif(sum(img2$v==0),3,8), img2$v)
spdf2 <- SpatialPixelsDataFrame(points = img2[c("x","y")], data = img2["v"])
r2 <- rasterFromXYZ(spdf2)
# for this noisier image, I needed a different transition function.
# The one from the vignette worked well enough for this example.
tl2 <- transition(r2, function(x) 1/mean(x), 8)
sPath2 <- shortestPath(tl2, c(0,0), c(12,-5), output = "SpatialLines")
sldf2 <- fortify(SpatialLinesDataFrame(sPath2, data = data.frame(ID = 1)))
ggplot(img2) +
geom_raster(aes(x=x,y=y,fill=v)) +
stat_function(data=img2, aes(x=x), fun = function(x) 10*sin(x), geom="line", color="white") +
geom_path(data=sldf2, aes(x=long,y=lat), color="green")
UPDATE: using real raster data...
I wanted to see if the same workflow would work on an actual real-world raster image and not just fake data, so...
library(jpeg)
# grab some river image...
url <- "https://c8.alamy.com/comp/AMDPJ6/fiji-big-island-winding-river-aerial-AMDPJ6.jpg"
download.file(url, "river.jpg", mode = "wb")
jpg <- readJPEG("./river.jpg")
img3 <- melt(jpg, varnames = c("y","x","rgb"))
img3$rgb <- as.character(factor(img3$rgb, levels = c(1,2,3), labels=c("r","g","b")))
img3 <- dcast(img3, x + y ~ rgb)
# convert rgb to greyscale
img3$v <- img3$r*.21 + img3$g*.72 + img3$b*.07
For rgb to greyscale, see: https://stackoverflow.com/a/27491947/2371031
# define some start/end point coordinates
pts_df <- data.frame(x = c(920, 500),
y = c(880, 50))
# set a reference "grey" value as the mean of the start and end point "v"s
ref_val <- mean(c(subset(img3, x==pts_df[1,1] & y==pts_df[1,2])$v,
subset(img3, x==pts_df[2,1] & y==pts_df[2,2])$v))
spdf3 <- SpatialPixelsDataFrame(points = img3[c("x","y")], data = img3["v"])
r3 <- rasterFromXYZ(spdf3)
# transition layer defines "conductance" between two points
# x is the two point values, "v" = c(v1, v2)
# 0 = no conductance, >>1 = good conductance, so
# make a transition function that encourages only small changes in v compared to the reference value.
tl3 <- transition(r3, function(x) (1/max(abs((x/ref_val)-1))^2)-1, 8)
sPath3 <- shortestPath(tl3, as.numeric(pts_df[1,]), as.numeric(pts_df[2,]), output = "SpatialLines")
sldf3 <- fortify(SpatialLinesDataFrame(sPath3, data = data.frame(ID = 1)))
# plot greyscale with points and path
ggplot(img3) +
geom_raster(aes(x,y, fill=v)) +
scale_fill_continuous(high="white", low="black") +
scale_y_reverse() +
geom_point(data=pts_df, aes(x,y), color="red") +
geom_path(data=sldf3, aes(x=long,y=lat), color="green")
I played around with different transition functions before finding one that worked. This one is probably more complex than it needs to be, but it works. You can increase the power term (from 2 to 3,4,5,6...) and it continues to work. It did not find a correct solution with the power term removed.
Alternative solution using igraph package.
Found an alternative set of answers using 'igraph' r package. I think it is important to note that one of the big differences here is that 'igraph' supports n-dimensional graphs whereas 'gdistance' only supports 2D graphs. So, for example, extending this answer into 3D is relatively easy.
library(igraph)
# make a 2D lattice graph, with same dimensions as "img"
l <- make_lattice(dimvector = c(length(unique(img$y)),
length(unique(img$x))), directed=F, circular=F)
summary(l)
# > IGRAPH ba0963d U--- 3267 6386 -- Lattice graph
# > + attr: name (g/c), dimvector (g/n), nei (g/n), mutual (g/l), circular (g/l)
# set vertex attributes
V(l)$x = img$x
V(l)$y = img$y
V(l)$v = img$v
# "color" is a known attribute that will be used by plot.igraph()
V(l)$color = grey.colors(length(unique(img$v)))[img$v+1]
# compute edge weights as a function of attributes of the two connected vertices
el <- get.edgelist(l)
# "weight" is a known edge attribute, and is used in shortest_path()
# I was confused about weights... lower weights are better, Inf weights will be avoided.
# also note from help: "if all weights are positive, then Dijkstra's algorithm is used."
E(l)$weight <- 1/(pmax(V(l)[el[, 1]]$v, V(l)[el[, 2]]$v))
E(l)$color = grey.colors(length(unique(E(l)$weight)))[E(l)$weight+1]
Edge weights calculation courtesy of: https://stackoverflow.com/a/27446127/2371031 (thanks!)
# find the start/end vertices
start = V(l)[V(l)$x == 0 & V(l)$y == 0]
end = V(l)[V(l)$x == 12 & V(l)$y == -5]
# get the shortest path, returning "both" (vertices and edges)...
result <- shortest_paths(graph = l, from = start, to = end, output = "both")
# color the edges that were part of the shortest path green
V(l)$color = ifelse(V(l) %in% result$vpath[[1]], "green", V(l)$color)
E(l)$color = ifelse(E(l) %in% result$epath[[1]], "green", E(l)$color)
# color the start and end vertices red
V(l)$color = ifelse(V(l) %in% c(start,end), "red", V(l)$color)
plot(l, vertex.shape = "square", vertex.size=2, vertex.frame.color=NA, vertex.label=NA, curved=F)
Second (noisier) example requires a different formula to compute edge weights.
img2 <- img
img2$v <- ifelse(img2$v==0, runif(sum(img2$v==0),3,8), img2$v)
l <- make_lattice(dimvector = c(length(unique(img2$y)),
length(unique(img2$x))), directed=F, circular=F)
# set vertex attributes
V(l)$x = img2$x
V(l)$y = img2$y
V(l)$v = img2$v
V(l)$color = grey.colors(length(unique(img2$v)))[factor(img2$v)]
# compute edge weights
el <- get.edgelist(l)
# proper edge weight calculation is the key to a good solution...
E(l)$weight <- (pmin(V(l)[el[, 1]]$v, V(l)[el[, 2]]$v))
E(l)$color = grey.colors(length(unique(E(l)$weight)))[factor(E(l)$weight)]
start = V(l)[V(l)$x == 0 & V(l)$y == 0]
end = V(l)[V(l)$x == 12 & V(l)$y == -5]
# get the shortest path, returning "both" (vertices and edges)...
result <- shortest_paths(graph = l, from = start, to = end, output = "both")
# color the edges that were part of the shortest path green
V(l)$color = ifelse(V(l) %in% result$vpath[[1]], "green", V(l)$color)
E(l)$color = ifelse(E(l) %in% result$epath[[1]], "green", E(l)$color)
# color the start and end vertices red
V(l)$color = ifelse(V(l) %in% c(start,end), "red", V(l)$color)
plot(l, vertex.shape = "square", vertex.size=2, vertex.frame.color=NA, vertex.label=NA, curved=F)

Shortest Paths based on edge attribute with igraph

I'm trying to get the shortest paths of a graph but based on its edge ids.
So having the following graph:
library(igraph)
set.seed(45)
g <- erdos.renyi.game(25, 1/10, directed = TRUE)
E(g)$id <- sample(1:3, length(E(g)), replace = TRUE)
The shortest_paths(g, 1, V(g)) function finds all the shortest paths from node 1 to all the other nodes. However, I would like to calculate this, not just by following the geodesic distance, but a mix between the geodesic distance, and the minimum of edge id changes.
For example if this would be a train network, and the edge ids would represent trains. I would like to calculate how to get from node A to all the other nodes using the shortest path, but while changing the least amount of time of trains.
OK I think I have a working solution, although the code is a little ugly. The basic algorithm (lets call it gs(i, j)) goes like this: If we want to find the shortest train journey from i to j (gs(i, j)) we:
find the shortest path from i to j considering all trains. if this path is length 0 or 1 return it (there is either no path or a path on 1 train)
split the graph up by 'trains' (subset graph by edges) so as to consider each train network separately, and find the shortest path between i and j in each individual train network
if a single train will get you from i to j, return the train route with the fewest stops between i and j, else
if no single train runs from i to j then call gs(i, j-1) where (j-1) is the stop before j in the shortest path between i and j on the full network.
So basically, we look to see if a single train can do it, and if it can't we call the function recursively looking if a single train can get you to the stop before the last stop, etc. etc.
library(igraph)
# First your data
set.seed(45)
g <- erdos.renyi.game(25, 1/10, directed = TRUE)
E(g)$id <- sample(1:3, length(E(g)), replace = TRUE)
plot(g, edge.color = E(g)$id)
# The function takes as arguments the graph, and the id of the vertex
# you want to go from/to. It should work for a vector of
# destinations but I have not rigorously tested it so proceed with
# caution!
get.shortest.routes <- function(g, from, to){
train.routes <- lapply(unique(E(g)$id), function(id){subgraph.edges(g, eids = which(E(g)$id==id), delete.vertices = F)})
target.sp <- shortest_paths(g, from = from, to = to, output = 'vpath')$vpath
single.train.paths <- lapply(train.routes, function(gs){shortest_paths(gs, from = from, to = to, output = 'vpath')$vpath})
for (i in length(target.sp)){
if (length(target.sp[[i]]>1)) {
cands <- lapply(single.train.paths, function(l){l[[i]]})
if (sum(unlist(lapply(cands, length)))!=0) {
cands <- cands[lapply(cands, length)!=0]
cands <- cands[lapply(cands, length)==min(unlist(lapply(cands, length)))]
target.sp[[i]] <- cands[[1]]
} else {
target.sp[[i]] <- c(get.shortest.routes(g, from = as.numeric(target.sp[[i]][1]),
to = as.numeric(target.sp[[i]][(length(target.sp[[i]]) - 1)]))[[1]],
get.shortest.routes(g, from = as.numeric(target.sp[[i]][(length(target.sp[[i]]) - 1)]),
to = as.numeric(target.sp[[i]][length(target.sp[[i]])]))[[1]][-1])
}
}
}
target.sp
}
OK now lets run some tests. If you squint at the graph above you can see that the path from vertex 5 to vertex 21 is length-2 if you take two trains, but that you can get there on 1 train if you pass through an extra station. Our new function should return the longer path:
shortest_paths(g, 5, 21)$vpath
#> [[1]]
#> + 3/25 vertices, from b014eb9:
#> [1] 5 13 21
get.shortest.routes(g, 5, 21)
#> Warning in shortest_paths(gs, from = from, to = to, output = "vpath"): At
#> structural_properties.c:745 :Couldn't reach some vertices
#> Warning in shortest_paths(gs, from = from, to = to, output = "vpath"): At
#> structural_properties.c:745 :Couldn't reach some vertices
#> [[1]]
#> + 4/25 vertices, from c22246c:
#> [1] 5 13 15 21
Lets make a really easy graph where we are sure what we want to see: here we should get 1-2-4-5 instead of 1-3-5:
df <- data.frame(from = c(1, 1, 2, 3, 4), to = c(2, 3, 4, 5, 5))
g1 <- graph_from_data_frame(df)
E(g1)$id <- c(1, 2, 1, 3, 1)
plot(g1, edge.color = E(g1)$id)
get.shortest.routes(g1, 1, 5)
#> Warning in shortest_paths(gs, from = from, to = to, output = "vpath"): At
#> structural_properties.c:745 :Couldn't reach some vertices
#> Warning in shortest_paths(gs, from = from, to = to, output = "vpath"): At
#> structural_properties.c:745 :Couldn't reach some vertices
#> [[1]]
#> + 4/5 vertices, named, from c406649:
#> [1] 1 2 4 5
I'm sure there is a more rigorous solution, and you'll probably want to optimize the code a bit. For instance, I just realized that I don't stop the function immediately if the shortest path on the full graph has only two nodes -- doing so would avoid some needless computations! This was a fun problem, I hope some other answers gets posted.
Created on 2018-05-11 by the reprex package (v0.2.0).
Here is my take on the problem. A few notes:
1) all_simple_paths will not scale well with large or highly connected graphs
2) I favored fewest changes above all else, which means a path with two changes and a dist of 40 will beat a path with three changes and a dist of 3.
4) I can imagine an even faster approach if # of changes and distance change priority if there is no path on one id
library(igraph)
# First your data
set.seed(45)
g <- erdos.renyi.game(25, 1/10, directed = TRUE)
E(g)$id <- sample(1:3, length(E(g)), replace = TRUE)
plot(g, edge.color = E(g)$id)
##Option 1:
rst <- all_simple_paths(g, from = 1, to = 18, mode = "out")
rst <- lapply(rst, as_ids)
rst1 <- lapply(rst, function(x) c(x[1], rep(x[2:(length(x)-1)],
each=2), x[length(x)]))
rst2 <- lapply(rst1, function(x) data.frame(eid = get.edge.ids(graph=g, vp = x),
train=E(g)$id[get.edge.ids(graph=g, vp = x)]))
rst3 <- data.frame(pathID=seq_along(rst),
changes=sapply(rst2, function(x) length(rle(x$train)$lengths)),
dist=sapply(rst2, nrow))
spath <- rst3[order(rst3$changes, rst3$dist), ][1,1]
#Vertex IDs
rst[[spath]]
#[1] 1 23 8 18
plot(g, edge.color = E(g)$id, vertex.color=ifelse(V(g) %in% rst[[spath]], "firebrick", "gray80"),
edge.arrow.size=0.5)

Neighbor groups based on cluster assignment is slow

I am doing some analysis using iGraph in R, and I am currently doing a calculation that is very expensive. I need to do it across all of the nodes in my graph, so if someone knows a more efficient way to do it, I would appreciate it.
I start out with a graph, g. I first do some community detection on the graph
library(igraph)
adj_matrix <- matrix(rbinom(10 * 5, 1, 0.5), ncol = 8000, nrow = 8000)
g <- graph_from_adjacency_matrix(adj_matrix, mode = 'undirected', diag = FALSE)
c <- cluster_louvain(g)
Then, I basically assign each cluster to 1 of 2 groups
nc <- length(c)
assignments <- rbinom(nc, 1, .5)
Now, for each node, I want to find out what percentage of its neighbors are in a given group (as defined by the cluster assignments). I currently do this in the current way:
pct_neighbors_1 <- function(g, vertex, c, assignments) {
sum(
ifelse(
assignments[membership(c)[neighbors(g, vertex)]] == 1, 1, 0)
)/length(neighbors(g, vertex))
}
And then, given that I have a dataframe with each row corresponding to one vertex in the graph, I do this for all vertices with
data$pct_neighbors_1 <- sapply(1:nrow(data),
pct_neighbors_1,
graph = g, community = c,
assignments = assignments)
Is there somewhere in here that I can make things more efficient? Thanks!
This should be faster :
library(igraph)
# for reproducibility's sake
set.seed(1234)
# create a random 1000 vertices graph
nverts <- 1000
g <- igraph::random.graph.game(nverts,0.1,type='gnp',directed=FALSE)
# clustering
c <- cluster_louvain(g)
# assignments
nc <- length(c)
assignments <- rbinom(nc, 1, .5)
# precalculate if a vertex belongs to the assigned communities
vertsInAssignments <- membership(c) %in% which(assignments==1)
# compute probabilities
probs <- sapply(1:vcount(g),FUN=function(i){
neigh <- neighbors(g,i)
sum(vertsInAssignments[neigh]) / length(neigh)
})

How to collapse branches in a phylogenetic tree by the label in their nodes or leaves?

I have built a phylogenetic tree for a protein family that can be split into different groups, classifying each one by its type of receptor or type of response. The nodes in the tree are labeled as the type of receptor.
In the phylogenetic tree I can see that proteins that belong to the same groups or type of receptor have clustered together in the same branches. So I would like to collapse these branches that have labels in common, grouping them by a given list of keywords.
The command would be something like this:
./collapse_tree_by_label -f phylogenetic_tree.newick -l list_of_labels_to_collapse.txt -o collapsed_tree.eps(or pdf)
My list_of_labels_to_collapse.txt would be like this:
A
B
C
D
My newick tree would be like this:
(A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2)
The output image without collapsing is like this:
http://i.stack.imgur.com/pHkoQ.png
The output image collapsing should be like this (collapsed_tree.eps):
http://i.stack.imgur.com/TLXd0.png
The width of the triangles should represent the branch length, and the high of the triangles must represent the number of nodes in the branch.
I have been playing with the "ape" package in R. I was able to plot a phylogenetic tree, but I still can't figure out how to collapse the branches by keywords in the labels:
require("ape")
This will load the tree:
cat("((A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
Here should be the code to collapse
This will plot the tree:
plot(tree.test)
Your tree as it is stored in R already has the tips stored as polytomies. It's just a matter of plotting the tree with triangles representing the polytomies.
There is no function in ape to do this, that I am aware of, but if you mess with the plotting function a little bit you can pull it off
# Step 1: make edges for descendent nodes invisible in plot:
groups <- c("A", "B", "C", "D")
group_edges <- numeric(0)
for(group in groups){
group_edges <- c(group_edges,getMRCA(tree.test,tree.test$tip.label[grepl(group, tree.test$tip.label)]))
}
edge.width <- rep(1, nrow(tree.test$edge))
edge.width[tree.test$edge[,1] %in% group_edges ] <- 0
# Step 2: plot the tree with the hidden edges
plot(tree.test, show.tip.label = F, edge.width = edge.width)
# Step 3: add triangles
add_polytomy_triangle <- function(phy, group){
root <- length(phy$tip.label)+1
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_nodes <- which(phy$tip.label %in% group_node_labels)
group_mrca <- getMRCA(phy,group_nodes)
tip_coord1 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[1])
tip_coord2 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[length(group_nodes)])
node_coord <- c(dist.nodes(phy)[root, group_mrca], mean(c(tip_coord1[2], tip_coord2[2])))
xcoords <- c(tip_coord1[1], tip_coord2[1], node_coord[1])
ycoords <- c(tip_coord1[2], tip_coord2[2], node_coord[2])
polygon(xcoords, ycoords)
}
Then you just have to loop through the groups to add the triangles
for(group in groups){
add_polytomy_triangle(tree.test, group)
}
I've also been searching for this kind of tool for ages, not so much for collapsing categorical groups, but for collapsing internal nodes based on a numerical support value.
The di2multi function in the ape package can collapse nodes to polytomies, but it currently can only does this by branch length threshold.
Here is a rough adaptation that allows collapsing by a node support value threshold instead (default threshold = 0.5).
Use at your own risk, but it works for me on my rooted Bayesian tree.
di2multi4node <- function (phy, tol = 0.5)
# Adapted di2multi function from the ape package to plot polytomies
# based on numeric node support values
# (di2multi does this based on edge lengths)
# Needs adjustment for unrooted trees as currently skips the first edge
{
if (is.null(phy$edge.length))
stop("the tree has no branch length")
if (is.na(as.numeric(phy$node.label[2])))
stop("node labels can't be converted to numeric values")
if (is.null(phy$node.label))
stop("the tree has no node labels")
ind <- which(phy$edge[, 2] > length(phy$tip.label))[as.numeric(phy$node.label[2:length(phy$node.label)]) < tol]
n <- length(ind)
if (!n)
return(phy)
foo <- function(ancestor, des2del) {
wh <- which(phy$edge[, 1] == des2del)
for (k in wh) {
if (phy$edge[k, 2] %in% node2del)
foo(ancestor, phy$edge[k, 2])
else phy$edge[k, 1] <<- ancestor
}
}
node2del <- phy$edge[ind, 2]
anc <- phy$edge[ind, 1]
for (i in 1:n) {
if (anc[i] %in% node2del)
next
foo(anc[i], node2del[i])
}
phy$edge <- phy$edge[-ind, ]
phy$edge.length <- phy$edge.length[-ind]
phy$Nnode <- phy$Nnode - n
sel <- phy$edge > min(node2del)
for (i in which(sel)) phy$edge[i] <- phy$edge[i] - sum(node2del <
phy$edge[i])
if (!is.null(phy$node.label))
phy$node.label <- phy$node.label[-(node2del - length(phy$tip.label))]
phy
}
This is my answer based on phytools::phylo.toBackbone function,
see http://blog.phytools.org/2013/09/even-more-on-plotting-subtrees-as.html, and http://blog.phytools.org/2013/10/finding-edge-lengths-of-all-terminal.html. First, load the function at the end of code.
library(ape)
library(phytools) #phylo.toBackbone
library(phangorn)
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);"
, file = "ex.tre", sep = "\n")
phy <- read.tree("ex.tre")
groups <- c("A", "B|C", "D")
backboneoftree<-makebackbone(groups,phy)
# tip.label clade.label N depth
# 1 A_1 A 10 0.2481818
# 2 B_1 B|C 6 0.9400000
# 3 D_1 D 5 0.4600000
{
tryCatch(dev.off(),error=function(e){""})
par(fig=c(0,0.5,0,1), mar = c(0, 0, 2, 0))
plot(phy, main="Original" )
par(fig=c(0.5,1,0,1), oma = c(0, 0, 1.2, 0), xpd=NA, new=T)
plot(backboneoftree)
title(main="Clades")
}
makebackbone <- function(groupings,phy){
listofspecies <- phy$tip.label
listtopreserve <- character()
newedgelengths <- meandistnode<- lengthofclades<- numeric()
for (i in 1:length(groupings)){
bestmrca<-getMRCA(phy,grep(groupings[i], phy$tip.label) )
mrcatips<-phy$tip.label[unlist(phangorn::Descendants(phy,bestmrca, type="tips") )]
listtopreserve[i] <- mrcatips[1]
meandistnode[i] <- mean(dist.nodes(phy)[unlist(lapply(mrcatips,
function(x) grep(x, phy$tip.label) ) ),bestmrca] )
lengthofclades[i] <- length(mrcatips)
provtree <- drop.tip(phy,mrcatips, trim.internal=F, subtree = T)
n3 <- length(provtree$tip.label)
newedgelengths[i] <- setNames(provtree$edge.length[sapply(1:n3,function(x,y)
which(y==x),
y=provtree$edge[,2])],
provtree$tip.label)[provtree$tip.label[grep("tips",provtree$tip.label)] ]
}
newtree <- drop.tip(phy,setdiff(listofspecies,listtopreserve),
trim.internal = T)
n <- length(newtree$tip.label)
newtree$edge.length[sapply(1:n,function(x,y)
which(y==x),
y=newtree$edge[,2])] <- newedgelengths + meandistnode
trans <- data.frame(tip.label=newtree$tip.label,clade.label=groupings,
N=lengthofclades, depth=meandistnode )
rownames(trans) <- NULL
print(trans)
backboneoftree <- phytools::phylo.toBackbone(newtree,trans)
return(backboneoftree)
}
EDIT: I haven't tried this, but it might be another answer: "Script and function to transform the tip branches of a tree , i.e the thickness or to triangles, with the width of both correlating with certain parameters (e.g., species number of the clade) (tip.branches.R)"
https://www.en.sysbot.bio.lmu.de/people/employees/cusimano/use_r/index.html
I think the script is finally doing what I wanted.
From the answer that #CactusWoman provided, I changed the code a little bit so the script will try to find the MRCA that represents the largest branch that matches to my search pattern. This solved the problem of not merging non-polytomic branches, or collapsing the whole tree because one matching node was mistakenly outside the correct branch.
In addition, I included a parameter that represents the limit for the pattern abundance ratio in a given branch, so we can select and collapse/group branches that have at least 90% of its tips matching to the search pattern, for example.
library(geiger)
library(phylobase)
library(ape)
#functions
find_best_mrca <- function(phy, group, threshold){
group_matches <- phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)]
group_mrca <- getMRCA(phy,phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)])
group_leaves <- tips(phy, group_mrca)
match_ratio <- length(group_matches)/length(group_leaves)
if( match_ratio < threshold){
#start searching for children nodes that have more than 95% of descendants matching to the search pattern
mrca_children <- descendants(as(phy,"phylo4"), group_mrca, type="all")
i <- 1
new_ratios <- NULL
nleaves <- NULL
names(mrca_children) <- NULL
for(new_mrca in mrca_children){
child_leaves <- tips(tree.test, new_mrca)
child_matches <- grep(group, child_leaves, ignore.case=TRUE)
new_ratios[i] <- length(child_matches)/length(child_leaves)
nleaves[i] <- length(tips(phy, new_mrca))
i <- i+1
}
match_result <- data.frame(mrca_children, new_ratios, nleaves)
match_result_sorted <- match_result[order(-match_result$nleaves,match_result$new_ratios),]
found <- numeric(0);
print(match_result_sorted)
for(line in 1:nrow(match_result_sorted)){
if(match_result_sorted$ new_ratios[line]>=threshold){
return(match_result_sorted$mrca_children[line])
found <- 1
}
}
if(found==0){return(found)}
}else{return(group_mrca)}
}
add_triangle <- function(phy, group,phylo_plot){
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_mrca <- getMRCA(phy,group_node_labels)
group_nodes <- descendants(as(tree.test,"phylo4"), group_mrca, type="tips")
names(group_nodes) <- NULL
x<-phylo_plot$xx
y<-phylo_plot$yy
x1 <- max(x[group_nodes])
x2 <-max(x[group_nodes])
x3 <- x[group_mrca]
y1 <- min(y[group_nodes])
y2 <- max(y[group_nodes])
y3 <- y[group_mrca]
xcoords <- c(x1,x2,x3)
ycoords <- c(y1,y2,y3)
polygon(xcoords, ycoords)
return(c(x2,y3))
}
#main
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
# Step 1: Find the best MRCA that matches to the keywords or search patten
groups <- c("A", "B|C", "D")
group_labels <- groups
group_edges <- numeric(0)
edge.width <- rep(1, nrow(tree.test$edge))
count <- 1
for(group in groups){
best_mrca <- find_best_mrca(tree.test, group, 0.90)
group_leaves <- tips(tree.test, best_mrca)
groups[count] <- paste(group_leaves, collapse="|")
group_edges <- c(group_edges,best_mrca)
#Step2: Remove the edges of the branches that will be collapsed, so they become invisible
edge.width[tree.test$edge[,1] %in% c(group_edges[count],descendants(as(tree.test,"phylo4"), group_edges[count], type="all")) ] <- 0
count = count +1
}
#Step 3: plot the tree hiding the branches that will be collapsed/grouped
last_plot.phylo <- plot(tree.test, show.tip.label = F, edge.width = edge.width)
#And save a copy of the plot so we can extract the xy coordinates of the nodes
#To get the x & y coordinates of a plotted tree created using plot.phylo
#or plotTree, we can steal from inside tiplabels:
last_phylo_plot<-get("last_plot.phylo",envir=.PlotPhyloEnv)
#Step 4: Add triangles and labels to the collapsed nodes
for(i in 1:length(groups)){
text_coords <- add_triangle(tree.test, groups[i],last_phylo_plot)
text(text_coords[1],text_coords[2],labels=group_labels[i], pos=4)
}
This doesn't address depicting the clades as triangles, but it does help with collapsing low-support nodes. The library ggtree has a function as.polytomy which can be used to collapse nodes based on support values.
For example, to collapse bootstraps less than 50%, you'd use:
polytree = as.polytomy(raxtree, feature='node.label', fun=function(x) as.numeric(x) < 50)

Resources