Related
I am trying to do something similar to this and this post. I have an igraph object and want to remove vertices(arrows) based on an values in a column of the edges dataframe, color the edges(circles) by a group, and change the line/arrow size based on the same column in the edges dataframe. Here is some reproducible code that looks exactly like my data:
# Data
edges <- data.frame(
"agency.from" = c(rep("a",4),rep("b",4),rep("c",4),rep("d",4)),
"agency.to" = c(rep(c("a","b","c","d"),4)),
"comm.freq" = sample(0:5,16, replace=TRUE))
nodes <- data.frame(
"agency" = c("a","b","c","d"),
"group" = c("x", "y", "x", "y"),
"state" = c("i", "j", "j", "i"))
# make igraph object
net <- graph_from_data_frame(d=edges, vertices=nodes, directed=T)
plot(net)
# remove loops
net2 <- simplify(net, remove.multiple = T, remove.loops = T)
plot(net2)
Which gives me:
this
# remove vertices where communication frequency is 1 and 0
net3 <- delete.vertices(net2, which(E(net2)$comm.freq == 1))
net4 <- delete.vertices(net3, which(E(net2)$comm.freq == 0))
plot(net4)
Which does not change the plot at all
Then I try to change the colors and sizes:
# color edges by group
colrs <- c("gray50", "tomato")
V(net4)$color <- colrs[V(net4)$group]
plot(net4)
# make size of arrow based on communication frequency
plot(net4, edge.width = E(net4)$comm.freq * 5, edge.arrow.size = E(net4)$comm.freq)
And still nothing changes
I followed the code provided in the other posts and I'm just really confused why nothing will work.
Any help is much appreciated!
The simplify() function removed your edge attributes. You need to specify how you want those values to be preserved when simplifying your graph. If you just want to keep the first possible value, you can do
net2 <- simplify(net, remove.multiple = T, remove.loops = T, edge.attr.comb=list("first"))
And then you use delete.vertices but you are passing indexes for edges, not vertices. If you want to drop both vertices that are adjacent to an edge with that given property, it should look more like
net3 <- delete_vertices(net2, V(net2)[.inc(E(net2)[comm.freq==1])])
net4 <- delete_vertices(net3, V(net3)[.inc(E(net3)[comm.freq==0])])
And then for the colors you have values like "x" and "y" for group, but you are indexing into the colrs vector which has no idea what "x" and "y" correspond to. It would be better to use a named vector. For example
colrs <- c(x="gray50", y="tomato")
V(net4)$color <- colrs[V(net4)$group]
How can I find a non-linear path through raster image data? e.g., least cost algorithm? Starting and ending points are known and given as:
Start point = (0,0)
End point = (12,-5)
For example, extract the approximate path of a winding river through a (greyscale) raster image.
# fake up some noisy, but reproducible, "winding river" data
set.seed(123)
df <- data.frame(x=seq(0,12,by=.01),
y=sapply(seq(0,12,by=.01), FUN = function(i) 10*sin(i)+rnorm(1)))
# convert to "pixels" of raster data
# assumption: image color is greyscale, only need one numeric value, v
img <- data.frame(table(round(df$y,0), round(df$x,1)))
names(img) <- c("y","x","v")
img$y <- as.numeric(as.character(img$y))
img$x <- as.numeric(as.character(img$x))
## take a look at the fake "winding river" raster image...
library(ggplot2)
ggplot(img) +
geom_raster(aes(x=x,y=y,fill=v))
As I was writing up my example, I stumbled upon an answer using the 'gdistance' r package... hopefully others will find this useful.
library(gdistance)
library(sp)
library(ggplot2)
# convert to something rasterFromXYZ() understands
spdf <- SpatialPixelsDataFrame(points = img[c("x","y")], data = img["v"])
# use rasterFromXYZ to make a RasterLayer
r <- rasterFromXYZ(spdf)
# make a transition layer, specifying a sensible function and the number of connection directions
tl <- transition(r, function(x) min(x), 8)
## mean(x), min(x), and max(x) produced similar results for me
# extract the shortest path as something we can plot
sPath <- shortestPath(tl, c(0,0), c(12,-5), output = "SpatialLines")
# conversion for ggplot
sldf <- fortify(SpatialLinesDataFrame(sPath, data = data.frame(ID = 1)))
# plot the original raster, truth (white), and the shortest path solution (green)
ggplot(img) +
geom_raster(aes(x=x,y=y,fill=v)) +
stat_function(data=img, aes(x=x), fun = function(x) 10*sin(x), geom="line", color="white") +
geom_path(data=sldf, aes(x=long,y=lat), color="green")
I wanted to make sure that I wasn't just giving myself too easy of a problem... so I made a noisier version of the image.
img2 <- img
img2$v <- ifelse(img2$v==0, runif(sum(img2$v==0),3,8), img2$v)
spdf2 <- SpatialPixelsDataFrame(points = img2[c("x","y")], data = img2["v"])
r2 <- rasterFromXYZ(spdf2)
# for this noisier image, I needed a different transition function.
# The one from the vignette worked well enough for this example.
tl2 <- transition(r2, function(x) 1/mean(x), 8)
sPath2 <- shortestPath(tl2, c(0,0), c(12,-5), output = "SpatialLines")
sldf2 <- fortify(SpatialLinesDataFrame(sPath2, data = data.frame(ID = 1)))
ggplot(img2) +
geom_raster(aes(x=x,y=y,fill=v)) +
stat_function(data=img2, aes(x=x), fun = function(x) 10*sin(x), geom="line", color="white") +
geom_path(data=sldf2, aes(x=long,y=lat), color="green")
UPDATE: using real raster data...
I wanted to see if the same workflow would work on an actual real-world raster image and not just fake data, so...
library(jpeg)
# grab some river image...
url <- "https://c8.alamy.com/comp/AMDPJ6/fiji-big-island-winding-river-aerial-AMDPJ6.jpg"
download.file(url, "river.jpg", mode = "wb")
jpg <- readJPEG("./river.jpg")
img3 <- melt(jpg, varnames = c("y","x","rgb"))
img3$rgb <- as.character(factor(img3$rgb, levels = c(1,2,3), labels=c("r","g","b")))
img3 <- dcast(img3, x + y ~ rgb)
# convert rgb to greyscale
img3$v <- img3$r*.21 + img3$g*.72 + img3$b*.07
For rgb to greyscale, see: https://stackoverflow.com/a/27491947/2371031
# define some start/end point coordinates
pts_df <- data.frame(x = c(920, 500),
y = c(880, 50))
# set a reference "grey" value as the mean of the start and end point "v"s
ref_val <- mean(c(subset(img3, x==pts_df[1,1] & y==pts_df[1,2])$v,
subset(img3, x==pts_df[2,1] & y==pts_df[2,2])$v))
spdf3 <- SpatialPixelsDataFrame(points = img3[c("x","y")], data = img3["v"])
r3 <- rasterFromXYZ(spdf3)
# transition layer defines "conductance" between two points
# x is the two point values, "v" = c(v1, v2)
# 0 = no conductance, >>1 = good conductance, so
# make a transition function that encourages only small changes in v compared to the reference value.
tl3 <- transition(r3, function(x) (1/max(abs((x/ref_val)-1))^2)-1, 8)
sPath3 <- shortestPath(tl3, as.numeric(pts_df[1,]), as.numeric(pts_df[2,]), output = "SpatialLines")
sldf3 <- fortify(SpatialLinesDataFrame(sPath3, data = data.frame(ID = 1)))
# plot greyscale with points and path
ggplot(img3) +
geom_raster(aes(x,y, fill=v)) +
scale_fill_continuous(high="white", low="black") +
scale_y_reverse() +
geom_point(data=pts_df, aes(x,y), color="red") +
geom_path(data=sldf3, aes(x=long,y=lat), color="green")
I played around with different transition functions before finding one that worked. This one is probably more complex than it needs to be, but it works. You can increase the power term (from 2 to 3,4,5,6...) and it continues to work. It did not find a correct solution with the power term removed.
Alternative solution using igraph package.
Found an alternative set of answers using 'igraph' r package. I think it is important to note that one of the big differences here is that 'igraph' supports n-dimensional graphs whereas 'gdistance' only supports 2D graphs. So, for example, extending this answer into 3D is relatively easy.
library(igraph)
# make a 2D lattice graph, with same dimensions as "img"
l <- make_lattice(dimvector = c(length(unique(img$y)),
length(unique(img$x))), directed=F, circular=F)
summary(l)
# > IGRAPH ba0963d U--- 3267 6386 -- Lattice graph
# > + attr: name (g/c), dimvector (g/n), nei (g/n), mutual (g/l), circular (g/l)
# set vertex attributes
V(l)$x = img$x
V(l)$y = img$y
V(l)$v = img$v
# "color" is a known attribute that will be used by plot.igraph()
V(l)$color = grey.colors(length(unique(img$v)))[img$v+1]
# compute edge weights as a function of attributes of the two connected vertices
el <- get.edgelist(l)
# "weight" is a known edge attribute, and is used in shortest_path()
# I was confused about weights... lower weights are better, Inf weights will be avoided.
# also note from help: "if all weights are positive, then Dijkstra's algorithm is used."
E(l)$weight <- 1/(pmax(V(l)[el[, 1]]$v, V(l)[el[, 2]]$v))
E(l)$color = grey.colors(length(unique(E(l)$weight)))[E(l)$weight+1]
Edge weights calculation courtesy of: https://stackoverflow.com/a/27446127/2371031 (thanks!)
# find the start/end vertices
start = V(l)[V(l)$x == 0 & V(l)$y == 0]
end = V(l)[V(l)$x == 12 & V(l)$y == -5]
# get the shortest path, returning "both" (vertices and edges)...
result <- shortest_paths(graph = l, from = start, to = end, output = "both")
# color the edges that were part of the shortest path green
V(l)$color = ifelse(V(l) %in% result$vpath[[1]], "green", V(l)$color)
E(l)$color = ifelse(E(l) %in% result$epath[[1]], "green", E(l)$color)
# color the start and end vertices red
V(l)$color = ifelse(V(l) %in% c(start,end), "red", V(l)$color)
plot(l, vertex.shape = "square", vertex.size=2, vertex.frame.color=NA, vertex.label=NA, curved=F)
Second (noisier) example requires a different formula to compute edge weights.
img2 <- img
img2$v <- ifelse(img2$v==0, runif(sum(img2$v==0),3,8), img2$v)
l <- make_lattice(dimvector = c(length(unique(img2$y)),
length(unique(img2$x))), directed=F, circular=F)
# set vertex attributes
V(l)$x = img2$x
V(l)$y = img2$y
V(l)$v = img2$v
V(l)$color = grey.colors(length(unique(img2$v)))[factor(img2$v)]
# compute edge weights
el <- get.edgelist(l)
# proper edge weight calculation is the key to a good solution...
E(l)$weight <- (pmin(V(l)[el[, 1]]$v, V(l)[el[, 2]]$v))
E(l)$color = grey.colors(length(unique(E(l)$weight)))[factor(E(l)$weight)]
start = V(l)[V(l)$x == 0 & V(l)$y == 0]
end = V(l)[V(l)$x == 12 & V(l)$y == -5]
# get the shortest path, returning "both" (vertices and edges)...
result <- shortest_paths(graph = l, from = start, to = end, output = "both")
# color the edges that were part of the shortest path green
V(l)$color = ifelse(V(l) %in% result$vpath[[1]], "green", V(l)$color)
E(l)$color = ifelse(E(l) %in% result$epath[[1]], "green", E(l)$color)
# color the start and end vertices red
V(l)$color = ifelse(V(l) %in% c(start,end), "red", V(l)$color)
plot(l, vertex.shape = "square", vertex.size=2, vertex.frame.color=NA, vertex.label=NA, curved=F)
Problem
I have two separate networks with no overlapping nodes or edges, they both have the same attributes. I want to combine these two networks into a single network which would then be made up of two distinct components.
However when I try to merge them using the union command the attributes are renamed from "attribute" to "attribute_1" and "attribute_2". That this will happen is stated in the command help file, but I cannot find an obvious way to merge these two networks.
The situation is shown in the below code block
library(igraph)
#create a 4 node network of two components
adjmat <- rep(0, 16)
adjmat[c(2,5,12,15)] <- 1
g <- graph.adjacency(matrix(adjmat, nrow = 4) , mode = "undirected")
#give attributes naming the nodes and the edges
g <- set_vertex_attr(g, "name", value = paste0("Node_", 1:4))
g <- set_edge_attr(g, "name", value = paste0("Edge_",1:2))
#I am interested in the type attribute
g <- set_edge_attr(g, "type", value = c("foo", "bar"))
plot(g)
#Decompose into seperate networks
gList <- decompose(g)
g2 <-union(gList[[1]], gList[[2]])
#vertices are fine but edges have been renamed as stated in the helpfile for union.
get.edge.attribute(g2)
get.vertex.attribute(g2)
Work around
Currently the two separate networks originate from the same original network so I have been able to make a hack however this isn't always the case and I would like a more igraph way of merging the two.
The hack is below
#To solve this problem I do the following
#Create two dataframes from the edge characteristics of the network and combine into a single dataframe
P <- rbind(as_data_frame(gList[[1]]),
as_data_frame(gList[[2]]))
g3 <- set.edge.attribute(g, "type", value = P$type[match(P$name, get.edge.attribute(g, "name"))])
#Edges are now correct
get.edge.attribute(g3)matrix(adjmat, nrow = 4)
get.vertex.attribute(g3)
Is there a function in igraph that would merge the two seperate networks into a single network whilst maintaining the attributes as is?
I have made the below version of union, which accepts two graphs with an arbitrary number of overlapping attributes and merges them into a single graph where the attributes do not have the "_x" suffix. The graphs can be entirely independent or have overlapping nodes.
In the case of overlapping nodes the attributes of graph 1 take precedence
library(dplyr)
library(igraph)
union2<-function(g1, g2){
#Internal function that cleans the names of a given attribute
CleanNames <- function(g, target){
#get target names
gNames <- parse(text = (paste0(target,"_attr_names(g)"))) %>% eval
#find names that have a "_1" or "_2" at the end
AttrNeedsCleaning <- grepl("(_\\d)$", gNames )
#remove the _x ending
StemName <- gsub("(_\\d)$", "", gNames)
NewnNames <- unique(StemName[AttrNeedsCleaning])
#replace attribute name for all attributes
for( i in NewnNames){
attr1 <- parse(text = (paste0(target,"_attr(g,'", paste0(i, "_1"),"')"))) %>% eval
attr2 <- parse(text = (paste0(target,"_attr(g,'", paste0(i, "_2"),"')"))) %>% eval
g <- parse(text = (paste0("set_",target,"_attr(g, i, value = ifelse(is.na(attr1), attr2, attr1))"))) %>%
eval
g <- parse(text = (paste0("delete_",target,"_attr(g,'", paste0(i, "_1"),"')"))) %>% eval
g <- parse(text = (paste0("delete_",target,"_attr(g,'", paste0(i, "_2"),"')"))) %>% eval
}
return(g)
}
g <- igraph::union(g1, g2)
#loop through each attribute type in the graph and clean
for(i in c("graph", "edge", "vertex")){
g <- CleanNames(g, i)
}
return(g)
}
Using the previous example
g4 <-union2(gList[[1]], gList[[2]])
#As we would like
get.edge.attribute(g4)
get.vertex.attribute(g4)
I have built a phylogenetic tree for a protein family that can be split into different groups, classifying each one by its type of receptor or type of response. The nodes in the tree are labeled as the type of receptor.
In the phylogenetic tree I can see that proteins that belong to the same groups or type of receptor have clustered together in the same branches. So I would like to collapse these branches that have labels in common, grouping them by a given list of keywords.
The command would be something like this:
./collapse_tree_by_label -f phylogenetic_tree.newick -l list_of_labels_to_collapse.txt -o collapsed_tree.eps(or pdf)
My list_of_labels_to_collapse.txt would be like this:
A
B
C
D
My newick tree would be like this:
(A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2)
The output image without collapsing is like this:
http://i.stack.imgur.com/pHkoQ.png
The output image collapsing should be like this (collapsed_tree.eps):
http://i.stack.imgur.com/TLXd0.png
The width of the triangles should represent the branch length, and the high of the triangles must represent the number of nodes in the branch.
I have been playing with the "ape" package in R. I was able to plot a phylogenetic tree, but I still can't figure out how to collapse the branches by keywords in the labels:
require("ape")
This will load the tree:
cat("((A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
Here should be the code to collapse
This will plot the tree:
plot(tree.test)
Your tree as it is stored in R already has the tips stored as polytomies. It's just a matter of plotting the tree with triangles representing the polytomies.
There is no function in ape to do this, that I am aware of, but if you mess with the plotting function a little bit you can pull it off
# Step 1: make edges for descendent nodes invisible in plot:
groups <- c("A", "B", "C", "D")
group_edges <- numeric(0)
for(group in groups){
group_edges <- c(group_edges,getMRCA(tree.test,tree.test$tip.label[grepl(group, tree.test$tip.label)]))
}
edge.width <- rep(1, nrow(tree.test$edge))
edge.width[tree.test$edge[,1] %in% group_edges ] <- 0
# Step 2: plot the tree with the hidden edges
plot(tree.test, show.tip.label = F, edge.width = edge.width)
# Step 3: add triangles
add_polytomy_triangle <- function(phy, group){
root <- length(phy$tip.label)+1
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_nodes <- which(phy$tip.label %in% group_node_labels)
group_mrca <- getMRCA(phy,group_nodes)
tip_coord1 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[1])
tip_coord2 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[length(group_nodes)])
node_coord <- c(dist.nodes(phy)[root, group_mrca], mean(c(tip_coord1[2], tip_coord2[2])))
xcoords <- c(tip_coord1[1], tip_coord2[1], node_coord[1])
ycoords <- c(tip_coord1[2], tip_coord2[2], node_coord[2])
polygon(xcoords, ycoords)
}
Then you just have to loop through the groups to add the triangles
for(group in groups){
add_polytomy_triangle(tree.test, group)
}
I've also been searching for this kind of tool for ages, not so much for collapsing categorical groups, but for collapsing internal nodes based on a numerical support value.
The di2multi function in the ape package can collapse nodes to polytomies, but it currently can only does this by branch length threshold.
Here is a rough adaptation that allows collapsing by a node support value threshold instead (default threshold = 0.5).
Use at your own risk, but it works for me on my rooted Bayesian tree.
di2multi4node <- function (phy, tol = 0.5)
# Adapted di2multi function from the ape package to plot polytomies
# based on numeric node support values
# (di2multi does this based on edge lengths)
# Needs adjustment for unrooted trees as currently skips the first edge
{
if (is.null(phy$edge.length))
stop("the tree has no branch length")
if (is.na(as.numeric(phy$node.label[2])))
stop("node labels can't be converted to numeric values")
if (is.null(phy$node.label))
stop("the tree has no node labels")
ind <- which(phy$edge[, 2] > length(phy$tip.label))[as.numeric(phy$node.label[2:length(phy$node.label)]) < tol]
n <- length(ind)
if (!n)
return(phy)
foo <- function(ancestor, des2del) {
wh <- which(phy$edge[, 1] == des2del)
for (k in wh) {
if (phy$edge[k, 2] %in% node2del)
foo(ancestor, phy$edge[k, 2])
else phy$edge[k, 1] <<- ancestor
}
}
node2del <- phy$edge[ind, 2]
anc <- phy$edge[ind, 1]
for (i in 1:n) {
if (anc[i] %in% node2del)
next
foo(anc[i], node2del[i])
}
phy$edge <- phy$edge[-ind, ]
phy$edge.length <- phy$edge.length[-ind]
phy$Nnode <- phy$Nnode - n
sel <- phy$edge > min(node2del)
for (i in which(sel)) phy$edge[i] <- phy$edge[i] - sum(node2del <
phy$edge[i])
if (!is.null(phy$node.label))
phy$node.label <- phy$node.label[-(node2del - length(phy$tip.label))]
phy
}
This is my answer based on phytools::phylo.toBackbone function,
see http://blog.phytools.org/2013/09/even-more-on-plotting-subtrees-as.html, and http://blog.phytools.org/2013/10/finding-edge-lengths-of-all-terminal.html. First, load the function at the end of code.
library(ape)
library(phytools) #phylo.toBackbone
library(phangorn)
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);"
, file = "ex.tre", sep = "\n")
phy <- read.tree("ex.tre")
groups <- c("A", "B|C", "D")
backboneoftree<-makebackbone(groups,phy)
# tip.label clade.label N depth
# 1 A_1 A 10 0.2481818
# 2 B_1 B|C 6 0.9400000
# 3 D_1 D 5 0.4600000
{
tryCatch(dev.off(),error=function(e){""})
par(fig=c(0,0.5,0,1), mar = c(0, 0, 2, 0))
plot(phy, main="Original" )
par(fig=c(0.5,1,0,1), oma = c(0, 0, 1.2, 0), xpd=NA, new=T)
plot(backboneoftree)
title(main="Clades")
}
makebackbone <- function(groupings,phy){
listofspecies <- phy$tip.label
listtopreserve <- character()
newedgelengths <- meandistnode<- lengthofclades<- numeric()
for (i in 1:length(groupings)){
bestmrca<-getMRCA(phy,grep(groupings[i], phy$tip.label) )
mrcatips<-phy$tip.label[unlist(phangorn::Descendants(phy,bestmrca, type="tips") )]
listtopreserve[i] <- mrcatips[1]
meandistnode[i] <- mean(dist.nodes(phy)[unlist(lapply(mrcatips,
function(x) grep(x, phy$tip.label) ) ),bestmrca] )
lengthofclades[i] <- length(mrcatips)
provtree <- drop.tip(phy,mrcatips, trim.internal=F, subtree = T)
n3 <- length(provtree$tip.label)
newedgelengths[i] <- setNames(provtree$edge.length[sapply(1:n3,function(x,y)
which(y==x),
y=provtree$edge[,2])],
provtree$tip.label)[provtree$tip.label[grep("tips",provtree$tip.label)] ]
}
newtree <- drop.tip(phy,setdiff(listofspecies,listtopreserve),
trim.internal = T)
n <- length(newtree$tip.label)
newtree$edge.length[sapply(1:n,function(x,y)
which(y==x),
y=newtree$edge[,2])] <- newedgelengths + meandistnode
trans <- data.frame(tip.label=newtree$tip.label,clade.label=groupings,
N=lengthofclades, depth=meandistnode )
rownames(trans) <- NULL
print(trans)
backboneoftree <- phytools::phylo.toBackbone(newtree,trans)
return(backboneoftree)
}
EDIT: I haven't tried this, but it might be another answer: "Script and function to transform the tip branches of a tree , i.e the thickness or to triangles, with the width of both correlating with certain parameters (e.g., species number of the clade) (tip.branches.R)"
https://www.en.sysbot.bio.lmu.de/people/employees/cusimano/use_r/index.html
I think the script is finally doing what I wanted.
From the answer that #CactusWoman provided, I changed the code a little bit so the script will try to find the MRCA that represents the largest branch that matches to my search pattern. This solved the problem of not merging non-polytomic branches, or collapsing the whole tree because one matching node was mistakenly outside the correct branch.
In addition, I included a parameter that represents the limit for the pattern abundance ratio in a given branch, so we can select and collapse/group branches that have at least 90% of its tips matching to the search pattern, for example.
library(geiger)
library(phylobase)
library(ape)
#functions
find_best_mrca <- function(phy, group, threshold){
group_matches <- phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)]
group_mrca <- getMRCA(phy,phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)])
group_leaves <- tips(phy, group_mrca)
match_ratio <- length(group_matches)/length(group_leaves)
if( match_ratio < threshold){
#start searching for children nodes that have more than 95% of descendants matching to the search pattern
mrca_children <- descendants(as(phy,"phylo4"), group_mrca, type="all")
i <- 1
new_ratios <- NULL
nleaves <- NULL
names(mrca_children) <- NULL
for(new_mrca in mrca_children){
child_leaves <- tips(tree.test, new_mrca)
child_matches <- grep(group, child_leaves, ignore.case=TRUE)
new_ratios[i] <- length(child_matches)/length(child_leaves)
nleaves[i] <- length(tips(phy, new_mrca))
i <- i+1
}
match_result <- data.frame(mrca_children, new_ratios, nleaves)
match_result_sorted <- match_result[order(-match_result$nleaves,match_result$new_ratios),]
found <- numeric(0);
print(match_result_sorted)
for(line in 1:nrow(match_result_sorted)){
if(match_result_sorted$ new_ratios[line]>=threshold){
return(match_result_sorted$mrca_children[line])
found <- 1
}
}
if(found==0){return(found)}
}else{return(group_mrca)}
}
add_triangle <- function(phy, group,phylo_plot){
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_mrca <- getMRCA(phy,group_node_labels)
group_nodes <- descendants(as(tree.test,"phylo4"), group_mrca, type="tips")
names(group_nodes) <- NULL
x<-phylo_plot$xx
y<-phylo_plot$yy
x1 <- max(x[group_nodes])
x2 <-max(x[group_nodes])
x3 <- x[group_mrca]
y1 <- min(y[group_nodes])
y2 <- max(y[group_nodes])
y3 <- y[group_mrca]
xcoords <- c(x1,x2,x3)
ycoords <- c(y1,y2,y3)
polygon(xcoords, ycoords)
return(c(x2,y3))
}
#main
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
# Step 1: Find the best MRCA that matches to the keywords or search patten
groups <- c("A", "B|C", "D")
group_labels <- groups
group_edges <- numeric(0)
edge.width <- rep(1, nrow(tree.test$edge))
count <- 1
for(group in groups){
best_mrca <- find_best_mrca(tree.test, group, 0.90)
group_leaves <- tips(tree.test, best_mrca)
groups[count] <- paste(group_leaves, collapse="|")
group_edges <- c(group_edges,best_mrca)
#Step2: Remove the edges of the branches that will be collapsed, so they become invisible
edge.width[tree.test$edge[,1] %in% c(group_edges[count],descendants(as(tree.test,"phylo4"), group_edges[count], type="all")) ] <- 0
count = count +1
}
#Step 3: plot the tree hiding the branches that will be collapsed/grouped
last_plot.phylo <- plot(tree.test, show.tip.label = F, edge.width = edge.width)
#And save a copy of the plot so we can extract the xy coordinates of the nodes
#To get the x & y coordinates of a plotted tree created using plot.phylo
#or plotTree, we can steal from inside tiplabels:
last_phylo_plot<-get("last_plot.phylo",envir=.PlotPhyloEnv)
#Step 4: Add triangles and labels to the collapsed nodes
for(i in 1:length(groups)){
text_coords <- add_triangle(tree.test, groups[i],last_phylo_plot)
text(text_coords[1],text_coords[2],labels=group_labels[i], pos=4)
}
This doesn't address depicting the clades as triangles, but it does help with collapsing low-support nodes. The library ggtree has a function as.polytomy which can be used to collapse nodes based on support values.
For example, to collapse bootstraps less than 50%, you'd use:
polytree = as.polytomy(raxtree, feature='node.label', fun=function(x) as.numeric(x) < 50)
I want to make a simple phylogenetic tree for a marine biology course as an educative example. I have a list of species with taxonomic rank:
Group <- c("Benthos","Benthos","Benthos","Benthos","Benthos","Benthos","Zooplankton","Zooplankton","Zooplankton","Zooplankton",
"Zooplankton","Zooplankton","Fish","Fish","Fish","Fish","Fish","Fish","Phytoplankton","Phytoplankton","Phytoplankton","Phytoplankton")
Domain <- rep("Eukaryota", length(Group))
Kingdom <- c(rep("Animalia", 18), rep("Chromalveolata", 4))
Phylum <- c("Annelida","Annelida","Arthropoda","Arthropoda","Porifera","Sipunculida","Arthropoda","Arthropoda","Arthropoda",
"Arthropoda","Echinoidermata","Chorfata","Chordata","Chordata","Chordata","Chordata","Chordata","Chordata","Heterokontophyta",
"Heterokontophyta","Heterokontophyta","Dinoflagellata")
Class <- c("Polychaeta","Polychaeta","Malacostraca","Malacostraca","Demospongiae","NA","Malacostraca","Malacostraca",
"Malacostraca","Maxillopoda","Ophiuroidea","Actinopterygii","Chondrichthyes","Chondrichthyes","Chondrichthyes","Actinopterygii",
"Actinopterygii","Actinopterygii","Bacillariophyceae","Bacillariophyceae","Prymnesiophyceae","NA")
Order <- c("NA","NA","Amphipoda","Cumacea","NA","NA","Amphipoda","Decapoda","Euphausiacea","Calanioda","NA","Gadiformes",
"NA","NA","NA","NA","Gadiformes","Gadiformes","NA","NA","NA","NA")
Species <- c("Nephtys sp.","Nereis sp.","Gammarus sp.","Diastylis sp.","Axinella sp.","Ph. Sipunculida","Themisto abyssorum","Decapod larvae (Zoea)",
"Thysanoessa sp.","Centropages typicus","Ophiuroidea larvae","Gadus morhua eggs / larvae","Etmopterus spinax","Amblyraja radiata",
"Chimaera monstrosa","Clupea harengus","Melanogrammus aeglefinus","Gadus morhua","Thalassiosira sp.","Cylindrotheca closterium",
"Phaeocystis pouchetii","Ph. Dinoflagellata")
dat <- data.frame(Group, Domain, Kingdom, Phylum, Class, Order, Species)
dat
I would like to get a dendrogram (cluster analysis) and use Domain as the first cutting point, Kindom as the second, Phylum as the third, etc. Missing values should be ignored (no cutting point, a straight line instead). Group should be used as a coloring category for the labels.
I am a bit uncertain how to make a distance matrix from this data frame. There are a lot of phylogenetic tree packages for R, they seem to want newick data / DNA / other advanced information. Thus help with this would be appreciated.
It's probably a bit lame to answer my own question, but I found an easier solution. Maybe it helps someone one day.
library(ape)
taxa <- as.phylo(~Kingdom/Phylum/Class/Order/Species, data = dat)
col.grp <- merge(data.frame(Species = taxa$tip.label), dat[c("Species", "Group")], by = "Species", sort = F)
cols <- ifelse(col.grp$Group == "Benthos", "burlywood4", ifelse(col.grp$Group == "Zooplankton", "blueviolet", ifelse(col.grp$Group == "Fish", "dodgerblue", ifelse(col.grp$Group == "Phytoplankton", "darkolivegreen2", ""))))
plot(taxa, type = "cladogram", tip.col = cols)
Note that all columns have to be factors. This demonstrates the work flow with R. It takes a week to find out something, although the code itself is just a couple of rows =)
If you wanted to draw the tree by hand
(this is probably not the best way to do it),
you could start as follows
(it is not a complete answer:
the colours are missing,
and the edges are too long).
This assumes that the data has already been sorted.
# Data: remove Group
dat <- data.frame(Domain, Kingdom, Phylum, Class, Order, Species)
# Start a new plot
par(mar=c(0,0,0,0))
plot(NA, xlim=c(0,ncol(dat)+1), ylim=c(0,nrow(dat)+1),
type="n", axes=FALSE, xlab="", ylab="", main="")
# Compute the position of each node and find all the edges to draw
positions <- NULL
links <- NULL
for(k in 1:ncol(dat)) {
y <- tapply(1:nrow(dat), dat[,k], mean)
y <- y[ names(y) != "NA" ]
positions <- rbind( positions, data.frame(
name = names(y),
x = k,
y = y
))
}
links <- apply( dat, 1, function(u) {
u <- u[ !is.na(u) & u != "NA" ]
cbind(u[-length(u)],u[-1])
} )
links <- do.call(rbind, links)
rownames(links) <- NULL
links <- unique(links[ order(links[,1], links[,2]), ])
# Draw the edges
for(i in 1:nrow(links)) {
from <- positions[links[i,1],]
to <- positions[links[i,2],]
lines( c(from$x, from$x, to$x), c(from$y, to$y, to$y) )
}
# Add the text
text(positions$x, positions$y, label=positions$name)