I want to modify the properties of the leaves in a dendrogram produced from plot of an hclust object. Minimally, I want to change the colors, but any help you can provide will be appreciated.
I did try to google the answer, but but every solution that I saw seemed alot harder than what I would have guessed.
A while ago, Joris Meys kindly provided me with this snippet of code that changes the color of leaves. Modify it to reflect your attributes.
clusDendro <- as.dendrogram(Clustering)
labelColors <- c("red", "blue", "darkgreen", "darkgrey", "purple")
## function to get colorlabels
colLab <- function(n) {
if(is.leaf(n)) {
a <- attributes(n)
# clusMember - a vector designating leaf grouping
# labelColors - a vector of colors for the above grouping
labCol <- labelColors[clusMember[which(names(clusMember) == a$label)]]
attr(n, "nodePar") <- c(a$nodePar, lab.col = labCol)
}
n
}
## Graph
clusDendro <- dendrapply(clusDendro, colLab)
op <- par(mar = par("mar") + c(0,0,0,2))
plot(clusDendro,
main = "Major title",
horiz = T, type = "triangle", center = T)
par(op)
Here is a solution for this question using a new package called "dendextend", built exactly for this sort of thing.
You can see many examples in the presentations and vignettes of the package, in the "usage" section in the following URL: https://github.com/talgalili/dendextend
Here is the solution for this question:
# define dendrogram object to play with:
dend <- as.dendrogram(hclust(dist(USArrests[1:3,]), "ave"))
# loading the package
install.packages('dendextend') # it is now on CRAN
library(dendextend)# let's add some color:
labels_colors(dend) <- 2:4
labels_colors(dend)
plot(dend)
It is not clear what you want to use it for, but I often need to identify a branch in a dendrogram. I've hacked the rect.hclust method to add a density and label input.
You would call it like this:
k <- 3 # number of branches to identify
labels.to.identify <- c('1','2','3')
required.density <- 10 # the density of shading lines, in lines per inch
rect.hclust.nice(tree, k, labels=labels.to.identify, density=density.required)
Here is the function
rect.hclust.nice = function (tree, k = NULL, which = NULL, x = NULL, h = NULL, border = 2,
cluster = NULL, density = NULL,labels = NULL, ...)
{
if (length(h) > 1 | length(k) > 1)
stop("'k' and 'h' must be a scalar")
if (!is.null(h)) {
if (!is.null(k))
stop("specify exactly one of 'k' and 'h'")
k <- min(which(rev(tree$height) < h))
k <- max(k, 2)
}
else if (is.null(k))
stop("specify exactly one of 'k' and 'h'")
if (k < 2 | k > length(tree$height))
stop(gettextf("k must be between 2 and %d", length(tree$height)),
domain = NA)
if (is.null(cluster))
cluster <- cutree(tree, k = k)
clustab <- table(cluster)[unique(cluster[tree$order])]
m <- c(0, cumsum(clustab))
if (!is.null(x)) {
if (!is.null(which))
stop("specify exactly one of 'which' and 'x'")
which <- x
for (n in 1L:length(x)) which[n] <- max(which(m < x[n]))
}
else if (is.null(which))
which <- 1L:k
if (any(which > k))
stop(gettextf("all elements of 'which' must be between 1 and %d",
k), domain = NA)
border <- rep(border, length.out = length(which))
labels <- rep(labels, length.out = length(which))
retval <- list()
for (n in 1L:length(which)) {
rect(m[which[n]] + 0.66, par("usr")[3L], m[which[n] +
1] + 0.33, mean(rev(tree$height)[(k - 1):k]), border = border[n], col = border[n], density = density, ...)
text((m[which[n]] + m[which[n] + 1]+1)/2, grconvertY(grconvertY(par("usr")[3L],"user","ndc")+0.02,"ndc","user"),labels[n])
retval[[n]] <- which(cluster == as.integer(names(clustab)[which[n]]))
}
invisible(retval)
}
Related
I have a time-calibrated phylogenetic tree from BEAST and I would like to make a figure in which its nodes are rotated to match an arbitrary ordering. The following code works perfectly to plot the tree with the nodes in the order they are in the input file.
library("phytools")
library("phyloch")
library("strap")
library("coda")
t <- read.beast("mcctree.tre") # I couldn't upload the file here
t$root.time <- t$height[1]
num_taxa <- length(t$tip.label)
display_all_node_bars <- TRUE
names_list <-vector()
for (name in t$tip){
v <- strsplit(name, "_")[[1]]
if(display_all_node_bars){
names_list = c(names_list, name)
}
else if(v[length(v)]=="0"){
names_list = c(names_list, name)
}
}
nids <- vector()
pos <- 1
len_nl <- length(names_list)
for(n in names_list){
for(nn in names_list[pos:len_nl]){
if(n != nn){
m <- getMRCA(t,c(n, nn))
if(m %in% nids == FALSE){
nids <- c(nids, m)
}
}
}
pos <- pos+1
}
pdf("tree.pdf", width = 20, height = 20)
geoscalePhylo(tree = t,
x.lim = c(-2,21),
units = c("Epoch"),
tick.scale = "myr",
boxes = FALSE,
width = 1,
cex.tip = 2,
cex.age = 3,
cex.ts = 2,
erotate = 0,
label.offset = 0.1)
lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv)
for(nv in nids){
bar_xx_a <- c(lastPP$xx[nv]+t$height[nv-num_taxa]-t$"height_95%_HPD_MIN"[nv-num_taxa],
lastPP$xx[nv]-(t$"height_95%_HPD_MAX"[nv-num_taxa]-t$height[nv-num_taxa]))
lines(bar_xx_a, c(lastPP$yy[nv], lastPP$yy[nv]), col = rgb(0, 0, 1, alpha = 0.3), lwd = 12)
}
t$node.label <- t$posterior
p <- character(length(t$node.label))
p[t$node.label >= 0.95] <- "black"
p[t$node.label < 0.95 & t$node.label >= 0.75] <- "gray"
p[t$node.label < 0.75] <- "white"
nodelabels(pch = 21, cex = 1.5, bg = p)
dev.off()
The following code is my attempt to rotate the nodes in the way I want (following this tutorial: http://blog.phytools.org/2015/04/finding-closest-set-of-node-rotations.html). And it works for rotating the nodes. However, the blue bars indicating the confidence intervals of the divergence time estimates get out of their correct place - this is what I would like help to correct. This will be used in much larger files with hundreds of branches - the example here is simplified.
new.order <- c("Sp8","Sp9","Sp10","Sp7","Sp6","Sp5","Sp4","Sp2","Sp3","Ou1","Ou2","Sp1")
t2 <- setNames(1:Ntip(t), new.order)
new.order.tree <- minRotate(t, t2)
new.order.tree$root.time <- t$root.time
new.order.tree$height <- t$height
new.order.tree$"height_95%_HPD_MIN" <- t$"height_95%_HPD_MIN"
new.order.tree$"height_95%_HPD_MAX" <- t$"height_95%_HPD_MAX"
pdf("reordered_tree.pdf", width = 20, height = 20)
geoscalePhylo(tree = new.order.tree,
x.lim = c(-2,21),
units = c("Epoch"),
tick.scale = "myr",
boxes = FALSE,
width = 1,
cex.tip = 2,
cex.age = 3,
cex.ts = 2,
erotate = 0,
label.offset = 0.1)
lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv)
for(nv in nids){
bar_xx_a <- c(lastPP$xx[nv]+new.order.tree$height[nv-num_taxa]-new.order.tree$"height_95%_HPD_MIN"[nv-num_taxa],
lastPP$xx[nv]-(new.order.tree$"height_95%_HPD_MAX"[nv-num_taxa]-new.order.tree$height[nv-num_taxa]))
lines(bar_xx_a, c(lastPP$yy[nv], lastPP$yy[nv]), col = rgb(0, 0, 1, alpha = 0.3), lwd = 12)
}
new.order.tree$node.label <- t$posterior
p <- character(length(new.order.tree$node.label))
p[new.order.tree$node.label >= 0.95] <- "black"
p[new.order.tree$node.label < 0.95 & new.order.tree$node.label >= 0.75] <- "gray"
p[new.order.tree$node.label < 0.75] <- "white"
nodelabels(pch = 21, cex = 1.5, bg = p)
dev.off()
I've found several similar questions here and in other forums, but none dealing specifically with time-calibrated trees - which is the core of the problem described above.
The short answer is that phyTools::minRotate() doesn't recognize the confidence intervals as associated with nodes. If you contact the phyTools maintainers, they may well be able to add this functionality quite easily.
Meanwhile, you can correct this yourself.
I don't know how read.beast() saves confidence intervals – let's say they're saved in t$conf.int. (Type unclass(t) at the R command line to see the full structure; you should be able to identify the appropriate property.)
If the tree's node labels are unique, then you can infer the new sequence of nodes using match():
library("phytools")
new.order <- c("Sp8","Sp9","Sp10","Sp7","Sp6","Sp5","Sp4","Sp2","Sp3","Ou1","Ou2","Sp1")
# Set up a fake initial tree -- you would load the tree from a file
tree <- rtree(length(new.order))
tree$tip.label <- sort(new.order)
tree$node.label <- seq_len(tree$Nnode)
tree$conf.int <- seq_len(tree$Nnode) * 10
# Plot tree
par(mfrow = c(1, 2), mar = rep(0, 4), cex = 0.9) # Create space
plot(tree, show.node.label = TRUE)
nodelabels(tree$conf.int, adj = 1) # Annotate "correct" intervals
# Re-order nodes with minRotate
noTree <- minRotate(tree, setNames(seq_along(new.order), new.order))
plot(noTree, show.node.label = TRUE)
# Move confidence intervals to correct node
tree$conf.int <- tree$conf.int[match(noTree$node.label, tree$node.label)]
nodelabels(tree$conf.int, adj = 1)
If you can't guarantee that the node labels are unique, you can always overwrite them in a temporary object:
# Find node order
treeCopy <- tree
treeCopy$node.label <- seq_len(tree$Nnode)
nodeOrder <- match(minRotate(treeCopy)$node.label, treeCopy$node.label)
# Apply node order
tree$conf.int <- tree$conf.int[nodeOrder]
I am trying to add a padding of 20 for the maximum of 5 curves I am plotting.
library(ggplot2)
... some code to compute 5 (bins) distributions y3(nSample,bin)
color_curve <- c("red", "green", "grey", "black")
max_y <- as.double(which.max(density(y3))+20)
# Save PDF - Chisquare
pdf(file="Chisquare_Distribution.pdf")
for (i in 1:nRed) {
if (i == 1)
plot(density(y3[,i]), col="blue", xlim=c(min(y3),max(y3)), ylim=c(0,max_y), main="z= 0.9595, 1.087, 1.2395, 1.45, 1.688")
else
lines(density(y3[,i]), col=color_curve[i-1])
}
dev.off()
But I get the following error at execution :
Error in which.max(density(y3)) :
'list' object cannot be coerced to type 'double'
Execution halted
I would like to add this padding of 20 to the maximum of all 5 distributions but it fails with this error.
How to fix this ?
Update 1
Thanks for the suggested answer. Unfortunately, now, I get the following figure :
As you can see, the maximum in y_limit is too high for the 5 distributions, I don't know where it could come from.
Update 2
With the new command, I have the following figure :
I get an under-estimated value for searching the max among the 5 distributions.
Edit
I provide the entire code to generate the plots with the input file:
my_data <- read.delim("Array_total_WITH_Shot_Noise.txt", header = FALSE, sep = " ")
array_2D <- array(my_data)
z_ph <- c(0.9595, 1.087, 1.2395, 1.45, 1.688)
b_sp <- c(1.42904922, 1.52601862, 1.63866958, 1.78259615, 1.91956918)
b_ph <- c(sqrt(1+z_ph))
ratio_squared <- (b_sp/b_ph)^2
nRed <- 5
nRow <- NROW(my_data)
nSample_var <- 1000000
nSample_mc <- 1000
Cl<-my_data[,2:length(my_data)]#suppose cl=var(alm)
Cl_sp <- array(0, dim=c(nRow,nRed))
Cl_ph <- array(0, dim=c(nRow,nRed))
length(Cl)
for (i in 1:length(Cl)) {
#(shape/rate) convention :
Cl_sp[,i] <-(Cl[, i] * ratio_squared[i])
Cl_ph[,i] <- (Cl[, i])
}
L <- array_2D[,1]
L <- 2*(array_2D[,1])+1
# Weighted sum of Chi squared distribution
y3_1<-array(0,dim=c(nSample_var,nRed));y3_2<-array(0,dim=c(nSample_var,nRed));y3<-array(0,dim=c(nSample_var,nRed));
for (i in 1:nRed) {
for (j in 1:nRow) {
# Try to summing all the random variable
y3_1[,i] <- y3_1[,i] + Cl_sp[j,i] * rchisq(nSample_var,df=L[j])
y3_2[,i] <- y3_2[,i] + Cl_ph[j,i] * rchisq(nSample_var,df=L[j])
}
y3[,i] <- y3_1[,i]/y3_2[,i]
}
print(paste0('n=',nSample_mc*nSample_var))
for (i in 1:nRed) {
# compute the standard deviation of the ratio by Monte-Carlo
print(paste0('red=',i,',mean_fid = ', ratio_squared[i],',mean_exp = ', mean(y3[,i])))
print(paste0('numerator : var = ', var(y3_1[,i]), ', sigma = ', sd(y3_1[,i])))
print(paste0('denominator : var = ', var(y3_2[,i]), ', sigma = ', sd(y3_2[,i])))
print(paste0('var = ', var(y3[,i]), ', sigma = ', sd(y3[,i])))
}
print('#############################################################')
# par(mfrow=c(2,nRed))
color_curve <- c("red", "green", "grey", "black")
# Save PDF - Chisquare
pdf(file="Chisquare_Distribution.pdf")
for (i in 1:nRed) {
if (i == 1)
plot(density(y3[,i]), col="blue", xlim=c(min(y3),max(y3)), main="z= 0.9595, 1.087, 1.2395, 1.45, 1.688")
else
lines(density(y3[,i]), col=color_curve[i-1])
}
dev.off()
Hoping this will help you to do a version ggplot2 of the classical R plots.
No code to generate your sample data was provided, so I used the code #akrun had used previously: y3 <- matrix(rnorm(5000), ncol = 5)
library(tidyverse)
as.data.frame(y3) %>%
mutate(row = row_number()) %>% # add row to simplify next step
pivot_longer(-row) %>% # reshape long
ggplot(aes(value, color = name)) + # map x to value, color to name
geom_density()
I wonder how you can simplify these two :
plot (payroll,wins)
id = identify(payroll, wins,labels = code, n = 5)
plot (payroll,wins)
with(data, text(payroll, wins, labels = code, pos = 1, cex=0.5))
using other alternatives - pch() dan as.numeric()?
Not sure it's easier but you change pch during identification as below (taken from the R-help). Every time you click empty point change to filled-in dot.
# data simulation
data <- data.frame(payroll = rnorm(10), wins = rnorm(10), code = letters[1:10])
identifyPch <- function(x, y = NULL, n = length(x), plot = FALSE, pch = 19, ...)
{
xy <- xy.coords(x, y)
x <- xy$x
y <- xy$y
sel <- rep(FALSE, length(x))
while (sum(sel) < n) {
ans <- identify(x[!sel], y[!sel], labels = which(!sel), n = 1, plot = plot, ...)
if(!length(ans)) {
break
}
ans <- which(!sel)[ans]
points(x[ans], y[ans], pch = pch)
sel[ans] <- TRUE
}
## return indices of selected points
which(sel)
}
if(dev.interactive()) { ## use it
with(data, plot(payroll,wins))
id = with(data, identifyPch(payroll, wins))
}
I would like to keep some cells in attention by making their borders clearly distinct from anything else.
The parameter rect.col is used to colorise all borders but I want to colorise only borders of the cells (3,3) and (7,7), for instance, by any halo color etc heat.colors(100) or rainbow(12).
Code:
library("corrplot")
library("psych")
ids <- seq(1,11)
M.cor <- cor(mtcars)
colnames(M.cor) <- ids
rownames(M.cor) <- ids
p.mat <- psych::corr.test(M.cor, adjust = "none", ci = F)
p.mat <- p.mat[["r"]]
corrplot(M.cor,
method = "color",
type = "upper",
tl.col = 'black',
diag = TRUE,
p.mat = p.mat,
sig.level = 0.0000005
)
Fig. 1 Output of the top code without cell bordering,
Fig. 2 Output after manually converting all coordinates to upper triangle but artifact at (10,1),
Fig. 3 Output with window size fix
Input: locations by ids (3,3) and (7,7)
Expected output: two cells where borders marked on upper triangle
Pseudocode
# ids must be id.pairs
# or just a list of two lists
createBorders <- function(id.pairs) {
labbly(id.pairs,function(z){
x <- z$V1
y <- z$V2
rect(x+0.5, y+0.5, x+1.5, y+1.5) # user20650
})
}
corrplot(...)
# TODO Which datastructure to use there in the function as the paired list of ids?
createBorders(ids.pairs)
Testing user20650's proposal
rect(2+0.5, 9+0.5, 3+0.5, 10+0.5, border="white", lwd=2)
Output in Fig. 2.
It would be great to have a function for this.
Assume you have a list of IDs.
I think there is something wrong with the placement because (2,3),(9,10) leads to the point in (2,3),(2,3).
Iterating user20650's Proposal in Chat
library("corrplot")
library("psych")
ids <- seq(1,11)
M.cor <- cor(mtcars)
colnames(M.cor) <- ids
rownames(M.cor) <- ids
p.mat <- psych::corr.test(M.cor, adjust = "none", ci = F)
p.mat <- p.mat[["r"]]
# Chat of http://stackoverflow.com/q/40538304/54964 user20650
cb <- function(corrPlot, ..., rectArgs = list() ){
lst <- list(...)
n <- ncol(corrPlot)
nms <- colnames(corrPlot)
colnames(corrPlot) <- if(is.null(nms)) 1:ncol(corrPlot) else nms
xleft <- match(lst$x, colnames(corrPlot)) - 0.5
ybottom <- n - match(lst$y, colnames(corrPlot)) + 0.5
lst <- list(xleft=xleft, ybottom=ybottom, xright=xleft+1, ytop=ybottom+1)
do.call(rect, c(lst, rectArgs))
}
plt <- corrplot(M.cor,
method = "color",
type = "upper",
tl.col = 'black',
diag = TRUE,
p.mat = p.mat,
sig.level = 0.0000005
)
cb(plt, x=c(1, 3, 5), y=c(10, 7, 4), rectArgs=list(border="white", lwd=3))
Output where only one cell border marked in Fig. 3.
Expected output: three cell borders marked
Restriction in Fig. 2 approach
You have to work all coordinates first to upper triangle.
So you can now call only the following where output has an artifact at (10,1) in Fig. 2
cb(plt, x=c(10, 7, 5), y=c(1, 3, 4), rectArgs=list(border="white", lwd=3))
Expected output: no artifact at (10,1)
The cause of the artifact can be white background, but it occurs also if the border color is red so most probably it is not the cause.
Solution - fix the window size and its output in Fig. 3
pdf("Rplots.pdf", height=10, width=10)
plt <- corrplot(M.cor,
method = "color",
type = "upper",
tl.col = 'black',
diag = TRUE,
p.mat = p.mat,
sig.level = 0.0000005
)
cb(plt, x=c(10, 7, 5), y=c(1, 3, 4), rectArgs=list(border="red", lwd=3))
dev.off()
R: 3.3.1
OS: Debian 8.5
Docs corrplot: here
My proposal where still pseudocode mark.ids. I found best to have plt and mark.ids as the options of corrplotCellBorders which creates corrplot with bordered wanted cells
mark.ids <- {x <- c(1), y <- c(2)} # TODO pseudocode
corrplotCellBorders(plt, mark.ids)
cb(plt, x, y, rectArgs=list(border="red", lwd=3))
# Chat of https://stackoverflow.com/q/40538304/54964 user20650
# createBorders.r, test.createBorders.
cb <- function(corrPlot, ..., rectArgs = list() ){
# ... pass named vector of x and y names
# for upper x > y, lower x < y
lst <- list(...)
n <- ncol(corrPlot)
nms <- colnames(corrPlot)
colnames(corrPlot) <- if(is.null(nms)) 1:ncol(corrPlot) else nms
xleft <- match(lst$x, colnames(corrPlot)) - 0.5
ybottom <- n - match(lst$y, colnames(corrPlot)) + 0.5
lst <- list(xleft=xleft, ybottom=ybottom, xright=xleft+1, ytop=ybottom+1)
do.call(rect, c(lst, rectArgs))
}
corrplotCellBorders <- function(plt, mark.ids) {
x <- mark.ids$x
y <- mark.ids$y
cb(plt, x, y, rectArgs=list(border="red", lwd=3))
}
Open
How to create mark.ids such that you can call its items by mark.ids$x and mark.ids$y?
Integrate point order neutrality for the upper triangle here
How do I label the y-axis, using timeSeries::plot, with Greek letters? i.e. change SB, SP, etc. to \alpha, \beta etc., I'm am aware I need expression(), in some way. However I can't even get to the labels (I normally use ggplot2). Code below.
# install.packages("xtable", dependencies = TRUE)
library("timeSeries")
## Load Swiss Pension Fund Benchmark Data -
LPP <- LPP2005REC[1:12, 1:4]
colnames(LPP) <- abbreviate(colnames(LPP), 2)
finCenter(LPP) <- "GMT"
timeSeries::plot(LPP, type = "o")
It have been pointed out that the object structure, obtained with str(), is quite particular in LPP compared to say this object z
z <- ts(matrix(rnorm(300), 100, 3), start = c(1961, 1), frequency = 12)
plot(z)
If any one has an answer to both or any I would appreciate it. I realize I can convert the data and plot it with ggplot2, I have seen that here on SO, but I am interested in doing in directly on the timeSeries object LPP and the stats (time-series object) z
[ REVISION & Edited ]
When plot.type is "multiple", we can't define ylab directly. Both plot(ts.obj) (S3 method) and plot(timeSeries.obj) (S4 method) take colnames(obj) as ylab, and I don't know any methods of using Greek letters as colname. (The difference in structure mainly comes from the difference of S3 and S4; colnames(timeSeries.obj) is equivalent to timeSeries.obj#units; the defaults is Series i and TS.i).
We can step in ylab using the arugument, panel (It wants a function and the default is lines). It is used in for(i in 1:ncol(data)). I couldn't give panel.function a suitable "i" (I guess it can in some way, but I didn't think up), so I got "i" using which col the data matches.
for timeSeries
ylabs <- expression(alpha, beta, gamma, delta)
row1 <- LPP[1,]
timeSeries.panel.f <- function(x, y, ...) {
lines(x, y, ...)
mtext(ylabs[which(row1 %in% y[1])], 2, line = 3)
}
plot(LPP, panel = timeSeries.panel.f, type = "o", ann = F)
title("Title")
mtext("Time", 1, line = 3)
## If you aren't so concerned about warnings, here is more general.
## (Many functions read `...` and they return warnings).
timeSeries.panel.f2 <- function(x, y, ..., ylabs = ylabs, row1 = row1) {
lines(x, y, ...)
mtext(ylabs[which(row1 %in% y[1])], 2, line = 3)
}
plot(LPP, panel = timeSeries.panel.f2, type = "o", ann = F,
ylabs = expression(alpha, beta, gamma, delta), row1 = LPP[1,])
title("Title")
mtext("Time", 1, line = 3)
for ts
ylabs <- expression(alpha, beta, gamma)
row1 <- z[1,]
ts.panel.f <- function(y, ...) {
lines(y, ...)
mtext(ylabs[which(row1 %in% y[1])], 2, line = 3)
}
plot(z, panel = ts.panel.f, ann = F)
title("Title")
mtext("Time", 1, line = 3)
Of course you can archieve it using new functions made from the original (mostly the same as the original). I showed only the modified points.
modified plot(ts.obj) (made from plot.ts)
my.plot.ts <- function(~~~, my.ylab = NULL) {
:
nm <- my.ylab # before: nm <- colnames(x)
:
}
# use
my.plot.ts(z, my.ylab = expression(alpha, beta, gamma), type = "o")
modified plot(timeSeries.obj)
# made from `.plot.timeSeries`
my.plot.timeSeries <- function(~~~, my.ylab = NULL) {
:
my.plotTimeSeries(~~~, my.ylab = my.ylab)
}
# made from `timeSeries:::.plotTimeSeries`
my.plotTimeSeries <- function(~~~, my.ylab) {
:
nm <- my.ylab # before: nm <- colnames(x)
:
}
#use
my.plot.timeSeries(LPP, my.ylab = expression(alpha, beta, gamma, delta), type="o")