Coloring clusters in ggdendro with long labels - r

I am creating dendrograms using ggdendro and coloring them according to cutpoints in the branches. I'm using the approach provided by #jlhoward in this question (Colorize Clusters in Dendogram with ggplot2) but I run into problems when my leaf labels are very long.
Here is some example code:
df <- USArrests
labs <- paste("veryverylongtitlename",1:50,sep="")
rownames(df) <- labs
library(ggplot2)
library(ggdendro)
hc <- hclust(dist(df), "ave") # heirarchal clustering
dendr <- dendro_data(hc, type="rectangle") # convert for ggplot
clust <- cutree(hc,k=2) # find 2 clusters
clust.df <- data.frame(label=names(clust), cluster=factor(clust))
# dendr[["labels"]] has the labels, merge with clust.df based on
label column
dendr[["labels"]] <- merge(dendr[["labels"]],clust.df, by="label")
# plot the dendrogram; note use of color=cluster in geom_text(...)
ggplot() +
geom_segment(data=segment(dendr), aes(x=x, y=y, xend=xend,
yend=yend)) +
geom_text(data=label(dendr), aes(x, y, label=label, hjust=0, color=cluster),
size=3) +
coord_flip() + scale_y_reverse(expand=c(0.2, 0)) +
theme(axis.line.y=element_blank(),
axis.ticks.y=element_blank(),
axis.text.y=element_blank(),
axis.title.y=element_blank(),
panel.background=element_rect(fill="white"),
panel.grid=element_blank())
As you can see, the labels here get cut off. I found this answer (decrease size of dendogram (or y-axis) ggplot), but I don't want to use it because I very much like the ability to use cutree to define my clusters. How can I manipulate the above code to fit the long labels? Many thanks!

Related

ggplotly with geom_ribbon grouping

I'm having some problems in converting a ggplot in to a plotly object, and retaining the same legend attributes. What I want:
For grouped series, a single line for fit, and faded region for ribbon of same colour, with transparency
No lines at the edge of the ribbon
Grouped legends for the lines, points and ribbons
Here is the code showing the 2 approaches I tried based on this answer:
ggplot: remove lines at ribbon edges
Both have an undesirable effect as you can see when running. Any suggestions would be great :)
library(plotly)
library(ggplot2)
# fake data
set.seed(1)
dt <- data.frame(x=rep(1:7,2), group=rep(letters[1:2], each=7), value=runif(14))
dt$lwr <- dt$value*.9
dt$upr <- dt$value*1.1
# build plot in ggplot, don't want lines at the edge
pl <- ggplot(data=dt, aes(y=value, x=x, group=group, colour=group,
fill=group)) +
geom_point() +
geom_line() +
geom_ribbon(aes(ymin=lwr, ymax=upr), alpha=.3, linetype=0) +
theme_minimal()
# looks ok, no lines at the edges
pl
# lines at edges, single group
ggplotly(pl)
# alternative: try reverting colour to NA
pl2 <- ggplot(data=dt, aes(y=value, x=x, group=group, colour=group,
fill=group)) +
geom_point() +
geom_line() +
geom_ribbon(aes(ymin=lwr, ymax=upr), alpha=.3, colour=NA) +
theme_minimal()
# looks ok
pl2
# no lines, but now not grouped, and some weird naming
ggplotly(pl2)
Thanks, Jonny
EDIT:
Addition to the accepted answer, in functional form
# dd: ggplotly object
library(stringi)
library(rvest)
remove_ggplotly_ribbon_lines <- function(dd){
find <- rvest::pluck(dd$x$data, "fillcolor")
w <- which(!sapply(find, is.null))
for(i in w){
dd$x$data[[i]]$line$color <-
stringi::stri_replace_all_regex(dd$x$data[[i]]$line$color, ",[\\d.]*\\)$", ",0.0)")
}
return(dd)
}
remove_ggplotly_ribbon_lines(ggplotly(pl))
Hi this is more a comment than an answer but I do not have right to post comments.
If you investigate the ggplotly object you will see that it is actually just a list. Changing the right elements of the list helps in controlling plot options.
The solution below just changes the alpha of the lines at ribbon edges. Hope this helps
library(plotly)
set.seed(1)
dt <- data.frame(x=rep(1:7,2), group=rep(letters[1:2], each=7), value=runif(14))
dt$lwr <- dt$value*.9
dt$upr <- dt$value*1.1
# build plot in ggplot, don't want lines at the edge
pl <- ggplot(data=dt, aes(y=value, x=x, group=group, colour=group,
fill=group)) +
geom_point() +
geom_line() +
geom_ribbon(aes(ymin=lwr, ymax=upr), alpha=.3, linetype=0) +
theme_minimal()
# looks ok, no lines at the edges
pl
# no lines at edges
dd = ggplotly(pl)
dd$x$data[[3]]$line$color = "rgba(248,118,109,0.0)"
dd$x$data[[4]]$line$color = "rgba(0,191,196,0.0)"
dd

How do I just remove one of the two legends in GGPLOT

I have the following code:
library(ggplot2)
df <- data.frame(iris) # iris dataset
pca <- prcomp(df[,1:4], retx=T, scale.=T) # scaled pca [exclude species col]
scores <- pca$x[,1:3] # scores for first three PC's
# k-means clustering [assume 3 clusters]
km <- kmeans(scores, centers=3, nstart=5)
ggdata <- data.frame(scores, Cluster=km$cluster, Species=df$Species)
# stat_ellipse is not part of the base ggplot package
source("https://raw.githubusercontent.com/tidyverse/ggplot2/master/R/stat-ellipse.R")
ggplot(ggdata) +
geom_point(aes(x=PC1, y=PC2, color=factor(Species)), size=5, shape=20) +
stat_ellipse(aes(x=PC1,y=PC2,fill=factor(Species)),
geom="polygon", level=0.95, alpha=0.2) +
guides(color=guide_legend("Species"),fill=guide_legend("Cluster"))
Which produces this:
As stated in that picture how do I just remove 'Cluster' legend?
Set your fill guide to "none"
ggplot(ggdata) +
geom_point(aes(x=PC1, y=PC2, color=factor(Species)), size=5, shape=20) +
stat_ellipse(aes(x=PC1,y=PC2, fill=factor(Species)),
geom="polygon", level=0.95, alpha=0.2)+
guides(color=guide_legend("Species"), fill = "none")
Edit: 20221129 - changed scale = FALSE to scale = "none", as per:
The <scale> argument of guides() cannot be FALSE. Use "none" instead as of ggplot2
3.3.4.

Cutting out a cluster from dendrogram

I am using this link to plot a nice dendrogram with colored labels as per the categories.
The second answer is what I am looking at in this link (Tree cut and Rectangles around clusters for a horizontal dendrogram in R )which uses the code below:
d <- dist(t(mat[,3:ncol(mat)]), method = "euclidean")
H.fit <- hclust(d, method="ward")
groups <- cutree(H.fit, k=16) # cut tree into clusters
hcdata<- dendro_data(H.fit, type="rectangle")
hcdata$labels <- merge(x = hcdata$labels, y = pm_gtex_comb, by.x = "label", by.y = "sample",all=TRUE)
ggplot() +
geom_segment(data=segment(hcdata), aes(x=x, y=y, xend=xend, yend=yend)) +
geom_text(data=label(hcdata), aes(x, y, label=label, hjust=0, color=cluster),
size=3) +
geom_rect(data=rect, aes(xmin=X1-.3, xmax=X2+.3, ymin=0, ymax=ymax),
color="red", fill=NA)+
geom_hline(yintercept=0.33, color="blue")+
coord_flip() + scale_y_reverse(expand=c(0.2, 0)) +
theme_dendro()
I want to cut out some of the clusters as I have 16 clusters,with 145 labels so that I can view only few clusters as I want to focus/cut-out/zoom in only on couple of them.Is there any way to do this on hclust object .This is only for having a nice visualization as the figure gets messy with 145 labels.Since I want to color as per the labels,I think ggdendro suits pretty well.
For example in this link ,if you look at 3)Zooming-in on dendrograms
http://gastonsanchez.com/blog/how-to/2012/10/03/Dendrograms.html
You could try prune from the package dendextend (which can do lots of other nifty things):
library(dendextend)
hc <- hclust(dist(USArrests), "ave")
clusters <- cutree(hc, k=3)
par(mfrow=c(1,2), mar=c(6, 4, 2, 3))
plot(as.dendrogram(hc), main="regular")
plot(dend <- prune(as.dendrogram(hc), names(clusters[clusters==1])),
ylim=range(hc$height), main="without cluster #1")
or if you insist on ggdendro:
ggdendro::ggdendrogram(dend)
A ggplot2 plot can be created also by using dendextend:
library(dendextend)
ggd1 <- as.ggdend(dend)
library(ggplot2)
ggplot(ggd1)

ggplot2: add conditional density curves describing both dimensions of scatterplot

I have scatterplots of 2D data from two categories. I want to add density lines for each dimension -- not outside the plot (cf. Scatterplot with marginal histograms in ggplot2) but right on the plotting surface. I can get this for the x-axis dimension, like this:
set.seed(123)
dim1 <- c(rnorm(100, mean=1), rnorm(100, mean=4))
dim2 <- rnorm(200, mean=1)
cat <- factor(c(rep("a", 100), rep("b", 100)))
mydf <- data.frame(cbind(dim2, dim1, cat))
ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() +
stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
It looks like this:
But I want an analogous pair of density curves running vertically, showing the distribution of points in the y-dimension. I tried
stat_density(aes(y=dim2, x=0+(..scaled..))), position="identity", geom="line)
but receive the error "stat_density requires the following missing aesthetics: x".
Any ideas? thanks
You can get the densities of the dim2 variables. Then, flip the axes and store them in a new data.frame. After that it is simply plotting them on top of the other graph.
p <- ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() +
stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
stuff <- ggplot_build(p)
xrange <- stuff[[2]]$ranges[[1]]$x.range # extract the x range, to make the new densities align with y-axis
## Get densities of dim2
ds <- do.call(rbind, lapply(unique(mydf$cat), function(lev) {
dens <- with(mydf, density(dim2[cat==lev]))
data.frame(x=dens$y+xrange[1], y=dens$x, cat=lev)
}))
p + geom_path(data=ds, aes(x=x, y=y, color=factor(cat)))
So far I can produce:
distrib_horiz <- stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() + distrib_horiz
And:
distrib_vert <- stat_density(data=mydf, aes(x=dim2, y=(-2+(..scaled..))),
position="identity", geom="line")
ggplot(data=mydf, aes(x=dim2, y=dim1, colour=as.factor(cat))) +
geom_point() + distrib_vert + coord_flip()
But combining them is proving tricky.
So far I have only a partial solution since I didn't manage to obtain a vertical stat_density line for each individual category, only for the total set. Maybe this can nevertheless help as a starting point for finding a better solution. My suggestion is to try with the ggMarginal() function from the ggExtra package.
p <- ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() + stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
library(ggExtra)
ggMarginal(p,type = "density", margins = "y", size = 4)
This is what I obtain:
I know it's not perfect, but maybe it's a step in a helpful direction. At least I hope so. Looking forward to seeing other answers.

XY scatter plot with heatmap strip at margin in r

Here data and hypithesis:
set.seed(1234)
myd <- data.frame (X = rnorm (100), Y = rnorm (100, 10, 3))
just catorizing X and Y, sometime this may be different variable than X and Y
and is category itself
myd$xcat <- cut (myd$X, 10)
myd$ycat <- cut (myd$Y, 10)
I want to make nice plot like the following, where the catories are plotting as strip of heatmap plots
require(ggplot2)
ggplot(myd, aes(x=X, y=Y)) + geom_point(shape=1) + theme_bw()
Is this possible ggplot2 or other packages or need a specialized solution?
One way to achieve this is to make three separate plots with ggplot2 and then use viewport() and grid.layout() to arrange them together.
First plot contains just middle part (scatter plot). px and py are heatmaps (made with geom_tile()) for the x and y axis. Most important part is to use the same theme() settings in plots (just change x to y). Used color="white" for some elements to ensure that there is a place for that element (to have correct dimensions) but they are not visible on plot.
#Scatter plot without axis titles
p<-ggplot(myd, aes(x=X, y=Y)) + geom_point(shape=1) +
theme_bw() + theme(axis.title=element_blank())
#tile plot for the x axis
px<-ggplot(myd,aes(x=xcat,y=1,fill=xcat))+geom_tile()+
scale_x_discrete(expand=c(0,0))+
scale_fill_hue(h=c(0,180))+
scale_y_continuous(expand=c(0,0),breaks=1,labels="10")+
theme(legend.position="none",
axis.title=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_text(color="white"),
axis.ticks.y=element_line(color="white"))
#tile plot for the y axis
py<-ggplot(myd,aes(x=1,y=ycat,fill=ycat))+geom_tile()+
scale_y_discrete(expand=c(0,0))+
scale_x_continuous(expand=c(0,0),breaks=1,labels="1")+
scale_fill_hue(h=c(181,360))+
theme(legend.position="none",
axis.title=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.text.x=element_text(color="white"),
axis.ticks.x=element_line(color="white"))
#Define layout for the plots (2 rows, 2 columns)
layt<-grid.layout(nrow=2,ncol=2,heights=c(7/8,1/8),widths=c(1/8,7/8),default.units=c('null','null'))
#View the layout of plots
grid.show.layout(layt)
#Draw plots one by one in their positions
grid.newpage()
pushViewport(viewport(layout=layt))
print(py,vp=viewport(layout.pos.row=1,layout.pos.col=1))
print(p,vp=viewport(layout.pos.row=1,layout.pos.col=2))
print(px,vp=viewport(layout.pos.row=2,layout.pos.col=2))
A solution in base plot:
brX <- seq(min(myd$X),max(myd$X),length=11)
brY <- seq(min(myd$Y),max(myd$Y),length=11)
layout(matrix(c(1,0,2,3),nrow=2),width=c(2,8),height=c(8,2))
par(mar=c(0,3,5,0))
plot(NA,ylim=range(myd$Y),xlim=c(0,1),axes=F,ann=F,xaxs="i")
rect(0,brY[-length(brY)],1,brY[-1],
col=colorRampPalette(c("red","yellow","green"))(length(brY)-1))
par(mar=c(0,0,5,5))
plot(NA,xlim=range(myd$X),ylim=range(myd$Y),ann=F,xaxt="n",yaxt="n")
abline(h=pretty(myd$Y),v=pretty(myd$X), col="grey95")
points(myd$X,myd$Y,pch=21)
axis(3)
axis(4)
par(mar=c(3,0,0,5))
plot(NA,xlim=range(myd$X),ylim=c(0,1),axes=F,ann=F,yaxs="i")
rect(brX[-length(brX)],0,brX[-1],1,
col=colorRampPalette(c("blue","white","red"))(length(brX)-1))

Resources