R: ggplot height adjustment for clustering dendrogram - r

The idea is to combine R packages ClustOfVar and ggdendro to give a visual summary of variable clustering.
When there are few columns in the data, the result is pretty good except that there are areas not covered(as circled in the chart below). Using mtcars for example:
library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(ClustOfVar)
library(ggdendro)
fit = hclustvar(X.quanti = mtcars)
labels = cutree(fit,k = 5)
labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))
p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)
df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))
p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
scale_y_continuous(expand=c(0,0))+
theme(axis.title=element_blank(),
axis.ticks=element_blank(),
axis.text=element_blank(),
legend.position="none")+coord_flip()+
geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))
When there are a large number of columns, another issue occurs. That is, the height of the color tiles part does not match the height the dendrogram.
library(ClustOfVar)
library(ggdendro)
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
fit = hclustvar(X.quanti = X)
labels = cutree(fit,k = 5)
labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))
p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)
df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))
p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
scale_y_continuous(expand=c(0,0))+
theme(axis.title=element_blank(),
axis.ticks=element_blank(),
axis.text=element_blank(),
legend.position="none")+coord_flip()+
geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))
#Sandy Muspratt has actually provided an excellent solution to this IF we have the R upgraded to version 3.3.1.
R: ggplot slight adjustment for clustering summary
But since I cannot change the version of the R deployed in the corporate server, I wonder if there is any other workaround that can align these two parts.

As far as I can tell, your code is not far wrong. The problem is that you are trying to match a continuous scale to a discrete scale when you merge the two plots. Also, it appears that ggdendrogram() adds additional space to the y-axis.
library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(ClustOfVar)
library(ggdendro)
# Data
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
# Cluster analysis
fit = hclustvar(X.quanti = X)
# Labels data frames
df2 <- data.frame(cluster = cutree(fit, k =5),
states = factor(fit$labels, levels = fit$labels[fit$order]))
df3 <- ddply(df2, .(cluster), summarise, pos = mean(as.numeric(states)))
# Dendrogram
# scale_x_continuous() for p1 should match scale_x_discrete() from p2
# scale_x_continuous strips off the labels. I grab them from df2
# scale _y_continuous() puts a little space between the labels and the dendrogram
p1 <- ggdendrogram(as.dendrogram(fit), rotate = TRUE) +
scale_x_continuous(expand = c(0, 0.5), labels = levels(df2$states), breaks = 1:length(df2$states)) +
scale_y_continuous(expand = c(0.02, 0))
# Tiles and labels
p2 <- ggplot(df2,aes(states, y = 1, fill = factor(cluster))) +
geom_tile() +
scale_y_continuous(expand = c(0, 0)) +
scale_x_discrete(expand = c(0, 0)) +
geom_text(data = df3, aes(x = pos, label = cluster)) +
coord_flip() +
theme(axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
legend.position = "none")
# Get the ggplot grobs
gp1 <- ggplotGrob(p1)
gp2 <- ggplotGrob(p2)
# Make sure the heights match
maxHeight <- unit.pmax(gp1$heights, gp2$heights)
gp1$heights <- as.list(maxHeight)
gp2$heights <- as.list(maxHeight)
# Combine the two plots
grid.arrange(gp2, gp1, ncol = 2,widths = c(1/6, 5/6))

Related

How to align multiple legends and avoid overlapping in ggplot?

The bounty expires in 7 days. Answers to this question are eligible for a +50 reputation bounty.
Electrino wants to draw more attention to this question.
I am trying to create a plot that combines 2 separate legends and a grid of multiple plots. The issue I'm having is I'm finding it difficult to align the legends so they are visible and not overlapping. hopefully the example below will explain what I mean.
To begin I am going to create 2 plots. In these two plots I am only interested in the legends, and I am discarding the actual plot (so please ignore the actual plots in these two plots). To get just the legend I am using the cowplot package.
library(ggplot2)
library(cowplot)
# -------------------------------------------------------------------------
# plot 1 ------------------------------------------------------------------
# create fake data
dfLegend_1 <- data.frame(x = LETTERS[1:10], y = c(1:10))
# set colours
pointColours <- c(A = "#F5736A", B = "#D58D00", C = "#A0A300",
D = "#36B300", E = "#00BC7B", F = "#00BCC2",
G = "#00ADF4", H = "#928DFF", I = "#E568F0",
J = "#808080")
# plot
ggLegend_1 <- ggplot(dfLegend_1, aes(x=x, y=y))+
geom_point(aes(fill = pointColours), shape = 22, size = 10) +
scale_fill_manual(values = unname(pointColours),
label = names(pointColours),
name = 'Variable') +
theme(legend.key.size = unit(0.5, "cm")) +
theme_void()
# get legend
legend_1 <- get_legend(ggLegend_1)
# -------------------------------------------------------------------------
# plot 2 ------------------------------------------------------------------
# Create fake data
dflegend_2 <- data.frame(
x = runif(100),
y = runif(100),
z2 = abs(rnorm(100))
)
# plot
ggLegend_2 <- ggplot(dflegend_2, aes(x=x, y = y))+
geom_point(aes(color = z2), shape = 22, size = 10) +
scale_color_gradientn(
colours = rev(colorRampPalette(c('steelblue', '#f7fcfd', 'orange'))(5)),
limits = c(0,10),
name = 'Gradient',
guide = guide_colorbar(
frame.colour = "black",
ticks.colour = "black"
))
# get legend
legend_2 <- get_legend(ggLegend_2)
Then I am creating many plots (in this example, I am creating 20 individual plots) and plotting them on a grid:
# create data
dfGrid <- data.frame(x = rnorm(10), y = rnorm(10))
# make a list of plots
plotList <- list()
for(i in 1:20){
plotList[[i]] <- ggplot(dfGrid) +
geom_ribbon(aes(x = x, ymin = min(y), ymax = 0), fill = "red", alpha = .5) +
geom_ribbon(aes(x = x, ymin = min(0), ymax = max(y)), fill = "blue", alpha = .5) +
theme_void()
}
# plot them on a grid
gridFinal <- cowplot::plot_grid(plotlist = plotList)
Finally, I am joining the two legends together and adding them to the grid of many plots:
# add legends together into on single plot
legendFinal <- plot_grid(legend_2, legend_1, ncol = 1)
# plot everything on the same plot
plot_grid(gridFinal, legendFinal, rel_widths = c(3, 1))
This results in something that looks like this:
As you can see, the legends overlap and are not very well spaced. I was wondering if there is any way to fit everything in whilst having the legends appropriately spaced and readable?
I should also note, that, in general, there can be any number of variables and any number of gridded plots.
One option to fix your issue would be to switch to patchwork to glue your plots and the legends together. Especially I make use of the design argument to assign more space to the Variable legend. However, you should be aware that legends are much less flexible compared to plots, i.e. the size of legends is in absolute units and will not adjust to the available space. Hence, I'm not sure whether my solution will fit your desire for a "one-size-fits-all" approach.
library(patchwork)
design <-
"
ABCDEU
FGHIJV
KLMNOV
PQRSTV
"
plotList2 <- c(plotList, list(legend_2, legend_1))
wrap_plots(plotList2) +
plot_layout(design = design)

Align multiple plots in ggplot2 when some have legends and others don't

I have used the method indicated here to align graphs sharing the same abscissa.
But I can't make it work when some of my graphs have a legend and others don't.
Here is an example:
library(ggplot2)
library(reshape2)
library(gridExtra)
x = seq(0, 10, length.out = 200)
y1 = sin(x)
y2 = cos(x)
y3 = sin(x) * cos(x)
df1 <- data.frame(x, y1, y2)
df1 <- melt(df1, id.vars = "x")
g1 <- ggplot(df1, aes(x, value, color = variable)) + geom_line()
print(g1)
df2 <- data.frame(x, y3)
g2 <- ggplot(df2, aes(x, y3)) + geom_line()
print(g2)
gA <- ggplotGrob(g1)
gB <- ggplotGrob(g2)
maxWidth <- grid::unit.pmax(gA$widths[2:3], gB$widths[2:3])
gA$widths[2:3] <- maxWidth
gB$widths[2:3] <- maxWidth
g <- arrangeGrob(gA, gB, ncol = 1)
grid::grid.newpage()
grid::grid.draw(g)
Using this code, I have the following result:
What I would like is to have the x axis aligned and the missing legend being filled by a blank space. Is this possible?
Edit:
The most elegant solution proposed is the one by Sandy Muspratt below.
I implemented it and it works quite well with two graphs.
Then I tried with three, having different legend sizes, and it doesn't work anymore:
library(ggplot2)
library(reshape2)
library(gridExtra)
x = seq(0, 10, length.out = 200)
y1 = sin(x)
y2 = cos(x)
y3 = sin(x) * cos(x)
y4 = sin(2*x) * cos(2*x)
df1 <- data.frame(x, y1, y2)
df1 <- melt(df1, id.vars = "x")
g1 <- ggplot(df1, aes(x, value, color = variable)) + geom_line()
g1 <- g1 + theme_bw()
g1 <- g1 + theme(legend.key = element_blank())
g1 <- g1 + ggtitle("Graph 1", subtitle = "With legend")
df2 <- data.frame(x, y3)
g2 <- ggplot(df2, aes(x, y3)) + geom_line()
g2 <- g2 + theme_bw()
g2 <- g2 + theme(legend.key = element_blank())
g2 <- g2 + ggtitle("Graph 2", subtitle = "Without legend")
df3 <- data.frame(x, y3, y4)
df3 <- melt(df3, id.vars = "x")
g3 <- ggplot(df3, aes(x, value, color = variable)) + geom_line()
g3 <- g3 + theme_bw()
g3 <- g3 + theme(legend.key = element_blank())
g3 <- g3 + scale_color_discrete("This is indeed a very long title")
g3 <- g3 + ggtitle("Graph 3", subtitle = "With legend")
gA <- ggplotGrob(g1)
gB <- ggplotGrob(g2)
gC <- ggplotGrob(g3)
gB = gtable::gtable_add_cols(gB, sum(gC$widths[7:8]), 6)
maxWidth <- grid::unit.pmax(gA$widths[2:5], gB$widths[2:5], gC$widths[2:5])
gA$widths[2:5] <- maxWidth
gB$widths[2:5] <- maxWidth
gC$widths[2:5] <- maxWidth
g <- arrangeGrob(gA, gB, gC, ncol = 1)
grid::grid.newpage()
grid::grid.draw(g)
This results in the following figure:
My main problem with the answers found here and in other questions regarding the subject is that people "play" quite a lot with the vector myGrob$widths without actually explaining why they are doing it. I have seen people modify myGrob$widths[2:5] others myGrob$widths[2:3] and I just can't find any documentation explaining what those columns are.
My objective is to create a generic function such as:
AlignPlots <- function(...) {
# Retrieve the list of plots to align
plots.list <- list(...)
# Initialize the lists
grobs.list <- list()
widths.list <- list()
# Collect the widths for each grob of each plot
max.nb.grobs <- 0
longest.grob <- NULL
for (i in 1:length(plots.list)){
if (i != length(plots.list)) {
plots.list[[i]] <- plots.list[[i]] + theme(axis.title.x = element_blank())
}
grobs.list[[i]] <- ggplotGrob(plots.list[[i]])
current.grob.length <- length(grobs.list[[i]])
if (current.grob.length > max.nb.grobs) {
max.nb.grobs <- current.grob.length
longest.grob <- grobs.list[[i]]
}
widths.list[[i]] <- grobs.list[[i]]$widths[2:5]
}
# Get the max width
maxWidth <- do.call(grid::unit.pmax, widths.list)
# Assign the max width to each grob
for (i in 1:length(grobs.list)){
if(length(grobs.list[[i]]) < max.nb.grobs) {
grobs.list[[i]] <- gtable::gtable_add_cols(grobs.list[[i]],
sum(longest.grob$widths[7:8]),
6)
}
grobs.list[[i]]$widths[2:5] <- as.list(maxWidth)
}
# Generate the plot
g <- do.call(arrangeGrob, c(grobs.list, ncol = 1))
return(g)
}
Expanding on #Axeman's answer, you can do all of this with cowplot without ever needing to use draw_plot directly. Essentially, you just make the plot in two columns -- one for the plots themselves and one for the legends -- and then place them next to each other. Note that, because g2 has no legend, I am using an empty ggplot object to hold the place of that legend in the legends column.
library(cowplot)
theme_set(theme_minimal())
plot_grid(
plot_grid(
g1 + theme(legend.position = "none")
, g2
, g3 + theme(legend.position = "none")
, ncol = 1
, align = "hv")
, plot_grid(
get_legend(g1)
, ggplot()
, get_legend(g3)
, ncol =1)
, rel_widths = c(7,3)
)
Gives
The main advantage here, in my mind, is the ability to set and skip legends as needed for each of the subplots.
Of note is that, if all of the plots have a legend, plot_grid handles the alignment for you:
plot_grid(
g1
, g3
, align = "hv"
, ncol = 1
)
gives
It is only the missing legend in g2 that causes problems.
Therefore, if you add a dummy legend to g2 and hide it's elements, you can get plot_grid to do all of the alignment for you, instead of worrying about manually adjusting rel_widths if you change the size of the output
plot_grid(
g1
, g2 +
geom_line(aes(color = "Test")) +
scale_color_manual(values = NA) +
theme(legend.text = element_blank()
, legend.title = element_blank())
, g3
, align = "hv"
, ncol = 1
)
gives
This also means that you can easily have more than one column, but still keep the plot areas the same. Simply removing , ncol = 1 from above yields a plot with 2 columns, but still correctly spaced (though you'll need to adjust the aspect ratio to make it useable):
As #baptiste suggested, you can also move the legends over so that they are all aligned to the left of in the "legend" portion of the plot by adding theme(legend.justification = "left") to the plots with the legends (or in theme_set to set globally), like this:
plot_grid(
g1 +
theme(legend.justification = "left")
,
g2 +
geom_line(aes(color = "Test")) +
scale_color_manual(values = NA) +
theme(legend.text = element_blank()
, legend.title = element_blank())
, g3 +
theme(legend.justification = "left")
, align = "hv"
, ncol = 1
)
gives
The patchwork package by Thomas Lin Pedersen does this all automagically:
library(patchwork)
g1 + g2 + plot_layout(ncol = 1)
Can hardly get any easier than that.
There might now be easier ways to do this, but your code was not far wrong.
After you have ensured that the widths of columns 2 and 3 in gA are the same as those in gB, check the widths of the two gtables: gA$widths and gB$widths. You will notice that the gA gtable has two additional columns not present in the gB gtable, namely widths 7 and 8. Use the gtable function gtable_add_cols() to add the columns to the gB gtable:
gB = gtable::gtable_add_cols(gB, sum(gA$widths[7:8]), 6)
Then proceed with arrangeGrob() ....
Edit: For a more general solution
Package egg (available on github) is experimental and fragile, but works nicely with your revised set of plots.
# install.package(devtools)
devtools::install_github("baptiste/egg")
library(egg)
grid.newpage()
grid.draw(ggarrange(g1,g2,g3, ncol = 1))
Thanks to this and that, posted in the comments (and then removed), I came up with the following general solution.
I like the answer from Sandy Muspratt and the egg package seems to do the job in a very elegant manner, but as it is "experimental and fragile", I preferred using this method:
#' Vertically align a list of plots.
#'
#' This function aligns the given list of plots so that the x axis are aligned.
#' It assumes that the graphs share the same range of x data.
#'
#' #param ... The list of plots to align.
#' #param globalTitle The title to assign to the newly created graph.
#' #param keepTitles TRUE if you want to keep the titles of each individual
#' plot.
#' #param keepXAxisLegends TRUE if you want to keep the x axis labels of each
#' individual plot. Otherwise, they are all removed except the one of the graph
#' at the bottom.
#' #param nb.columns The number of columns of the generated graph.
#'
#' #return The gtable containing the aligned plots.
#' #examples
#' g <- VAlignPlots(g1, g2, g3, globalTitle = "Alignment test")
#' grid::grid.newpage()
#' grid::grid.draw(g)
VAlignPlots <- function(...,
globalTitle = "",
keepTitles = FALSE,
keepXAxisLegends = FALSE,
nb.columns = 1) {
# Retrieve the list of plots to align
plots.list <- list(...)
# Remove the individual graph titles if requested
if (!keepTitles) {
plots.list <- lapply(plots.list, function(x) x <- x + ggtitle(""))
plots.list[[1]] <- plots.list[[1]] + ggtitle(globalTitle)
}
# Remove the x axis labels on all graphs, except the last one, if requested
if (!keepXAxisLegends) {
plots.list[1:(length(plots.list)-1)] <-
lapply(plots.list[1:(length(plots.list)-1)],
function(x) x <- x + theme(axis.title.x = element_blank()))
}
# Builds the grobs list
grobs.list <- lapply(plots.list, ggplotGrob)
# Get the max width
widths.list <- do.call(grid::unit.pmax, lapply(grobs.list, "[[", 'widths'))
# Assign the max width to all grobs
grobs.list <- lapply(grobs.list, function(x) {
x[['widths']] = widths.list
x})
# Create the gtable and display it
g <- grid.arrange(grobs = grobs.list, ncol = nb.columns)
# An alternative is to use arrangeGrob that will create the table without
# displaying it
#g <- do.call(arrangeGrob, c(grobs.list, ncol = nb.columns))
return(g)
}
One trick is to plot and align the graphs without any legends, and then plotting the legend separately next to it. cowplot has a convenience function for quickly getting the legend from a plot, and plot_grid allows for automatic allignment.
library(cowplot)
theme_set(theme_grey())
l <- get_legend(g1)
ggdraw() +
draw_plot(plot_grid(g1 + theme(legend.position = 'none'), g2, ncol = 1, align = 'hv'),
width = 0.9) +
draw_plot(l, x = 0.9, y = 0.55, width = 0.1, height = 0.5)
Using grid.arrange
library(ggplot2)
library(reshape2)
library(gridExtra)
x = seq(0, 10, length.out = 200)
y1 = sin(x)
y2 = cos(x)
y3 = sin(x) * cos(x)
df1 <- data.frame(x, y1, y2)
df1 <- melt(df1, id.vars = "x")
g1 <- ggplot(df1, aes(x, value, color = variable)) + geom_line()
df2 <- data.frame(x, y3)
g2 <- ggplot(df2, aes(x, y3)) + geom_line()
#extract the legend from the first graph
temp <- ggplotGrob(g1)
leg_index <- which(sapply(temp$grobs, function(x) x$name) == "guide-box")
legend <- temp$grobs[[leg_index]]
#remove the legend of the first graph
g1 <- g1 + theme(legend.position="none")
#define position of each grobs/plots and width and height ratio
grid_layout <- rbind(c(1,3),
c(2,NA))
grid_width <- c(5,1)
grid_heigth <- c(1,1)
grid.arrange(
grobs=list(g1, g2,legend),
layout_matrix = grid_layout,
widths = grid_width,
heights = grid_heigth)

R plot: Uniform distance between ticks for non-uniform numbers

I am trying to recreate the basic temperature trend of this Paleotemperature figure in R. (Original image and data.)
The scale interval of the x-axis changes from 100s of millions of years to 10s of millions to millions, and then to 100s of thousands, and so on, but the ticks marks are evenly spaced. The original figure was carefully laid out in five separate graphs in Excel to achieve the spacing. I am trying to get the same x-axis layout in R.
I have tried two basic approaches. The first approach was to use par(fig=c(x1,x2,y1,y2)) to make five separate graphs placed side by side. The problem is that the intervals among tick marks is not uniform and labels overlap.
#1
par(fig=c(0,0.2,0,0.5), mar=c(3,4,0,0))
plot(paleo1$T ~ paleo1$Years, col='red3', xlim=c(540,60), bty='l',type='l', ylim=c(-6,15), ylab='Temperature Anomaly (°C)')
abline(0,0,col='gray')
#2
par(fig=c(0.185,0.4,0,0.5), mar=c(3,0,0,0), new=TRUE)
plot(paleo2$T ~ paleo2$Year, col='forestgreen', axes=F, type='l', xlim=c(60,5), ylab='', ylim=c(-6,15))
axis(1, xlim=c(60,5))
abline(0,0,col='gray')
#etc.
The second approach (and my preferred approach, if possible) is to plot the data in a single graph. This causes non-uniform distance among tick marks because they follow their "natural" order. (Edit: example data added as well as link to full data set.).
years <- c(500,400,300,200,100,60,50,40,30,20,10,5,4,3,2,1)
temps <- c(13.66, 8.6, -2.16, 3.94, 8.44, 5.28, 12.98, 8.6, 5, 5.34, 3.66, 2.65, 0.78, 0.25, -1.51, -1.77)
test <- data.frame(years, temps)
names(test) <- c('Year','T')
# The full csv file can be used with this line instead of the above.
# test <- read.csv('https://www.dropbox.com/s/u0dfmlvzk0ztpkv/paleo_test.csv?dl=1')
plot(test$T ~ test$Year, type='l', xaxt='n', xlim=c(520,1), bty='l', ylim=c(-5,15), xlab="", ylab='Temperature Anomaly (°C)')
ticklabels = c(500,400,300,200,100,60,50,40,30,20,10,5,4,3,2,1)
axis(1, at=ticklabels)
Adding log='x' to plot comes closest but the intervals between ticks are still not even and the actual scale is, of course, not a log scale.
My examples only go down to 1 million years because I am trying to solve the problem first but my the goal is to match the original figure above. I am open to ggplot solutions although I am only fleetingly familiar with it.
I will strike a different note by saying: don't. In my experience, the harder something is to do in ggplot2 (and to a lesser extent, base graphics), the less likely it is to be a good idea. Here, the problem is that consistently changing the scales like is more likely to lead the viewer astray.
Instead, I recommend using a log scale and manually setting your cutoffs.
First, here is some longer data, just to cover the full likely scale of your question:
longerTest <-
data.frame(
Year = rep(1:9, times = 6) * rep(10^(3:8), each = 9)
, T = rnorm(6*9))
Then, I picked some cutoffs to place the labels at in the plot. These can be adjusted to whatever you want, but are at least a starting point for reasonably spaced ticks:
forLabels <-
rep(c(1,2,5), times = 6) * rep(10^(3:8), each = 3)
Then, I manually set some things to append to the labels. Thus, instead of having to say "Thousands of years" under part of the axis, you can just label those with a "k". Each order of magnitude gets a value. Nnote that the names are just to help keep things straight: below I just use the index to match. So, if you skip the first two, you will need to adjust the indexing below.
toAppend <-
c("1" = "0"
, "2" = "00"
, "3" = "k"
, "4" = "0k"
, "5" = "00k"
, "6" = "M"
, "7" = "0M"
, "8" = "00M")
Then, I change my forLabels into the text versions I want to use by grabbing the first digit, and concatenating with the correct suffix from above.
myLabels <-
paste0(
substr(as.character(forLabels), 1, 1)
, toAppend[floor(log10(forLabels))]
)
This gives:
[1] "1k" "2k" "5k" "10k" "20k" "50k" "100k" "200k" "500k" "1M" "2M"
[12] "5M" "10M" "20M" "50M" "100M" "200M" "500M"
You could likely use these for base graphics, but getting the log scale to do what you want is sometimes tricky. Instead, since you said you are open to a ggplot2 solution, I grabbed this modified log scale from this answer to get a log scale that runs from big to small:
library("scales")
reverselog_trans <- function(base = exp(1)) {
trans <- function(x) -log(x, base)
inv <- function(x) base^(-x)
trans_new(paste0("reverselog-", format(base)), trans, inv,
log_breaks(base = base),
domain = c(1e-100, Inf))
}
Then, just pass in the data, and set the scale with the desired breaks:
ggplot(longerTest
, aes(x = Year
, y = T)) +
geom_line() +
scale_x_continuous(
breaks = forLabels
, labels = myLabels
, trans=reverselog_trans(10)
)
Gives:
Which has a consistent scale, but is labelled far more uniformly.
If you want colors, you can do that using cut:
ggplot(longerTest
, aes(x = Year
, y = T
, col = cut(log10(Year)
, breaks = c(3,6,9)
, labels = c("Thousands", "Millions")
, include.lowest = TRUE)
, group = 1
)) +
geom_line() +
scale_x_continuous(
breaks = forLabels
, labels = myLabels
, trans=reverselog_trans(10)
) +
scale_color_brewer(palette = "Set1"
, name = "How long ago?")
Here is a version using facet_wrap to create different scales. I used 6 here, but you can set whatever thresholds you want instead.
longerTest$Period <-
cut(log10(longerTest$Year)
, breaks = c(3, 4, 5, 6, 7, 8, 9)
, labels = paste(rep(c("", "Ten", "Hundred"), times = 2)
, rep(c("Thousands", "Millions"), each = 3) )
, include.lowest = TRUE)
longerTest$Period <-
factor(longerTest$Period
, levels = rev(levels(longerTest$Period)))
newBreaks <-
rep(c(2,4,6,8, 10), times = 6) * rep(10^(3:8), each = 5)
newLabels <-
paste0(
substr(as.character(newBreaks), 1, 1)
, toAppend[floor(log10(newBreaks))]
)
ggplot(longerTest
, aes(x = Year
, y = T
)) +
geom_line() +
facet_wrap(~Period, scales = "free_x") +
scale_x_reverse(
breaks = newBreaks
, labels = newLabels
)
gives:
Here is a start:
#define the panels
breaks <- c(-Inf, 8, 80, Inf)
test$panel <- cut(test$Year, breaks, labels = FALSE)
test$panel <- ordered(test$panel, levels = unique(test$panel))
#for correct scales
dummydat <- data.frame(Year = c(0, 8, 8, 80, 80, max(test$Year)),
T = mean(test$T),
panel = ordered(rep(1:3, each = 2), levels = levels(test$panel)))
library(ggplot2)
ggplot(test, aes(x = Year, y = T, color = panel)) +
geom_line() +
geom_blank(data = dummydat) + #for correct scales
facet_wrap(~ panel, nrow = 1, scales = "free_x") +
theme_minimal() + #choose a theme you like
theme(legend.position = "none", #and customize it
panel.spacing.x = unit(0, "cm"),
strip.text = element_blank() ,
strip.background = element_blank()) +
scale_x_reverse(expand = c(0, 0))
Here's a basic example of doing it with separate plots using gridExtra. This may be useful to combine with extra grobs, for instance to create the epoch boxes across the top (not done here). If so desired, this might be best combined with Roland's solution.
# ggplot with gridExtra
library('ggplot2')
library('gridExtra')
library('grid')
d1 <- test[1:5, ]
d2 <- test[6:11, ]
d3 <- test[12:16, ]
plot1 <- ggplot(d1, aes(y = T, x = seq(1:nrow(d1)))) +
geom_line() +
ylim(c(-5, 15)) +
theme_minimal() +
theme(axis.title.x = element_blank(),
plot.margin = unit(c(1,0,1,1), "cm")) +
scale_x_continuous(breaks=)
plot2 <- ggplot(d2, aes(y = T, x = seq(1:nrow(d2)))) +
geom_line() +
ylim(c(-5, 15)) +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.x = element_blank(),
plot.margin = unit(c(1,0,1,0), "cm"))
plot3 <- ggplot(d3, aes(y = T, x = seq(1:nrow(d3)))) +
geom_line() +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.x = element_blank(),
plot.margin = unit(c(1,0,1,0), "cm")) +
ylim(c(-5, 15))
# put together
grid.arrange(plot1, plot2, plot3, nrow = 1,
widths = c(1.5,1,1)) # allow extra width for first plot which has y axis

A shared legend for z-scores and corresponding p-values in a heatmap

I have a z-scores matrix:
set.seed(1)
z.score.mat <- matrix(rnorm(1000),nrow=100,ncol=10)
which are the result of some biological experimental data, and a corresponding p-value matrix:
p.val.mat <- pnorm(abs(z.score.mat),lower.tail = F)
Both have identical dimnames:
rownames(z.score.mat) <- paste("p",1:100,sep="")
colnames(z.score.mat) <- paste("c",1:10,sep="")
rownames(p.val.mat) <- paste("p",1:100,sep="")
colnames(p.val.mat) <- paste("c",1:10,sep="")
I'm plotting a hierarchically clustered heatmap of the z-scores like this:
hc.col <- hclust(dist(z.score.mat))
dd.col <- as.dendrogram(hc.col)
col.ord <- order.dendrogram(dd.col)
hc.row <- hclust(dist(t(z.score.mat)))
dd.row <- as.dendrogram(hc.row)
row.ord <- order.dendrogram(dd.row)
clustered.mat <- z.score.mat[col.ord,row.ord]
clustered.mat.names <- attr(clustered.mat,"dimnames")
clustered.mat.df <- as.data.frame(clustered.mat)
colnames(clustered.mat.df) <- clustered.mat.names[[2]]
clustered.mat.df[,"process"] <- clustered.mat.names[[1]]
clustered.mat.df[,"process"] <- with(clustered.mat.df,factor(clustered.mat.df[,"process"],levels=clustered.mat.df[,"process"],ordered=TRUE))
require(reshape2)
clustered.mat.df <- reshape2::melt(clustered.mat.df,id.vars="process")
colnames(clustered.mat.df)[2:3] <- c("condition","z.score")
clustered.mat.df$p.value <- sapply(1:nrow(clustered.mat.df),function(x) p.val.mat[which(rownames(p.val.mat) == clustered.mat.df$process[x]),which(colnames(p.val.mat) == clustered.mat.df$condition[x])])
lab.legend <- colnames(clustered.mat.df)[3]
lab.row <- colnames(clustered.mat.df)[1]
lab.col <- colnames(clustered.mat.df)[2]
require(ggplot2)
ggplot(clustered.mat.df,aes(x=condition,y=process))+
geom_tile(aes(fill=z.score))+
scale_fill_gradient2(lab.legend,high="darkred",low="darkblue")+
theme_bw()+
theme(legend.key=element_blank(),
legend.position="right",
panel.border=element_blank(),
strip.background=element_blank(),
axis.text.x=element_text(angle=45,vjust=0.5)
)
My question is if it is possible, and how, to have on one side of the legend bar the z-score range (which is currently on the right hand) and on the other side the corresponding p-value range?
This is quite fiddly when the plot dimensions change, but you do get the required result:
br <- seq(-3, 3, 1)
lab <- round(pnorm(abs(br),lower.tail = F), 3)
p <- ggplot(clustered.mat.df,aes(x=condition,y=process))+
geom_tile(aes(fill=z.score), show.legend = FALSE)+
scale_fill_gradient2(lab.legend, high="darkred", low="darkblue", breaks = br)
p1 <- ggplot(clustered.mat.df,aes(x=condition,y=process))+
geom_tile(aes(fill=z.score))+
scale_fill_gradient2(lab.legend, high="darkred", low="darkblue", breaks = br) +
guides(fill = guide_colorbar(title = '', label.position = 'right', barheight = 10))
p2 <- ggplot(clustered.mat.df,aes(x=condition,y=process))+
geom_tile(aes(fill=z.score))+
scale_fill_gradient2(lab.legend, high="darkred", low="darkblue", breaks = br, labels = lab) +
guides(fill = guide_colorbar('', label.position = 'left', barheight = 10))
library(cowplot)
l1 <- get_legend(p1)
l2 <- get_legend(p2)
ggdraw() +
draw_plot(p, width = 0.85) +
draw_grob(l1, 0.89, 0, 0.1, 1) +
draw_grob(l2, 0.85, 0, 0.1, 1) +
draw_label('p z', 0.88, 0.675, hjust = 0)
This approach uses gtable and grid functions. It takes the legend from your plot, edits the legend so that the p values appear on the left side, then puts the edited legend back into the plot.
# Your data
set.seed(1)
z.score.mat <- matrix(rnorm(1000),nrow=100,ncol=10)
# which are the result of some biological experimental data, and a corresponding p-value matrix:
p.val.mat <- pnorm(abs(z.score.mat),lower.tail = F)
rownames(z.score.mat) <- paste("p",1:100,sep="")
colnames(z.score.mat) <- paste("c",1:10,sep="")
rownames(p.val.mat) <- paste("p",1:100,sep="")
colnames(p.val.mat) <- paste("c",1:10,sep="")
hc.col <- hclust(dist(z.score.mat))
dd.col <- as.dendrogram(hc.col)
col.ord <- order.dendrogram(dd.col)
hc.row <- hclust(dist(t(z.score.mat)))
dd.row <- as.dendrogram(hc.row)
row.ord <- order.dendrogram(dd.row)
clustered.mat <- z.score.mat[col.ord,row.ord]
clustered.mat.names <- attr(clustered.mat,"dimnames")
clustered.mat.df <- as.data.frame(clustered.mat)
colnames(clustered.mat.df) <- clustered.mat.names[[2]]
clustered.mat.df[,"process"] <- clustered.mat.names[[1]]
clustered.mat.df[,"process"] <- with(clustered.mat.df,factor(clustered.mat.df[,"process"],levels=clustered.mat.df[,"process"],ordered=TRUE))
require(reshape2)
clustered.mat.df <- reshape2::melt(clustered.mat.df,id.vars="process")
colnames(clustered.mat.df)[2:3] <- c("condition","z.score")
clustered.mat.df$p.value <- sapply(1:nrow(clustered.mat.df),function(x) p.val.mat[which(rownames(p.val.mat) == clustered.mat.df$process[x]),which(colnames(p.val.mat) == clustered.mat.df$condition[x])])
lab.legend <- colnames(clustered.mat.df)[3]
lab.row <- colnames(clustered.mat.df)[1]
lab.col <- colnames(clustered.mat.df)[2]
# Your plot
require(ggplot2)
p = ggplot(clustered.mat.df,aes(x=condition,y=process))+
geom_tile(aes(fill=z.score))+
scale_fill_gradient2(lab.legend,high="darkred",low="darkblue") +
theme_bw()+
theme(legend.key=element_blank(),
legend.position="right",
panel.border=element_blank(),
strip.background=element_blank(),
axis.text.x=element_text(angle=45,vjust=0.5))
library(gtable)
library(grid)
# Get the ggplot grob
g = ggplotGrob(p)
# Get the legend
index = which(g$layout$name == "guide-box")
leg = g$grobs[[index]]
# Get the legend labels
# and calculate corresponding p values
z.breaks = as.numeric(leg$grobs[[1]]$grobs[[3]]$label)
p.breaks = as.character(round(pnorm(abs(z.breaks), lower.tail = F), 3))
# Get the width of the longest p.break string, taking account of font and font size
w = lapply(na.omit(p.breaks), function(x) grobWidth(textGrob(x,
gp = gpar(fontsize = leg$grobs[[1]]$grobs[[3]]$gp$fontsize,
fontfamily = leg$grobs[[1]]$grobs[[3]]$gp$fontfamily))))
w = do.call(unit.pmax, w)
w = convertX(w, "mm")
# Add columns to the legend gtable to take p.breaks,
# setting the width of relevant column to w
leg$grobs[[1]] = gtable_add_cols(leg$grobs[[1]], leg$grobs[[1]]$widths[3], 1)
leg$grobs[[1]] = gtable_add_cols(leg$grobs[[1]], w, 1)
# Construct grob containing p.breaks
# Begin with the z.score grob, then make relevant changes
p.values = leg$grobs[[1]]$grobs[[3]]
p.values[c("label", "x", "hjust")] = list(p.breaks, unit(1, "npc"), 1)
# Put the p.values grob into the legend gtable
leg$grobs[[1]] = gtable_add_grob(leg$grobs[[1]], p.values, t=4, l=2,
name = "p.values", clip = "off")
# Put 'p' and 'z' labels into the legend gtable
leg$grobs[[1]] = gtable_add_grob(leg$grobs[[1]], list(textGrob("p"), textGrob("z")),
t=2, l=c(2,6), clip = "off")
# Drop the current legend title
leg$grobs[[1]]$grobs[[4]] = nullGrob()
# Put the legend back into the plot,
# and make sure the relevant column is wide enough to take the new legend
g$grobs[[index]] = leg
g$widths[8] = g$widths[8] + sum(leg$grobs[[1]]$widths[2:3])
# Draw the plot
grid.newpage()
grid.draw(g)
Not precisely what you described, but you could put both p values and z values into the same labels on one side of the legend:
z.breaks = c(-2,0,2)
p.breaks = pnorm(abs(z.breaks),lower.tail = F)
ggplot(clustered.mat.df,aes(x=condition,y=process)) +
geom_tile(aes(fill = z.score)) +
scale_fill_gradient2("z score (p value)", high="darkred",low="darkblue",
breaks = z.breaks,
labels = paste0(z.breaks, ' (p = ', round(p.breaks,2), ')') ) +
theme_bw() +
theme(legend.key = element_blank(),
legend.position = 'right',
panel.border = element_blank(),
strip.background = element_blank(),
axis.text.x=element_text(angle=45,vjust=0.5))

Parallel co-ordinates plot in R (ggparcoord)

I am facing a somewhat strange situation while plotting a parallel co-ordinates plot using ggparcoord. I am running the following code and it is running perfectly fine:
# Load required packages
require(GGally)
# Load datasets
data(state)
df <- data.frame(state.x77,
State = state.name,
Abbrev = state.abb,
Region = state.region,
Division = state.division
)
# Generate basic parallel coordinate plot
p <- ggparcoord(data = df,
# Which columns to use in the plot
columns = 1:4,
# Which column to use for coloring data
groupColumn = 11,
# Allows order of vertical bars to be modified
order = "anyClass",
# Do not show points
showPoints = FALSE,
# Turn on alpha blending for dense plots
alphaLines = 0.6,
# Turn off box shading range
shadeBox = NULL,
# Will normalize each column's values to [0, 1]
scale = "uniminmax" # try "std" also
)
# Start with a basic theme
p <- p + theme_minimal()
# Decrease amount of margin around x, y values
p <- p + scale_y_continuous(expand = c(0.02, 0.02))
p <- p + scale_x_discrete(expand = c(0.02, 0.02))
# Remove axis ticks and labels
p <- p + theme(axis.ticks = element_blank())
p <- p + theme(axis.title = element_blank())
p <- p + theme(axis.text.y = element_blank())
# Clear axis lines
p <- p + theme(panel.grid.minor = element_blank())
p <- p + theme(panel.grid.major.y = element_blank())
# Darken vertical lines
p <- p + theme(panel.grid.major.x = element_line(color = "#bbbbbb"))
# Move label to bottom
p <- p + theme(legend.position = "bottom")
# Figure out y-axis range after GGally scales the data
min_y <- min(p$data$value)
max_y <- max(p$data$value)
pad_y <- (max_y - min_y) * 0.1
# Calculate label positions for each veritcal bar
lab_x <- rep(1:4, times = 2) # 2 times, 1 for min 1 for max
lab_y <- rep(c(min_y - pad_y, max_y + pad_y), each = 4)
# Get min and max values from original dataset
lab_z <- c(sapply(df[, 1:4], min), sapply(df[, 1:4], max))
# Convert to character for use as labels
lab_z <- as.character(lab_z)
# Add labels to plot
p <- p + annotate("text", x = lab_x, y = lab_y, label = lab_z, size = 3)
# Display parallel coordinate plot
print(p)
I get the following output:
The moment I want to subset the data to display fewer region levels using the following statement:
df<-df[which(df$Region %in% c('South','West','Northeast')),]
I start receiving the following error:
Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
contrasts can be applied only to factors with 2 or more levels
Why am I getting this error when the number of levels I want to display are clearly more than 2?
Any help on this would be much appreciated.
I figured what the problem was. I had to convert the column into factor.
df$Region <- factor(df$Region)
The above piece of code fixes the error.

Resources