Related
ggrepel provides an excellent series of functions for annotating ggplot2 graphs and the examples page contains lots of nice hints of how to expand its functionality, including moving the labels generated away from both the axes of the plot, other labels, and so on.
However, one thing that isn't covered is moving the labels away from manually drawn lines with geom_hline() and geom_vline(), as may occur, for example, in making an annotated volcano plot.
Here's a simple MWE to highlight the problem:
library("tidyverse")
library("ggrepel")
dat <- subset(mtcars, wt > 2.75 & wt < 3.45)
dat$car <- rownames(dat)
ggplot(dat, aes(wt, mpg, label = car)) +
geom_point(color = "red") +
geom_text_repel(seed = 1) + #Seed for reproducibility
geom_vline(xintercept = 3.216) + #Deliberately chosen "bad" numbers
geom_hline(yintercept = 19.64) + theme_bw()
This produces the following output:
Note how the lines overlap the text of the labels and obscure it (is that "Horret 4 Drive" or "Hornet 4 Drive"?)
Jiggling the points about a bit post facto you can make a far nicer fit – I have simply shifted some of the labels a tiny bit to get them off the line.
Is it possible to get ggrepel to do this automatically? I know the example given isn't totally stable (other seeds give acceptable results) but for complex plots with a large number of points it definitely is a problem.
Edit: If you're curious, a far less "minimum" working example would be the below (taken from bioconductor):
download.file("https://raw.githubusercontent.com/biocorecrg/CRG_RIntroduction/master/de_df_for_volcano.rds", "de_df_for_volcano.rds", method="curl")
tmp <- readRDS("de_df_for_volcano.rds")
de <- tmp[complete.cases(tmp), ]
de$diffexpressed <- "NO"
# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP"
de$diffexpressed[de$log2FoldChange > 0.6 & de$pvalue < 0.05] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
de$diffexpressed[de$log2FoldChange < -0.6 & de$pvalue < 0.05] <- "DOWN"
# Create a new column "delabel" to de, that will contain the name of genes differentially expressed (NA in case they are not)
de$delabel <- NA
de$delabel[de$diffexpressed != "NO"] <- de$gene_symbol[de$diffexpressed != "NO"]
#Actually do plot
ggplot(data=de, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "black", "red")) +
geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
This produces the below, where the text-overlapping-lines problem is quite obvious:
I don't think there's a built-in way to do this.
A non-elegant hack off the top of my head is to add invisible points along the intercept lines which the labels will then repel away from.
dat <- subset(mtcars, wt > 2.75 & wt < 3.45)
dat$car <- rownames(dat)
xintercept = 3.216
yintercept = 19.64
dat %>%
mutate(alpha = 1) %>%
bind_rows(.,
tibble(wt = seq(from = min(.$wt), to = max(.$wt), length.out = 20), mpg = yintercept, car = '', alpha = 0),
tibble(wt = xintercept, mpg = seq(from = min(.$mpg), to = max(.$mpg), length.out = 20), car = '', alpha = 0)
) %>%
ggplot(aes(wt, mpg, label = car, alpha = alpha)) +
geom_point(color = "red") +
geom_text_repel(seed = 1) + #Seed for reproducibility
geom_vline(xintercept = xintercept) +
geom_hline(yintercept = yintercept) + theme_bw() +
scale_alpha_identity()
One (admittedly unorthodox) solution would be to plot "invisible" text along the intercept lines and thus trick geom_text_repel into staying away from them. The complication is that you have to add several filler rows to your data set and then modify the plot to render the filler as invisible. But the end result is pretty clean:
dat2 <- bind_rows(
data.frame(wt = seq(min(dat$wt), max(dat$wt), length = 20), mpg = 19.64, car = 'O'),
data.frame(mpg = seq(min(dat$mpg), max(dat$mpg), length = 20), wt = 3.216, car = 'O'),
dat
)
ggplot(dat2, aes(wt, mpg, label = car)) +
geom_point(data = filter(dat2, car != 'O'), color = "red") +
geom_text_repel(aes(color = car == 'O'), seed = 1, show.legend = F) + #Seed for reproducibility
geom_vline(xintercept = 3.216) + #Deliberately chosen "bad" numbers
geom_hline(yintercept = 19.64) +
scale_color_manual(values = c('black', 'transparent'))
theme_bw()
I'm not sure if there's any functions that allows ggrepel to do this automatically. One way to hack around this is to create multiple subsets of data, and add nudge to the label. Here I used the volcano plot as an example.
library(ggplot2)
library(ggrepel)
ggplot(data=de, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel(data = subset(de, log2FoldChange < -0.6),
nudge_x = -0.05) +
geom_text_repel(data = subset(de, log2FoldChange > 0.6),
nudge_x = 0.08) +
scale_color_manual(values=c("blue", "black", "red")) +
geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
I would like to plot a graph from a Discriminant Function Analysis in which points must have a black border and be filled with specific colors and confidence ellipses must be the same color as the points are filled. Using the following code, I get almost the graph I want, except that points do not have a black border:
library(ggplot2)
library(ggord)
library(MASS)
data("iris")
set.seed(123)
linear <- lda(Species~., iris)
linear
dfaplot <- ggord(linear, iris$Species, labcol = "transparent", arrow = NULL, poly = FALSE, ylim = c(-11, 11), xlim = c(-11, 11))
dfaplot +
scale_shape_manual(values = c(16,15,17)) +
scale_color_manual(values = c("#00FF00","#FF00FF","#0000FF")) +
theme(legend.position = "none")
PLOT 1
I could put a black border on the points by using the following code, but then confidence ellipses turn black.
dfaplot +
scale_shape_manual(values = c(21,22,24)) +
scale_color_manual(values = c("black","black","black")) +
scale_fill_manual(values = c("#00FF00","#FF00FF","#0000FF")) +
theme(legend.position = "none")
PLOT 2
I would like to keep the ellipses as in the first graph, but the points as in the second one. However, I am being unable to figure out how I could do this. If anyone has suggestions on how to do this, I would be very grateful. I am using the "ggord" package because I learned how to run the analysis using it, but if anyone has suggestions on how to do the same with only ggplot, it would be fine.
This roughly replicates what is going on in ggord. Looking at the source for the package, the ellipses are implemented differently in ggord than below, hence the small differences. If that is a big deal you can review the source and make changes. By default, geom_point doesn't have a fill attribute. So we set the shapes to a character type that does, and then specify color = 'black' in geom_point(). The full code (including projecting the original data) is below.
set.seed(123)
linear <- lda(Species~., iris)
linear
# Get point x, y coordinates
df <- data.frame(predict(linear, iris[, 1:4]))
df$species <- iris$Species
# Get explained variance for each axis
var_exp <- 100 * linear$svd ^ 2 / sum(linear$svd ^ 2)
ggplot(data = df,
aes(x = x.LD1,
y = x.LD2)) +
geom_point(aes(fill = species,
shape = species),
size = 4) +
stat_ellipse(aes(color = species),
level = 0.95) +
ylim(c(-11, 11)) +
xlim(c(-11, 11)) +
ylab(paste("LD2 (",
round(var_exp[2], 2),
"%)")) +
xlab(paste("LD1 (",
round(var_exp[1], 2),
"%)")) +
scale_color_manual(values = c("#00FF00","#FF00FF","#0000FF")) +
scale_fill_manual(values = c("#00FF00","#FF00FF","#0000FF")) +
scale_shape_manual(values = c(21, 22, 24)) +
coord_fixed(1) +
theme_bw() +
theme(
legend.position = "none"
)
To plot arrows, you can grab the scaling from the output it and plot it with geom_segment. I played with the colors/alpha so they were visible in the plot below.
scaling <- data.frame(linear$scaling)
...
geom_segment(data = scaling,
aes(x = 0,
y = 0,
xend = LD1,
yend = LD2),
arrow = arrow(),
color = "black") +
geom_text(data = scaling,
aes(x = ifelse(LD1 <= 0.1, LD1 - 2, LD1 + 2),
y = ifelse(LD2 <= 0.1, LD2 - 1, LD2 + 1)),
label = rownames(scaling),
color = "black") +
...
Data: Data
Code:
## Load the data
ifpricc = read.csv(file = "IFPRI_CCAgg2050.csv", heade=TRUE)
#-----------------------------------------------------------------------
# Plotting Kernel density distribution for the final yield impact data
#-----------------------------------------------------------------------
ifpricc.df = as.data.frame(ifpricc)
ifpricc_mlt.df = melt(ifpricc.df, id.vars=c("crop","codereg","reg","sres","gcm","scen"))
kernel = ggplot(data=subset(ifpricc_mlt.df, reg %in% c("Canada","United States","Oceania","OECD Europe","Eastern Europe","Former USSR") & gcm %in% c("CSIRO","MIROC","noCC")),
aes(x = value, y = ..density..))
kernel = kernel + geom_density(aes(fill = gcm), alpha=.4, subset = .(crop %in% c("WHET")),
position="identity", stat="density", size=0.75,
bw = "nrd0", adjust = 1.5,
kernel = c("gaussian"))
kernel = kernel + scale_fill_manual(name="GCM model",breaks=c("CSIRO","MIROC","noCC"), values=c("red","blue","gray80"))
kernel = kernel + facet_grid(sres ~ reg, scale="free") + scale_y_continuous(breaks=seq(0,2,.25))
kernel = kernel + labs(title="Kernel density distribution - with and without climate change", y="Density", x="Yield") + theme_bw()
kernel = kernel + theme(plot.title=element_text(face="bold", size=rel(2), hjust=0.5, vjust=1.5, family="serif"),
axis.text.x=element_text(color="black", size=rel(2), hjust=0.5, family="serif"),
axis.text.y=element_text(color="black", size=rel(2), hjust=1, family="serif"),
axis.title.x=element_text(face="bold", color="black", size=rel(1.6), hjust=0.5, vjust=0.2, family="serif"),
axis.title.y=element_text(face="bold", color="black", size=rel(1.6), hjust=0.5, vjust=0.2, family="serif"),
strip.text=element_text(face="bold", size=rel(1.5), family="serif"),
legend.text=element_text(face="bold", size=rel(1.25), family="serif"),
legend.title=element_text(face="bold", size=rel(1.45), family="serif"))
Results:
Question:
What I am trying to achieve here is to plot Kernel density curves. My problem is that I want to overlay the baseline kernel curves (in lower facet) over the colored ones (two upper facets) and which represent deviations from the baseline. Any help will be greatly appreciated.
Cheers :)
Alternative Question:
So I tinkered a bit after looking up potential solutions on the website, and I came up with this: instead of faceting using facet_grid(sres ~ reg) by "sres" x "reg", I faceted by using facet_wrap(~ reg). It produces something closer to what I am intending .
The problem now is that I cannot identify the distribution by "sres", which is what I am looking for. To solve this, I thought to annotate the plot by adding vertical lines that plot the mean of the data by "sres". But I kind of am at loss of how to move from here.
Any suggestions?
If I understand, I think this is what you want. You need to rearrange the data: repeat the lines in the data frame containing the noCC factor, once with sres = A1B, and once with sres = B1. That way, the noCC density curve will appear in the A1B facets and in the B1 facets.
In addition, the melting of the data frame has no effect other than to create a column of 1s. Also, I do the subsetting outside the call to ggplot2.
library(ggplot2)
ifpricc = read.csv(file = "IFPRI_CCAgg2050.csv", heade=TRUE)
# Subset the data frame
df = subset(ifpricc,
reg %in% c("Canada","United States","Oceania","OECD Europe","Eastern Europe","Former USSR") &
gcm %in% c("CSIRO","MIROC","noCC") &
crop %in% c("WHET"))
# Manipulate the data frame
x = df[df$sres == "PM", ]
x = rbind(x, x)
x$sres = rep(c("A1B", "B1"), each = dim(x)[1]/2)
df = df[df$sres != "PM",]
df = rbind(df, x)
# Draw the plot
ggplot(data=df, aes(x = yield, fill = gcm)) +
geom_density(alpha=.4, size=0.75, adjust = 1.5) +
scale_fill_manual(name="GCM model",breaks=c("CSIRO","MIROC","noCC"),
values=c("red","blue","gray80")) +
facet_grid(sres ~ reg, scale="free") + scale_y_continuous(breaks=seq(0,2,.25))
EDIT: The facet_wrap version:
The idea is to draw two charts: one for A1B, and the second for B1; then to put the two charts together using functions from the gridExtra package. But that will give a legend for each chart. It would look better with only one legend. Therefore, draw one of the charts so that its legend can be extracted. Then draw the two charts without their legends, and put the two charts and the legend together.
library(ggplot)
library(gridExtra)
library(gtable)
ifpricc = read.csv(file = "IFPRI_CCAgg2050.csv", heade=TRUE)
# Subset the data frame
df = subset(ifpricc,
reg %in% c("Canada","United States","Oceania","OECD Europe","Eastern Europe","Former USSR") &
gcm %in% c("CSIRO","MIROC","noCC") &
crop %in% c("WHET"))
# Draw first chart
p1 = ggplot(data=df[df$sres != "B1", ], aes(x = yield, fill = gcm)) +
geom_density(alpha=.4, size=0.75, adjust = 1.5) +
ggtitle("sres = A1B") +
scale_fill_manual(name="GCM model",breaks=c("CSIRO","MIROC","noCC"),
values=c("red","blue","gray80")) +
facet_wrap( ~ reg, scales = "free_x") + scale_y_continuous(breaks=seq(0,2,.25))
# Extract its legend
legend = gtable_filter(ggplot_gtable(ggplot_build(p1)), "guide-box")
# Redraw the first chart without its legend
p1 = p1 + guides(fill = FALSE)
# Draw the second chart without its legend
p2 = ggplot(data=df[df$sres != "A1B", ], aes(x = yield, fill = gcm)) +
geom_density(alpha=.4, size=0.75, adjust = 1.5) +
ggtitle("sres = B1") +
scale_fill_manual(name="GCM model",breaks=c("CSIRO","MIROC","noCC"),
values=c("red","blue","gray80"), guide = "none") +
facet_wrap( ~ reg, scales = "free_x") + scale_y_continuous(breaks=seq(0,2,.25))
# Combine the two charts and the legend (and a main title)
grid.arrange(arrangeGrob(p1, p2, ncol = 1),
legend, widths = unit.c(unit(1, "npc") - legend$width, legend$width), nrow = 1,
main = textGrob("Kernel density distribution - with and without climate change",
vjust = 1, gp = gpar(fontface = "bold")))
I have a dataframe in R like this:
dat = data.frame(Sample = c(1,1,2,2,3), Start = c(100,300,150,200,160), Stop = c(180,320,190,220,170))
And I would like to plot it such that the x-axis is the position and the y-axis is the number of samples at that position, with each sample in a different colour. So in the above example you would have some positions with height 1, some with height 2 and one area with height 3. The aim being to find regions where there are a large number of samples and what samples are in that region.
i.e. something like:
&
---
********- -- **
where * = Sample 1, - = Sample 2 and & = Sample 3
My first try:
dat$Sample = factor(dat$Sample)
ggplot(aes(x = Start, y = Sample, xend = Stop, yend = Sample, color = Sample), data = dat) +
geom_segment(size = 2) +
geom_segment(aes(x = Start, y = 0, xend = Stop, yend = 0), size = 2, alpha = 0.2, color = "black")
I combine two segment geometries here. One draws the colored vertical bars. These show where Samples have been measured. The second geometry draws the grey bar below where the density of the samples is shown. Any comments to improve on this quick hack?
This hack may be what you're looking for, however I've greatly increased the size of the dataframe in order to take advantage of stacking by geom_histogram.
library(ggplot2)
dat = data.frame(Sample = c(1,1,2,2,3),
Start = c(100,300,150,200,160),
Stop = c(180,320,190,220,170))
# Reformat the data for plotting with geom_histogram.
dat2 = matrix(ncol=2, nrow=0, dimnames=list(NULL, c("Sample", "Position")))
for (i in seq(nrow(dat))) {
Position = seq(dat[i, "Start"], dat[i, "Stop"])
Sample = rep(dat[i, "Sample"], length(Position))
dat2 = rbind(dat2, cbind(Sample, Position))
}
dat2 = as.data.frame(dat2)
dat2$Sample = factor(dat2$Sample)
plot_1 = ggplot(dat2, aes(x=Position, fill=Sample)) +
theme_bw() +
opts(panel.grid.minor=theme_blank(), panel.grid.major=theme_blank()) +
geom_hline(yintercept=seq(0, 20), colour="grey80", size=0.15) +
geom_hline(yintercept=3, linetype=2) +
geom_histogram(binwidth=1) +
ylim(c(0, 20)) +
ylab("Count") +
opts(axis.title.x=theme_text(size=11, vjust=0.5)) +
opts(axis.title.y=theme_text(size=11, angle=90)) +
opts(title="Segment Plot")
png("plot_1.png", height=200, width=650)
print(plot_1)
dev.off()
Note that the way I've reformatted the dataframe is a bit ugly, and will not scale well (e.g. if you have millions of segments and/or large start and stop positions).
I am new to R and am trying to plot 3 histograms onto the same graph.
Everything worked fine, but my problem is that you don't see where 2 histograms overlap - they look rather cut off.
When I make density plots, it looks perfect: each curve is surrounded by a black frame line, and colours look different where curves overlap.
Can someone tell me if something similar can be achieved with the histograms in the 1st picture? This is the code I'm using:
lowf0 <-read.csv (....)
mediumf0 <-read.csv (....)
highf0 <-read.csv(....)
lowf0$utt<-'low f0'
mediumf0$utt<-'medium f0'
highf0$utt<-'high f0'
histogram<-rbind(lowf0,mediumf0,highf0)
ggplot(histogram, aes(f0, fill = utt)) + geom_histogram(alpha = 0.2)
Using #joran's sample data,
ggplot(dat, aes(x=xx, fill=yy)) + geom_histogram(alpha=0.2, position="identity")
note that the default position of geom_histogram is "stack."
see "position adjustment" of this page:
geom_histogram documentation
Your current code:
ggplot(histogram, aes(f0, fill = utt)) + geom_histogram(alpha = 0.2)
is telling ggplot to construct one histogram using all the values in f0 and then color the bars of this single histogram according to the variable utt.
What you want instead is to create three separate histograms, with alpha blending so that they are visible through each other. So you probably want to use three separate calls to geom_histogram, where each one gets it's own data frame and fill:
ggplot(histogram, aes(f0)) +
geom_histogram(data = lowf0, fill = "red", alpha = 0.2) +
geom_histogram(data = mediumf0, fill = "blue", alpha = 0.2) +
geom_histogram(data = highf0, fill = "green", alpha = 0.2) +
Here's a concrete example with some output:
dat <- data.frame(xx = c(runif(100,20,50),runif(100,40,80),runif(100,0,30)),yy = rep(letters[1:3],each = 100))
ggplot(dat,aes(x=xx)) +
geom_histogram(data=subset(dat,yy == 'a'),fill = "red", alpha = 0.2) +
geom_histogram(data=subset(dat,yy == 'b'),fill = "blue", alpha = 0.2) +
geom_histogram(data=subset(dat,yy == 'c'),fill = "green", alpha = 0.2)
which produces something like this:
Edited to fix typos; you wanted fill, not colour.
While only a few lines are required to plot multiple/overlapping histograms in ggplot2, the results are't always satisfactory. There needs to be proper use of borders and coloring to ensure the eye can differentiate between histograms.
The following functions balance border colors, opacities, and superimposed density plots to enable the viewer to differentiate among distributions.
Single histogram:
plot_histogram <- function(df, feature) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)))) +
geom_histogram(aes(y = ..density..), alpha=0.7, fill="#33AADE", color="black") +
geom_density(alpha=0.3, fill="red") +
geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
labs(x=feature, y = "Density")
print(plt)
}
Multiple histogram:
plot_multi_histogram <- function(df, feature, label_column) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
geom_histogram(alpha=0.7, position="identity", aes(y = ..density..), color="black") +
geom_density(alpha=0.7) +
geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
labs(x=feature, y = "Density")
plt + guides(fill=guide_legend(title=label_column))
}
Usage:
Simply pass your data frame into the above functions along with desired arguments:
plot_histogram(iris, 'Sepal.Width')
plot_multi_histogram(iris, 'Sepal.Width', 'Species')
The extra parameter in plot_multi_histogram is the name of the column containing the category labels.
We can see this more dramatically by creating a dataframe with many different distribution means:
a <-data.frame(n=rnorm(1000, mean = 1), category=rep('A', 1000))
b <-data.frame(n=rnorm(1000, mean = 2), category=rep('B', 1000))
c <-data.frame(n=rnorm(1000, mean = 3), category=rep('C', 1000))
d <-data.frame(n=rnorm(1000, mean = 4), category=rep('D', 1000))
e <-data.frame(n=rnorm(1000, mean = 5), category=rep('E', 1000))
f <-data.frame(n=rnorm(1000, mean = 6), category=rep('F', 1000))
many_distros <- do.call('rbind', list(a,b,c,d,e,f))
Passing data frame in as before (and widening chart using options):
options(repr.plot.width = 20, repr.plot.height = 8)
plot_multi_histogram(many_distros, 'n', 'category')
To add a separate vertical line for each distribution:
plot_multi_histogram <- function(df, feature, label_column, means) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
geom_histogram(alpha=0.7, position="identity", aes(y = ..density..), color="black") +
geom_density(alpha=0.7) +
geom_vline(xintercept=means, color="black", linetype="dashed", size=1)
labs(x=feature, y = "Density")
plt + guides(fill=guide_legend(title=label_column))
}
The only change over the previous plot_multi_histogram function is the addition of means to the parameters, and changing the geom_vline line to accept multiple values.
Usage:
options(repr.plot.width = 20, repr.plot.height = 8)
plot_multi_histogram(many_distros, "n", 'category', c(1, 2, 3, 4, 5, 6))
Result:
Since I set the means explicitly in many_distros I can simply pass them in. Alternatively you can simply calculate these inside the function and use that way.