ggplot2 boxplot: horizontal bar at median? - r

I would like to make a ggplot2 boxplot more meaningful by adding a thick bar at the median (so that if the median is equal to either of the lower or upper quartiles, it can be detected to which it is equal). I came across a recent post of Kohske:
Can I get boxplot notches in ggplot2?
but I didn't know how to give the "crossbar" a "height". Then I tried
to use a rectangle but it didn't work either. Here is a minimal example:
require(ggplot2)
require(reshape2)
require(plyr)
set.seed(1)
## parameters
p1 <- c(5, 20, 100)
p2 <- c("f1", "f2", "f3", "f4", "f5")
p3 <- c("g1","g2","g3","g4","g5")
N <- 1000
## lengths
l1 <- length(p1)
l2 <- length(p2)
l3 <- length(p3)
## build result array containing the measurements
arr <- array(rep(NA, l1*l2*l3*N), dim=c(l1, l2, l3, N),
dimnames=list(
p1=p1,
p2=p2,
p3=p3,
N=1:N))
for(i in 1:l1){
for(j in 1:l2){
for(k in 1:l3){
arr[i,j,k,] <- i+j+k+runif(N, min=-4, max=4)
}
}
}
arr <- arr + rexp(3*5*5*N)
## create molten data
mdf <- melt(arr, formula = . ~ p1 + p2 + p3 + N) # create molten data frame
## confidence interval calculated by `boxplot.stats`
f <- function(x){
ans <- boxplot.stats(x)
data.frame(x=x, y=ans$stats[3], ymin=ans$conf[1], ymax=ans$conf[2])
}
## (my poor) trial
ggplot(mdf, aes(x=p3, y=value)) + geom_boxplot(outlier.shape=1) +
stat_summary(fun.data=f, geom="rectangle", colour=NA, fill="black",
xmin=x-0.36, xmax=x+0.36, ymin=max(y-0.2, ymin), ymax=min(y+0.2,
ymax)) + facet_grid(p2 ~ p1, scales = "free_y")
**SOLUTION** (after the discussion with Kohske below):
f <- function(x, height){
ans <- median(x)
data.frame(y=ans, ymin=ans-height/2, ymax=ans+height/2)
}
p <- ggplot(mdf, aes(x=p3, y=value)) + geom_boxplot(outlier.shape=1) +
stat_summary(fun.data=f, geom="crossbar", height=0.5, colour=NA,
fill="black", width=0.78) +
facet_grid(p2 ~ p1, scales = "free_y")
pdf()
print(p)
dev.off()
**UPDATE** Hmmm... it's not that trivial. The following example shows that the "height" of the crossbar should be adapted to the y-axis scale, otherwise it might be overseen.
require(ggplot2)
require(reshape2)
require(plyr)
set.seed(1)
## parameters
p1 <- c(5, 20, 100)
p2 <- c("f1", "f2", "f3", "f4", "f5")
p3 <- c("g1","g2","g3","g4","g5")
N <- 1000
## lengths
l1 <- length(p1)
l2 <- length(p2)
l3 <- length(p3)
## build result array containing the measurements
arr <- array(rep(NA, l1*l2*l3*N), dim=c(l1, l2, l3, N),
dimnames=list(
p1=p1,
p2=p2,
p3=p3,
N=1:N))
for(i in 1:l1){
for(j in 1:l2){
for(k in 1:l3){
arr[i,j,k,] <- i+j^4+k+runif(N, min=-4, max=4)
}
}
}
arr <- arr + rexp(3*5*5*N)
arr[1,2,5,] <- arr[1,2,5,]+30
arr[1,5,3,] <- arr[1,5,3,]+100
## create molten data
mdf <- melt(arr, formula = . ~ p1 + p2 + p3 + N) # create molten data frame
f <- function(x, height){
ans <- median(x)
data.frame(y=ans, ymin=ans-height/2, ymax=ans+height/2)
}
## plot
p <- ggplot(mdf, aes(x=p3, y=value)) + geom_boxplot(outlier.shape=1) +
stat_summary(fun.data=f, geom="crossbar", height=0.7, colour=NA,
fill="black", width=0.78) +
facet_grid(p2 ~ p1, scales = "free_y")
pdf()
print(p)
dev.off()

here is an example:
f <- function(x, height) {
ans <- median(x)
data.frame(ymin = ans-height/2, ymax = ans+height/2, y = ans)
}
df <- data.frame(x=gl(2,6), y=c(1,1,1,1,3,3, 1,1,3,3,3,3))
ggplot(df, aes(x, y)) + geom_boxplot() +
stat_summary(fun.data = f, geom = "crossbar", height = 0.1,
colour = NA, fill = "skyblue", width = 0.8, alpha = 0.5)
if you just want to change the apparence, then here is a quick hack, I don't recommend though,
df <- data.frame(x=gl(2,6), y=c(c(1,1,1,1,3,3), c(1,1,3,3,3,3)*10))
ggplot(df, aes(x, y)) + geom_boxplot() + facet_grid(x~.)
gs <- grid.gget("geom_boxplot", grep = T)
if (inherits(gs, "grob")) gs <- list(gs)
gss <- llply(gs, function(g) g$children[[length(g$children)]])
l_ply(gss, function(g) grid.edit(g$name, grep=T, just = c("left", "center"), height = unit(0.05, "native"), gp = gpar(fill = "skyblue", alpha = 0.5, col = NA)))

Related

List of plots generated in ggplot2 using scale_color_gradientn have wrong coloring

I'm attempting to use library(scales) and scale_color_gradientn() to create a custom mapping of colors to a continuous variable, in an attempt to limit the effect of outliers on the coloring of my plot. This works for a single plot, but does not work when I use a loop to generate several plots and store them in a list.
Here is a minimal working example:
library(ggplot2)
library(scales)
data1 <- as.data.frame(cbind(x = rnorm(100),
y = rnorm(100),
v1 = rnorm(100, mean = 2, sd = 1),
v2 = rnorm(100, mean = -2, sd = 1)))
#add outliers
data1[1,"v1"] <- 200
data1[2,"v1"] <- -200
data1[1,"v2"] <- 50
data1[2,"v2"] <- -50
#define color palette
cols <- colorRampPalette(c("#3540FF","black","#FF3535"))(n = 100)
#simple color scale
col2 <- scale_color_gradient2(low = "#3540FF",
mid = "black",
high = "#FF3535"
)
#outlier-adjusted color scale
{
aa <- min(data1$v1)
bb <- quantile(data1$v1, 0.05)
cc <- quantile(data1$v1, 0.95)
dd <- max(data1$v1)
coln <- scale_color_gradientn(colors = cols[c(1,5,95,100)],
values = rescale(c(aa,bb,cc,dd),
limits = c(aa,dd))
)
}
Plots:
1. Plot with simple scales - outliers cause scales to stretch out.
ggplot(data1, aes(x = x, y = y, color = v1))+
geom_point()+
col2
2. Plot with outlier-adjusted scales - correct color scaling.
ggplot(data1, aes(x = x, y = y, color = v1))+
geom_point()+
coln
3. The scales for v1 do not work for v2 as the data is different.
ggplot(data1, aes(x = x, y = y, color = v2))+
geom_point()+
coln
#loop to produce list of plots each with own scale
{
plots <- list()
k <- 1
for (i in c("v1","v2")){
aa <- min(data1[,i])
bb <- quantile(data1[,i],0.05)
cc <- quantile(data1[,i], 0.95)
dd <- max(data1[,i])
colm <- scale_color_gradientn(colors = cols[c(1,5,95,100)],
values = rescale(c(aa,bb,cc,dd),
limits = c(aa,dd)))
plots[[k]] <- ggplot(data1, aes_string(x = "x",
y = "y",
color = i
))+
geom_point()+
colm
k <- k + 1
}
}
4. First plot has the wrong scales.
plots[[1]]
5. Second plot has the correct scales.
plots[[2]]
So I'm guessing this has something to do with the scale_color_gradientn() function being called when the plotting takes place, rather than within the loop.
If anyone can help with this, it'd be much appreciated. In base R I would bin the continuous data and assigning hex colors into a vector used for fill color, but I'm unsure how I can apply this within ggplot.
You need to use a closure (function with associated environment):
{
plots <- list()
k <- 1
for (i in c("v1", "v2")){
colm <- function() {
aa <- min(data1[, i])
bb <- quantile(data1[, i], 0.05)
cc <- quantile(data1[, i], 0.95)
dd <- max(data1[, i])
scale_color_gradientn(colors = cols[c(1, 5, 95, 100)],
values = rescale(c(aa, bb, cc, dd),
limits = c(aa, dd)))
}
plots[[k]] <- ggplot(data1, aes_string(x = "x",
y = "y",
color = i)) +
geom_point() +
colm()
k <- k + 1
}
}
plots[[1]]
plots[[2]]

R: ggplot2 and loop with data.table

I have to make 4 plots which differ only for y and ylab.
I start from a data.table dt which is
set.seed(123)
dt <- data.table(a = rnorm(20),
b = rnorm(20),
c = rnorm(20),
d = rnorm(20),
e = rnorm(20))
Every single plot should be a scatterplot with row numbers as x vs y values. Additionally, I want to plot some hline at median(y) + h*mad(y) where h = c(0, -2, 2, -3, 3)
This plot should be repeated for columns a, c, d and e of dt.
I came up with this bit of code
# Defining y labels #
ylabels <- c(bquote(phantom(.)^100*A~"/"*phantom(.)^200*A),
bquote(phantom(.)^101*C~"/"*phantom(.)^201*B),
bquote(phantom(.)^102*D~"/"*phantom(.)^202*D),
bquote(phantom(.)^103*E~"/"*phantom(.)^202*E))
# Selecting columns of dt
ydata <- names(dt)[c(1, 3, 4, 5)]
h <- c(0, -2, 2, -3, 3)
hcol <- c("#009E73", "#E69F00", "#E69F00", "red", "red")
# The for cycle should create the 4 plots and assign them to a list
plots <- list()
for (i in seq_along(ydata)) {
p1 <- ggplot(dt, aes_string(x = seq(1, dt[, .N]), y = ydata[i])) +
geom_point() +
geom_hline(aes_string(yintercept = median(ydata[i]) +
h * mad(ydata[i])), color = hcol) +
xlab("Replicate") +
ylab(ylabels[i]) +
scale_x_continuous(breaks = seq(1, dt[,.N])))
plots[[i]] <- p1 # add each plot into plot list
}
Then plots will be fed to the multiplot function from Cookbook for R.
However my loop doesn't work properly because it fails to calculate the median and mad values.
Do you have any suggestions to make the code work?
# data.table with the median +- h* mad values
hline.values <- dt[, lapply(.SD, function(x) median(x) + h * mad(x)),
.SDcols = ydata]
# new empty list
plots <- list()
for (i in seq_along(ydata)) {
p1 <- ggplot(dt, aes_string(x = seq(1, dt[, .N]), y = ydata[i])) +
geom_point() +
geom_hline(data = hline.values,
aes_string(yintercept = ydata[i])) +
# Axis labels and theme
xlab("Replicate") +
ylab(ylabels[[i]]) +
scale_x_continuous(breaks = seq(1, dt[, .N]))
plots[[i]] <- p1
}

How can I add significant brackets between facets of a faceted R plot using ggplot?

I’m using the following code to plot longitudinal data with the facet_grid() option. I would like to indicate significant group differences between the facet grids using brackets and asterisks. However, so far I could only add text/lines within the individual grids, and not between them.
for(i in seq_along(varlist)){
p <- ggplot(data = Plot, aes(x = Timepoint , y = eval(parse(text = varlist[i])),
group = Sub_ID, colour = Subgroup)) + geom_point() +
geom_line(linetype = "dashed")
r <- p + stat_smooth(aes(group = 1, method = "lm")) + stat_summary(aes(group = 1),
geom = "point", fun.y = mean, shape = 17, size = 5) + facet_grid(. ~ Subgroup)
ggsave(filename=paste(varlist[i],"_by_subgroup.jpg", sep=""),width = 10, height = 7.5)
}
Load the libraries
require(data.table)
require(ggplot2)
require(gtable)
Make toy data
data0 <- data.table(iris)[,list(Mean.Sepal.Length=mean(.SD[,Sepal.Length]),Mean.Petal.Length=mean(.SD[,Petal.Length])),by=list(Species)]
data1 <- melt(data0,id.vars="Species")
## ## Draw the bars
p <- ggplot(data=data1,aes(x=variable,y=value,fill=variable)) +
geom_bar(stat="identity") +
facet_grid(~Species) +
scale_x_discrete(breaks=NULL)
p
Draw the brackets and asterisks
## make function to rescale the coordinates to npc
scale_to_npc <- function(x, range) scales::rescale(c(range, x), c(0,1))[-c(1,2)]
scale_x <- function(x,facet,ranges){scale_to_npc(x,ranges[[facet]][["x.range"]])}
scale_y <- function(y,facet,ranges){scale_to_npc(y,ranges[[facet]][["y.range"]])}
## build grobs and get the ranges
gb <- ggplot_build(p)
g <- ggplot_gtable(gb)
## gtable_show_layout(g)
ranges <- gb$panel$ranges
## get and rescale the coordinates
y1 <- data1[variable=="Mean.Petal.Length",min(value)]
y3 <- data1[,max(value)]
y4 <- data1[variable=="Mean.Petal.Length",max(value)]
data2 <- data.frame(x.=c(2,2,2,2,1.5),y.=c(y1,y3*1.01,y3*1.01,y4,y3*1.01),facet=c(1,1,3,3,2))
data2b <- data.frame(
x=mapply(scale_x,data2[,1],data2[,3],MoreArgs=list(ranges =ranges)),
y=mapply(scale_y,data2[,2],data2[,3],MoreArgs=list(ranges=ranges))
)
## draw the brackets and asterisks
g <- gtable_add_grob(g, moveToGrob(data2b[1,1],data2b[1,2]),t=4,l=4,b=4,r=4)
g <- gtable_add_grob(g, lineToGrob(data2b[2,1],data2b[2,2]),t=4.5,l=4,b=4,r=4)
g <- gtable_add_grob(g, moveToGrob(data2b[2,1],data2b[2,2]),t=4.5,l=4,b=4,r=4)
g <- gtable_add_grob(g, lineToGrob(data2b[3,1],data2b[3,2]),t=4,l=8,b=4,r=8)
g <- gtable_add_grob(g, moveToGrob(data2b[3,1],data2b[3,2]),t=4,l=8,b=4,r=8)
g <- gtable_add_grob(g, lineToGrob(data2b[4,1],data2b[4,2]),t=4.5,l=8,b=4,r=8)
g <- gtable_add_grob(g, textGrob("***",data2b[5,1],data2b[5,2]),t=4,l=4,b=4,r=8)
## turn clip off to allow the line across panels
g$layout$clip <- "off"
grid.newpage()
grid.draw(g)

Putting expressions (plotmath) in the legend key labels

How can I put expressions (plotmath) into the legend key labels of the following plot?
I am aware of How to use Greek symbols in ggplot2? and the link therein, but whenever I use scale_..._manual function, I obtain a second legend (see below).
require(ggplot2)
require(reshape2)
require(plyr)
## parameters
d <- c(2, 5, 10, 20, 50, 100)
tau <- c("t1", "t2", "t3")
fam <- c("f1", "f2", "f3", "f4", "f5")
meth <- c("m1", "m2", "m3", "m4")
## lengths
nd <- length(d)
ntau <- length(tau)
nfam <- length(fam)
nmeth <- length(meth)
## build result array containing the measurements
arr <- array(rep(NA, nd*ntau*nfam*nmeth), dim=c(nd, ntau, nfam, nmeth),
dimnames=list(d=d, tau=tau, fam=fam, meth=meth))
for(i in 1:nd){
for(j in 1:ntau){
for(k in 1:nfam){
for(l in 1:nmeth){
arr[i,j,k,l] <- i+j+k+l+runif(1)
}
}
}
}
## create molten data
mdf <- reshape2:::melt.array(arr, formula = . ~ d + tau + fam + meth) # create molten data frame
mdf$tau. <- factor(mdf$tau, levels=tau, labels=paste("tau==", tau, sep="")) # expression for tau
mdf$fam. <- factor(mdf$fam, levels=fam, labels=paste("alpha==", fam, sep="")) # expression for family
meth.labs <- lapply(1:nmeth, function(i) bquote(gamma==.(i))) # expression for methods
## plot
ggplot(mdf, aes(x=d, y=value, shape=meth, linetype=meth)) + geom_line() +
geom_point() + facet_grid(fam. ~ tau., scales="free_y", labeller=label_parsed) +
## scale_linetype_manual(values=1:4, breaks=meth, labels=meth.labs) + # problem: adds another legend
scale_x_continuous(trans="log10", breaks=d, labels=d) +
scale_y_continuous(trans="log10")
If I use both scale_*_manual functions I get a single legend with the expression:
ggplot(mdf, aes(x=d, y=value, shape=meth, linetype=meth)) + geom_line() +
geom_point() + facet_grid(fam. ~ tau., scales="free_y", labeller=label_parsed) +
## scale_linetype_manual(values=1:4, breaks=meth, labels=meth.labs) + # problem: adds another legend
scale_x_continuous(trans="log10", breaks=d, labels=d) +
scale_y_continuous(trans="log10") +
scale_linetype_manual(breaks = c('m1','m2','m3','m4'),values = 1:4,labels = meth.labs) +
scale_shape_manual(breaks = c('m1','m2','m3','m4'),values = 1:4,labels = meth.labs)

ggplot2:scatterplots for all possible combinations of variables

I want to plot graphs for all possible combinations of variables. My code is below:
set.seed(12345)
a <- data.frame(Glabel=LETTERS[1:7], A=rnorm(7, mean = 0, sd = 1), B=rnorm(7, mean = 0, sd = 1), C=rnorm(7, mean = 0, sd = 1))
T <- data.frame(Tlabel=LETTERS[11:20], A=rnorm(10, mean = 0, sd = 1), B=rnorm(10, mean = 0, sd = 1), C=rnorm(10, mean = 0, sd = 1))
library(ggplot2)
for(i in 2:(ncol(a)-1))
{
for(j in (i+1):ncol(a))
{
r <- 0.08
p <- ggplot(data=a, mapping=aes(x=a[, i], y=a[, j])) + geom_point() + theme_bw()
p <- p + geom_text(data=a, mapping=aes(x=a[, i], y=a[, j], label=Glabel),
size=3, vjust=1.35, colour="black")
p <- p + geom_segment(data = T, aes(xend = T[ ,i], yend=T[ ,j]),
x=0, y=0, colour="black",
arrow=arrow(angle=25, length=unit(0.25, "cm")))
p <- p + geom_text(data=T, aes(x=T[ ,i], y=T[ ,j], label=Tlabel), size=3, vjust=0, colour="red")
dev.new()
print(p)
}
}
This code works fine. But the method used here is not recommended (See #baptiste comment) and does not work in function. I want to know what is the best and recommended way to accomplish this task. Thanks in advance for your help.
Alright this is garbage but the best I could do. It's super inefficient as it recreates a partial data with each loop through lapply. Maybe someone else has something better:
MAT <- outer(names(df)[-1], names(df)[-1], paste)
combs <- sapply(MAT[lower.tri(MAT)], function(x) strsplit(x, " "))
ind <- lapply(combs, function(x) match(x, names(df)))
plotter <- function(cn) { #start junky function
NAMES <- colnames(df)[cn]
df2 <- df[cn]
names(df2)<- c('x1', 'x2')
p <- ggplot(data=df2, aes(x1, x2)) + geom_point() + theme_bw() +
scale_x_continuous(name=NAMES[1]) +
scale_y_continuous(name=NAMES[2])
dev.new()
print(p)
} #end of junky function
lapply(ind, function(x) plotter(cn=x))
EDIT: This is a bit better:
x <- match(names(df)[-1], names(df))
MAT <- outer(x, x, paste)
combs <- t(sapply(MAT[lower.tri(MAT)], function(x) as.numeric(unlist(strsplit(x, " ")))))
plotter <- function(cn) {
NAMES <- colnames(df)[cn]
df2 <- df[cn]
names(df2)<- c('x1', 'x2')
p <- ggplot(data=df2, aes(x1, x2)) + geom_point() + theme_bw() +
scale_x_continuous(name=NAMES[1]) +
scale_y_continuous(name=NAMES[2])
dev.new()
print(p)
}
apply(combs, 1, function(x) plotter(cn=x))

Resources