No outliers in ggplot boxplot with facet_wrap - r

I would like to plot boxplots without outliers with ggplot, giving focus on the boxes and whiskers only
For example:
p1 <- ggplot(diamonds, aes(x=cut, y=price, fill=cut))
p1 + geom_boxplot() + facet_wrap(~clarity, scales="free")
gives facetted boxplots with outliers
I can suppress outliers with outlier.size=NA:
p1 <- ggplot(diamonds, aes(x=cut, y=price, fill=cut))
p1 + geom_boxplot(outlier.size=NA) + facet_wrap(~clarity, scales="free")
which gives
Here, the y-axis scale is the same as in the original plot, just the outliers don't show up. How can I now modify the scale to "zoom in" on each panel according to the whisker ends?
I can reset ylim like this
ylim1 = boxplot.stats(diamonds$price)$stats[c(1, 5)]
and then replot
p1 + geom_boxplot(outlier.size=NA)
+ facet_wrap(~clarity, scales="free")
+ coord_cartesian(ylim = ylim1*1.05)
but this doesn't work on the facets:
Is there a way to "facet_wrap" the boxplots.stats function?
Edit:
I've tried to calculate the boxplot statistics dynamically, but this doesn't seem to work.
give.stats <- function(x){return(boxplot.stats(x)$stats[c(1,5)])}
p1 + geom_boxplot(outlier.size=NA) +
facet_wrap(~clarity, scales="free") +
coord_cartesian(ylim = give.stats)
> Error in min(x, na.rm = na.rm) : invalid 'type' (list) of argument
Any more ideas would be much appreciated.

It can be done with stat_summary and custom statistic calculation function:
calc_boxplot_stat <- function(x) {
coef <- 1.5
n <- sum(!is.na(x))
# calculate quantiles
stats <- quantile(x, probs = c(0.0, 0.25, 0.5, 0.75, 1.0))
names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
iqr <- diff(stats[c(2, 4)])
# set whiskers
outliers <- x < (stats[2] - coef * iqr) | x > (stats[4] + coef * iqr)
if (any(outliers)) {
stats[c(1, 5)] <- range(c(stats[2:4], x[!outliers]), na.rm = TRUE)
}
return(stats)
}
ggplot(diamonds, aes(x=cut, y=price, fill=cut)) +
stat_summary(fun.data = calc_boxplot_stat, geom="boxplot") +
facet_wrap(~clarity, scales="free")
The stats calculation function is generic, thus no need for data manipulation before plotting.
It is also possible to set whiskers to 10% and 90% :
calc_stat <- function(x) {
coef <- 1.5
n <- sum(!is.na(x))
# calculate quantiles
stats <- quantile(x, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
return(stats)
}
ggplot(diamonds, aes(x=cut, y=price, fill=cut)) +
stat_summary(fun.data = calc_stat, geom="boxplot") +
facet_wrap(~clarity, scales="free")

Through outlier.size=NA you make the outliers disappear, this is not an option to ignore the outliers plotting the boxplots. So, the plots are generated considering the (invisible) outliers. There seems to be no option for what you want. In order to make the boxplots as you need them I would calculate the quantiles myself and generate the boxplots based on these quantiles, like in the following example:
stat<-tapply(diamonds$price,list(diamonds$cut,diamonds$clarity),function(x) boxplot.stats(x))
stats<-unlist(tapply(diamonds$price,list(diamonds$cut,diamonds$clarity),function(x) boxplot.stats(x)$stats))
df<-data.frame(
cut=rep(rep(unlist(dimnames(stat)[1]),each=5),length(unlist(dimnames(stat)[2]))),
clarity=rep(unlist(dimnames(stat)[2]),each=25),
price=unlist(tapply(diamonds$price,list(diamonds$cut,diamonds$clarity),function(x) boxplot.stats(x)$stats)))
ggplot(df,aes(x=cut,y=price,fill=cut))+geom_boxplot()+facet_wrap(~clarity,scales="free")
Which gives (note that the orders in the plot are different now):

Ok, I figured out a more easy way to do this by commenting out some lines in the original ggplot boxplot function and calling the modified function.
I am not a programmer, no idea if this is a good or robust thing to do, but it seems to work fine for now.
This is the modified function I am using:
#modified version of geom_boxplot
require(ggplot2)
geom_boxplot_noOutliers <- function (mapping = NULL, data = NULL, stat = "boxplot",
position = "dodge", outlier.colour = NULL,
outlier.shape = NULL, outlier.size = NULL,
notch = FALSE, notchwidth = .5, varwidth = FALSE,
...) {
#outlier_defaults <- ggplot2:::Geom$find('point')$default_aes()
#outlier.colour <- outlier.colour %||% outlier_defaults$colour
#outlier.shape <- outlier.shape %||% outlier_defaults$shape
#outlier.size <- outlier.size %||% outlier_defaults$size
GeomBoxplot_noOutliers$new(mapping = mapping, data = data, stat = stat,
position = position, outlier.colour = outlier.colour,
outlier.shape = outlier.shape, outlier.size = outlier.size, notch = notch,
notchwidth = notchwidth, varwidth = varwidth, ...)
}
GeomBoxplot_noOutliers <- proto(ggplot2:::Geom, {
objname <- "boxplot_noOutliers"
reparameterise <- function(., df, params) {
df$width <- df$width %||%
params$width %||% (resolution(df$x, FALSE) * 0.9)
# if (!is.null(df$outliers)) {
# suppressWarnings({
# out_min <- vapply(df$outliers, min, numeric(1))
# out_max <- vapply(df$outliers, max, numeric(1))
# })
#
# df$ymin_final <- pmin(out_min, df$ymin)
# df$ymax_final <- pmax(out_max, df$ymax)
# }
# if `varwidth` not requested or not available, don't use it
if (is.null(params) || is.null(params$varwidth) || !params$varwidth || is.null(df$relvarwidth)) {
df$xmin <- df$x - df$width / 2
df$xmax <- df$x + df$width / 2
} else {
# make `relvarwidth` relative to the size of the largest group
df$relvarwidth <- df$relvarwidth / max(df$relvarwidth)
df$xmin <- df$x - df$relvarwidth * df$width / 2
df$xmax <- df$x + df$relvarwidth * df$width / 2
}
df$width <- NULL
if (!is.null(df$relvarwidth)) df$relvarwidth <- NULL
df
}
draw <- function(., data, ..., fatten = 2, outlier.colour = NULL, outlier.shape = NULL, outlier.size = 2,
notch = FALSE, notchwidth = .5, varwidth = FALSE) {
common <- data.frame(
colour = data$colour,
size = data$size,
linetype = data$linetype,
fill = alpha(data$fill, data$alpha),
group = data$group,
stringsAsFactors = FALSE
)
whiskers <- data.frame(
x = data$x,
xend = data$x,
y = c(data$upper, data$lower),
yend = c(data$ymax, data$ymin),
alpha = NA,
common)
box <- data.frame(
xmin = data$xmin,
xmax = data$xmax,
ymin = data$lower,
y = data$middle,
ymax = data$upper,
ynotchlower = ifelse(notch, data$notchlower, NA),
ynotchupper = ifelse(notch, data$notchupper, NA),
notchwidth = notchwidth,
alpha = data$alpha,
common)
# if (!is.null(data$outliers) && length(data$outliers[[1]] >= 1)) {
# outliers <- data.frame(
# y = data$outliers[[1]],
# x = data$x[1],
# colour = outlier.colour %||% data$colour[1],
# shape = outlier.shape %||% data$shape[1],
# size = outlier.size %||% data$size[1],
# fill = NA,
# alpha = NA,
# stringsAsFactors = FALSE)
# outliers_grob <- GeomPoint$draw(outliers, ...)
# } else {
outliers_grob <- NULL
# }
ggname(.$my_name(), grobTree(
outliers_grob,
GeomSegment$draw(whiskers, ...),
GeomCrossbar$draw(box, fatten = fatten, ...)
))
}
guide_geom <- function(.) "boxplot_noOutliers"
draw_legend <- function(., data, ...) {
data <- aesdefaults(data, .$default_aes(), list(...))
gp <- with(data, gpar(col=colour, fill=alpha(fill, alpha), lwd=size * .pt, lty = linetype))
gTree(gp = gp, children = gList(
linesGrob(0.5, c(0.1, 0.25)),
linesGrob(0.5, c(0.75, 0.9)),
rectGrob(height=0.5, width=0.75),
linesGrob(c(0.125, 0.875), 0.5)
))
}
default_stat <- function(.) StatBoxplot
default_pos <- function(.) PositionDodge
default_aes <- function(.) aes(weight=1, colour="grey20", fill="white", size=0.5, alpha = NA, shape = 16, linetype = "solid")
required_aes <- c("x", "lower", "upper", "middle", "ymin", "ymax")
})
I saved it as an r file and use source to load it:
library(ggplot2)
library(scales)
#load functions
source("D:/Eigene Dateien/Scripte/R-Scripte/myfunctions/geomBoxplot_noOutliers.r")
Now I can just plot without outliers using geom_boxplot_noOutliers and everything works fine even with facets :-)
p1 <- ggplot(diamonds, aes(x=cut, y=price, fill=cut))
p1 + geom_boxplot_noOutliers() + facet_wrap(~clarity, scales="free")

In your case, I am thinking that limiting the display range could work, since all the outliers are larger than 10000.
p1 + geom_boxplot() + ylim(0,10000)

Related

Can't apply scale_x_log10() to my own geom: it appears on plot incorrectly

I'm trying to understand how ggproto works to write my own geoms.
I wrote geom_myerrorbarh (analogous to geom_errorbarh, but only with x,y, xwidth arguments). The figure below shows that everything works correctly at a linear scale. However, if you use the log10 scale, it is different from geom_errorbarh.
I noticed that when using scale_x_log10(), x=log10(x) is converted first, and then xmin=x-xwidth; xmax=x+xwidth (see setup_data argument). But it should be xmin=log10(x-width); xmax=log10(x+xwidth).
How to solve this problem?
library(grid)
library(ggplot2)
library(patchwork)
theme_set(theme_minimal())
GeomMyerrorbarh <- ggproto("GeomMyerrorbarh", Geom,
required_aes = c("x", "y", "xwidth"),
draw_key = draw_key_path,
setup_data = function(data, params){
transform(data, xmin = x - xwidth, xmax = x + xwidth)
},
draw_group = function(data, panel_scales, coord) {
## Transform the data first
coords <- coord$transform(data, panel_scales)
## Construct a grid grob
grid::segmentsGrob(
x0 = coords$xmin,
x1 = coords$xmax,
y0 = coords$y,
y1 = coords$y,
gp = gpar(lwd = coords$size,
col = coords$colour,
alpha = coords$alpha))
})
geom_myerrorbarh <- function(mapping = NULL, data = NULL, stat = "identity",
position = "identity", na.rm = FALSE,
show.legend = NA, inherit.aes = TRUE, ...) {
ggplot2::layer(
geom = GeomMyerrorbarh, mapping = mapping,
data = data, stat = stat, position = position,
show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, ...)
)
}
df <- data.frame(x = c(1, 2),
y = c(1, 2),
xerr = c(0.1, 0.2))
p1 <- ggplot(df, aes(x, y)) +
geom_point() +
geom_errorbarh(aes(xmin = x - xerr, xmax = x + xerr),
height=0, size=4, alpha=0.2, color='red') +
geom_myerrorbarh(aes(xwidth = xerr)) +
labs(subtitle = 'Linear scale x')
p2 <- p1 +
scale_x_log10() +
labs(subtitle = 'Log10 scale x')
# Plot:
# Red transparent region - geom_errorbarh
# Black line - geom_myerrorbarh
p1 | p2

Grid as bars in ggplot

A common layout in many sites is to draw the grid as shaded bars:
I'm doing this with this function:
grid_bars <- function(data, y, n = 5, fill = "gray90") {
breaks <- pretty(data[[y]], n)
len <- length(breaks)-1
all_bars <- data.frame(
b.id = rep(1:len, 4),
b.x = c(rep(-Inf, len), rep(Inf, len*2), rep(-Inf, len)),
b.y = c(rep(breaks[-length(breaks)], 2), rep(breaks[-1], 2))
)
bars <- all_bars[all_bars$b.id %in% (1:len)[c(FALSE, TRUE)], ]
grid <- list(
geom_polygon(data = bars, aes(b.x, b.y, group = b.id),
fill = fill, colour = fill),
scale_y_continuous(breaks = breaks),
theme(panel.grid = element_blank())
)
return(grid)
}
#-------------------------------------------------
dat <- data.frame(year = 1875:1972,
level = as.vector(LakeHuron))
ggplot(dat, aes(year, level)) +
grid_bars(dat, "level", 10) +
geom_line(colour = "steelblue", size = 1.2) +
theme_classic()
But it needs to specify data and y again. How to take those directly from the ggplot?
After having a look at the options for extending ggplot2 in Hadley Wickham's book on ggplot2 you probably have to set up your own Geom or Stat layer to achieve the desired result. This way you can access the data and aesthetics specified in ggplot() or even pass different data and aesthetics to your fun. Still a newbie in writing extensions for ggplot2 but a first approach may look like so:
library(ggplot2)
# Make bars dataframe
make_bars_df <- function(y, n) {
breaks <- pretty(y, n)
len <- length(breaks) - 1
all_bars <- data.frame(
group = rep(1:len, 4),
x = c(rep(-Inf, len), rep(Inf, len * 2), rep(-Inf, len)),
y = c(rep(breaks[-length(breaks)], 2), rep(breaks[-1], 2))
)
all_bars[all_bars$group %in% (1:len)[c(FALSE, TRUE)], ]
}
# Setup Geom
geom_grid_bars_y <- function(mapping = NULL, data = NULL, stat = "identity",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, n = 5, ...) {
layer(
geom = GeomGridBarsY, mapping = mapping, data = data, stat = stat,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(n = n, ...)
)
}
GeomGridBarsY <- ggproto("GeomGridBarsY", Geom,
required_aes = c("y"),
default_aes = aes(alpha = NA, colour = NA, fill = "gray90", group = NA,
linetype = "solid", size = 0.5, subgroup = NA),
non_missing_aes = aes("n"),
setup_data = function(data, params) {
transform(data)
},
draw_group = function(data, panel_scales, coord, n = n) {
bars <- make_bars_df(data[["y"]], n)
# setup data for GeomPolygon
## If you want this to work with facets you have to take care of the PANEL
bars$PANEL <- factor(1)
# Drop x, y, group from data
d <- data[ , setdiff(names(data), c("x", "y", "group"))]
d <- d[!duplicated(d), ]
# Merge information in data to bars
bars <- merge(bars, d, by = "PANEL")
# Set color = fill
bars[["colour"]] <- bars[["fill"]]
# Draw
grid::gList(
ggplot2::GeomPolygon$draw_panel(bars, panel_scales, coord)
)
},
draw_key = draw_key_rect
)
grid_bars <- function(n = 5, fill = "gray90") {
list(
geom_grid_bars_y(n = n, fill = fill),
scale_y_continuous(breaks = scales::pretty_breaks(n = n)),
theme(panel.grid = element_blank())
)
}
dat <- data.frame(year = 1875:1972,
level = as.vector(LakeHuron))
ggplot(dat, aes(year, level)) +
grid_bars(n = 10, fill = "gray95") +
geom_line(colour = "steelblue", size = 1.2) +
theme_classic()
Just for reference:
A first and simple approach to get grid bars one could simply adjust the size of the grid lines via theme() like so:
# Simple approach via theme
ggplot(dat, aes(year, level)) +
geom_line(colour = "steelblue", size = 1.2) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme_classic() +
theme(panel.grid.major.y = element_line(size = 8))
Created on 2020-06-14 by the reprex package (v0.3.0)

Add boxplot and stats to split violin plot in ggplot2 (R)

I love the split violin plot and #jan-glx 's awesome geom_split_violin function created here: Split violin plot with ggplot2.
I would love to add split boxplots and stats to this, as I explain below.
First, to be complete, here are the full data and code.
Data (copied from above link)
set.seed(20160229)
my_data = data.frame(
y=c(rnorm(1000), rnorm(1000, 0.5), rnorm(1000, 1), rnorm(1000, 1.5)),
x=c(rep('a', 2000), rep('b', 2000)),
m=c(rep('i', 1000), rep('j', 2000), rep('i', 1000)))
Code to create geom_split_violin function (copied from above link)
library('ggplot2')
GeomSplitViolin <- ggproto("GeomSplitViolin", GeomViolin,
draw_group = function(self, data, ..., draw_quantiles = NULL) {
data <- transform(data, xminv = x - violinwidth * (x - xmin), xmaxv = x + violinwidth * (xmax - x))
grp <- data[1, "group"]
newdata <- plyr::arrange(transform(data, x = if (grp %% 2 == 1) xminv else xmaxv), if (grp %% 2 == 1) y else -y)
newdata <- rbind(newdata[1, ], newdata, newdata[nrow(newdata), ], newdata[1, ])
newdata[c(1, nrow(newdata) - 1, nrow(newdata)), "x"] <- round(newdata[1, "x"])
if (length(draw_quantiles) > 0 & !scales::zero_range(range(data$y))) {
stopifnot(all(draw_quantiles >= 0), all(draw_quantiles <=
1))
quantiles <- ggplot2:::create_quantile_segment_frame(data, draw_quantiles)
aesthetics <- data[rep(1, nrow(quantiles)), setdiff(names(data), c("x", "y")), drop = FALSE]
aesthetics$alpha <- rep(1, nrow(quantiles))
both <- cbind(quantiles, aesthetics)
quantile_grob <- GeomPath$draw_panel(both, ...)
ggplot2:::ggname("geom_split_violin", grid::grobTree(GeomPolygon$draw_panel(newdata, ...), quantile_grob))
}
else {
ggplot2:::ggname("geom_split_violin", GeomPolygon$draw_panel(newdata, ...))
}
})
geom_split_violin <- function(mapping = NULL, data = NULL, stat = "ydensity", position = "identity", ...,
draw_quantiles = NULL, trim = TRUE, scale = "area", na.rm = FALSE,
show.legend = NA, inherit.aes = TRUE) {
layer(data = data, mapping = mapping, stat = stat, geom = GeomSplitViolin,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(trim = trim, scale = scale, draw_quantiles = draw_quantiles, na.rm = na.rm, ...))
}
My attempt to add boxplots and stats
Here is the code that I used to try to add:
Split boxplots.
P values using wilcox.test stats.
Sample sizes (n).
Code:
library(ggpubr)
give.n <- function(x){return(y = -2.6, label = length(x))}
ggplot(my_data, aes(x, y, fill = m)) +
geom_split_violin() +
geom_boxplot(width = 0.2, notch = TRUE, fill="white", outlier.shape = NA) +
stat_summary(fun.data = give.n, geom = "text") +
stat_compare_means(aes(label = ifelse(p < 1.e-4, sprintf("p = %2.1e",
as.numeric(..p.format..)), sprintf("p = %5.4f",
as.numeric(..p.format..)))), method = "wilcox.test", paired = FALSE) +
stat_summary(fun.data = give.n, geom = "text")
This is the result:
Unfortunately, this throws an error and is not quite where I hoped to get, because it is missing the p values and the sample sizes (n) and the boxplots are not split. I also tried one of #Axeman 's excellent suggestions in another SO answer, but no luck so far.
What I am hoping to achieve is something similar to this (also with p values no longer "NA"):
This seems a big challenge, but I am hoping someone out there might be able to help, as others will probably love this as much as me. Thank you.

Plot only one side/half of the violin plot

I would like to have only one half of violin plots (similar to the plots created by stat_density_ridges from ggridges). A MWE
library(ggplot2)
dframe = data.frame(val = c(), group = c())
for(i in 1:5){
offset = i - 3
dframe = rbind(dframe,
data.frame(val = rnorm(n = 50, mean = 0 - offset), group = i)
)
}
dframe$group = as.factor(dframe$group)
ggplot(data = dframe, aes(x = group, y = val)) +
geom_violin()
produces a plot like this
I though would like to have one looking like this:
Ideally, the plots would also be scaled to like 1.5 to 2 times the width.
There's a neat solution by #David Robinson (original code is from his gists and I did only a couple of modifications).
He creates new layer (GeomFlatViolin) which is based on changing width of the violin plot:
data <- transform(data,
xmaxv = x,
xminv = x + violinwidth * (xmin - x))
This layer also has width argument.
Example:
# Using OPs data
# Get wanted width with: geom_flat_violin(width = 1.5)
ggplot(dframe, aes(group, val)) +
geom_flat_violin()
Code:
library(ggplot2)
library(dplyr)
"%||%" <- function(a, b) {
if (!is.null(a)) a else b
}
geom_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity",
position = "dodge", trim = TRUE, scale = "area",
show.legend = NA, inherit.aes = TRUE, ...) {
layer(
data = data,
mapping = mapping,
stat = stat,
geom = GeomFlatViolin,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
trim = trim,
scale = scale,
...
)
)
}
GeomFlatViolin <-
ggproto("GeomFlatViolin", Geom,
setup_data = function(data, params) {
data$width <- data$width %||%
params$width %||% (resolution(data$x, FALSE) * 0.9)
# ymin, ymax, xmin, and xmax define the bounding rectangle for each group
data %>%
group_by(group) %>%
mutate(ymin = min(y),
ymax = max(y),
xmin = x - width / 2,
xmax = x)
},
draw_group = function(data, panel_scales, coord) {
# Find the points for the line to go all the way around
data <- transform(data,
xmaxv = x,
xminv = x + violinwidth * (xmin - x))
# Make sure it's sorted properly to draw the outline
newdata <- rbind(plyr::arrange(transform(data, x = xminv), y),
plyr::arrange(transform(data, x = xmaxv), -y))
# Close the polygon: set first and last point the same
# Needed for coord_polar and such
newdata <- rbind(newdata, newdata[1,])
ggplot2:::ggname("geom_flat_violin", GeomPolygon$draw_panel(newdata, panel_scales, coord))
},
draw_key = draw_key_polygon,
default_aes = aes(weight = 1, colour = "grey20", fill = "white", size = 0.5,
alpha = NA, linetype = "solid"),
required_aes = c("x", "y")
)
Package see has also a function geom_violinhalf that seems to do exactly what you want (see right plot below). It behaves mostly like geom_violin(), except that it does not have all arguments geom_violin() has (missing for example draw_quantiles)
library(ggplot2)
library(see)
p <- ggplot(mtcars, aes(factor(cyl), mpg))
p1 <- p + geom_violin()+ ggtitle("geom_violin")
p2 <- p + see::geom_violinhalf()+ ggtitle("see::geom_violinhalf")
## show them next to each other
library(patchwork)
p1+p2
Created on 2020-04-30 by the reprex package (v0.3.0)

ggplot2: display blocks of nested split violins

I have the following dataset:
df <- data.frame(dens = rnorm(5000),
split = as.factor(sample(1:2, 5000, replace = T)),
method = as.factor(sample(c("A","B"), 5000, replace = T)),
counts = sample(c(1, 10, 100, 1000, 10000), 5000, replace = T))
What i am wanting to do is to do split violin plots for splits 1 and 2 within groups A and B for each count (which would be in the logscale, but that is not important for this example). We have four groups for each setting but there is a nested aspect to it.
So, I can do the following:
df$key <- factor(paste(df$split, df$method))
and then:
library(ggplot2)
ggplot(df, aes(x = factor(counts), y = dens, fill = split)) +
geom_violin(aes(fill = key), scale = "width", draw_quantiles = c(0.25, 0.5, 0.75)) + scale_fill_manual(values = cbPalette) + theme_bw()
which gives me the following plot:
But what I want is really the light blue and the dark blue to be the two halves of a split violin plot and the light green and the dark green to be the two halves of another split violin plot and these plots should be bunched together. I would also like the different counts to be more separated from each other, but i feel that I can figure that out.
Note that this question is different than the one I have listed or Split violin plot with ggplot2 because we are bunching two different levels of nested split violin plots for each "Counts".
I was trying to follow enter link description here but
I can not tell how to add such a nested groups setting to the code there and am looking for some advice.
Here is what I have tried:
GeomSplitViolin <- ggproto("GeomSplitViolin", GeomViolin,
draw_group = function(self, data, ..., draw_quantiles = NULL){
# By #YAK: https://stackoverflow.com/questions/35717353/split-violin-plot-with-ggplot2
data <- transform(data, xminv = x - violinwidth * (x - xmin), xmaxv = x + violinwidth * (xmax - x))
grp <- data[1,'group']
newdata <- plyr::arrange(transform(data, x = if(grp%%2==1) xminv else xmaxv), if(grp%%2==1) y else -y)
newdata <- rbind(newdata[1, ], newdata, newdata[nrow(newdata), ], newdata[1, ])
newdata[c(1,nrow(newdata)-1,nrow(newdata)), 'x'] <- round(newdata[1, 'x'])
if (length(draw_quantiles) > 0 & !scales::zero_range(range(data$y))) {
stopifnot(all(draw_quantiles >= 0), all(draw_quantiles <= 1))
quantiles <- create_quantile_segment_frame(data, draw_quantiles, split = TRUE, grp = grp)
aesthetics <- data[rep(1, nrow(quantiles)), setdiff(names(data), c("x", "y")), drop = FALSE]
aesthetics$alpha <- rep(1, nrow(quantiles))
both <- cbind(quantiles, aesthetics)
quantile_grob <- GeomPath$draw_panel(both, ...)
ggplot2:::ggname("geom_split_violin", grid::grobTree(GeomPolygon$draw_panel(newdata, ...), quantile_grob))
}
else {
ggplot2:::ggname("geom_split_violin", GeomPolygon$draw_panel(newdata, ...))
}
}
)
create_quantile_segment_frame <- function (data, draw_quantiles, split = FALSE, grp = NULL) {
dens <- cumsum(data$density)/sum(data$density)
ecdf <- stats::approxfun(dens, data$y)
ys <- ecdf(draw_quantiles)
violin.xminvs <- (stats::approxfun(data$y, data$xminv))(ys)
violin.xmaxvs <- (stats::approxfun(data$y, data$xmaxv))(ys)
violin.xs <- (stats::approxfun(data$y, data$x))(ys)
if (grp %% 2 == 0) {
data.frame(x = ggplot2:::interleave(violin.xs, violin.xmaxvs),
y = rep(ys, each = 2), group = rep(ys, each = 2))
} else {
data.frame(x = ggplot2:::interleave(violin.xminvs, violin.xs),
y = rep(ys, each = 2), group = rep(ys, each = 2))
}
}
geom_split_violin <- function (mapping = NULL, data = NULL, stat = "ydensity", position = "identity", ..., draw_quantiles = NULL, trim = TRUE, scale = "area", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) {
layer(data = data, mapping = mapping, stat = stat, geom = GeomSplitViolin, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list(trim = trim, scale = scale, draw_quantiles = draw_quantiles, na.rm = na.rm, ...))
}
library(ggplot2)
ggplot(df, aes(x = factor(counts), y = dens, fill = interaction(split,method))) +
geom_split_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + scale_fill_manual(values=RColorBrewer::brewer.pal(name="Paired",n=4)) + theme_light() + theme(legend.position="bottom")
And here is what I get:
As can be seen, the green images are on top of the blues. How do I get around this? Thanks!
EDIT: Folllowing Axeman's suggestion, I am almost there:
levels(df$split) <- factor(0:3)
library(ggplot2)
ggplot(df, aes(x = interaction(split, counts), y = dens, fill = key)) + geom_split_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + scale_fill_manual(values=RColorBrewer::brewer.pal(name="Paired",n=4)) + theme_light() + theme(legend.position="bottom") + scale_x_discrete(interaction(df$split,df$counts)[-length(interaction(df$split,df$counts))], drop = FALSE)
So almost there!
Would like two fixes: the white space arising from the last interaction between split and counts, and the scale to only have counts for each bunch.
Wonder if these should be separate questions on Stackoverflow.
Almost there!
library(ggplot2)
ggplot(df, aes(x = interaction(split, counts), y = dens, fill = key)) + geom_split_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +scale_fill_manual(values=RColorBrewer::brewer.pal(name="Paired",n=4)) + theme_light() + theme(legend.position="bottom") + scale_x_discrete(limits=levels(interaction(df$split,df$counts))[-length(levels(interaction(df$split,df$counts)))],drop = FALSE)
This yields:
I still need to place the value of counts on the x-axis, in between the two plots.
I think that this question has become too long and the basic parts of this question have been answered. I have put up a new question on how to change the discrete scale. Hopefully, someone will know! Anyway, here is the answer to this question (thanks, Axe!). It is in the edited version of my question.
library(ggplot2)
df <- data.frame(dens = rnorm(5000),
split = factor(sample(1:2, 5000, replace = T)),
method = factor(sample(c("A","B"), 5000, replace = T)),
counts = factor(sample(c(1, 10, 100, 1000, 10000), 5000, replace = T)))
df$key <- factor(paste(df$split, df$method))
levels(df$split) <- factor(0:2)
library(ggplot2)
ggplot(df, aes(x = interaction(split, counts), y = dens, fill = key)) +
geom_split_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
scale_fill_manual(values=RColorBrewer::brewer.pal(name="Paired",n=4)) +
theme_light() +
theme(legend.position="bottom") +
scale_x_discrete(
limits = levels(interaction(df$split,df$counts))[-length(levels(interaction(df$split,df$counts)))],
drop = FALSE,
name = "Counts"
)

Resources