Override lower, upper, etc. in boxplot while grouping - r

Per default, for the lower, middle and upper quantile in geom_boxplot the 25%-, 50%-, and 75%-quantiles are considered. These are computed from y, but can be set manually via the aesthetic arguments lower, upper, middle (providing also x, ymin and ymax and setting stat="identity").
However, doing so, several undesirable effects occur (cf. version 1 in the example code):
The argument group is ignored, so all values of a column are considered in calculations (for instance when computing the lowest quantile for each group)
The resulting identical boxplots are grouped by x, and repeated within the group as often as the specific group value occurs in the data (instead of merging the boxes to a wider one)
outliers are not plotted
By pre-computing the desired values and storing them in a new data frame, one can handle the first two points (cf. version 2 in the example code), while the third point is fixed by identifying the outliers and adding them separately to the chart via geom_point.
Is there a more straight forward way to have the quantiles changed, without having these undesired effects?
Example Code:
set.seed(12)
# Random data in B, grouped by values 1 to 4 in A
u <- data.frame(A = sample.int(4, 100, replace = TRUE), B = rnorm(100))
# Desired arguments
qymax <- 0.9
qymin <- 0.1
qmiddle <- 0.5
qupper <- 0.8
qlower <- 0.2
Version 1: Repeated boxplots per value in A, grouped by A
ggplot(u, aes(x = A, y = B)) +
geom_boxplot(aes(group=A,
lower = quantile(B, qlower),
upper = quantile(B, qupper),
middle = quantile(B, qmiddle),
ymin = quantile(B, qymin),
ymax = quantile(B, qymax) ),
stat="identity")
Version 2: Compute the arguments first for each group. Base R solution
Bgrouped <- lapply(unique(u$A), function(a) u$B[u$A == a])
.lower <- sapply(Bgrouped, function(x) quantile(x, qlower))
.upper <- sapply(Bgrouped, function(x) quantile(x, qupper))
.middle <- sapply(Bgrouped, function(x) quantile(x, qmiddle))
.ymin <- sapply(Bgrouped, function(x) quantile(x, qymin))
.ymax <- sapply(Bgrouped, function(x) quantile(x, qymax))
u <- data.frame(A = unique(u$A),
lower = .lower,
upper = .upper,
middle = .middle,
ymin = .ymin,
ymax = .ymax)
ggplot(u, aes(x = A)) +
geom_boxplot(aes(lower = lower, upper = upper,
middle = middle, ymin = ymin, ymax = ymax ),
stat="identity")

It's not something I'd really do without a lot of justification, as people typically expect the boxplot's min / max / box values to correspond to the same quantile positions, but it can be done.
Data used (with extreme values added to demonstrate outliers):
set.seed(12)
u <- data.frame(A = sample.int(4, 100, replace = TRUE), B = rnorm(100))
u$B[c(30, 70, 76)] <- c(4, -4, -5)
Solution 1: You can pre-compute the values without going by the base R route, & include calculations for outliers in the same step. I'd do it completely within Hadley's tidyverse libraries, which I find neater:
library(dplyr)
library(tidyr)
u %>%
group_by(A) %>%
summarise(lower = quantile(B, qlower),
upper = quantile(B, qupper),
middle = quantile(B, qmiddle),
IQR = diff(c(lower, upper)),
ymin = max(quantile(B, qymin), lower - 1.5 * IQR),
ymax = min(quantile(B, qymax), upper + 1.5 * IQR),
outliers = list(B[which(B > upper + 1.5 * IQR |
B < lower - 1.5 * IQR)])) %>%
ungroup() %>%
ggplot(aes(x = A)) +
geom_boxplot(aes(lower = lower, upper = upper,
middle = middle, ymin = ymin, ymax = ymax ),
stat="identity") +
geom_point(data = . %>%
filter(sapply(outliers, length) > 0) %>%
select(A, outliers) %>%
unnest(),
aes(y = unlist(outliers)))
Solution 2: You can override the actual quantile specifications used by ggplot. The calculations for geom_boxplot()'s quantiles are actually in StatBoxplot's compute_group() function, found here:
compute_group = function(data, scales, width = NULL, na.rm = FALSE, coef = 1.5) {
qs <- c(0, 0.25, 0.5, 0.75, 1)
if (!is.null(data$weight)) {
mod <- quantreg::rq(y ~ 1, weights = weight, data = data, tau = qs)
stats <- as.numeric(stats::coef(mod))
} else {
stats <- as.numeric(stats::quantile(data$y, qs))
}
... (omitted for space)
The qs vector defines the quantile positions. It's not affected by parameters passed to compute_group(), so the only way to change that is to change the definition for compute_group() itself:
# save a copy of the original function, in case you need to revert
original.function <- environment(ggplot2::StatBoxplot$compute_group)$f
# define new function (only the first line for qs is changed, but you'll have to
# copy & paste the whole thing)
new.function <- function (data, scales, width = NULL, na.rm = FALSE, coef = 1.5) {
qs <- c(0.1, 0.2, 0.5, 0.8, 0.9)
if (!is.null(data$weight)) {
mod <- quantreg::rq(y ~ 1, weights = weight, data = data,
tau = qs)
stats <- as.numeric(stats::coef(mod))
}
else {
stats <- as.numeric(stats::quantile(data$y, qs))
}
names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
iqr <- diff(stats[c(2, 4)])
outliers <- data$y < (stats[2] - coef * iqr) | data$y > (stats[4] +
coef * iqr)
if (any(outliers)) {
stats[c(1, 5)] <- range(c(stats[2:4], data$y[!outliers]),
na.rm = TRUE)
}
if (length(unique(data$x)) > 1)
width <- diff(range(data$x)) * 0.9
df <- as.data.frame(as.list(stats))
df$outliers <- list(data$y[outliers])
if (is.null(data$weight)) {
n <- sum(!is.na(data$y))
}
else {
n <- sum(data$weight[!is.na(data$y) & !is.na(data$weight)])
}
df$notchupper <- df$middle + 1.58 * iqr/sqrt(n)
df$notchlower <- df$middle - 1.58 * iqr/sqrt(n)
df$x <- if (is.factor(data$x))
data$x[1]
else mean(range(data$x))
df$width <- width
df$relvarwidth <- sqrt(n)
df
}
Result:
# toggle between the two definitions
environment(StatBoxplot$compute_group)$f <- original.function
ggplot(u, aes(x = A, y = B, group = A)) +
geom_boxplot() +
ggtitle("original definition for calculated quantiles")
environment(StatBoxplot$compute_group)$f <- new.function
ggplot(u, aes(x = A, y = B, group = A)) +
geom_boxplot() +
ggtitle("new definition for calculated quantiles")
Do note that when you change the definition, it affects every ggplot object in your environment. So if you've created a ggplot boxplot object before the definition change, & print it out afterwards, the boxplot will follow the new definition. (For the side-by-side comparison above, I had to convert each ggplot to a grob object immediately, in order to preserve the difference.)

Related

ggplot2 adding label to geom_area

I'm teaching undergrad statistics and trying to make a useful little R script to help my students understand calculating probabilities in the standard normal distribution. I have this script, which takes zscore breakpoints, calculates the fraction of data between each breakpoint, and colors each breakpoint section:
library(tidyverse)
library(ggplot2)
library(magrittr)
sim_dat = data.frame(z = seq(-5,5, length.out = 1001))
sim_dat$y = dnorm(sim_dat$z, mean = 0, sd=1)
#fill in z-score bkpts, excluding zero: 0 will always be included
zscores <- c(-1,1.5)
zscores <- sort( setdiff(zscores,0) )
bkpoints <- sort( c(-Inf, zscores,0, Inf))
#find pct data between brekpoints
pctdata <- numeric(length=length(bkpoints)-1)
interval <- character(length=length(bkpoints)-1)
for(i in 1:length(pctdata)){
pctdata[i] <- plyr::round_any( pnorm(q=bkpoints[i+1]) - pnorm(q=bkpoints[i]) , 0.0001)
interval[i] <- paste0(bkpoints[i],",",bkpoints[i+1])
}
pctdata_df <- cbind.data.frame(interval,pctdata,stringsAsFactors=FALSE)
sim_dat$standard_normal_sections = cut(sim_dat$z, breaks = bkpoints)
p1 <- ggplot2::ggplot(sim_dat, aes(z, y, fill = standard_normal_sections)) + geom_area() +
scale_x_continuous(breaks= c(seq(-5,5,1), zscores))
p1
pctdata_df
I'd like to use pctdata_df$pctdata(vector of how much data is in section of p1) as labels. I'm finding very little on how to add labels to geom_area. Any help is appreciated!
There is nothing special about geom_area. If you want to add labels you could do so with geom_text where you pass your pctdata_df to the data argument. As you gave no information on where you want to add your labels I have put them beneath the area chart.
Note: There is no need for a for loop. You could simply pass a vector to pnorm or paste.
library(scales)
library(ggplot2)
# find pct data between brekpoints
lower <- bkpoints[1:(length(bkpoints) - 1)]
upper <- bkpoints[2:length(bkpoints)]
pctdata <- pnorm(q = upper) - pnorm(q = lower)
interval <- paste0(lower, ",", upper)
pctdata_df <- data.frame(interval, lower, upper, pctdata)
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(lower), upper - 1, .5 * (lower + upper)))
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(upper), lower + 1, x_label))
sim_dat$standard_normal_sections <- cut(sim_dat$z, breaks = bkpoints)
ggplot(sim_dat, aes(z, y)) +
geom_area(aes(fill = standard_normal_sections)) +
geom_text(data = pctdata_df, aes(x = x_label, y = 0, label = scales::number(pctdata, .01)),
vjust = 1, size = 8 / .pt, nudge_y = -.01) +
scale_x_continuous(breaks = c(seq(-5, 5, 1), zscores))

calling an object from a function in R - error object not found

I am trying to create a variable width column chart, each column representing the a non overlapping steps. the width would be the mean of the step and the height is the standard dev of each step. This was my original code:
df <- data.frame (Steps = c( "Step2", "Step3", "Step4", "Step5", "Step6", "Step7"), Variability = c(sd(dataset$PS02_days_creation_to_validation, na.rm = TRUE), sd(dataset$PS03_days_validation_to_sourcing, na.rm = TRUE), sd(dataset$PS04_days_sourcing_to_confirmation, na.rm = TRUE), sd(dataset$PS05_days_confirmation_to_ship_created, na.rm = TRUE),sd(dataset$PS06_days_ship_created_to_shipped, na.rm = TRUE), sd(dataset$PS07_days_shipped_to_reception,na.rm = TRUE) ), width = c(mean(dataset$PS02_days_creation_to_validation, na.rm = TRUE), mean(dataset$PS03_days_validation_to_sourcing, na.rm = TRUE), mean(dataset$PS04_days_sourcing_to_confirmation, na.rm = TRUE), mean(dataset$PS05_days_confirmation_to_ship_created,na.rm = TRUE), mean(dataset$PS06_days_ship_created_to_shipped,na.rm = TRUE), mean(dataset$PS07_days_shipped_to_reception,na.rm = TRUE) ))
df$w <- cumsum(df$width) #cumulative sums to gt the end points of each column.
df$wm <- df$w - df$width #giving the starting point of each column
library(ggplot2)
p <- ggplot(df, aes(ymin = 0))
p1 <- p + geom_rect(aes(xmin = wm, xmax = w, ymax = Variability, fill = Steps))
p1
This code works fine when all the steps are positive. However when the width of the first step is negative, then my columns would overlap.
i.e. 4 steps with the following width =(25,50,75,100), using the above code creates: end point w= (25,75,150,250) starting point wm= (0,25,75,150). Great!
Now if the the first step's width change to (-25,50,75,100) then the code produces w:(-25,25,100,200) and wm:(0,-25,25,100) which means that the steps are overlapping!! Instead i would like to amend the code to produce: w=(-25,50,125,225) and wm:(0,0,50,125)
An R guru kindly suggested using this function:
widths_to_starts_ends <- function(widths){
widths <- widths[widths != 0]
pos_widths <- widths[widths > 0]
neg_widths <- widths[widths < 0]
pos_ends <- cumsum(pos_widths)
pos_starts <- cumsum(pos_widths) - pos_widths
neg_widths <- abs(neg_widths)
neg_starts <- cumsum(neg_widths)
neg_ends <- cumsum(neg_widths) - neg_widths
neg_starts <- (-1)*neg_starts
neg_ends <- (-1)*neg_ends
return( list(s = c(neg_ends, pos_starts), e = c(neg_starts, pos_ends)) )
}
widths_to_starts_ends(c(-25,50,75,100))
which results in
$s
[1] 0 0 50 125
$e
[1] -25 50 125 225
I would like to embed this function into my original code (to replace the w and wm lines with e and s) and make a graph but i am not sure how to call the objects E and S. this is what im doing:
library(ggplot2)
p <- ggplot(df, aes(ymin = 0))
p1 <- p + geom_rect(aes(xmin = s, xmax = e, ymax = Variability, fill = Steps))
p1
But it says object S is not found. Could you help me please?
s and e appear to be elements of a list, not standalone objects, which I imagine is why they can’t be found. In your function you could instead return a dataframe df containing s and e columns and then make your ggplot call
ggplot(df, aes(xmin = s, xmax = e...)) + geom_rect()
If I understand it correctly, you want to make a continuous series of intervals that are shifted by the first value of your list if it is negative.
Here is something more compact than what you have already to produce such intervals and store them in your dataframe:
df$e <- cumsum(abs(df$width)) + min(0, df$width[1])
df$s <- c(min(0, df$width[1]), df$e[-length(df$e)])
df$s #starting points
#[1] -25 0 50 125
df$e #ending points
#[1] 0 50 125 225
Then, the corresponding plot would be obtained with:
p <- ggplot(df, aes(ymin = 0))
p1 <- p + geom_rect(aes(xmin = s, xmax = e, ymax = Variability, fill = Steps))
p1
Data that I used:
set.seed(123456789)
df <- data.frame (Steps = c( "Step2",
"Step3",
"Step4",
"Step5"),
Variability = c(sd(1:500/sample(1:10, 1)),
sd(1:500/sample(1:10, 1)),
sd(1:500/sample(1:10, 1)),
sd(1:500/sample(1:10, 1))),
width = c(-25,50,75,100))

stat_summary: Including single observations into aggregating function

I would like to "force" an aggregating function in stat_summary to calculate an output value for single observations:
set.seed(1)
value <- c(rep(1:6, each = 3), 7:8)
rel_freq <- sample(x = seq(0, 1, 0.1), size = length(value), replace = TRUE)
example_df <- data.frame(value, rel_freq)
require(ggplot2)
ggplot() +
stat_summary(data = example_df,
mapping = aes(x = as.character(value), y = rel_freq),
fun.data = mean_se)
# Warning message: Removed 2 rows containing missing values (geom_pointrange)
Now what happened here (IMO) is that ggplot removed observations 7 and 8 because the aggregating function in stat_summary doesn't work with single observations? But is there a way to force an output here?
You could write your own little function that extends mean_se to handle the case where the length of x equals 1.
mean_se_tjebo <- function (x, mult = 1) {
x <- stats::na.omit(x)
se <- mult * sqrt(stats::var(x)/length(x))
mean <- mean(x)
if(length(x) != 1) {
data.frame(y = mean, ymin = mean - se, ymax = mean + se)
} else {
data.frame(y = mean, ymin = mean, ymax = mean)
}
}
Now the plot looks as follows
ggplot() +
stat_summary(data = example_df,
mapping = aes(x = as.character(value), y = rel_freq),
fun.data = mean_se_tjebo)

ggplot box plot confusion [duplicate]

I'm trying to use ggplot2 / geom_boxplot to produce a boxplot where the whiskers are defined as the 5 and 95th percentile instead of 0.25 - 1.5 IQR / 0.75 + IQR and outliers from those new whiskers are plotted as usual. I can see that the geom_boxplot aesthetics include ymax / ymin, but it's not clear to me how I put values in here. It seems like:
stat_quantile(quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95))
should be able to help, but I don't know how to relate the results of this stat to set the appropriate geom_boxplot() aesthetics:
geom_boxplot(aes(ymin, lower, middle, upper, ymax))
I've seen other posts where people mention essentially building a boxplot-like object manually, but I'd rather keep the whole boxplot gestalt intact, just revising the meaning of two of the variables being drawn.
geom_boxplot with stat_summary can do it:
# define the summary function
f <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# sample data
d <- data.frame(x=gl(2,50), y=rnorm(100))
# do it
ggplot(d, aes(x, y)) + stat_summary(fun.data = f, geom="boxplot")
# example with outliers
# define outlier as you want
o <- function(x) {
subset(x, x < quantile(x)[2] | quantile(x)[4] < x)
}
# do it
ggplot(d, aes(x, y)) +
stat_summary(fun.data=f, geom="boxplot") +
stat_summary(fun.y = o, geom="point")
It is now possible to specify the whiskers endpoints in ggplot2_2.1.0. Copying from the examples in ?geom_boxplot:
# It's possible to draw a boxplot with your own computations if you
# use stat = "identity":
y <- rnorm(100)
df <- data.frame(
x = 1,
y0 = min(y),
y25 = quantile(y, 0.25),
y50 = median(y),
y75 = quantile(y, 0.75),
y100 = max(y)
)
ggplot(df, aes(x)) +
geom_boxplot(
aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
stat = "identity"
)
Building on #konvas's answer, beginning in ggplot2.0.x, you can extend ggplot using the ggproto system and define your own stat.
By copying the ggplot2 stat_boxplot code and making a few edits, you can quickly define a new stat (stat_boxplot_custom) that takes the percentiles you want to use as an argument (qs) instead of the coef argument that stat_boxplot uses. The new stat is defined here:
# modified from https://github.com/tidyverse/ggplot2/blob/master/R/stat-boxplot.r
library(ggplot2)
stat_boxplot_custom <- function(mapping = NULL, data = NULL,
geom = "boxplot", position = "dodge",
...,
qs = c(.05, .25, 0.5, 0.75, 0.95),
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBoxplotCustom,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
qs = qs,
...
)
)
}
Then, the layer function is defined. Note that b/c I copied directly from stat_boxplot, you have to access a few internal ggplot2 functions using :::. This includes a lot of stuff copied directly over from StatBoxplot, but the key area is in computing the stats directly from the qs argument: stats <- as.numeric(stats::quantile(data$y, qs)) inside of the compute_group function.
StatBoxplotCustom <- ggproto("StatBoxplotCustom", Stat,
required_aes = c("x", "y"),
non_missing_aes = "weight",
setup_params = function(data, params) {
params$width <- ggplot2:::"%||%"(
params$width, (resolution(data$x) * 0.75)
)
if (is.double(data$x) && !ggplot2:::has_groups(data) && any(data$x != data$x[1L])) {
warning(
"Continuous x aesthetic -- did you forget aes(group=...)?",
call. = FALSE
)
}
params
},
compute_group = function(data, scales, width = NULL, na.rm = FALSE, qs = c(.05, .25, 0.5, 0.75, 0.95)) {
if (!is.null(data$weight)) {
mod <- quantreg::rq(y ~ 1, weights = weight, data = data, tau = qs)
stats <- as.numeric(stats::coef(mod))
} else {
stats <- as.numeric(stats::quantile(data$y, qs))
}
names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
iqr <- diff(stats[c(2, 4)])
outliers <- (data$y < stats[1]) | (data$y > stats[5])
if (length(unique(data$x)) > 1)
width <- diff(range(data$x)) * 0.9
df <- as.data.frame(as.list(stats))
df$outliers <- list(data$y[outliers])
if (is.null(data$weight)) {
n <- sum(!is.na(data$y))
} else {
# Sum up weights for non-NA positions of y and weight
n <- sum(data$weight[!is.na(data$y) & !is.na(data$weight)])
}
df$notchupper <- df$middle + 1.58 * iqr / sqrt(n)
df$notchlower <- df$middle - 1.58 * iqr / sqrt(n)
df$x <- if (is.factor(data$x)) data$x[1] else mean(range(data$x))
df$width <- width
df$relvarwidth <- sqrt(n)
df
}
)
There is also a gist here, containing this code.
Then, stat_boxplot_custom can be called just like stat_boxplot:
library(ggplot2)
y <- rnorm(100)
df <- data.frame(x = 1, y = y)
# whiskers extend to 5/95th percentiles by default
ggplot(df, aes(x = x, y = y)) +
stat_boxplot_custom()
# or extend the whiskers to min/max
ggplot(df, aes(x = x, y = y)) +
stat_boxplot_custom(qs = c(0, 0.25, 0.5, 0.75, 1))

Changing whisker definition in geom_boxplot

I'm trying to use ggplot2 / geom_boxplot to produce a boxplot where the whiskers are defined as the 5 and 95th percentile instead of 0.25 - 1.5 IQR / 0.75 + IQR and outliers from those new whiskers are plotted as usual. I can see that the geom_boxplot aesthetics include ymax / ymin, but it's not clear to me how I put values in here. It seems like:
stat_quantile(quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95))
should be able to help, but I don't know how to relate the results of this stat to set the appropriate geom_boxplot() aesthetics:
geom_boxplot(aes(ymin, lower, middle, upper, ymax))
I've seen other posts where people mention essentially building a boxplot-like object manually, but I'd rather keep the whole boxplot gestalt intact, just revising the meaning of two of the variables being drawn.
geom_boxplot with stat_summary can do it:
# define the summary function
f <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# sample data
d <- data.frame(x=gl(2,50), y=rnorm(100))
# do it
ggplot(d, aes(x, y)) + stat_summary(fun.data = f, geom="boxplot")
# example with outliers
# define outlier as you want
o <- function(x) {
subset(x, x < quantile(x)[2] | quantile(x)[4] < x)
}
# do it
ggplot(d, aes(x, y)) +
stat_summary(fun.data=f, geom="boxplot") +
stat_summary(fun.y = o, geom="point")
It is now possible to specify the whiskers endpoints in ggplot2_2.1.0. Copying from the examples in ?geom_boxplot:
# It's possible to draw a boxplot with your own computations if you
# use stat = "identity":
y <- rnorm(100)
df <- data.frame(
x = 1,
y0 = min(y),
y25 = quantile(y, 0.25),
y50 = median(y),
y75 = quantile(y, 0.75),
y100 = max(y)
)
ggplot(df, aes(x)) +
geom_boxplot(
aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
stat = "identity"
)
Building on #konvas's answer, beginning in ggplot2.0.x, you can extend ggplot using the ggproto system and define your own stat.
By copying the ggplot2 stat_boxplot code and making a few edits, you can quickly define a new stat (stat_boxplot_custom) that takes the percentiles you want to use as an argument (qs) instead of the coef argument that stat_boxplot uses. The new stat is defined here:
# modified from https://github.com/tidyverse/ggplot2/blob/master/R/stat-boxplot.r
library(ggplot2)
stat_boxplot_custom <- function(mapping = NULL, data = NULL,
geom = "boxplot", position = "dodge",
...,
qs = c(.05, .25, 0.5, 0.75, 0.95),
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBoxplotCustom,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
qs = qs,
...
)
)
}
Then, the layer function is defined. Note that b/c I copied directly from stat_boxplot, you have to access a few internal ggplot2 functions using :::. This includes a lot of stuff copied directly over from StatBoxplot, but the key area is in computing the stats directly from the qs argument: stats <- as.numeric(stats::quantile(data$y, qs)) inside of the compute_group function.
StatBoxplotCustom <- ggproto("StatBoxplotCustom", Stat,
required_aes = c("x", "y"),
non_missing_aes = "weight",
setup_params = function(data, params) {
params$width <- ggplot2:::"%||%"(
params$width, (resolution(data$x) * 0.75)
)
if (is.double(data$x) && !ggplot2:::has_groups(data) && any(data$x != data$x[1L])) {
warning(
"Continuous x aesthetic -- did you forget aes(group=...)?",
call. = FALSE
)
}
params
},
compute_group = function(data, scales, width = NULL, na.rm = FALSE, qs = c(.05, .25, 0.5, 0.75, 0.95)) {
if (!is.null(data$weight)) {
mod <- quantreg::rq(y ~ 1, weights = weight, data = data, tau = qs)
stats <- as.numeric(stats::coef(mod))
} else {
stats <- as.numeric(stats::quantile(data$y, qs))
}
names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
iqr <- diff(stats[c(2, 4)])
outliers <- (data$y < stats[1]) | (data$y > stats[5])
if (length(unique(data$x)) > 1)
width <- diff(range(data$x)) * 0.9
df <- as.data.frame(as.list(stats))
df$outliers <- list(data$y[outliers])
if (is.null(data$weight)) {
n <- sum(!is.na(data$y))
} else {
# Sum up weights for non-NA positions of y and weight
n <- sum(data$weight[!is.na(data$y) & !is.na(data$weight)])
}
df$notchupper <- df$middle + 1.58 * iqr / sqrt(n)
df$notchlower <- df$middle - 1.58 * iqr / sqrt(n)
df$x <- if (is.factor(data$x)) data$x[1] else mean(range(data$x))
df$width <- width
df$relvarwidth <- sqrt(n)
df
}
)
There is also a gist here, containing this code.
Then, stat_boxplot_custom can be called just like stat_boxplot:
library(ggplot2)
y <- rnorm(100)
df <- data.frame(x = 1, y = y)
# whiskers extend to 5/95th percentiles by default
ggplot(df, aes(x = x, y = y)) +
stat_boxplot_custom()
# or extend the whiskers to min/max
ggplot(df, aes(x = x, y = y)) +
stat_boxplot_custom(qs = c(0, 0.25, 0.5, 0.75, 1))

Resources