R: adding alpha bags to a 2d or 3d scatterplot - r

I know that in ggplot2 one can add the convex hull to a scatterplot by group as in
library(ggplot2)
library(plyr)
data(iris)
df<-iris
find_hull <- function(df) df[chull(df$Sepal.Length, df$Sepal.Width), ]
hulls <- ddply(df, "Species", find_hull)
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = hulls, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
I was wondering though how one could calculate and add alpha bags instead, i.e. the largest convex hull that contains at least a proportion 1-alpha of all the points? Either in 2d (to display with ggplot2) or 3d (to display with rgl).
EDIT: My initial idea was be to keep on "peeling" the convex hull for as along as the criterion of containing at least a given % of points would be satisfied, although in the paper here it seems they use a different algorithm (isodepth, which seems to be implemented in R package depth, in function isodepth and aplpack::plothulls seems also close to what I want (although it produces a full plot as opposed to just the contour), so I think with these I may be sorted. Though these function only works in 2D, and I would also be interested in a 3D extension (to be plotted in rgl). If anyone has any pointers let me know!
EDIT2: with function depth::isodepth I found a 2d solution (see post below), although I am still looking for a 3D solution as well - if anyone would happen to know how to do that, please let me know!

Ha with the help of function depth::isodepth I came up with the following solution - here I find the alpha bag that contains a proportion of at least 1-alpha of all points :
library(mgcv)
library(depth)
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,5)]
alph=0.05
find_bag = function(x,alpha=alph) {
n=nrow(x)
target=1-alpha
propinside=1
d=1
while (propinside>target) {
p=isodepth(x[,1:2],dpth=d,output=T, mustdith=T)[[1]]
ninside=sum(in.out(p,as.matrix(x[,1:2],ncol=2))*1)
nonedge=sum(sapply(1:nrow(p),function (row)
nrow(merge(round(setNames(as.data.frame(p[row,,drop=F]),names(x)[1:2]),5),as.data.frame(x[,1:2])))>0)*1)-3
propinside=(ninside+nonedge)/n
d=d+1
}
p=isodepth(x[,1:2],dpth=d-1,output=T, mustdith=T)[[1]]
p }
bags <- ddply(df, "Species", find_bag,alpha=alph)
names(bags) <- c("Species",names(df)[1:2])
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = bags, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
EDIT2:
Using my original idea of convex hull peeling I also came up with the following solution which now works in 2d & 3d; the result is not quite the same is with the isodepth algorithm, but it's pretty close :
# in 2d
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,5)]
alph=0.05
find_bag = function(x,alpha=alph) {
n=nrow(x)
propinside=1
target=1-alpha
x2=x
while (propinside>target) {
propinside=nrow(x2)/n
hull=chull(x2)
x2old=x2
x2=x2[-hull,]
}
x2old[chull(x2old),] }
bags <- ddply(df, "Species", find_bag, alpha=alph)
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = bags, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
# in 3d
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,3,5)]
levels=unique(df[,"Species"])
nlevels=length(levels)
zoom=0.8
cex=1
aspectr=c(1,1,0.7)
pointsalpha=1
userMatrix=matrix(c(0.80,-0.60,0.022,0,0.23,0.34,0.91,0,-0.55,-0.72,0.41,0,0,0,0,1),ncol=4,byrow=T)
windowRect=c(0,29,1920,1032)
cols=c("red","forestgreen","blue")
alph=0.05
plotbag = function(x,alpha=alph,grp=1,cols=c("red","forestgreen","blue"),transp=0.2) {
propinside=1
target=1-alpha
x2=x
levels=unique(x2[,ncol(x2)])
x2=x2[x2[,ncol(x2)]==levels[[grp]],]
n=nrow(x2)
while (propinside>target) {
propinside=nrow(x2)/n
hull=unique(as.vector(convhulln(as.matrix(x2[,1:3]), options = "Tv")))
x2old=x2
x2=x2[-hull,]
}
ids=t(convhulln(as.matrix(x2old[,1:3]), options = "Tv"))
rgl.triangles(x2old[ids,1],x2old[ids,2],x2old[ids,3],col=cols[[grp]],alpha=transp,shininess=50)
}
open3d(zoom=zoom,userMatrix=userMatrix,windowRect=windowRect,antialias=8)
for (i in 1:nlevels) {
plot3d(x=df[df[,ncol(df)]==levels[[i]],][,1],
y=df[df[,ncol(df)]==levels[[i]],][,2],
z=df[df[,ncol(df)]==levels[[i]],][,3],
type="s",
col=cols[[i]],
size=cex,
lit=TRUE,
alpha=pointsalpha,point_antialias=TRUE,
line_antialias=TRUE,shininess=50, add=TRUE)
plotbag(df,alpha=alph, grp=i, cols=c("red","forestgreen","blue"), transp=0.3) }
axes3d(color="black",drawfront=T,box=T,alpha=1)
title3d(color="black",xlab=names(df)[[1]],ylab=names(df)[[2]],zlab=names(df)[[3]],alpha=1)
aspect3d(aspectr)

We can modify the aplpack::plothulls function to accept a parameter for the proportion of points to enclose (in aplpack it's set to 50%). Then we can use this modified function to make a custom a geom for ggplot.
Here's the custom geom:
library(ggplot2)
StatBag <- ggproto("Statbag", Stat,
compute_group = function(data, scales, prop = 0.5) {
#################################
#################################
# originally from aplpack package, plotting functions removed
plothulls_ <- function(x, y, fraction, n.hull = 1,
col.hull, lty.hull, lwd.hull, density=0, ...){
# function for data peeling:
# x,y : data
# fraction.in.inner.hull : max percentage of points within the hull to be drawn
# n.hull : number of hulls to be plotted (if there is no fractiion argument)
# col.hull, lty.hull, lwd.hull : style of hull line
# plotting bits have been removed, BM 160321
# pw 130524
if(ncol(x) == 2){ y <- x[,2]; x <- x[,1] }
n <- length(x)
if(!missing(fraction)) { # find special hull
n.hull <- 1
if(missing(col.hull)) col.hull <- 1
if(missing(lty.hull)) lty.hull <- 1
if(missing(lwd.hull)) lwd.hull <- 1
x.old <- x; y.old <- y
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
for( i in 1:(length(x)/3)){
x <- x[-idx]; y <- y[-idx]
if( (length(x)/n) < fraction ){
return(cbind(x.hull,y.hull))
}
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx];
}
}
if(missing(col.hull)) col.hull <- 1:n.hull
if(length(col.hull)) col.hull <- rep(col.hull,n.hull)
if(missing(lty.hull)) lty.hull <- 1:n.hull
if(length(lty.hull)) lty.hull <- rep(lty.hull,n.hull)
if(missing(lwd.hull)) lwd.hull <- 1
if(length(lwd.hull)) lwd.hull <- rep(lwd.hull,n.hull)
result <- NULL
for( i in 1:n.hull){
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
result <- c(result, list( cbind(x.hull,y.hull) ))
x <- x[-idx]; y <- y[-idx]
if(0 == length(x)) return(result)
}
result
} # end of definition of plothulls
#################################
# prepare data to go into function below
the_matrix <- matrix(data = c(data$x, data$y), ncol = 2)
# get data out of function as df with names
setNames(data.frame(plothulls_(the_matrix, fraction = prop)), nm = c("x", "y"))
# how can we get the hull and loop vertices passed on also?
},
required_aes = c("x", "y")
)
#' #inheritParams ggplot2::stat_identity
#' #param prop Proportion of all the points to be included in the bag (default is 0.5)
stat_bag <- function(mapping = NULL, data = NULL, geom = "polygon",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, prop = 0.5, alpha = 0.3, ...) {
layer(
stat = StatBag, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, prop = prop, alpha = alpha, ...)
)
}
geom_bag <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
prop = 0.5,
alpha = 0.3,
...,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBag,
geom = GeomBag,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
alpha = alpha,
prop = prop,
...
)
)
}
#' #rdname ggplot2-ggproto
#' #format NULL
#' #usage NULL
#' #export
GeomBag <- ggproto("GeomBag", Geom,
draw_group = function(data, panel_scales, coord) {
n <- nrow(data)
if (n == 1) return(zeroGrob())
munched <- coord_munch(coord, data, panel_scales)
# Sort by group to make sure that colors, fill, etc. come in same order
munched <- munched[order(munched$group), ]
# For gpar(), there is one entry per polygon (not one entry per point).
# We'll pull the first value from each group, and assume all these values
# are the same within each group.
first_idx <- !duplicated(munched$group)
first_rows <- munched[first_idx, ]
ggplot2:::ggname("geom_bag",
grid:::polygonGrob(munched$x, munched$y, default.units = "native",
id = munched$group,
gp = grid::gpar(
col = first_rows$colour,
fill = alpha(first_rows$fill, first_rows$alpha),
lwd = first_rows$size * .pt,
lty = first_rows$linetype
)
)
)
},
default_aes = aes(colour = "NA", fill = "grey20", size = 0.5, linetype = 1,
alpha = NA, prop = 0.5),
handle_na = function(data, params) {
data
},
required_aes = c("x", "y"),
draw_key = draw_key_polygon
)
And here's an example of how it can be used:
ggplot(iris, aes(Sepal.Length, Petal.Length, colour = Species, fill = Species)) +
geom_point() +
stat_bag(prop = 0.95) + # enclose 95% of points
stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
stat_bag(prop = 0.05, alpha = 0.9) # enclose 5% of points

Related

How to highlight high correlations in ggpairs correlation matrix?

I am investigating the distribution of different variables and their correlations. Is there a way to highlight the high correlations? e.g. I can mark correlations greater than 0.8 as red and lower than -0.8 as blue.
enter image description here
As #thefringthing said in their comment, this is not a straightforward task but it is definitely doable.
This solution is based on this question and this answer:
# Load libraries
library(tidyverse)
library(GGally)
# Load some example data
mtcars <- mtcars[,1:6]
# Define function to colour panels according to correlation
cor_func <- function(data, mapping, method, symbol, ...){
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
corr <- cor(x, y, method=method, use='complete.obs')
colFn <- colorRampPalette(c("firebrick", "white", "dodgerblue"),
interpolate ='spline')
rampcols <- colFn(100)
match <- c(rampcols[1:10], rep("#FFFFFF", 80), rampcols[90:100])
fill <- match[findInterval(corr, seq(-1, 1, length = 100))]
ggally_text(
label = paste(symbol, as.character(round(corr, 2))),
mapping = aes(),
xP = 0.5, yP = 0.5,
color = 'black',
...) +
theme_void() +
theme(panel.background = element_rect(fill = fill))
}
plot1 <- ggpairs(mtcars,
upper = list(continuous = wrap(cor_func,
method = 'spearman', symbol = "Corr:\n")),
lower = list(continuous = function(data, mapping, ...) {
ggally_smooth_lm(data = data, mapping = mapping)}),
diag = list(continuous = function(data, mapping, ...) {
ggally_densityDiag(data = data, mapping = mapping)}
))
plot1

How to create a 2d polygon that outlines group data points and overlap analysis of such euclidean or hyperspace volumes [duplicate]

I know that in ggplot2 one can add the convex hull to a scatterplot by group as in
library(ggplot2)
library(plyr)
data(iris)
df<-iris
find_hull <- function(df) df[chull(df$Sepal.Length, df$Sepal.Width), ]
hulls <- ddply(df, "Species", find_hull)
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = hulls, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
I was wondering though how one could calculate and add alpha bags instead, i.e. the largest convex hull that contains at least a proportion 1-alpha of all the points? Either in 2d (to display with ggplot2) or 3d (to display with rgl).
EDIT: My initial idea was be to keep on "peeling" the convex hull for as along as the criterion of containing at least a given % of points would be satisfied, although in the paper here it seems they use a different algorithm (isodepth, which seems to be implemented in R package depth, in function isodepth and aplpack::plothulls seems also close to what I want (although it produces a full plot as opposed to just the contour), so I think with these I may be sorted. Though these function only works in 2D, and I would also be interested in a 3D extension (to be plotted in rgl). If anyone has any pointers let me know!
EDIT2: with function depth::isodepth I found a 2d solution (see post below), although I am still looking for a 3D solution as well - if anyone would happen to know how to do that, please let me know!
Ha with the help of function depth::isodepth I came up with the following solution - here I find the alpha bag that contains a proportion of at least 1-alpha of all points :
library(mgcv)
library(depth)
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,5)]
alph=0.05
find_bag = function(x,alpha=alph) {
n=nrow(x)
target=1-alpha
propinside=1
d=1
while (propinside>target) {
p=isodepth(x[,1:2],dpth=d,output=T, mustdith=T)[[1]]
ninside=sum(in.out(p,as.matrix(x[,1:2],ncol=2))*1)
nonedge=sum(sapply(1:nrow(p),function (row)
nrow(merge(round(setNames(as.data.frame(p[row,,drop=F]),names(x)[1:2]),5),as.data.frame(x[,1:2])))>0)*1)-3
propinside=(ninside+nonedge)/n
d=d+1
}
p=isodepth(x[,1:2],dpth=d-1,output=T, mustdith=T)[[1]]
p }
bags <- ddply(df, "Species", find_bag,alpha=alph)
names(bags) <- c("Species",names(df)[1:2])
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = bags, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
EDIT2:
Using my original idea of convex hull peeling I also came up with the following solution which now works in 2d & 3d; the result is not quite the same is with the isodepth algorithm, but it's pretty close :
# in 2d
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,5)]
alph=0.05
find_bag = function(x,alpha=alph) {
n=nrow(x)
propinside=1
target=1-alpha
x2=x
while (propinside>target) {
propinside=nrow(x2)/n
hull=chull(x2)
x2old=x2
x2=x2[-hull,]
}
x2old[chull(x2old),] }
bags <- ddply(df, "Species", find_bag, alpha=alph)
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = bags, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
# in 3d
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,3,5)]
levels=unique(df[,"Species"])
nlevels=length(levels)
zoom=0.8
cex=1
aspectr=c(1,1,0.7)
pointsalpha=1
userMatrix=matrix(c(0.80,-0.60,0.022,0,0.23,0.34,0.91,0,-0.55,-0.72,0.41,0,0,0,0,1),ncol=4,byrow=T)
windowRect=c(0,29,1920,1032)
cols=c("red","forestgreen","blue")
alph=0.05
plotbag = function(x,alpha=alph,grp=1,cols=c("red","forestgreen","blue"),transp=0.2) {
propinside=1
target=1-alpha
x2=x
levels=unique(x2[,ncol(x2)])
x2=x2[x2[,ncol(x2)]==levels[[grp]],]
n=nrow(x2)
while (propinside>target) {
propinside=nrow(x2)/n
hull=unique(as.vector(convhulln(as.matrix(x2[,1:3]), options = "Tv")))
x2old=x2
x2=x2[-hull,]
}
ids=t(convhulln(as.matrix(x2old[,1:3]), options = "Tv"))
rgl.triangles(x2old[ids,1],x2old[ids,2],x2old[ids,3],col=cols[[grp]],alpha=transp,shininess=50)
}
open3d(zoom=zoom,userMatrix=userMatrix,windowRect=windowRect,antialias=8)
for (i in 1:nlevels) {
plot3d(x=df[df[,ncol(df)]==levels[[i]],][,1],
y=df[df[,ncol(df)]==levels[[i]],][,2],
z=df[df[,ncol(df)]==levels[[i]],][,3],
type="s",
col=cols[[i]],
size=cex,
lit=TRUE,
alpha=pointsalpha,point_antialias=TRUE,
line_antialias=TRUE,shininess=50, add=TRUE)
plotbag(df,alpha=alph, grp=i, cols=c("red","forestgreen","blue"), transp=0.3) }
axes3d(color="black",drawfront=T,box=T,alpha=1)
title3d(color="black",xlab=names(df)[[1]],ylab=names(df)[[2]],zlab=names(df)[[3]],alpha=1)
aspect3d(aspectr)
We can modify the aplpack::plothulls function to accept a parameter for the proportion of points to enclose (in aplpack it's set to 50%). Then we can use this modified function to make a custom a geom for ggplot.
Here's the custom geom:
library(ggplot2)
StatBag <- ggproto("Statbag", Stat,
compute_group = function(data, scales, prop = 0.5) {
#################################
#################################
# originally from aplpack package, plotting functions removed
plothulls_ <- function(x, y, fraction, n.hull = 1,
col.hull, lty.hull, lwd.hull, density=0, ...){
# function for data peeling:
# x,y : data
# fraction.in.inner.hull : max percentage of points within the hull to be drawn
# n.hull : number of hulls to be plotted (if there is no fractiion argument)
# col.hull, lty.hull, lwd.hull : style of hull line
# plotting bits have been removed, BM 160321
# pw 130524
if(ncol(x) == 2){ y <- x[,2]; x <- x[,1] }
n <- length(x)
if(!missing(fraction)) { # find special hull
n.hull <- 1
if(missing(col.hull)) col.hull <- 1
if(missing(lty.hull)) lty.hull <- 1
if(missing(lwd.hull)) lwd.hull <- 1
x.old <- x; y.old <- y
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
for( i in 1:(length(x)/3)){
x <- x[-idx]; y <- y[-idx]
if( (length(x)/n) < fraction ){
return(cbind(x.hull,y.hull))
}
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx];
}
}
if(missing(col.hull)) col.hull <- 1:n.hull
if(length(col.hull)) col.hull <- rep(col.hull,n.hull)
if(missing(lty.hull)) lty.hull <- 1:n.hull
if(length(lty.hull)) lty.hull <- rep(lty.hull,n.hull)
if(missing(lwd.hull)) lwd.hull <- 1
if(length(lwd.hull)) lwd.hull <- rep(lwd.hull,n.hull)
result <- NULL
for( i in 1:n.hull){
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
result <- c(result, list( cbind(x.hull,y.hull) ))
x <- x[-idx]; y <- y[-idx]
if(0 == length(x)) return(result)
}
result
} # end of definition of plothulls
#################################
# prepare data to go into function below
the_matrix <- matrix(data = c(data$x, data$y), ncol = 2)
# get data out of function as df with names
setNames(data.frame(plothulls_(the_matrix, fraction = prop)), nm = c("x", "y"))
# how can we get the hull and loop vertices passed on also?
},
required_aes = c("x", "y")
)
#' #inheritParams ggplot2::stat_identity
#' #param prop Proportion of all the points to be included in the bag (default is 0.5)
stat_bag <- function(mapping = NULL, data = NULL, geom = "polygon",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, prop = 0.5, alpha = 0.3, ...) {
layer(
stat = StatBag, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, prop = prop, alpha = alpha, ...)
)
}
geom_bag <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
prop = 0.5,
alpha = 0.3,
...,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBag,
geom = GeomBag,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
alpha = alpha,
prop = prop,
...
)
)
}
#' #rdname ggplot2-ggproto
#' #format NULL
#' #usage NULL
#' #export
GeomBag <- ggproto("GeomBag", Geom,
draw_group = function(data, panel_scales, coord) {
n <- nrow(data)
if (n == 1) return(zeroGrob())
munched <- coord_munch(coord, data, panel_scales)
# Sort by group to make sure that colors, fill, etc. come in same order
munched <- munched[order(munched$group), ]
# For gpar(), there is one entry per polygon (not one entry per point).
# We'll pull the first value from each group, and assume all these values
# are the same within each group.
first_idx <- !duplicated(munched$group)
first_rows <- munched[first_idx, ]
ggplot2:::ggname("geom_bag",
grid:::polygonGrob(munched$x, munched$y, default.units = "native",
id = munched$group,
gp = grid::gpar(
col = first_rows$colour,
fill = alpha(first_rows$fill, first_rows$alpha),
lwd = first_rows$size * .pt,
lty = first_rows$linetype
)
)
)
},
default_aes = aes(colour = "NA", fill = "grey20", size = 0.5, linetype = 1,
alpha = NA, prop = 0.5),
handle_na = function(data, params) {
data
},
required_aes = c("x", "y"),
draw_key = draw_key_polygon
)
And here's an example of how it can be used:
ggplot(iris, aes(Sepal.Length, Petal.Length, colour = Species, fill = Species)) +
geom_point() +
stat_bag(prop = 0.95) + # enclose 95% of points
stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
stat_bag(prop = 0.05, alpha = 0.9) # enclose 5% of points

plot a circle/convex hull arround a given percentage of points

I have
x=rnorm(100)
y=rnorm(100)
plot(x,y)
abline(h=0); abline(v=0)
From point (0,0) and going outwards I would like to draw a contour/circle/ellipse/freehand convex hull that encloses any given percentage of points.
Is there any function or package that can automate this? I have tried the following so far but I can only get a circle with some extrapolation and approximation.
I have tried this so far:
#calculate radius
r<- sqrt(x^2+y^2)
df<-data.frame(radius=seq(0,3,0.1), percentage=NA)
#get the percentage of points that have a smaller radius than i
k<-1
for (i in seq(0,3,0.1)){
df$percentage[k] <- sum(r<i)/length(r)
k<-k+1
}
#extrapolation function
prox.function<- approxfun(df$percentage, df$radius)
#get the radius of the circle that encloses about 50% of
prox.function(.50)
#draw the circle
library(plotrix)
draw.circle(0,0,prox.function(.50))
The radius enclosing a fraction f of the points is:
f <- 0.5 # use half for this example as in the question
sort(r)[ ceiling(f * length(r)) ]
Yes, we can create a new geom for ggplot that will draw a convex hull around any given percentage of all the points in the data. This is similar to the bagplot, and uses some code from the bagplot function in the aplpack package (which is fixed at 50% of points).
Here is the definition of the new geom that allows you to chose what percentage of points to enclose:
library(ggplot2)
# Here's the stat_
StatBag <- ggproto("Statbag", Stat,
compute_group = function(data, scales, prop = 0.5) {
#################################
#################################
# originally from aplpack package, plotting functions removed
plothulls_ <- function(x, y, fraction, n.hull = 1,
col.hull, lty.hull, lwd.hull, density=0, ...){
# function for data peeling:
# x,y : data
# fraction.in.inner.hull : max percentage of points within the hull to be drawn
# n.hull : number of hulls to be plotted (if there is no fractiion argument)
# col.hull, lty.hull, lwd.hull : style of hull line
# plotting bits have been removed, BM 160321
# pw 130524
if(ncol(x) == 2){ y <- x[,2]; x <- x[,1] }
n <- length(x)
if(!missing(fraction)) { # find special hull
n.hull <- 1
if(missing(col.hull)) col.hull <- 1
if(missing(lty.hull)) lty.hull <- 1
if(missing(lwd.hull)) lwd.hull <- 1
x.old <- x; y.old <- y
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
for( i in 1:(length(x)/3)){
x <- x[-idx]; y <- y[-idx]
if( (length(x)/n) < fraction ){
return(cbind(x.hull,y.hull))
}
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx];
}
}
if(missing(col.hull)) col.hull <- 1:n.hull
if(length(col.hull)) col.hull <- rep(col.hull,n.hull)
if(missing(lty.hull)) lty.hull <- 1:n.hull
if(length(lty.hull)) lty.hull <- rep(lty.hull,n.hull)
if(missing(lwd.hull)) lwd.hull <- 1
if(length(lwd.hull)) lwd.hull <- rep(lwd.hull,n.hull)
result <- NULL
for( i in 1:n.hull){
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
result <- c(result, list( cbind(x.hull,y.hull) ))
x <- x[-idx]; y <- y[-idx]
if(0 == length(x)) return(result)
}
result
} # end of definition of plothulls
#################################
# prepare data to go into function below
the_matrix <- matrix(data = c(data$x, data$y), ncol = 2)
# get data out of function as df with names
setNames(data.frame(plothulls_(the_matrix, fraction = prop)), nm = c("x", "y"))
# how can we get the hull and loop vertices passed on also?
},
required_aes = c("x", "y")
)
# Here's the stat_ function
#' #inheritParams ggplot2::stat_identity
#' #param prop Proportion of all the points to be included in the bag (default is 0.5)
stat_bag <- function(mapping = NULL, data = NULL, geom = "polygon",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, prop = 0.5, alpha = 0.3, ...) {
layer(
stat = StatBag, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, prop = prop, alpha = alpha, ...)
)
}
# here's the geom_
geom_bag <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
prop = 0.5,
alpha = 0.3,
...,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBag,
geom = GeomBag,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
alpha = alpha,
prop = prop,
...
)
)
}
#' #rdname ggplot2-ggproto
#' #format NULL
#' #usage NULL
#' #export
GeomBag <- ggproto("GeomBag", Geom,
draw_group = function(data, panel_scales, coord) {
n <- nrow(data)
if (n == 1) return(zeroGrob())
munched <- coord_munch(coord, data, panel_scales)
# Sort by group to make sure that colors, fill, etc. come in same order
munched <- munched[order(munched$group), ]
# For gpar(), there is one entry per polygon (not one entry per point).
# We'll pull the first value from each group, and assume all these values
# are the same within each group.
first_idx <- !duplicated(munched$group)
first_rows <- munched[first_idx, ]
ggplot2:::ggname("geom_bag",
grid:::polygonGrob(munched$x, munched$y, default.units = "native",
id = munched$group,
gp = grid::gpar(
col = first_rows$colour,
fill = alpha(first_rows$fill, first_rows$alpha),
lwd = first_rows$size * .pt,
lty = first_rows$linetype
)
)
)
},
default_aes = aes(colour = "NA", fill = "grey20", size = 0.5, linetype = 1,
alpha = NA, prop = 0.5),
handle_na = function(data, params) {
data
},
required_aes = c("x", "y"),
draw_key = draw_key_polygon
)
Here are some examples. We can stack three convex hulls together with different alpha levels to show where the centre of the data is, and its spread:
ggplot(mpg, aes(displ, hwy, fill = drv, colour = drv)) +
geom_point() +
geom_bag(prop = 0.95) + # enclose 95% of points
geom_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
geom_bag(prop = 0.1, alpha = 0.8) # enclose 5% of points
ggplot(iris, aes(Sepal.Length, Petal.Length, colour = Species, fill = Species)) +
geom_point() +
stat_bag(prop = 0.95) + # enclose 95% of points
stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
stat_bag(prop = 0.05, alpha = 0.9) # enclose 5% of points

ggplot box plot confusion [duplicate]

I'm trying to use ggplot2 / geom_boxplot to produce a boxplot where the whiskers are defined as the 5 and 95th percentile instead of 0.25 - 1.5 IQR / 0.75 + IQR and outliers from those new whiskers are plotted as usual. I can see that the geom_boxplot aesthetics include ymax / ymin, but it's not clear to me how I put values in here. It seems like:
stat_quantile(quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95))
should be able to help, but I don't know how to relate the results of this stat to set the appropriate geom_boxplot() aesthetics:
geom_boxplot(aes(ymin, lower, middle, upper, ymax))
I've seen other posts where people mention essentially building a boxplot-like object manually, but I'd rather keep the whole boxplot gestalt intact, just revising the meaning of two of the variables being drawn.
geom_boxplot with stat_summary can do it:
# define the summary function
f <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# sample data
d <- data.frame(x=gl(2,50), y=rnorm(100))
# do it
ggplot(d, aes(x, y)) + stat_summary(fun.data = f, geom="boxplot")
# example with outliers
# define outlier as you want
o <- function(x) {
subset(x, x < quantile(x)[2] | quantile(x)[4] < x)
}
# do it
ggplot(d, aes(x, y)) +
stat_summary(fun.data=f, geom="boxplot") +
stat_summary(fun.y = o, geom="point")
It is now possible to specify the whiskers endpoints in ggplot2_2.1.0. Copying from the examples in ?geom_boxplot:
# It's possible to draw a boxplot with your own computations if you
# use stat = "identity":
y <- rnorm(100)
df <- data.frame(
x = 1,
y0 = min(y),
y25 = quantile(y, 0.25),
y50 = median(y),
y75 = quantile(y, 0.75),
y100 = max(y)
)
ggplot(df, aes(x)) +
geom_boxplot(
aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
stat = "identity"
)
Building on #konvas's answer, beginning in ggplot2.0.x, you can extend ggplot using the ggproto system and define your own stat.
By copying the ggplot2 stat_boxplot code and making a few edits, you can quickly define a new stat (stat_boxplot_custom) that takes the percentiles you want to use as an argument (qs) instead of the coef argument that stat_boxplot uses. The new stat is defined here:
# modified from https://github.com/tidyverse/ggplot2/blob/master/R/stat-boxplot.r
library(ggplot2)
stat_boxplot_custom <- function(mapping = NULL, data = NULL,
geom = "boxplot", position = "dodge",
...,
qs = c(.05, .25, 0.5, 0.75, 0.95),
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBoxplotCustom,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
qs = qs,
...
)
)
}
Then, the layer function is defined. Note that b/c I copied directly from stat_boxplot, you have to access a few internal ggplot2 functions using :::. This includes a lot of stuff copied directly over from StatBoxplot, but the key area is in computing the stats directly from the qs argument: stats <- as.numeric(stats::quantile(data$y, qs)) inside of the compute_group function.
StatBoxplotCustom <- ggproto("StatBoxplotCustom", Stat,
required_aes = c("x", "y"),
non_missing_aes = "weight",
setup_params = function(data, params) {
params$width <- ggplot2:::"%||%"(
params$width, (resolution(data$x) * 0.75)
)
if (is.double(data$x) && !ggplot2:::has_groups(data) && any(data$x != data$x[1L])) {
warning(
"Continuous x aesthetic -- did you forget aes(group=...)?",
call. = FALSE
)
}
params
},
compute_group = function(data, scales, width = NULL, na.rm = FALSE, qs = c(.05, .25, 0.5, 0.75, 0.95)) {
if (!is.null(data$weight)) {
mod <- quantreg::rq(y ~ 1, weights = weight, data = data, tau = qs)
stats <- as.numeric(stats::coef(mod))
} else {
stats <- as.numeric(stats::quantile(data$y, qs))
}
names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
iqr <- diff(stats[c(2, 4)])
outliers <- (data$y < stats[1]) | (data$y > stats[5])
if (length(unique(data$x)) > 1)
width <- diff(range(data$x)) * 0.9
df <- as.data.frame(as.list(stats))
df$outliers <- list(data$y[outliers])
if (is.null(data$weight)) {
n <- sum(!is.na(data$y))
} else {
# Sum up weights for non-NA positions of y and weight
n <- sum(data$weight[!is.na(data$y) & !is.na(data$weight)])
}
df$notchupper <- df$middle + 1.58 * iqr / sqrt(n)
df$notchlower <- df$middle - 1.58 * iqr / sqrt(n)
df$x <- if (is.factor(data$x)) data$x[1] else mean(range(data$x))
df$width <- width
df$relvarwidth <- sqrt(n)
df
}
)
There is also a gist here, containing this code.
Then, stat_boxplot_custom can be called just like stat_boxplot:
library(ggplot2)
y <- rnorm(100)
df <- data.frame(x = 1, y = y)
# whiskers extend to 5/95th percentiles by default
ggplot(df, aes(x = x, y = y)) +
stat_boxplot_custom()
# or extend the whiskers to min/max
ggplot(df, aes(x = x, y = y)) +
stat_boxplot_custom(qs = c(0, 0.25, 0.5, 0.75, 1))

Changing whisker definition in geom_boxplot

I'm trying to use ggplot2 / geom_boxplot to produce a boxplot where the whiskers are defined as the 5 and 95th percentile instead of 0.25 - 1.5 IQR / 0.75 + IQR and outliers from those new whiskers are plotted as usual. I can see that the geom_boxplot aesthetics include ymax / ymin, but it's not clear to me how I put values in here. It seems like:
stat_quantile(quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95))
should be able to help, but I don't know how to relate the results of this stat to set the appropriate geom_boxplot() aesthetics:
geom_boxplot(aes(ymin, lower, middle, upper, ymax))
I've seen other posts where people mention essentially building a boxplot-like object manually, but I'd rather keep the whole boxplot gestalt intact, just revising the meaning of two of the variables being drawn.
geom_boxplot with stat_summary can do it:
# define the summary function
f <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
# sample data
d <- data.frame(x=gl(2,50), y=rnorm(100))
# do it
ggplot(d, aes(x, y)) + stat_summary(fun.data = f, geom="boxplot")
# example with outliers
# define outlier as you want
o <- function(x) {
subset(x, x < quantile(x)[2] | quantile(x)[4] < x)
}
# do it
ggplot(d, aes(x, y)) +
stat_summary(fun.data=f, geom="boxplot") +
stat_summary(fun.y = o, geom="point")
It is now possible to specify the whiskers endpoints in ggplot2_2.1.0. Copying from the examples in ?geom_boxplot:
# It's possible to draw a boxplot with your own computations if you
# use stat = "identity":
y <- rnorm(100)
df <- data.frame(
x = 1,
y0 = min(y),
y25 = quantile(y, 0.25),
y50 = median(y),
y75 = quantile(y, 0.75),
y100 = max(y)
)
ggplot(df, aes(x)) +
geom_boxplot(
aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
stat = "identity"
)
Building on #konvas's answer, beginning in ggplot2.0.x, you can extend ggplot using the ggproto system and define your own stat.
By copying the ggplot2 stat_boxplot code and making a few edits, you can quickly define a new stat (stat_boxplot_custom) that takes the percentiles you want to use as an argument (qs) instead of the coef argument that stat_boxplot uses. The new stat is defined here:
# modified from https://github.com/tidyverse/ggplot2/blob/master/R/stat-boxplot.r
library(ggplot2)
stat_boxplot_custom <- function(mapping = NULL, data = NULL,
geom = "boxplot", position = "dodge",
...,
qs = c(.05, .25, 0.5, 0.75, 0.95),
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBoxplotCustom,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
qs = qs,
...
)
)
}
Then, the layer function is defined. Note that b/c I copied directly from stat_boxplot, you have to access a few internal ggplot2 functions using :::. This includes a lot of stuff copied directly over from StatBoxplot, but the key area is in computing the stats directly from the qs argument: stats <- as.numeric(stats::quantile(data$y, qs)) inside of the compute_group function.
StatBoxplotCustom <- ggproto("StatBoxplotCustom", Stat,
required_aes = c("x", "y"),
non_missing_aes = "weight",
setup_params = function(data, params) {
params$width <- ggplot2:::"%||%"(
params$width, (resolution(data$x) * 0.75)
)
if (is.double(data$x) && !ggplot2:::has_groups(data) && any(data$x != data$x[1L])) {
warning(
"Continuous x aesthetic -- did you forget aes(group=...)?",
call. = FALSE
)
}
params
},
compute_group = function(data, scales, width = NULL, na.rm = FALSE, qs = c(.05, .25, 0.5, 0.75, 0.95)) {
if (!is.null(data$weight)) {
mod <- quantreg::rq(y ~ 1, weights = weight, data = data, tau = qs)
stats <- as.numeric(stats::coef(mod))
} else {
stats <- as.numeric(stats::quantile(data$y, qs))
}
names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
iqr <- diff(stats[c(2, 4)])
outliers <- (data$y < stats[1]) | (data$y > stats[5])
if (length(unique(data$x)) > 1)
width <- diff(range(data$x)) * 0.9
df <- as.data.frame(as.list(stats))
df$outliers <- list(data$y[outliers])
if (is.null(data$weight)) {
n <- sum(!is.na(data$y))
} else {
# Sum up weights for non-NA positions of y and weight
n <- sum(data$weight[!is.na(data$y) & !is.na(data$weight)])
}
df$notchupper <- df$middle + 1.58 * iqr / sqrt(n)
df$notchlower <- df$middle - 1.58 * iqr / sqrt(n)
df$x <- if (is.factor(data$x)) data$x[1] else mean(range(data$x))
df$width <- width
df$relvarwidth <- sqrt(n)
df
}
)
There is also a gist here, containing this code.
Then, stat_boxplot_custom can be called just like stat_boxplot:
library(ggplot2)
y <- rnorm(100)
df <- data.frame(x = 1, y = y)
# whiskers extend to 5/95th percentiles by default
ggplot(df, aes(x = x, y = y)) +
stat_boxplot_custom()
# or extend the whiskers to min/max
ggplot(df, aes(x = x, y = y)) +
stat_boxplot_custom(qs = c(0, 0.25, 0.5, 0.75, 1))

Resources