How to find the geom_curve control points in ggplot - r

I wonder how can I find the control points of geom_curve in ggplot2? e.g.
p <- ggplot(mtcars, aes(wt, mpg)) +
geom_curve(aes(x = 2.62, y = 21.0, xend = 3.57, yend = 15.0),curvature = -0.2, data = df) +
geom_point()
b <- ggplot_build(p)
b$data[[1]]
p$layers[[1]]$geom_params
b$data[[1]] gives the starting and ending points and p$layers[[1]]$geom_params gives the curve information (angle, curvature, ...).
But how can I find the control points, so I can reproduce the graphics?

library(ggplot)
library(grid)
library(stringr)
df <- data.frame(x = 1:3, y = 1:3)
df2 <- data.frame(x = c(1,3), y = c(1,3),
xend = c(2,2), yend = c(2,2))
g <- ggplot(df, aes(x, y)) +
geom_point() +
geom_curve(aes(x = x ,y = y,
xend = xend, yend = yend),
data = df2,
color = c("red", "blue"))
g
getCurve_controlPoints <- function(ggplotObject) {
len_layers <- length(ggplotObject$layers)
layerNames <- lapply(seq_len(len_layers),
function(j) {
className <- class(ggplotObject$layers[[j]]$geom)
className[-which(className %in% c("ggproto" ,"gg", "Geom"))]
})
curveLayerId <- which(sapply(layerNames,
function(layerName){
"GeomCurve" %in% layerName
}) == TRUE
)
gg_build <- ggplot_build(ggplotObject)
# you can also add yes or no in your code
# answer <- utils::menu(c("y", "n"), title="Do you want to draw the ggplot?")
grid.draw(ggplotObject)
grid.force()
gridList <- grid.ls(print = FALSE)
gridList.name <- gridList$name
xspline.name <- gridList.name[which(str_detect(gridList.name, "curve") == TRUE)]
xspline.len <- length(xspline.name)
controlPoints <- lapply(seq_len(length(curveLayerId)),
function (j) {
# curve data
curve_data <- gg_build$data[[curveLayerId[j]]]
# avoid duplicated rows
curve_data <- curve_data[!duplicated(curve_data), ]
n <- dim(curve_data)[1]
# here we go! But wait a second, it seems like the starting and ending position do not match
xsplinePoints <- xsplinePoints(grid.get("xspline", grep=TRUE))
# mapping data to our coordinates
control_data <- lapply(seq_len(n),
function(i){
if (n == 1) {
xy <- lapply(xsplinePoints,
function(coord){
as.numeric(coord)
})
} else {
xy <- lapply(xsplinePoints[[i]],
function(coord){
as.numeric(coord)
})
}
x.start <- curve_data[i, ]$x
x.end <- curve_data[i, ]$xend
y.start <- curve_data[i, ]$y
y.end <- curve_data[i, ]$yend
# mapping to ggplot coordinates
xy_x.diff <- xy$x[length(xy$x)] - xy$x[1]
xy_y.diff <- xy$y[length(xy$y)] - xy$y[1]
# maybe there is a better way?
if(xy_x.diff == 0){
xy_x.diff <- 1e-16
}
if(xy_y.diff == 0){
xy_y.diff <- 1e-16
}
x <- (x.end - x.start) / (xy_x.diff) * (xy$x - xy$x[1]) + x.start
y <- (y.end - y.start) / (xy_y.diff) * (xy$y - xy$y[1]) + y.start
list(x = x, y = y)
})
# grid remove
grid.remove(xspline.name[j], redraw = FALSE)
control_data
})
controlPoints
}
controlPoints <- getCurve_controlPoints(g)
# check the points
plot(controlPoints[[1]][[1]]$x, controlPoints[[1]][[1]]$y,
xlim = c(1,3),
ylim = c(1,3),
xlab = "x",
ylab = "y",
pch = 19)
points(controlPoints[[1]][[2]]$x, controlPoints[[1]][[2]]$y, pch = 19)
I think it works well so far and the ggplot version I used is 3.0.0. If you use any version less than 2.2.1, error may occur.
This idea is suggested by Prof Paul Murrell. Perhaps the easiest way to capture the control points of geom_curve? The cons are that grobs(ggplot object) must be drawn at first, since these points are only generated at the drawing time.

Related

Plot a discontinuous function in R without connecting a "jump"

I'd like to plot a discontinuous function without connecting a jump. For example, in the following plot, I'd like to delete the line connecting (0.5, 0.5) and (0.5, 1.5).
f <- function(x){
(x < .5) * (x) + (x >= .5) * (x + 1)
}
ggplot()+
geom_function(fun = f)
Edit: I'm looking for a solution that works even if the discountinuous point is not a round number, say pi/10.
You could write a little wrapper function which finds discontinuities in the given function and plots them as separate groups:
plot_fun <- function(fun, from = 0, to = 1, by = 0.001) {
x <- seq(from, to, by)
groups <- cut(x, c(-Inf, x[which(abs(diff(fun(x))) > 0.1)], Inf))
df <- data.frame(x, groups, y = fun(x))
ggplot(df, aes(x, y, group = groups)) +
geom_line()
}
This allows
plot_fun(f)
plot_fun(floor, 0, 10)
This answer is based on Allan Cameron's answer, but depicts the jump using open and closed circles. Whether the function is right or left continuous is controlled by an argument.
library("ggplot2")
plot_fun <- function(fun, from = 0, to = 1, by = 0.001, right_continuous = TRUE) {
x <- seq(from, to, by)
tol_vertical <- 0.1
y <- fun(x)
idx_break <- which(abs(diff(y)) > tol_vertical)
x_break <- x[idx_break]
y_break_l <- y[idx_break]
y_break_r <- y[idx_break + 1]
groups <- cut(x, c(-Inf, x_break, Inf))
df <- data.frame(x, groups, y = fun(x))
plot_ <- ggplot(df, aes(x, y, group = groups)) +
geom_line()
# add open and closed points showing jump
dataf_l <- data.frame(x = x_break, y = y_break_l)
dataf_r <- data.frame(x = x_break, y = y_break_r)
shape_open_circle <- 1
# this is the default of shape, but might as well specify.
shape_closed_circle <- 19
shape_size <- 4
if (right_continuous) {
shape_l <- shape_open_circle
shape_r <- shape_closed_circle
} else {
shape_l <- shape_closed_circle
shape_r <- shape_open_circle
}
plot_ <- plot_ +
geom_point(data = dataf_l, aes(x = x, y = y), group = NA, shape = shape_l, size = shape_size) +
geom_point(data = dataf_r, aes(x = x, y = y), group = NA, shape = shape_r, size = shape_size)
return(plot_)
}
Here's the OP's original example:
f <- function(x){
(x < .5) * (x) + (x >= .5) * (x + 1)
}
plot_fun(f)
Here's Allan's additional example using floor, which shows multiple discontinuities:
plot_fun(floor, from = 0, to = 10)
And here's an example showing that the function does not need to be piecewise linear:
f_curved <- function(x) ifelse(x > 0, yes = 0.5*(2-exp(-x)), no = 0)
plot_fun(f_curved, from = -1, to = 5)
You can insert everything inside an ifelse:
f <- function(x){
ifelse(x==0.5,
NA,
(x < .5) * (x) + (x >= .5) * (x + 1))
}
ggplot()+
geom_function(fun = f)

gam plots with ggplot

I need to create some gam plots in ggplot. I can do them with the general plot function, but am unsure how to do with ggplot. Here is my code and plots with the regular plot function. I'm using the College data set from the ISLR package.
train.2 <- sample(dim(College)[1],2*dim(College)[1]/3)
train.college <- College[train.2,]
test.college <- College[-train.2,]
gam.college <- gam(Outstate~Private+s(Room.Board)+s(Personal)+s(PhD)+s(perc.alumni)+s(Expend)+s(Grad.Rate), data=train.college)
par(mfrow=c(2,2))
plot(gam.college, se=TRUE,col="blue")
See update below old answer.
Old answer:
There is an implementation of GAM plotting using ggplot2 in voxel library. Here is how you would go about it:
library(ISLR)
library(mgcv)
library(voxel)
library(tidyverse)
library(gridExtra)
data(College)
set.seed(1)
train.2 <- sample(dim(College)[1],2*dim(College)[1]/3)
train.college <- College[train.2,]
test.college <- College[-train.2,]
gam.college <- gam(Outstate~Private+s(Room.Board)+s(Personal)+s(PhD)+s(perc.alumni)+s(Expend)+s(Grad.Rate), data=train.college)
vars <- c("Room.Board", "Personal", "PhD", "perc.alumni","Expend", "Grad.Rate")
map(vars, function(x){
p <- plotGAM(gam.college, smooth.cov = x) #plot customization goes here
g <- ggplotGrob(p)
}) %>%
{grid.arrange(grobs = (.), ncol = 2, nrow = 3)}
after a bunch of errors: In plotGAM(gam.college, smooth.cov = x) :
There are one or more factors in the model fit, please consider plotting by group since plot might be unprecise
To compare to the plot.gam:
par(mfrow=c(2,3))
plot(gam.college, se=TRUE,col="blue")
You might also want to plot the observed values:
map(vars, function(x){
p <- plotGAM(gam.college, smooth.cov = x) +
geom_point(data = train.college, aes_string(y = "Outstate", x = x ), alpha = 0.2) +
geom_rug(data = train.college, aes_string(y = "Outstate", x = x ), alpha = 0.2)
g <- ggplotGrob(p)
}) %>%
{grid.arrange(grobs = (.), ncol = 3, nrow = 2)}
or per group (especially important if you used the by argument (interaction in gam).
map(vars, function(x){
p <- plotGAM(gam.college, smooth.cov = x, groupCovs = "Private") +
geom_point(data = train.college, aes_string(y = "Outstate", x = x, color= "Private"), alpha = 0.2) +
geom_rug(data = train.college, aes_string(y = "Outstate", x = x, color= "Private" ), alpha = 0.2) +
scale_color_manual("Private", values = c("#868686FF", "#0073C2FF")) +
theme(legend.position="none")
g <- ggplotGrob(p)
}) %>%
{grid.arrange(grobs = (.), ncol = 3, nrow = 2)}
Update, 08. Jan. 2020.
I currently think the package mgcViz offers superior functionality compared to the voxel::plotGAMfunction. An example using the above data set and models:
library(mgcViz)
viz <- getViz(gam.college)
print(plot(viz, allTerms = T), pages = 1)
plot customization is similar go ggplot2 syntax:
trt <- plot(viz, allTerms = T) +
l_points() +
l_fitLine(linetype = 1) +
l_ciLine(linetype = 3) +
l_ciBar() +
l_rug() +
theme_grey()
print(trt, pages = 1)
This vignette shows many more examples.

List of plots generated in ggplot2 using scale_color_gradientn have wrong coloring

I'm attempting to use library(scales) and scale_color_gradientn() to create a custom mapping of colors to a continuous variable, in an attempt to limit the effect of outliers on the coloring of my plot. This works for a single plot, but does not work when I use a loop to generate several plots and store them in a list.
Here is a minimal working example:
library(ggplot2)
library(scales)
data1 <- as.data.frame(cbind(x = rnorm(100),
y = rnorm(100),
v1 = rnorm(100, mean = 2, sd = 1),
v2 = rnorm(100, mean = -2, sd = 1)))
#add outliers
data1[1,"v1"] <- 200
data1[2,"v1"] <- -200
data1[1,"v2"] <- 50
data1[2,"v2"] <- -50
#define color palette
cols <- colorRampPalette(c("#3540FF","black","#FF3535"))(n = 100)
#simple color scale
col2 <- scale_color_gradient2(low = "#3540FF",
mid = "black",
high = "#FF3535"
)
#outlier-adjusted color scale
{
aa <- min(data1$v1)
bb <- quantile(data1$v1, 0.05)
cc <- quantile(data1$v1, 0.95)
dd <- max(data1$v1)
coln <- scale_color_gradientn(colors = cols[c(1,5,95,100)],
values = rescale(c(aa,bb,cc,dd),
limits = c(aa,dd))
)
}
Plots:
1. Plot with simple scales - outliers cause scales to stretch out.
ggplot(data1, aes(x = x, y = y, color = v1))+
geom_point()+
col2
2. Plot with outlier-adjusted scales - correct color scaling.
ggplot(data1, aes(x = x, y = y, color = v1))+
geom_point()+
coln
3. The scales for v1 do not work for v2 as the data is different.
ggplot(data1, aes(x = x, y = y, color = v2))+
geom_point()+
coln
#loop to produce list of plots each with own scale
{
plots <- list()
k <- 1
for (i in c("v1","v2")){
aa <- min(data1[,i])
bb <- quantile(data1[,i],0.05)
cc <- quantile(data1[,i], 0.95)
dd <- max(data1[,i])
colm <- scale_color_gradientn(colors = cols[c(1,5,95,100)],
values = rescale(c(aa,bb,cc,dd),
limits = c(aa,dd)))
plots[[k]] <- ggplot(data1, aes_string(x = "x",
y = "y",
color = i
))+
geom_point()+
colm
k <- k + 1
}
}
4. First plot has the wrong scales.
plots[[1]]
5. Second plot has the correct scales.
plots[[2]]
So I'm guessing this has something to do with the scale_color_gradientn() function being called when the plotting takes place, rather than within the loop.
If anyone can help with this, it'd be much appreciated. In base R I would bin the continuous data and assigning hex colors into a vector used for fill color, but I'm unsure how I can apply this within ggplot.
You need to use a closure (function with associated environment):
{
plots <- list()
k <- 1
for (i in c("v1", "v2")){
colm <- function() {
aa <- min(data1[, i])
bb <- quantile(data1[, i], 0.05)
cc <- quantile(data1[, i], 0.95)
dd <- max(data1[, i])
scale_color_gradientn(colors = cols[c(1, 5, 95, 100)],
values = rescale(c(aa, bb, cc, dd),
limits = c(aa, dd)))
}
plots[[k]] <- ggplot(data1, aes_string(x = "x",
y = "y",
color = i)) +
geom_point() +
colm()
k <- k + 1
}
}
plots[[1]]
plots[[2]]

How to create a 2d polygon that outlines group data points and overlap analysis of such euclidean or hyperspace volumes [duplicate]

I know that in ggplot2 one can add the convex hull to a scatterplot by group as in
library(ggplot2)
library(plyr)
data(iris)
df<-iris
find_hull <- function(df) df[chull(df$Sepal.Length, df$Sepal.Width), ]
hulls <- ddply(df, "Species", find_hull)
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = hulls, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
I was wondering though how one could calculate and add alpha bags instead, i.e. the largest convex hull that contains at least a proportion 1-alpha of all the points? Either in 2d (to display with ggplot2) or 3d (to display with rgl).
EDIT: My initial idea was be to keep on "peeling" the convex hull for as along as the criterion of containing at least a given % of points would be satisfied, although in the paper here it seems they use a different algorithm (isodepth, which seems to be implemented in R package depth, in function isodepth and aplpack::plothulls seems also close to what I want (although it produces a full plot as opposed to just the contour), so I think with these I may be sorted. Though these function only works in 2D, and I would also be interested in a 3D extension (to be plotted in rgl). If anyone has any pointers let me know!
EDIT2: with function depth::isodepth I found a 2d solution (see post below), although I am still looking for a 3D solution as well - if anyone would happen to know how to do that, please let me know!
Ha with the help of function depth::isodepth I came up with the following solution - here I find the alpha bag that contains a proportion of at least 1-alpha of all points :
library(mgcv)
library(depth)
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,5)]
alph=0.05
find_bag = function(x,alpha=alph) {
n=nrow(x)
target=1-alpha
propinside=1
d=1
while (propinside>target) {
p=isodepth(x[,1:2],dpth=d,output=T, mustdith=T)[[1]]
ninside=sum(in.out(p,as.matrix(x[,1:2],ncol=2))*1)
nonedge=sum(sapply(1:nrow(p),function (row)
nrow(merge(round(setNames(as.data.frame(p[row,,drop=F]),names(x)[1:2]),5),as.data.frame(x[,1:2])))>0)*1)-3
propinside=(ninside+nonedge)/n
d=d+1
}
p=isodepth(x[,1:2],dpth=d-1,output=T, mustdith=T)[[1]]
p }
bags <- ddply(df, "Species", find_bag,alpha=alph)
names(bags) <- c("Species",names(df)[1:2])
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = bags, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
EDIT2:
Using my original idea of convex hull peeling I also came up with the following solution which now works in 2d & 3d; the result is not quite the same is with the isodepth algorithm, but it's pretty close :
# in 2d
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,5)]
alph=0.05
find_bag = function(x,alpha=alph) {
n=nrow(x)
propinside=1
target=1-alpha
x2=x
while (propinside>target) {
propinside=nrow(x2)/n
hull=chull(x2)
x2old=x2
x2=x2[-hull,]
}
x2old[chull(x2old),] }
bags <- ddply(df, "Species", find_bag, alpha=alph)
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = bags, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
# in 3d
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,3,5)]
levels=unique(df[,"Species"])
nlevels=length(levels)
zoom=0.8
cex=1
aspectr=c(1,1,0.7)
pointsalpha=1
userMatrix=matrix(c(0.80,-0.60,0.022,0,0.23,0.34,0.91,0,-0.55,-0.72,0.41,0,0,0,0,1),ncol=4,byrow=T)
windowRect=c(0,29,1920,1032)
cols=c("red","forestgreen","blue")
alph=0.05
plotbag = function(x,alpha=alph,grp=1,cols=c("red","forestgreen","blue"),transp=0.2) {
propinside=1
target=1-alpha
x2=x
levels=unique(x2[,ncol(x2)])
x2=x2[x2[,ncol(x2)]==levels[[grp]],]
n=nrow(x2)
while (propinside>target) {
propinside=nrow(x2)/n
hull=unique(as.vector(convhulln(as.matrix(x2[,1:3]), options = "Tv")))
x2old=x2
x2=x2[-hull,]
}
ids=t(convhulln(as.matrix(x2old[,1:3]), options = "Tv"))
rgl.triangles(x2old[ids,1],x2old[ids,2],x2old[ids,3],col=cols[[grp]],alpha=transp,shininess=50)
}
open3d(zoom=zoom,userMatrix=userMatrix,windowRect=windowRect,antialias=8)
for (i in 1:nlevels) {
plot3d(x=df[df[,ncol(df)]==levels[[i]],][,1],
y=df[df[,ncol(df)]==levels[[i]],][,2],
z=df[df[,ncol(df)]==levels[[i]],][,3],
type="s",
col=cols[[i]],
size=cex,
lit=TRUE,
alpha=pointsalpha,point_antialias=TRUE,
line_antialias=TRUE,shininess=50, add=TRUE)
plotbag(df,alpha=alph, grp=i, cols=c("red","forestgreen","blue"), transp=0.3) }
axes3d(color="black",drawfront=T,box=T,alpha=1)
title3d(color="black",xlab=names(df)[[1]],ylab=names(df)[[2]],zlab=names(df)[[3]],alpha=1)
aspect3d(aspectr)
We can modify the aplpack::plothulls function to accept a parameter for the proportion of points to enclose (in aplpack it's set to 50%). Then we can use this modified function to make a custom a geom for ggplot.
Here's the custom geom:
library(ggplot2)
StatBag <- ggproto("Statbag", Stat,
compute_group = function(data, scales, prop = 0.5) {
#################################
#################################
# originally from aplpack package, plotting functions removed
plothulls_ <- function(x, y, fraction, n.hull = 1,
col.hull, lty.hull, lwd.hull, density=0, ...){
# function for data peeling:
# x,y : data
# fraction.in.inner.hull : max percentage of points within the hull to be drawn
# n.hull : number of hulls to be plotted (if there is no fractiion argument)
# col.hull, lty.hull, lwd.hull : style of hull line
# plotting bits have been removed, BM 160321
# pw 130524
if(ncol(x) == 2){ y <- x[,2]; x <- x[,1] }
n <- length(x)
if(!missing(fraction)) { # find special hull
n.hull <- 1
if(missing(col.hull)) col.hull <- 1
if(missing(lty.hull)) lty.hull <- 1
if(missing(lwd.hull)) lwd.hull <- 1
x.old <- x; y.old <- y
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
for( i in 1:(length(x)/3)){
x <- x[-idx]; y <- y[-idx]
if( (length(x)/n) < fraction ){
return(cbind(x.hull,y.hull))
}
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx];
}
}
if(missing(col.hull)) col.hull <- 1:n.hull
if(length(col.hull)) col.hull <- rep(col.hull,n.hull)
if(missing(lty.hull)) lty.hull <- 1:n.hull
if(length(lty.hull)) lty.hull <- rep(lty.hull,n.hull)
if(missing(lwd.hull)) lwd.hull <- 1
if(length(lwd.hull)) lwd.hull <- rep(lwd.hull,n.hull)
result <- NULL
for( i in 1:n.hull){
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
result <- c(result, list( cbind(x.hull,y.hull) ))
x <- x[-idx]; y <- y[-idx]
if(0 == length(x)) return(result)
}
result
} # end of definition of plothulls
#################################
# prepare data to go into function below
the_matrix <- matrix(data = c(data$x, data$y), ncol = 2)
# get data out of function as df with names
setNames(data.frame(plothulls_(the_matrix, fraction = prop)), nm = c("x", "y"))
# how can we get the hull and loop vertices passed on also?
},
required_aes = c("x", "y")
)
#' #inheritParams ggplot2::stat_identity
#' #param prop Proportion of all the points to be included in the bag (default is 0.5)
stat_bag <- function(mapping = NULL, data = NULL, geom = "polygon",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, prop = 0.5, alpha = 0.3, ...) {
layer(
stat = StatBag, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, prop = prop, alpha = alpha, ...)
)
}
geom_bag <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
prop = 0.5,
alpha = 0.3,
...,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBag,
geom = GeomBag,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
alpha = alpha,
prop = prop,
...
)
)
}
#' #rdname ggplot2-ggproto
#' #format NULL
#' #usage NULL
#' #export
GeomBag <- ggproto("GeomBag", Geom,
draw_group = function(data, panel_scales, coord) {
n <- nrow(data)
if (n == 1) return(zeroGrob())
munched <- coord_munch(coord, data, panel_scales)
# Sort by group to make sure that colors, fill, etc. come in same order
munched <- munched[order(munched$group), ]
# For gpar(), there is one entry per polygon (not one entry per point).
# We'll pull the first value from each group, and assume all these values
# are the same within each group.
first_idx <- !duplicated(munched$group)
first_rows <- munched[first_idx, ]
ggplot2:::ggname("geom_bag",
grid:::polygonGrob(munched$x, munched$y, default.units = "native",
id = munched$group,
gp = grid::gpar(
col = first_rows$colour,
fill = alpha(first_rows$fill, first_rows$alpha),
lwd = first_rows$size * .pt,
lty = first_rows$linetype
)
)
)
},
default_aes = aes(colour = "NA", fill = "grey20", size = 0.5, linetype = 1,
alpha = NA, prop = 0.5),
handle_na = function(data, params) {
data
},
required_aes = c("x", "y"),
draw_key = draw_key_polygon
)
And here's an example of how it can be used:
ggplot(iris, aes(Sepal.Length, Petal.Length, colour = Species, fill = Species)) +
geom_point() +
stat_bag(prop = 0.95) + # enclose 95% of points
stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
stat_bag(prop = 0.05, alpha = 0.9) # enclose 5% of points

R: adding alpha bags to a 2d or 3d scatterplot

I know that in ggplot2 one can add the convex hull to a scatterplot by group as in
library(ggplot2)
library(plyr)
data(iris)
df<-iris
find_hull <- function(df) df[chull(df$Sepal.Length, df$Sepal.Width), ]
hulls <- ddply(df, "Species", find_hull)
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = hulls, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
I was wondering though how one could calculate and add alpha bags instead, i.e. the largest convex hull that contains at least a proportion 1-alpha of all the points? Either in 2d (to display with ggplot2) or 3d (to display with rgl).
EDIT: My initial idea was be to keep on "peeling" the convex hull for as along as the criterion of containing at least a given % of points would be satisfied, although in the paper here it seems they use a different algorithm (isodepth, which seems to be implemented in R package depth, in function isodepth and aplpack::plothulls seems also close to what I want (although it produces a full plot as opposed to just the contour), so I think with these I may be sorted. Though these function only works in 2D, and I would also be interested in a 3D extension (to be plotted in rgl). If anyone has any pointers let me know!
EDIT2: with function depth::isodepth I found a 2d solution (see post below), although I am still looking for a 3D solution as well - if anyone would happen to know how to do that, please let me know!
Ha with the help of function depth::isodepth I came up with the following solution - here I find the alpha bag that contains a proportion of at least 1-alpha of all points :
library(mgcv)
library(depth)
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,5)]
alph=0.05
find_bag = function(x,alpha=alph) {
n=nrow(x)
target=1-alpha
propinside=1
d=1
while (propinside>target) {
p=isodepth(x[,1:2],dpth=d,output=T, mustdith=T)[[1]]
ninside=sum(in.out(p,as.matrix(x[,1:2],ncol=2))*1)
nonedge=sum(sapply(1:nrow(p),function (row)
nrow(merge(round(setNames(as.data.frame(p[row,,drop=F]),names(x)[1:2]),5),as.data.frame(x[,1:2])))>0)*1)-3
propinside=(ninside+nonedge)/n
d=d+1
}
p=isodepth(x[,1:2],dpth=d-1,output=T, mustdith=T)[[1]]
p }
bags <- ddply(df, "Species", find_bag,alpha=alph)
names(bags) <- c("Species",names(df)[1:2])
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = bags, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
EDIT2:
Using my original idea of convex hull peeling I also came up with the following solution which now works in 2d & 3d; the result is not quite the same is with the isodepth algorithm, but it's pretty close :
# in 2d
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,5)]
alph=0.05
find_bag = function(x,alpha=alph) {
n=nrow(x)
propinside=1
target=1-alpha
x2=x
while (propinside>target) {
propinside=nrow(x2)/n
hull=chull(x2)
x2old=x2
x2=x2[-hull,]
}
x2old[chull(x2old),] }
bags <- ddply(df, "Species", find_bag, alpha=alph)
plot <- ggplot(data = df, aes(x = Sepal.Length, y = Sepal.Width, colour=Species, fill = Species)) +
geom_point() +
geom_polygon(data = bags, alpha = 0.5) +
labs(x = "Sepal.Length", y = "Sepal.Width")
plot
# in 3d
library(plyr)
library(ggplot2)
data(iris)
df=iris[,c(1,2,3,5)]
levels=unique(df[,"Species"])
nlevels=length(levels)
zoom=0.8
cex=1
aspectr=c(1,1,0.7)
pointsalpha=1
userMatrix=matrix(c(0.80,-0.60,0.022,0,0.23,0.34,0.91,0,-0.55,-0.72,0.41,0,0,0,0,1),ncol=4,byrow=T)
windowRect=c(0,29,1920,1032)
cols=c("red","forestgreen","blue")
alph=0.05
plotbag = function(x,alpha=alph,grp=1,cols=c("red","forestgreen","blue"),transp=0.2) {
propinside=1
target=1-alpha
x2=x
levels=unique(x2[,ncol(x2)])
x2=x2[x2[,ncol(x2)]==levels[[grp]],]
n=nrow(x2)
while (propinside>target) {
propinside=nrow(x2)/n
hull=unique(as.vector(convhulln(as.matrix(x2[,1:3]), options = "Tv")))
x2old=x2
x2=x2[-hull,]
}
ids=t(convhulln(as.matrix(x2old[,1:3]), options = "Tv"))
rgl.triangles(x2old[ids,1],x2old[ids,2],x2old[ids,3],col=cols[[grp]],alpha=transp,shininess=50)
}
open3d(zoom=zoom,userMatrix=userMatrix,windowRect=windowRect,antialias=8)
for (i in 1:nlevels) {
plot3d(x=df[df[,ncol(df)]==levels[[i]],][,1],
y=df[df[,ncol(df)]==levels[[i]],][,2],
z=df[df[,ncol(df)]==levels[[i]],][,3],
type="s",
col=cols[[i]],
size=cex,
lit=TRUE,
alpha=pointsalpha,point_antialias=TRUE,
line_antialias=TRUE,shininess=50, add=TRUE)
plotbag(df,alpha=alph, grp=i, cols=c("red","forestgreen","blue"), transp=0.3) }
axes3d(color="black",drawfront=T,box=T,alpha=1)
title3d(color="black",xlab=names(df)[[1]],ylab=names(df)[[2]],zlab=names(df)[[3]],alpha=1)
aspect3d(aspectr)
We can modify the aplpack::plothulls function to accept a parameter for the proportion of points to enclose (in aplpack it's set to 50%). Then we can use this modified function to make a custom a geom for ggplot.
Here's the custom geom:
library(ggplot2)
StatBag <- ggproto("Statbag", Stat,
compute_group = function(data, scales, prop = 0.5) {
#################################
#################################
# originally from aplpack package, plotting functions removed
plothulls_ <- function(x, y, fraction, n.hull = 1,
col.hull, lty.hull, lwd.hull, density=0, ...){
# function for data peeling:
# x,y : data
# fraction.in.inner.hull : max percentage of points within the hull to be drawn
# n.hull : number of hulls to be plotted (if there is no fractiion argument)
# col.hull, lty.hull, lwd.hull : style of hull line
# plotting bits have been removed, BM 160321
# pw 130524
if(ncol(x) == 2){ y <- x[,2]; x <- x[,1] }
n <- length(x)
if(!missing(fraction)) { # find special hull
n.hull <- 1
if(missing(col.hull)) col.hull <- 1
if(missing(lty.hull)) lty.hull <- 1
if(missing(lwd.hull)) lwd.hull <- 1
x.old <- x; y.old <- y
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
for( i in 1:(length(x)/3)){
x <- x[-idx]; y <- y[-idx]
if( (length(x)/n) < fraction ){
return(cbind(x.hull,y.hull))
}
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx];
}
}
if(missing(col.hull)) col.hull <- 1:n.hull
if(length(col.hull)) col.hull <- rep(col.hull,n.hull)
if(missing(lty.hull)) lty.hull <- 1:n.hull
if(length(lty.hull)) lty.hull <- rep(lty.hull,n.hull)
if(missing(lwd.hull)) lwd.hull <- 1
if(length(lwd.hull)) lwd.hull <- rep(lwd.hull,n.hull)
result <- NULL
for( i in 1:n.hull){
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
result <- c(result, list( cbind(x.hull,y.hull) ))
x <- x[-idx]; y <- y[-idx]
if(0 == length(x)) return(result)
}
result
} # end of definition of plothulls
#################################
# prepare data to go into function below
the_matrix <- matrix(data = c(data$x, data$y), ncol = 2)
# get data out of function as df with names
setNames(data.frame(plothulls_(the_matrix, fraction = prop)), nm = c("x", "y"))
# how can we get the hull and loop vertices passed on also?
},
required_aes = c("x", "y")
)
#' #inheritParams ggplot2::stat_identity
#' #param prop Proportion of all the points to be included in the bag (default is 0.5)
stat_bag <- function(mapping = NULL, data = NULL, geom = "polygon",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, prop = 0.5, alpha = 0.3, ...) {
layer(
stat = StatBag, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, prop = prop, alpha = alpha, ...)
)
}
geom_bag <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
prop = 0.5,
alpha = 0.3,
...,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBag,
geom = GeomBag,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
alpha = alpha,
prop = prop,
...
)
)
}
#' #rdname ggplot2-ggproto
#' #format NULL
#' #usage NULL
#' #export
GeomBag <- ggproto("GeomBag", Geom,
draw_group = function(data, panel_scales, coord) {
n <- nrow(data)
if (n == 1) return(zeroGrob())
munched <- coord_munch(coord, data, panel_scales)
# Sort by group to make sure that colors, fill, etc. come in same order
munched <- munched[order(munched$group), ]
# For gpar(), there is one entry per polygon (not one entry per point).
# We'll pull the first value from each group, and assume all these values
# are the same within each group.
first_idx <- !duplicated(munched$group)
first_rows <- munched[first_idx, ]
ggplot2:::ggname("geom_bag",
grid:::polygonGrob(munched$x, munched$y, default.units = "native",
id = munched$group,
gp = grid::gpar(
col = first_rows$colour,
fill = alpha(first_rows$fill, first_rows$alpha),
lwd = first_rows$size * .pt,
lty = first_rows$linetype
)
)
)
},
default_aes = aes(colour = "NA", fill = "grey20", size = 0.5, linetype = 1,
alpha = NA, prop = 0.5),
handle_na = function(data, params) {
data
},
required_aes = c("x", "y"),
draw_key = draw_key_polygon
)
And here's an example of how it can be used:
ggplot(iris, aes(Sepal.Length, Petal.Length, colour = Species, fill = Species)) +
geom_point() +
stat_bag(prop = 0.95) + # enclose 95% of points
stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
stat_bag(prop = 0.05, alpha = 0.9) # enclose 5% of points

Resources