jitter if multiple outliers in ggplot2 boxplot - r

I am trying to find a suitable display to illustrate various properties within and across school classes. For each class there is only 15-30 data points (pupils).
Right now i am leaning towards a whisker-less boxplot, showing only 1.,2. and 3. quartile + datapoints more then e.g. 1 population SD +/- the sample median.
This I can do.
However - I need to show this graph to some teachers, in order to gauge what they like most. I'd like to compare my graph with a normal boxplot. But the normal boxplot looks the same if there is only one outlier, or e.g. 5 outliers at the same value. In this case this would be a deal-breaker.
e.g.
test <-structure(list(value = c(3, 5, 3, 3, 6, 4, 5, 4, 6, 4, 6, 4,
4, 6, 5, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 5, 6, 6, 4, 3, 5, 4,
6, 5, 6, 4, 5, 5, 3, 4, 4, 6, 4, 4, 5, 5, 3, 4, 5, 8, 8, 8, 8,
9, 6, 6, 7, 6, 9), places = structure(c(1L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L), .Label = c("a", "b"), class = "factor")), .Names = c("value",
"places"), row.names = c(NA, -60L), class = "data.frame")
ggplot(test, aes(x=places,y=value))+geom_boxplot()
Here there are two outliers at ("a",9) - but only one "dot" shown.
So my question: How to jitter the outliers. And - what kind of display would you suggest for this kind of data?

you can redifine the funcition
GeomBoxplot$draw<-function (., data, ..., outlier.colour = "black", outlier.shape = 16,
outlier.size = 2, outlier.jitter=0)
{
defaults <- with(data, data.frame(x = x, xmin = xmin, xmax = xmax,
colour = colour, size = size, linetype = 1, group = 1,
alpha = 1, fill = alpha(fill, alpha), stringsAsFactors = FALSE))
defaults2 <- defaults[c(1, 1), ]
if (!is.null(data$outliers) && length(data$outliers[[1]] >=
1)) {
pp<-position_jitter(width=outlier.jitter,height=0)
p<-pp$adjust(data.frame(x=data$x[rep(1, length(data$outliers[[1]]))], y=data$outliers[[1]]),.scale)
outliers_grob <- GeomPoint$draw(data.frame(x=p$x, y = p$y, colour = I(outlier.colour),
shape = outlier.shape, alpha = 1, size = outlier.size,
fill = NA), ...)
}
else {
outliers_grob <- NULL
}
with(data, ggname(.$my_name(), grobTree(outliers_grob, GeomPath$draw(data.frame(y = c(upper,
ymax), defaults2), ...), GeomPath$draw(data.frame(y = c(lower,
ymin), defaults2), ...), GeomRect$draw(data.frame(ymax = upper,
ymin = lower, defaults), ...), GeomRect$draw(data.frame(ymax = middle,
ymin = middle, defaults), ...))))
}
ggplot(test, aes(x=places,y=value))+geom_boxplot(outlier.jitter=0.05)
This is ad-hoc solution. Of course, in the sense of OOP, you should create a sub-class of GeomBoxplot and override the function. This is easy because ggplot2 is nice.
=== added for example of sub-class definition ===
GeomBoxplotJitterOutlier <- proto(GeomBoxplot, {
draw <- function (., data, ..., outlier.colour = "black", outlier.shape = 16,
outlier.size = 2, outlier.jitter=0) {
# copy the body of function 'draw' above and paste here.
}
objname <- "boxplot_jitter_outlier"
desc <- "Box and whiskers plot with jittered outlier"
guide_geom <- function(.) "boxplot_jitter_outlier"
})
geom_boxplot_jitter_outlier <- GeomBoxplotJitterOutlier$build_accessor()
then you can do with your sub-class:
ggplot(test, aes(x=places,y=value))+geom_boxplot_jitter_outlier(outlier.jitter=0.05)

It seems like the accepted answer doesn't work anymore, since ggplot2 has been updated.
After much search on the net I found the following on: http://comments.gmane.org/gmane.comp.lang.r.ggplot2/3616 -Look at Winston Chang's reply-
He calculates the outliers separately using ddply and then plotts them using
geom_dotplot()
having disabled the outlier output on the geom_boxplot():
geom_boxplot(outlier.colour = NA)
Here is the full code from the URL mentioned above:
# This returns a data frame with the outliers only
find_outliers <- function(y, coef = 1.5) {
qs <- c(0, 0.25, 0.5, 0.75, 1)
stats <- as.numeric(quantile(y, qs))
iqr <- diff(stats[c(2, 4)])
outliers <- y < (stats[2] - coef * iqr) | y > (stats[4] + coef * iqr)
return(y[outliers])
}
library(MASS) # Use the birthwt data set from MASS
# Find the outliers for each level of 'smoke'
library(plyr)
outlier_data <- ddply(birthwt, .(smoke), summarise, lwt = find_outliers(lwt))
# This draws an ordinary box plot
ggplot(birthwt, aes(x = factor(smoke), y = lwt)) + geom_boxplot()
# This draws the outliers using geom_dotplot
ggplot(birthwt, aes(x = factor(smoke), y = lwt)) +
geom_boxplot(outlier.colour = NA) +
#also consider:
# geom_jitter(alpha = 0.5, size = 2)+
geom_dotplot(data = outlier_data, binaxis = "y",
stackdir = "center", binwidth = 4)

Given the small number of data points, you would like to plot all the points not only the outliers. This will help to find out the distribution of points inside your boxplot.
You can do that with geom_jitter, but notice that box_plot already plot dots for the outliers so in order to not display them twice you need to switch off the outliers display of the boxplot with geom_boxplot(outlier.shape = NA).
library("ggplot2")
test <-structure(list(value = c(3, 5, 3, 3, 6, 4, 5, 4, 6, 4, 6, 4, 4, 6, 5, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 5, 6, 6, 4, 3, 5\
, 4, 6, 5, 6, 4, 5, 5, 3, 4, 4, 6, 4, 4, 5, 5, 3, 4, 5, 8, 8, 8, 8, 9, 6, 6, 7, 6, 9), places = structure(c(1L, 2L, 1L, 1L, 1L\
, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, \
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L), .Label = c("a", "b"), class =\
"factor")), .Names = c("value", "places"), row.names = c(NA, -60L), class = "data.frame")
# adding a level that you will use latter for giving colors
l <- rep(c(10,20,30,40,50,60), 10)
test$levels<-l
# [1]
# original plot
ggplot(test, aes(x=places,y=value))+geom_boxplot()
# [2]
# plot with outlier from boxplot and the points jittered to see
# distribution (outliers and the same point from position jitter would be
# counted twice for each different height)
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot() + geom_jitter(position=position_jitter(width=0.1, height=0))
# [3]
# make wider the jitter to avoid overplotting because there are a lot
# of points with the same value, also remove the outliers from boxplot
# (they are plotted with the geom_jitter anyway)
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot(outlier.shape = NA) +
geom_jitter(position=position_jitter(width=0.3, height=0))
# [4]
# adding colors to the points to see if there is a sub-pattern in the distribution
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot(outlier.shape = NA) +
geom_jitter(position=position_jitter(width=0.3, height=0), aes(colour=levels))
# [5]
# adding a bit of vertical jittering
# jittering (a good option for a less discrete datasets)
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot(outlier.shape = NA) +
geom_jitter(position=position_jitter(width=0.3, height=0.05), aes(colour=levels))
# [6]
# finally remember that position_jitter makes a jittering of a 40% of
# the resolution of the data, so if you forget the height=0 you will
# have a total different picture
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot(outlier.shape = NA) +
geom_jitter(position=position_jitter(width=0.2))

Does this get you what you are looking for? The limit to where the jitter begins is not automatic, but it is a start.
g = ggplot(test, aes(x = places,y = value))
g + geom_boxplot(outlier.colour = rgb(0,0,0,0)) + geom_point(data = test[test$value > 8,], position = position_jitter(width = .4))

Code abode doesn't work anymore. For current version of ggplot2 I used the following class:
DrawGeomBoxplotJitterOutlier <- function(data, panel_params, coord, ...,
outlier.jitter.width=NULL,
outlier.jitter.height=0,
outlier.colour = NULL,
outlier.fill = NULL,
outlier.shape = 19,
outlier.size = 1.5,
outlier.stroke = 0.5,
outlier.alpha = NULL) {
boxplot_grob <- ggplot2::GeomBoxplot$draw_group(data, panel_params, coord, ...)
point_grob <- grep("geom_point.*", names(boxplot_grob$children))
if (length(point_grob) == 0)
return(boxplot_grob)
ifnotnull <- function(x, y) ifelse(is.null(x), y, x)
if (is.null(outlier.jitter.width)) {
outlier.jitter.width <- (data$xmax - data$xmin) / 2
}
x <- data$x[1]
y <- data$outliers[[1]]
if (outlier.jitter.width > 0 & length(y) > 1) {
x <- jitter(rep(x, length(y)), amount=outlier.jitter.width)
}
if (outlier.jitter.height > 0 & length(y) > 1) {
y <- jitter(y, amount=outlier.jitter.height)
}
outliers <- data.frame(
x = x, y = y,
colour = ifnotnull(outlier.colour, data$colour[1]),
fill = ifnotnull(outlier.fill, data$fill[1]),
shape = ifnotnull(outlier.shape, data$shape[1]),
size = ifnotnull(outlier.size, data$size[1]),
stroke = ifnotnull(outlier.stroke, data$stroke[1]),
fill = NA,
alpha = ifnotnull(outlier.alpha, data$alpha[1]),
stringsAsFactors = FALSE
)
boxplot_grob$children[[point_grob]] <- ggplot2::GeomPoint$draw_panel(outliers, panel_params, coord)
return(boxplot_grob)
}
GeomBoxplotJitterOutlier <- ggplot2::ggproto("GeomBoxplotJitterOutlier",
ggplot2::GeomBoxplot,
draw_group = DrawGeomBoxplotJitterOutlier)
geom_boxplot_jitter_outlier <- function(mapping = NULL, data = NULL,
stat = "boxplot", position = "dodge",
..., outlier.jitter.width=0,
outlier.jitter.height=NULL,
na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE) {
ggplot2::layer(
geom = GeomBoxplotJitterOutlier, mapping = mapping, data = data,
stat = stat, position = position, show.legend = show.legend,
inherit.aes = inherit.aes, params = list(na.rm = na.rm,
outlier.jitter.width=outlier.jitter.width,
outlier.jitter.height=outlier.jitter.height, ...))
}

Related

Order Grouped Geom_lines in ggplot

I am wishing to show multiple geom_points in order of "season_pts" from each group "drafted_qbs". The issue though is that I'm not sure what to assign the other variable. I have a "team" column which is just the row number of each group but that will only order the first grouping "2".
Any way of laying on the same graph (not interested in faceting) each groups "fantasy_pts" in order of points would be helpful.
Data
structure(list(team = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L),
season_pts = c(447.44, 381.62, 416.6, 367.96, 419.92, 490.78,
501.66, 458.56, 484.48, 458.36, 518, 495.7, 511.34, 499.68,
536.42, 522.92, 536.92, 518.46, 538.06, 525.96, 541.84, 523.26,
542.98, 527.4, 527), drafted_qbs = c(2, 2, 2, 2, 2, 3, 3,
3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -25L))
It usually helps to ask yourself "what is it I am trying to show with this plot?"
If you are trying to show that an increased number of drafted QBs tends to produce an increased number of points, then you can do something like this:
ggplot(df, aes(drafted_qbs, season_pts)) +
geom_point(size = 4, aes(color = factor(team))) +
geom_smooth(color = 'gray20', size = 0.5, linetype = 2, alpha = 0.15) +
scale_color_brewer(palette = 'Set1') +
theme_light(base_size = 16) +
labs(x = 'Drafted QBs', y = 'Season Points', color = 'Team') +
theme(panel.grid.minor.x = element_blank())
If you want to show that not all teams are affected equally by this effect, then something like this might be preferable:
ggplot(df, aes(team, season_pts, color = drafted_qbs)) +
geom_point(size = 4, alpha = 0.5) +
scale_color_gradient(low = 'red3', high = 'blue3') +
theme_light(base_size = 16) +
labs(x = 'Team', y = 'Season Points', color = 'Drafted QBs') +
theme(panel.grid.minor.x = element_blank())

set x-axis display range

The example data:
nltt <- structure(list(time = c(0, 1.02579504946471, 1.66430039972234,
1.67435173974229, 1.82009140034063, 1.95574135900067, 2.06963147976273,
2.64869249896209, 3.10438864450297, 0, 0.56927280073675, 1.94864234867127,
3.40490224199942, 0, 0.444318793403606, 1.34697160089298, 5.86547288923207,
0, 1.10824151737219, 1.77801220982399, 1.82246583876729, 2.18034182214015,
2.33051663760657, 3.01615794541527, 0, 0.101501512884473, 0.98261402255534,
1.04492141817475, 1.16554239437694, 1.25441082400256, 1.25777371029976,
1.62464049949719, 1.87253384965378, 1.91118229908154, 1.94105777022533,
2.17755553127212, 2.37899716574036, 2.85003451051712, 3.16711386665322
), num = c(2, 3, 4, 5, 4, 5, 6, 5, 6, 2, 3, 4, 5, 2, 3, 2, 3,
2, 3, 4, 5, 6, 7, 6, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11,
12, 13, 14), rep = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L)), row.names = c(NA,
-39L), class = "data.frame")
The example code:
ggplot2::ggplot(nltt, aes(time, num, group = as.factor(rep), color = as.factor(rep))) +
ggplot2::geom_line() + ggplot2::coord_cartesian(xlim = c(0, 3)) +
ggplot2::theme(legend.position = "none") + ggplot2::xlab("age")
The example plot:
I would like each line in the plot to stop precisely at x = 3, but adding coord_cartesian(xlim = c(0, 3)) does not achieve my goal because the lines continues in the right padding area. How can I limit the lines in the range of [0, 3] without truncating my raw data?
Up front: two answers, the first removes the margins and requires no change to the data; the second preserves the margins, which requires one to modify the data in-place.
Remove the margin(s)
The default behavior is to expand the margins a little. While there is the ggplot2::expansion to control the multiplicative and additive components of the expansion, it can only be used in scale_continuous which, as you know, will result in loss (NA) of out-of-bound points.
If you can accept losing the boundary on the left as well, though, you can add expand=FALSE to your coord_cartesian and get your desired results:
ggplot2::ggplot(nltt, aes(time, num, group = as.factor(rep), color = as.factor(rep))) +
ggplot2::geom_line() +
ggplot2::coord_cartesian(xlim = c(0, 3), expand = FALSE) +
ggplot2::theme(legend.position = "none") +
ggplot2::xlab("age")
If you want to retain the left margin, though, you can force it by adjusting the xlim=, realizing that the default is around expansion(mult=0.05, add=0):
ggplot2::ggplot(nltt, aes(time, num, group = as.factor(rep), color = as.factor(rep))) +
ggplot2::geom_line() +
ggplot2::coord_cartesian(xlim = c(-0.15, 3), expand = FALSE) +
ggplot2::theme(legend.position = "none") +
ggplot2::xlab("age")
Interpolate and truncate, external to ggplot
ggplot2::scale_x_continuous(..., oob=) supports several mechanisms for dealing with out-of-bounds data, including:
the default censor (replaces with NA), which doesn't work since we don't have data at time=3
scales::squish that will take (for example) x=4 and squish it back to x=3; the unfortunate side-effect of this is that it is univariate (it does not attempt to change the corresponding y= value), so the slopes of the squished line segments will be steeper, and (in my mind at least) this corrupts the data and vis;
a user-defined function that is passed the values and the associated limits; unfortunately, it is also univariate, so we're stuck with the same data/vis slope-corruption as the previous bullet.
This brings me to the suggestion to interpolate the data yourself before passing to ggplot. I'll demo with dplyr but it can be done easily with base R or other dialects as well.
library(dplyr)
group_by(nltt, rep) %>%
## step 1: interpolate, returns *just* time=3 data, nothing more
summarize(as.data.frame(setNames(approx(time, num, xout = 3), c("time", "num")))) %>%
## step 2: combine with the original data
bind_rows(nltt) %>%
## step 3: remove data over 3
dplyr::filter(time <= 3) %>%
ggplot(aes(time, num, group = as.factor(rep), color = as.factor(rep))) +
ggplot2::geom_line() + ggplot2::coord_cartesian(xlim = c(0, 3)) +
ggplot2::theme(legend.position = "none") +
ggplot2::xlab("age")

ggplot legend -- how to make the line segment longer? [duplicate]

Considering the data.frame "dfc":
dfc <- structure(list(lag = c(-5L, -4L, -3L, -2L, -1L, 0L, 1L, 2L, 3L,
4L, 5L, -5L, -4L, -3L, -2L, -1L, 0L, 1L, 2L, 3L, 4L, 5L), variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("var1", "var2"), class = "factor"),
value = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0.6, 1.2, 1.8,
2.4, 3, 3.6, 4.2, 4.8, 5.4, 6, 6.6), size = c(2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), .Names = c("lag",
"variable", "value", "size"), row.names = c(NA, -22L), class = "data.frame")
Which looks like:
head(dfc)
lag variable value size
1 -5 var1 1 2
2 -4 var1 2 2
3 -3 var1 3 2
4 -2 var1 4 2
5 -1 var1 5 2
6 0 var1 6 2
I would like to plot "value" against "lag" with different line styles (depending on "variable) and widths (depending on "size"). I achieved with this piece of code:
ggplot(dfc) +
geom_line(aes(x = lag, y = value, linetype = variable, size = size)) +
scale_size(range=c(1, 2), guide=FALSE) +
scale_linetype_manual(values = 1:2, labels = c("Name of var1 ", "Name of var2")) +
theme(legend.title = element_blank(), legend.position = "bottom",
legend.direction = "horizontal")
Giving this plot:
But I cannot figure out how, at the same, to change the size of the line inside the legend, accordingly, and to keep the renaming.
z <- ggplot(dfc) +
geom_line(aes(x = lag, y = value, linetype = variable, size = size)) +
scale_size(range=c(1, 2), guide=FALSE)
z <- z + theme(legend.key.width = unit(5,"cm"))
z <- z + guides(linetype = guide_legend(override.aes = list(size = 2)))
z
I have added the guides() function to my previous answer.
After increasing the width of the legend.key.width, size of the lines in the legend can be changed by using the override.aes parameter and assigning it to the linetype variable.
This can be other way than what #cogitovita proposed.
Since dfc is grouped by $variable and $size is constant within each group. Try extract $size and set it manually.
var_size = c('var1'=2, 'var2'=1)
ggplot(dfc) +
geom_line(aes(x = lag, y = value, linetype = variable, size=variable)) +
scale_size_manual(values=var_size)
library(ggplot2)
library(grid)
z <- ggplot(dfc) + geom_line(aes(x = lag, y = value, linetype = variable, size = size)) +
scale_size(range=c(1, 2), guide=FALSE)
z <- z + theme(legend.key.width = unit(5,"cm"))
Width of the Legend is changed using legend.key.width parameter in the ggplot2 library package
You can change the numeric value of the width. Loading the grid library is important. Else the 'unit' is not recognized by R.

Remove box and points in legend

How do I remove the the box, ribbon color, and points in the legend? I would just like a straight line representing each color of the color. I've tried using guides(), but it's not changing.
Sample data:
pdat1 <- structure(list(type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("10-year",
"20-year", "30-year"), class = "factor"), effect = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), value = c(0,
-21.89, -27.36, -33.75, -40.57, -47.32, 0, -23, -28.31, -34.96,
-42.6, -50.81, 0, -16.9, -22.25, -28.87, -36.4, -44.52, 0, -10.24,
-16.8, -24.74, -33.52, -42.55, 0, -10.24, -16.8, -24.74, -33.52,
-42.55, 0, -10.24, -16.8, -24.74, -33.52, -42.55), temp = c(0,
1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5), value_max = c(2.91,
-19.02, -24.42, -30.88, -37.63, -44.35, 2.9, -20.09, -25.36,
-32.05, -39.67, -47.87, 2.97, -14.02, -19.27, -25.89, -33.49,
-41.58, 2.42, -7.74, -14.34, -22.27, -31.06, -40.02, 2.45, -7.8,
-14.36, -22.26, -31.07, -40.07, 2.46, -7.71, -14.23, -22.23,
-31.02, -40.05), value_min = c(-2.91, -24.76, -30.3, -36.63,
-43.5, -50.3, -2.9, -25.91, -31.27, -37.87, -45.52, -53.75, -2.97,
-19.77, -25.24, -31.85, -39.32, -47.46, -2.42, -12.74, -19.26,
-27.21, -35.98, -45.08, -2.45, -12.68, -19.24, -27.22, -35.96,
-45.02, -2.46, -12.77, -19.37, -27.25, -36.02, -45.05)), class = "data.frame", row.names = c(NA,
-36L), .Names = c("type", "effect", "value", "temp", "value_max",
"value_min"))
Plot Code
library(ggplot2)
ggplot(pdat1) +
geom_ribbon(aes(ymax = value_max, ymin = value_min, x = temp, linetype = NA, color = effect, fill = effect), fill = "#C0CCD9", alpha = 0.5 ) +
geom_line(aes(x = temp, y = value, color = effect, group = effect)) +
geom_point(aes(x = temp, y = value, color = effect), size = 0.5) +
ylab("Y") +
xlab("X") +
guides(color = guide_legend(keywidth = 2,
keyheight = 1,
override.aes = list(linetype = c(1, 1),
size = 1,
shape = c(0, 0)))) +
facet_wrap(~type)
Your ggplot code is a little bit messy, particularly for the ribbon. For example the fill aestetic is both mapped to the effect variable and set to a color value (#C0CCD9).
To remove the boxes in the legend key you need to use legend.key in theme but it works only after cleaning your ggplot code.
To avoid unnecessary repetitions I have moved severeal aestetics to the first ggplot call so that ggplot use them as default for the subsequent geom_XX calls.
ggplot(pdat1, aes(x = temp, y = value, group = effect)) +
geom_ribbon(aes(ymax = value_max, ymin = value_min), fill = "#C0CCD9", alpha = 0.5 ) +
geom_line(aes(color = effect)) +
geom_point(aes(color = effect), size = 0.5) +
ylab("Y") + xlab("X") +
guides(color = guide_legend(keywidth = 2, keyheight = 1,
override.aes = list(size = 1, shape = NA))) +
facet_wrap(~type) +
theme_bw() +
theme(legend.key = element_rect(fill = NA, color = NA))

Dodge/jitter the position of outliers in geom_boxplot (ggplot2) [duplicate]

I am trying to find a suitable display to illustrate various properties within and across school classes. For each class there is only 15-30 data points (pupils).
Right now i am leaning towards a whisker-less boxplot, showing only 1.,2. and 3. quartile + datapoints more then e.g. 1 population SD +/- the sample median.
This I can do.
However - I need to show this graph to some teachers, in order to gauge what they like most. I'd like to compare my graph with a normal boxplot. But the normal boxplot looks the same if there is only one outlier, or e.g. 5 outliers at the same value. In this case this would be a deal-breaker.
e.g.
test <-structure(list(value = c(3, 5, 3, 3, 6, 4, 5, 4, 6, 4, 6, 4,
4, 6, 5, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 5, 6, 6, 4, 3, 5, 4,
6, 5, 6, 4, 5, 5, 3, 4, 4, 6, 4, 4, 5, 5, 3, 4, 5, 8, 8, 8, 8,
9, 6, 6, 7, 6, 9), places = structure(c(1L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L), .Label = c("a", "b"), class = "factor")), .Names = c("value",
"places"), row.names = c(NA, -60L), class = "data.frame")
ggplot(test, aes(x=places,y=value))+geom_boxplot()
Here there are two outliers at ("a",9) - but only one "dot" shown.
So my question: How to jitter the outliers. And - what kind of display would you suggest for this kind of data?
you can redifine the funcition
GeomBoxplot$draw<-function (., data, ..., outlier.colour = "black", outlier.shape = 16,
outlier.size = 2, outlier.jitter=0)
{
defaults <- with(data, data.frame(x = x, xmin = xmin, xmax = xmax,
colour = colour, size = size, linetype = 1, group = 1,
alpha = 1, fill = alpha(fill, alpha), stringsAsFactors = FALSE))
defaults2 <- defaults[c(1, 1), ]
if (!is.null(data$outliers) && length(data$outliers[[1]] >=
1)) {
pp<-position_jitter(width=outlier.jitter,height=0)
p<-pp$adjust(data.frame(x=data$x[rep(1, length(data$outliers[[1]]))], y=data$outliers[[1]]),.scale)
outliers_grob <- GeomPoint$draw(data.frame(x=p$x, y = p$y, colour = I(outlier.colour),
shape = outlier.shape, alpha = 1, size = outlier.size,
fill = NA), ...)
}
else {
outliers_grob <- NULL
}
with(data, ggname(.$my_name(), grobTree(outliers_grob, GeomPath$draw(data.frame(y = c(upper,
ymax), defaults2), ...), GeomPath$draw(data.frame(y = c(lower,
ymin), defaults2), ...), GeomRect$draw(data.frame(ymax = upper,
ymin = lower, defaults), ...), GeomRect$draw(data.frame(ymax = middle,
ymin = middle, defaults), ...))))
}
ggplot(test, aes(x=places,y=value))+geom_boxplot(outlier.jitter=0.05)
This is ad-hoc solution. Of course, in the sense of OOP, you should create a sub-class of GeomBoxplot and override the function. This is easy because ggplot2 is nice.
=== added for example of sub-class definition ===
GeomBoxplotJitterOutlier <- proto(GeomBoxplot, {
draw <- function (., data, ..., outlier.colour = "black", outlier.shape = 16,
outlier.size = 2, outlier.jitter=0) {
# copy the body of function 'draw' above and paste here.
}
objname <- "boxplot_jitter_outlier"
desc <- "Box and whiskers plot with jittered outlier"
guide_geom <- function(.) "boxplot_jitter_outlier"
})
geom_boxplot_jitter_outlier <- GeomBoxplotJitterOutlier$build_accessor()
then you can do with your sub-class:
ggplot(test, aes(x=places,y=value))+geom_boxplot_jitter_outlier(outlier.jitter=0.05)
It seems like the accepted answer doesn't work anymore, since ggplot2 has been updated.
After much search on the net I found the following on: http://comments.gmane.org/gmane.comp.lang.r.ggplot2/3616 -Look at Winston Chang's reply-
He calculates the outliers separately using ddply and then plotts them using
geom_dotplot()
having disabled the outlier output on the geom_boxplot():
geom_boxplot(outlier.colour = NA)
Here is the full code from the URL mentioned above:
# This returns a data frame with the outliers only
find_outliers <- function(y, coef = 1.5) {
qs <- c(0, 0.25, 0.5, 0.75, 1)
stats <- as.numeric(quantile(y, qs))
iqr <- diff(stats[c(2, 4)])
outliers <- y < (stats[2] - coef * iqr) | y > (stats[4] + coef * iqr)
return(y[outliers])
}
library(MASS) # Use the birthwt data set from MASS
# Find the outliers for each level of 'smoke'
library(plyr)
outlier_data <- ddply(birthwt, .(smoke), summarise, lwt = find_outliers(lwt))
# This draws an ordinary box plot
ggplot(birthwt, aes(x = factor(smoke), y = lwt)) + geom_boxplot()
# This draws the outliers using geom_dotplot
ggplot(birthwt, aes(x = factor(smoke), y = lwt)) +
geom_boxplot(outlier.colour = NA) +
#also consider:
# geom_jitter(alpha = 0.5, size = 2)+
geom_dotplot(data = outlier_data, binaxis = "y",
stackdir = "center", binwidth = 4)
Given the small number of data points, you would like to plot all the points not only the outliers. This will help to find out the distribution of points inside your boxplot.
You can do that with geom_jitter, but notice that box_plot already plot dots for the outliers so in order to not display them twice you need to switch off the outliers display of the boxplot with geom_boxplot(outlier.shape = NA).
library("ggplot2")
test <-structure(list(value = c(3, 5, 3, 3, 6, 4, 5, 4, 6, 4, 6, 4, 4, 6, 5, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 5, 6, 6, 4, 3, 5\
, 4, 6, 5, 6, 4, 5, 5, 3, 4, 4, 6, 4, 4, 5, 5, 3, 4, 5, 8, 8, 8, 8, 9, 6, 6, 7, 6, 9), places = structure(c(1L, 2L, 1L, 1L, 1L\
, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, \
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L), .Label = c("a", "b"), class =\
"factor")), .Names = c("value", "places"), row.names = c(NA, -60L), class = "data.frame")
# adding a level that you will use latter for giving colors
l <- rep(c(10,20,30,40,50,60), 10)
test$levels<-l
# [1]
# original plot
ggplot(test, aes(x=places,y=value))+geom_boxplot()
# [2]
# plot with outlier from boxplot and the points jittered to see
# distribution (outliers and the same point from position jitter would be
# counted twice for each different height)
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot() + geom_jitter(position=position_jitter(width=0.1, height=0))
# [3]
# make wider the jitter to avoid overplotting because there are a lot
# of points with the same value, also remove the outliers from boxplot
# (they are plotted with the geom_jitter anyway)
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot(outlier.shape = NA) +
geom_jitter(position=position_jitter(width=0.3, height=0))
# [4]
# adding colors to the points to see if there is a sub-pattern in the distribution
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot(outlier.shape = NA) +
geom_jitter(position=position_jitter(width=0.3, height=0), aes(colour=levels))
# [5]
# adding a bit of vertical jittering
# jittering (a good option for a less discrete datasets)
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot(outlier.shape = NA) +
geom_jitter(position=position_jitter(width=0.3, height=0.05), aes(colour=levels))
# [6]
# finally remember that position_jitter makes a jittering of a 40% of
# the resolution of the data, so if you forget the height=0 you will
# have a total different picture
dev.new()
ggplot(data=test, aes(x=places, y=value)) + geom_boxplot(outlier.shape = NA) +
geom_jitter(position=position_jitter(width=0.2))
Does this get you what you are looking for? The limit to where the jitter begins is not automatic, but it is a start.
g = ggplot(test, aes(x = places,y = value))
g + geom_boxplot(outlier.colour = rgb(0,0,0,0)) + geom_point(data = test[test$value > 8,], position = position_jitter(width = .4))
Code abode doesn't work anymore. For current version of ggplot2 I used the following class:
DrawGeomBoxplotJitterOutlier <- function(data, panel_params, coord, ...,
outlier.jitter.width=NULL,
outlier.jitter.height=0,
outlier.colour = NULL,
outlier.fill = NULL,
outlier.shape = 19,
outlier.size = 1.5,
outlier.stroke = 0.5,
outlier.alpha = NULL) {
boxplot_grob <- ggplot2::GeomBoxplot$draw_group(data, panel_params, coord, ...)
point_grob <- grep("geom_point.*", names(boxplot_grob$children))
if (length(point_grob) == 0)
return(boxplot_grob)
ifnotnull <- function(x, y) ifelse(is.null(x), y, x)
if (is.null(outlier.jitter.width)) {
outlier.jitter.width <- (data$xmax - data$xmin) / 2
}
x <- data$x[1]
y <- data$outliers[[1]]
if (outlier.jitter.width > 0 & length(y) > 1) {
x <- jitter(rep(x, length(y)), amount=outlier.jitter.width)
}
if (outlier.jitter.height > 0 & length(y) > 1) {
y <- jitter(y, amount=outlier.jitter.height)
}
outliers <- data.frame(
x = x, y = y,
colour = ifnotnull(outlier.colour, data$colour[1]),
fill = ifnotnull(outlier.fill, data$fill[1]),
shape = ifnotnull(outlier.shape, data$shape[1]),
size = ifnotnull(outlier.size, data$size[1]),
stroke = ifnotnull(outlier.stroke, data$stroke[1]),
fill = NA,
alpha = ifnotnull(outlier.alpha, data$alpha[1]),
stringsAsFactors = FALSE
)
boxplot_grob$children[[point_grob]] <- ggplot2::GeomPoint$draw_panel(outliers, panel_params, coord)
return(boxplot_grob)
}
GeomBoxplotJitterOutlier <- ggplot2::ggproto("GeomBoxplotJitterOutlier",
ggplot2::GeomBoxplot,
draw_group = DrawGeomBoxplotJitterOutlier)
geom_boxplot_jitter_outlier <- function(mapping = NULL, data = NULL,
stat = "boxplot", position = "dodge",
..., outlier.jitter.width=0,
outlier.jitter.height=NULL,
na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE) {
ggplot2::layer(
geom = GeomBoxplotJitterOutlier, mapping = mapping, data = data,
stat = stat, position = position, show.legend = show.legend,
inherit.aes = inherit.aes, params = list(na.rm = na.rm,
outlier.jitter.width=outlier.jitter.width,
outlier.jitter.height=outlier.jitter.height, ...))
}

Resources