geom_boxplot with mapped, variable widths per bar? - r

I would like to be able to map the width of each of the boxplots in a plot to a variable, or otherwise specify it. Let's say I want the relative widths of the boxes in the figure below to be 1, 2, 3. Setting varwidth won't help me since the actual numbers of observations are the same for each bar.
I have the beginnings of a horrible hacky solution I can post, but would welcome something actually good!
library(ggplot2)
set.seed(101)
dd <- data.frame(f = factor(rep(LETTERS[1:3], each = 10)),
y = rnorm(30))
g1 <- ggplot(dd, aes(f,y)) + geom_boxplot()
print(g1)

My basic idea (which would take more work to make it nice) is to ggplot_build(); hack the relevant elements in the data for the layer; and redraw the plot: example below. Obviously not as nice as having a real mapping/scaling system but maybe OK for simple cases ...
rel_wid <- c(1, 2, 3)
g1B <- ggplot_build(g1)
newdat <- g1B$data[[1]]
wids <- mean(newdat$new_width)*rel_wid/mean(rel_wid)
newdat <- within(newdat,
{
xmin <- newx - wids/2
xmax <- newx + wids/2
})
g2 <- g1B
g2$data[[1]] <- newdat
library(grid)
grid.draw(ggplot_gtable(g2))

A bit more concise and does it all in one gulp, without having to build the plot first:
library(ggplot2)
set.seed(101)
dd <- data.frame(f = factor(rep(LETTERS[1:3], each = 10)), y = rnorm(30))
ggplot(dd, aes(f,y)) +
Map(\(a, b) geom_boxplot(data = a, width = b), split(dd, dd$f), 1:3 * 0.35)
Created on 2023-02-07 with reprex v2.0.2

Related

Combine multiple facet strips across columns in ggplot2 facet_wrap

I am trying to combine facet strips across two adjacent panels (there is always two adjacent ones with the same first ID variable, but with two different scenarios, let's call them "A" and "B"). I am not particularly wedded to the gtable + grid solution I tried, but sadly I cannot use the facet_nested() from the ggh4x package (I cannot install it on my company's server due to various restrictions that are in place and needed dependencies - I looked at using only the relevant code, but that again is not easy due to the dependencies).
A minimum viable example of the basic plot I want to make easier to read by indicating which panels "belong together" by combining the top facet strips looks like this:
library(tidyverse)
library(gtable)
library(grid)
idx = 1:16
p1 = expand_grid(id=idx, id2=c("A", "B"), x=1:10) %>%
mutate(y=rnorm(n=n())) %>%
ggplot(aes(x=x,y=y)) +
geom_jitter() +
facet_wrap(~id + id2, nrow = 4, ncol=8)
The strips with the "1"s, the ones with the "2"s etc. should be combined (in reality it's a somewhat longer text, but this is just for illustration). I was trying to adapt an answer for a similar scenario (https://stackoverflow.com/a/40316170/7744356 - thank you #markus for finding it again), but this is what I tried. As you can see below, the height of what I produce seems wrong. I assume this must be some trivial thing I am overlooking/not understanding.
# Combine strips for a ID
g <- ggplot_gtable(ggplot_build(p1))
strip <- gtable_filter(g, "strip-t", trim = FALSE)
stript <- which(grepl('strip-t', g$layout$name))
stript2 = stript[idx*2-1]
top <- strip$layout$t[idx*2-1]
# # Using the $b below instead of b = top[i]+1, also seems not to work
#bot <- strip$layout$b[idx*2-1]
l <- strip$layout$l[idx*2-1]
r <- strip$layout$r[idx*2]
mat <- matrix(vector("list",
length = length(idx)*3),
nrow = length(idx))
mat[] <- list(zeroGrob())
res <- gtable_matrix("toprow", mat,
unit(c(1, 0, 1), "null"),
unit( rep(1, length(idx)),
"null"))
for (i in 1:length(stript2)){
if (i==1){
zz <- res %>%
gtable_add_grob(g$grobs[[stript2[i]]]$grobs[[1]], 1, 1, 1, 3) %>%
gtable_add_grob(g, .,
t = top[i],
l = l[i],
b = top[i]+1,
r = r[i],
name = c("add-strip"))
} else {
zz <- res %>%
gtable_add_grob(g$grobs[[stript2[i]]]$grobs[[1]], 1, 1, 1, 3) %>%
gtable_add_grob(zz, .,
t = top[i],
l = l[i],
b = top[i]+1,
r = r[i],
name = c("add-strip"))
}
}
grid::grid.draw(zz)
------------ Update with a ggh4x implementation -----------------
This may solve this type of problem for many, but has its downsides (e.g. axes alignment across rows gets a bit manual, probably need to manually remove x-axes and ensure the limits are the same, add a unified y-axis label, requires installation of a package from github: devtools::install_github("teunbrand/ggh4x#v0.1") for a specific version, plus cowplot interacts badly with e.g. ggtern). So I'd love it, if someone still managed to do a pure gtable + grid version.
library(tidyverse)
library(ggh4x)
library(cowplot)
plots = expand_grid(id=idx, id2=c("A", "B"), x=1:10) %>%
mutate(y=rnorm(n=n()),
plotrow=(id-1)%/%4+1) %>%
group_by(plotrow) %>%
group_map( ~ ggplot(data=.,
aes(x=x,y=y)) +
geom_jitter() +
facet_nested( ~ id + id2, ))
plot_grid(plotlist = plots, nrow = 4, ncol=1)
I'm a bit late to this game, but ggh4x now has a facet_nested_wrap() implementation that should greatly simplify this problem (disclaimer: I wrote ggh4x).
library(tidyverse)
library(ggh4x)
idx = 1:16
p1 = expand_grid(id=idx, id2=c("A", "B"), x=1:10) %>%
mutate(y=rnorm(n=n())) %>%
ggplot(aes(x=x,y=y)) +
geom_jitter() +
facet_nested_wrap(~id + id2, nrow = 4, ncol=8)
p1
Created on 2020-08-12 by the reprex package (v0.3.0)
Keep in mind that there might still be a few bugs in this. Also, I'm aware that this doesn't help the OP because his package versions are constrained, but I thought I mention this here anyway.
Here's a reprex of a somewhat pedestrian way to do it in grid. I have made the "parent" facet somewhat darker to emphasise the nesting, but if you prefer the color to match just change the rectGrob fill color to "gray85".
# Set up plot as per example
library(tidyverse)
library(gtable)
library(grid)
idx = 1:16
p1 = expand_grid(id=idx, id2=c("A", "B"), x=1:10) %>%
mutate(y=rnorm(n=n())) %>%
ggplot(aes(x=x,y=y)) +
geom_jitter() +
facet_wrap(~id + id2, nrow = 4, ncol=8)
g <- ggplot_gtable(ggplot_build(p1))
# Code to produce facet strips
stript <- grep("strip", g$layout$name)
grid_cols <- sort(unique(g$layout[stript,]$l))
t_vals <- rep(sort(unique(g$layout[stript,]$t)), each = length(grid_cols)/2)
l_vals <- rep(grid_cols[seq_along(grid_cols) %% 2 == 1], length = length(t_vals))
r_vals <- rep(grid_cols[seq_along(grid_cols) %% 2 == 0], length = length(t_vals))
labs <- levels(as.factor(p1$data$id))
for(i in seq_along(labs))
{
filler <- rectGrob(y = 0.7, height = 0.6, gp = gpar(fill = "gray80", col = NA))
tg <- textGrob(label = labs[i], y = 0.75, gp = gpar(cex = 0.8))
g <- gtable_add_grob(g, filler, t = t_vals[i], l = l_vals[i], r = r_vals[i],
name = paste0("filler", i))
g <- gtable_add_grob(g, tg, t = t_vals[i], l = l_vals[i], r = r_vals[i],
name = paste0("textlab", i))
}
grid.newpage()
grid.draw(g)
And to demonstrate changing the rectGrob to 50% height and "gray85":
Or if you wanted you could assign a different fill for each cycle of the loop:
Obviously the above method might take a few tweaks to fit other plots with different numbers of levels etc.
Created on 2020-07-04 by the reprex package (v0.3.0)
Maybe this can not tackle the issue, but I would like to post because it could help to present results in a different plot keeping the same structure. You will have to define the number of columns for the plot in plot_layout(ncol = 4). This code uses patchwork package. Hope this can be useful.
library(tidyverse)
library(gtable)
library(grid)
library(patchwork)
idx = 1:16
#Data
p1 = expand_grid(id=idx, id2=c("A", "B"), x=1:10) %>%
mutate(y=rnorm(n=n()))
#Split data
List <- split(p1,p1$id)
#Sketch function
myplot <- function(x)
{
d <- ggplot(x,aes(x=x,y=y)) +
geom_jitter() +
facet_wrap(~id2, nrow = 1, ncol=2)+
ggtitle(unique(x$id))+
theme(plot.title = element_text(hjust = 0.5))
return(d)
}
#List of plots
Lplots <- lapply(List,myplot)
#Concatenate plots
#Create chain for plots
chain <- paste0('Lplots[[',1:length(Lplots),']]',collapse = '+')
#Evaluate the object and create the plot
Plot <- eval(parse(text = chain))+plot_layout(ncol = 4)+
plot_annotation(title = 'A nice plot')&theme(plot.title = element_text(hjust=0.5))
#Display
Plot
You will end up with a plot like this:

R box plot with all data points ordered from low to high

In R, I would like to create a box plot that also shows all data points. There are numerous posts and websites where you can find this information, but they all seem to show the data points in ‘jitter’ or ‘random’ style. Here is an example code using the ToothGrowth dataset with ggplot2 in R.
library(datasets)
data(ToothGrowth)
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
library(ggplot2)
ggplot(ToothGrowth, aes(x=dose, y=len)) +
geom_boxplot(notch = TRUE) +
geom_jitter(position=position_jitter(0.2))
However, I would like to have the data points ordered from the lowest at the lower-left to highest at the top-right. Please see example in this link:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3966983/figure/F1/ (freely accessible). Specifically, I refer to Figure 1a, top portion (‘Purity’).
Might anyone have suggestions? I would much appreciate it. Thank you.
I don't know if this is what you are after but maybe you can inspire yourself from the code below.
set.seed(1234)
n <- 20
x <- rnorm(n)
boxplot(x)
points(seq(0.75, 1.25, length.out = n), sort(x))
df1 <- sapply(1:4, function(i) rnorm(n, mean = i))
df1 <- as.data.frame(df1)
df1 <- reshape2::melt(df1)
boxplot(value ~ variable, df1)
sp <- split(df1, df1$variable)
for(i in 1:4){
points(seq(i - 0.25, i + 0.25, length.out = n), sort(sp[[i]]$value))
}
Edit.
A ggplot2 solution uses a similar trick to define the points' x axis coordinates. The only thing "strange", is to rely on R's internal representation of factors as consecutive integers starting at 1. Note that this must be seen as a hack, but as a reliable one, I don't believe it will ever change.
library(ggplot2)
library(tidyverse)
df1 %>%
group_by(variable) %>%
arrange(value) %>%
mutate(xcoord = seq(-0.25, 0.25, length.out = n())) %>%
ggplot(aes(x = variable, y = value, group = variable)) +
geom_boxplot() +
geom_point(aes(x = xcoord + as.integer(variable)))

created a nested cdf that doesn't reach 1

Here is some workable example of data I wish to plot:
set.seed(123)
x <- rweibull(n = 2000, shape = 2, scale = 10)
x <- round(x, digits = 0)
x <- sort(x, decreasing = FALSE)
y <- c(rep(0.1, times = 500),rep(0.25, times = 500),rep(0.4, times = 500),rep(0.85, times = 500))
z <- rbinom(n=2000, size=1, prob=y)
df1 <- data.frame(x,z)
I want to plot the overal fequency of z across x.
unlike a typical cdf, the function should not reach 1.0, but instead
sum(df1$z)/length(df1$z)
a ymax of 0.36 (721/2000).
using ggplot2 we can create a cdf of x with the following command:
library(ggplot2)
ggplot(df1, aes(x)) + stat_ecdf()
But i want to extend this plot to show the cumulative percentage of z (as a function of 'x')
The end result should like like
EDIT
with some very poor data manipulation I am able to generate the something similiar to a cdf plot, but there must be a more beautiful and easy method using various packages and ggplot
mytable <- table(df1$x, df1$z)
mydf <- as.data.frame.matrix(mytable)
colnames(mydf) <- c("z_no", "z_yes")
mydf$A <- 1:length(mydf$z_no)
mydf$sum <- cumsum(mydf$z_yes)
mydf$dis <- mydf$sum/length(z)
plot(mydf$A, mydf$dis)
You can use the package dplyr to process the data as follows:
library(dplyr)
plot_data <- group_by(df1, x) %>%
summarise(z_num = sum(z)) %>%
mutate(cum_perc_z = cumsum(z_num)/nrow(df1))
This gives the same result as the data processing that you describe in your edit. Note, however, that I get sum(df1$z) = 796 and the maximal y value is thus 796/2000 = 0.398.
For the plot, you can use geom_step() to have a step function and add the horizontal line with geom_hline():
ggplot(plot_data, aes(x = x, y = cum_perc_z)) +
geom_step(colour = "red", size = 0.8) +
geom_hline(yintercept = max(plot_data$cum_perc_z))

Add second x-axis in ggplot2

In the "graphics" package one can add a second x-axis (indicating the percentiles of the distribution) to a histogram as follows:
x <- rnorm(1000)
hist(x, main="", xlab="Bias")
perc <- quantile(x, seq(from=.00, to=1, by=.1))
axis(1,at=perc,labels=c("0","10%","20%","30%","40%","50%","60%","70%","80%","90%","100%"),cex=0.5, pos= -90)
That looks awkward, of course. So how can I modify the following ggplot2 code to add a second x-axis, shwing the percentiles, while the first x-axis should indicate the raw values?:
library(ggplot2)
theme_classic(base_size = 12, base_family = "")
x <- rnorm(1000)
qplot(x, main="", xlab="Bias")
perc <- quantile(x, seq(from=.00, to=1, by=.1))
Any help? Many thanks in advance!
I'm not entirely certain what you're after, since your first example doesn't actually produce what you describe.
But in terms of simply adding the percentage along with the raw value along the x axis, the easiest strategy would probably be to simply combine the two with a line break in a single set of labels:
dat <- data.frame(x = rnorm(1000))
perc <- quantile(dat$x,seq(from = 0,to = 1,by = 0.1))
l <- paste(round(perc,1),names(perc),sep = "\n")
> ggplot(dat,aes(x = x)) +
geom_histogram() +
scale_x_continuous(breaks = perc,labels = l)
Here's another approach which uses annotate(...) and does not require that the two scales have the same breaks.
library(ggplot2)
library(grid)
set.seed(123)
x <- rnorm(1000)
perc <- quantile(x, seq(from=.00, to=1, by=.1))
labs <- gsub("\\%","",names(perc)) # strip "%" from names
yval <- hist(x,breaks=30,plot=F)$count
yrng <- diff(range(yval))
g1 <- ggplot() +
geom_histogram(aes(x=x))+
xlim(range(x))+
coord_cartesian(ylim=c(0,1.1*max(yval)))+
labs(x="")+
annotate(geom = "text", x = perc, y = -0.1*yrng, label = labs, size=4) +
annotate(geom = "text", x=0, y=-0.16*yrng, label="Bias", size=4.5)+
theme(plot.margin = unit(c(1, 1, 2, 1), "lines"))
g2 <- ggplot_gtable(ggplot_build(g1))
g2$layout$clip[g2$layout$name == "panel"] <- "off"
grid.draw(g2)
This adds the second x-axis and the label using annotate(...). The last three lines of code turn off clipping of the viewport. Otherwise the annotations aren't visible.
Credit to #Henrik for his answer to this question.

How to produce a meaningful draftsman/correlation plot for discrete values

One of my favorite tools for exploratory analysis is pairs(), however in the case of a limited number of discrete values, it falls flat as the dots all align perfectly. Consider the following:
y <- t(rmultinom(n=1000,size=4,prob=rep(.25,4)))
pairs(y)
It doesn't really give a good sense of correlation. Is there an alternative plot style that would?
If you change y to a data.frame you can add some 'jitter' and with the col option you can set the transparency level (the 4th number in rgb):
y <- data.frame(y)
pairs(sapply(y,jitter), col = rgb(0,0,0,.2))
Or you could use ggplot2's plotmatrix:
library(ggplot2)
plotmatrix(y) + geom_jitter(alpha = .2)
Edit: Since plotmatrix in ggplot2 is deprecated use ggpairs (GGally package mentioned in #hadley's comment above)
library(GGally)
ggpairs(y, lower = list(params = c(alpha = .2, position = "jitter")))
Here is an example using corrplot:
M <- cor(y)
corrplot.mixed(M)
You can find more examples in the intro
http://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html
Here are a couple of options using ggplot2:
library(ggplot2)
## re-arrange data (copied from plotmatrix function)
prep.plot <- function(data) {
grid <- expand.grid(x = 1:ncol(data), y = 1:ncol(data))
grid <- subset(grid, x != y)
all <- do.call("rbind", lapply(1:nrow(grid), function(i) {
xcol <- grid[i, "x"]
ycol <- grid[i, "y"]
data.frame(xvar = names(data)[ycol], yvar = names(data)[xcol],
x = data[, xcol], y = data[, ycol], data)
}))
all$xvar <- factor(all$xvar, levels = names(data))
all$yvar <- factor(all$yvar, levels = names(data))
return(all)
}
dat <- prep.plot(data.frame(y))
## plot with transparent jittered points
ggplot(dat, aes(x = x, y=y)) +
geom_jitter(alpha=.125) +
facet_grid(xvar ~ yvar) +
theme_bw()
## plot with color representing density
ggplot(dat, aes(x = factor(x), y=factor(y))) +
geom_bin2d() +
facet_grid(xvar ~ yvar) +
theme_bw()
I don't have enough credits yet to comment on #Vincent 's post - when doing
library(GGally)
ggpairs(y, lower = list(params = c(alpha = .2, position = "jitter")))
I get
Error in stop_if_params_exist(obj$params) :
'params' is a deprecated argument. Please 'wrap' the function to supply arguments. help("wrap", package = "GGally")
So it seems, based on the indicated help page, that it would need to be in this case here:
ydf <- as.data.frame(y)
regularPlot <- ggpairs(ydf, lower = list(continuous = wrap(ggally_points, alpha = .2, position = "jitter")))
regularPlot

Resources