Dendrogram with labels on the right side - r

I want a horizontal dendrogram with the variable names on the right side to display correlation coefficients. It would be nice if I could achieve it in some ggplot2-related package, since I want the diagram to be similar looking to my other graphics. scale_x_discrete(position="top) does not work, because then the labels disappear. These are my results so far:
library(ggplot2)
library(dplyr)
library(tidyr)
library(faux)
library(ggdendro)
# data
set.seed(5)
dat <- rnorm_multi(n = 100,
mu = c(0, 20, 20),
sd = c(1, 5, 5),
r = c(0.5, 0.5, 0.25),
varnames = c("A", "B", "C"),
empirical = FALSE)
# make correlation matrix
cor_matrix_before <- cor(dat, method="spearman")
# make dendrogram
tree <- hclust(as.dist(1 - cor_matrix_before**2))
ggdendrogram(tree) +
theme_light() +
theme(text = element_text(size=16)) +
xlab("") +
ylab("Spearmans rho squared") +
scale_y_reverse(breaks=seq(0,1,0.25), labels=rev(seq(0,1,0.25))) +
geom_hline(yintercept=0.7*0.7, col = "red") +
coord_flip()
(I stole the preparation of correlated variables from: https://cran.r-project.org/web/packages/faux/vignettes/rnorm_multi.html)
But this would be what I want (just a quick paint-montage):
EDIT: Thanks to #tjebo, this is my final solution (I removed all the parts that I did not need, look at his answer for a more generic answer):
tree <- hclust(as.dist(1 - cor_matrix_before**2))
data <- ggdendro::dendro_data(tree)
ggplot() +
geom_blank()+
geom_segment(data = segment(data), aes_string(x = "x", y = "y", xend = "xend", yend = "yend")) +
geom_hline(yintercept=0.7*0.7, col = "red") +
scale_x_continuous(breaks = seq_along(data$labels$label), labels = data$labels$label, position = "top") +
scale_y_reverse(breaks=seq(0,1,0.25), labels=rev(seq(0,1,0.25))) +
coord_flip() +
theme(axis.text.x = element_text(angle = angle, hjust = 1, vjust = 0.5),
axis.text.y = element_text(angle = angle, hjust = 1),
text = element_text(size=16, family="Calibri")) +
ylab("Spearmans rho squared") +
xlab("") +
theme_light()

If you want to avoid re-inventing the wheel and creating those dendrograms from scratch (i.e., if you wanna make use of high level ggdendrogram), then you won't get around changing the underlying function. ggdendro::ggdendrogram defines both y and x axis. You need to modify them in the function body. See comments in the code below.
library(tidyverse)
library(faux)
library(ggdendro)
set.seed(5)
dat <- rnorm_multi(
n = 100,
mu = c(0, 20, 20),
sd = c(1, 5, 5),
r = c(0.5, 0.5, 0.25),
varnames = c("A", "B", "C"),
empirical = FALSE
)
cor_matrix_before <- cor(dat, method = "spearman")
tree <- hclust(as.dist(1 - cor_matrix_before**2))
## re-define ggdendrogram. I think the easiest is add another argument for the axis position, see "x_lab"
ggdendrogram2 <- function(data, segments = TRUE, labels = TRUE, leaf_labels = TRUE,
rotate = FALSE, theme_dendro = TRUE, x_lab = "bottom", ...) {
dataClass <- if (inherits(data, "dendro")) {
data$class
} else {
class(data)
}
angle <- if (dataClass %in% c("dendrogram", "hclust")) {
ifelse(rotate, 0, 90)
} else {
ifelse(rotate, 90, 0)
}
hjust <- if (dataClass %in% c("dendrogram", "hclust")) {
ifelse(rotate, 1, 1)
} else {
0.5
}
if (!ggdendro::is.dendro(data)) {
data <- ggdendro::dendro_data(data)
}
p <- ggplot() +
geom_blank()
if (segments && !is.null(data$segments)) {
p <- p + geom_segment(data = segment(data), aes_string(
x = "x",
y = "y", xend = "xend", yend = "yend"
))
}
if (leaf_labels && !is.null(data$leaf_labels)) {
p <- p + geom_text(
data = leaf_label(data), aes_string(
x = "x",
y = "y", label = "label"
), hjust = hjust, angle = angle,
...
)
}
if (labels) {
p <- p + scale_x_continuous(
breaks = seq_along(data$labels$label),
labels = data$labels$label,
# and this is where you add x_lab
position = x_lab
)
}
if (rotate) {
p <- p + coord_flip()
p <- p + scale_y_continuous()
} else {
p <- p + scale_y_continuous()
}
if (theme_dendro) {
p <- p + theme_dendro()
}
p <- p + theme(axis.text.x = element_text(
angle = angle,
hjust = 1, vjust = 0.5
)) + theme(axis.text.y = element_text(
angle = angle,
hjust = 1
))
p
}
ggdendrogram2(tree, x_lab = "top", rotate = TRUE)
Created on 2021-07-28 by the reprex package (v2.0.0)

Related

Increase space for long axis labels in radar chart

I want to create a radar chart with ggirahExtra::ggRadar. The problem is that I have long labels and they are clipped. I thought I could create more space between label and plot by adding margin = margin(0,0,2,0, "cm") to element_text in axis.text, but its not working.
Any ideas how to increase the label space are welcome (apart from making the font smaller).
Add: As #tjebo suggests in the comments, it might be easier, or maybe the only way to make it work, to change the underlying functions in ggRadar especially coord_radar. Any suggestions of how to do this are welcome.
library(ggplot2)
library(ggiraphExtra)
dat <- data.frame("Item_A_Long" = 2,
"Item_B_Very_Very_Long"= 0,
"Label_Item_C" = 1,
"Item_D_Label" = 4,
"Another_very_long_label" = 3)
ggRadar(dat,
aes(
x = c(Item_A_Long,
Item_B_Very_Very_Long,
Label_Item_C,
Item_D_Label,
Another_very_long_label)
),
legend.position = "top",
colour = "white",
rescale = FALSE,
use.label = FALSE
) +
scale_y_continuous(expand = c(0,0),
limits = c(0,4)
) +
theme(panel.background = element_rect(fill = "#001957"),
# adding margin = margin(0,0,2,0, "cm") to element_text below does not help
axis.text = element_text(color = "#FFFFFF"),
panel.grid.major.y = element_blank())
Created on 2021-04-30 by the reprex package (v0.3.0)
It's a matter of clipping. The problem is also the white standard background of your drawing device. Below a hacky workaround.
turn off clipping with a modified version of ggiraphExtra::coord_radar as well as ggiraphExtra::ggRadar. Note I have removed a (very) few bits from the original ggRadar function, so if you need all arguments, you'd need to modify the function yourself.
Turn all background elements blue
Superimpose all onto a pure blue background, I am using cowplot.
library(cowplot)
library(ggplot2)
p1 <- ggRadar2(dat,
aes(
x = c(
Item_A_Long,
Item_B_Very_Very_Long,
Label_Item_C,
Item_D_Label,
Another_very_long_label
)
),
colour = "white",
rescale = FALSE,
clip = "off"
) +
theme(
plot.background = element_rect(fill = "#001957", color = "#001957"),
panel.background = element_rect(fill = "#001957"),
# adding margin = margin(0,0,2,0, "cm") to element_text below does not help
axis.text = element_text(color = "#FFFFFF"),
panel.grid.major.y = element_blank()
)
p2 <-
ggplot() +
theme_void()+
theme(panel.background = element_rect(fill = "#001957"))
ggdraw(p2) + draw_plot(p1)
the modified functions
coord_radar2 <- function(theta = "x", start = 0, direction = 1, clip = "off") {
theta <- match.arg(theta, c("x", "y"))
r <- if (theta == "x") {
"y"
} else {
"x"
}
ggproto("CoordRadar", ggplot2::CoordPolar,
theta = theta,
r = r, start = start, clip = clip,
direction = sign(direction), is_linear = function(coord) TRUE
)
}
ggRadar2 <- function(data, mapping = NULL, rescale = TRUE, legend.position = "top",
colour = "red", alpha = 0.3, size = 3, ylim = NULL, scales = "fixed",
use.label = FALSE, interactive = FALSE, clip = "off", ...) {
data <- as.data.frame(data)
(groupname <- setdiff(names(mapping), c("x", "y")))
groupname
mapping
length(groupname)
if (length(groupname) == 0) {
groupvar <- NULL
}
else {
groupvar <- ggiraphExtra:::getMapping(mapping, groupname)
}
groupvar
facetname <- colorname <- NULL
if ("facet" %in% names(mapping)) {
facetname <- ggiraphExtra:::getMapping(mapping, "facet")
}
(colorname <- setdiff(groupvar, facetname))
if ((length(colorname) == 0) & !is.null(facetname)) {
colorname <- facetname
}
data <- ggiraphExtra:::num2factorDf(data, groupvar)
(select <- sapply(data, is.numeric))
if ("x" %in% names(mapping)) {
xvars <- ggiraphExtra:::getMapping(mapping, "x")
xvars
if (length(xvars) < 3) {
warning("At least three variables are required")
}
}
else {
xvars <- colnames(data)[select]
}
(xvars <- setdiff(xvars, groupvar))
if (rescale) {
data <- ggiraphExtra:::rescale_df(data, groupvar)
}
temp <- sjlabelled::get_label(data)
cols <- ifelse(temp == "", colnames(data), temp)
if (is.null(groupvar)) {
id <- ggiraphExtra:::newColName(data)
data[[id]] <- 1
longdf <- reshape2::melt(data, id.vars = id, measure.vars = xvars)
}
else {
cols <- setdiff(cols, groupvar)
longdf <- reshape2::melt(data, id.vars = groupvar, measure.vars = xvars)
}
temp <- paste0("plyr::ddply(longdf,c(groupvar,'variable'), dplyr::summarize,mean=mean(value,na.rm=TRUE))")
df <- eval(parse(text = temp))
colnames(df)[length(df)] <- "value"
df
groupvar
if (is.null(groupvar)) {
id2 <- ggiraphExtra:::newColName(df)
df[[id2]] <- "all"
id3 <- ggiraphExtra:::newColName(df)
df[[id3]] <- 1:nrow(df)
df$tooltip <- paste0(df$variable, "=", round(
df$value,
1
))
df$tooltip2 <- paste0("all")
p <- ggplot(data = df, aes_string(
x = "variable", y = "value",
group = 1
)) +
ggiraph::geom_polygon_interactive(aes_string(tooltip = "tooltip2"),
colour = colour, fill = colour, alpha = alpha
) +
ggiraph::geom_point_interactive(aes_string(
data_id = id3,
tooltip = "tooltip"
), colour = colour, size = size)
}
else {
if (!is.null(colorname)) {
id2 <- ggiraphExtra:::newColName(df)
df[[id2]] <- df[[colorname]]
}
id3 <- ggiraphExtra:::newColName(df)
df[[id3]] <- 1:nrow(df)
df$tooltip <- paste0(
groupvar, "=", df[[colorname]], "<br>",
df$variable, "=", round(df$value, 1)
)
df$tooltip2 <- paste0(groupvar, "=", df[[colorname]])
p <- ggplot(data = df, aes_string(
x = "variable", y = "value",
colour = colorname, fill = colorname, group = colorname
)) +
ggiraph::geom_polygon_interactive(aes_string(tooltip = "tooltip2"),
alpha = alpha
) +
ggiraph::geom_point_interactive(aes_string(
data_id = id3,
tooltip = "tooltip"
), size = size)
}
p
if (!is.null(facetname)) {
formula1 <- as.formula(paste0("~", facetname))
p <- p + facet_wrap(formula1, scales = scales)
}
p <- p + xlab("") + ylab("") + theme(legend.position = legend.position)
p <- p + coord_radar2(clip = clip)
if (!is.null(ylim)) {
p <- p + expand_limits(y = ylim)
}
p
p
}
You can use the labelled package to create labels with line breaks and then set label = TRUE in ggRadar(). You can add more than one break for super long labels.
library(ggplot2)
library(ggiraphExtra)
library(labelled)
dat <- data.frame("Item_A_Long" = 2,
"Item_B_Very_Very_Long"= 0,
"Label_Item_C" = 1,
"Item_D_Label" = 4,
"Another_very_long_label" = 3)
var_label(dat$Item_A_Long ) <- "Item \nA long"
var_label(dat$Item_B_Very_Very_Long ) <- "Item_B_\nVery_\nVery_Long"
var_label(dat$Label_Item_C ) <- "Label_\nItem_C "
var_label(dat$Item_D_Label ) <- "Item_\nD_Label"
var_label(dat$Another_very_long_label ) <- "Another_very_\nlong_label"
ggRadar(dat,
aes(
x = c(Item_A_Long,
Item_B_Very_Very_Long,
Label_Item_C,
Item_D_Label,
Another_very_long_label)
),
legend.position = "top",
colour = "white",
rescale = FALSE,
use.label = TRUE
) +
scale_y_continuous(expand = c(0,0),
limits = c(0,4)
) +
theme(panel.background = element_rect(fill = "#001957"),
# adding margin = margin(0,0,2,0, "cm") to element_text below does not help
axis.text = element_text(color = "#FFFFFF"),
panel.grid.major.y = element_blank())

Automatically writing scatterplots in ggplot2 to a folder

I have a large number of variables and would like to create scatterplots comparing all variables to a single variable. I have been able to do this in base R using lapply, but I cannot complete the same task in ggplot2 using lapply.
Below is an example dataset.
df <- data.frame("ID" = 1:16)
df$A <- c(1,2,3,4,5,6,7,8,9,10,11,12,12,14,15,16)
df$B <- c(5,6,7,8,9,10,13,15,14,15,16,17,18,18,19,20)
df$C <- c(11,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
I define the variables I would like to generate scatterplots with, using the code below:
df_col_names <- df %>% select(A:C) %>% colnames(.)
Below is how I have been able to successfully complete the task of plotting all variables against variable A, using lapply in base R:
lapply(df_col_names, function(x) {
tiff(filename=sprintf("C:\\Documents\\%s.tiff", x),
width = 1000, height = 1000, res=200)
plot(df$A, df[[x]],
pch=19,
cex = 1.5,
ylab = x,
ylim = c(0, 20),
xlim = c(0, 20))
dev.off()
})
Below is my attempt at completing the task in ggplot2 without any success. It generates the tiff images, although they are empty.
lapply(df_col_names, function(x) {
tiff(filename=sprintf("C:\\Documents\\%s.tiff", x),
width = 1000, height = 1000, res=200)
ggplot(df) +
geom_point(data = df,
aes(x = A, y = df_col_names[[x]], size = 3)) +
geom_smooth(aes(x = A, y = df_col_names[[x]], size = 0), method = "lm", size=0.5) +
coord_fixed(ratio = 1, xlim = c(0, 20), ylim = c(0, 20)) +
guides(size = FALSE, color = FALSE) +
theme_bw(base_size = 14)
dev.off()
})
It works for me with ggsave. Also note that you are passing string column names to ggplot so use .data to refer to actual column values.
library(ggplot2)
lapply(df_col_names, function(x) {
ggplot(df) +
geom_point( aes(x = A, y = .data[[x]], size = 3)) +
geom_smooth(aes(x = A, y = .data[[x]], size = 0), method = "lm", size=0.5) +
coord_fixed(ratio = 1, xlim = c(0, 20), ylim = c(0, 20)) +
guides(size = FALSE, color = FALSE) +
theme_bw(base_size = 14) -> plt
ggsave(sprintf("%s.tiff", x), plt)
})

How to plot 'outside' of plotting area using ggplot in R?

I recently asked this question. However, I am asking a separate question now as the scope of my new question falls outside the range of the last question.
I am trying to create a heatmap in ggplot... however, outside of the axis I am trying to plot geom_tile. The issue is I cannot find a consistent way to get it to work. For example, the code I am using to plot is:
library(colorspace)
library(ggplot2)
library(ggnewscale)
library(tidyverse)
asd <- expand_grid(paste0("a", 1:9), paste0("b", 1:9))
df <- data.frame(
a = asd$`paste0("a", 1:9)`,
b = asd$`paste0("b", 1:9)`,
c = sample(20, 81, replace = T)
)
# From discrete to continuous
df$a <- match(df$a, sort(unique(df$a)))
df$b <- match(df$b, sort(unique(df$b)))
z <- sample(10, 18, T)
# set color palettes
pal <- rev(diverging_hcl(palette = "Blue-Red", n = 11))
palEdge <- rev(sequential_hcl(palette = "Plasma", n = 11))
# plot
ggplot(df, aes(a, b)) +
geom_tile(aes(fill = c)) +
scale_fill_gradientn(
colors = pal,
guide = guide_colorbar(
frame.colour = "black",
ticks.colour = "black"
),
name = "C"
) +
theme_classic() +
labs(x = "A axis", y = "B axis") +
new_scale_fill() +
geom_tile(data = tibble(a = 1:9,
z = z[1:9]),
aes(x = a, y = 0, fill = z, height = 0.3)) +
geom_tile(data = tibble(b = 1:9,
z = z[10:18]),
aes(x = 0, y = b, fill = z, width = 0.3)) +
scale_fill_gradientn(
colors = palEdge,
guide = guide_colorbar(
frame.colour = "black",
ticks.colour = "black"
),
name = "Z"
)+
coord_cartesian(clip = "off", xlim = c(0.5, NA), ylim = c(0.5, NA)) +
theme(aspect.ratio = 1,
plot.margin = margin(10, 15.5, 25, 25, "pt")
)
This produces something like this:
However, I am trying to find a consistent way to plot something more like this (which I quickly made in photoshop):
The main issue im having is being able to manipulate the coordinates of the new scale 'outside' of the plotting area. Is there a way to move the tiles that are outside so I can position them in an area that makes sense?
There are always the two classic options when plotting outside the plot area:
annotate/ plot with coord_...(clip = "off")
make different plots and combine them.
The latter option usually gives much more flexibility and way less headaches, in my humble opinion.
library(colorspace)
library(tidyverse)
library(patchwork)
asd <- expand_grid(paste0("a", 1:9), paste0("b", 1:9))
df <- data.frame(
a = asd$`paste0("a", 1:9)`,
b = asd$`paste0("b", 1:9)`,
c = sample(20, 81, replace = T)
)
# From discrete to continuous
df$a <- match(df$a, sort(unique(df$a)))
df$b <- match(df$b, sort(unique(df$b)))
z <- sample(10, 18, T)
# set color palettes
pal <- rev(diverging_hcl(palette = "Blue-Red", n = 11))
palEdge <- rev(sequential_hcl(palette = "Plasma", n = 11))
# plot
p_main <- ggplot(df, aes(a, b)) +
geom_tile(aes(fill = c)) +
scale_fill_gradientn("C",colors = pal,
guide = guide_colorbar(frame.colour = "black",
ticks.colour = "black")) +
theme_classic() +
labs(x = "A axis", y = "B axis")
p_bottom <- ggplot() +
geom_tile(data = tibble(a = 1:9, z = z[1:9]),
aes(x = a, y = 0, fill = z, height = 0.3)) +
theme_void() +
scale_fill_gradientn("Z",limits = c(0,10),
colors = palEdge,
guide = guide_colorbar(
frame.colour = "black", ticks.colour = "black"))
p_left <- ggplot() +
theme_void()+
geom_tile(data = tibble(b = 1:9, z = z[10:18]),
aes(x = 0, y = b, fill = z, width = 0.3)) +
scale_fill_gradientn("Z",limits = c(0,10),
colors = palEdge,
guide = guide_colorbar( frame.colour = "black", ticks.colour = "black"))
p_left + p_main +plot_spacer()+ p_bottom +
plot_layout(guides = "collect",
heights = c(1, .1),
widths = c(.1, 1))
Created on 2021-02-21 by the reprex package (v1.0.0)

Produce an inset in each facet of an R ggplot while preserving colours of the original facet content

I would like to produce a graphic combining four facets of a graph with insets in each facet showing a detail of the respective plot. This is one of the things I tried:
#create data frame
n_replicates <- c(rep(1:10,15),rep(seq(10,100,10),15),rep(seq(100,1000,100),15),rep(seq(1000,10000,1000),15))
sim_years <- rep(sort(rep((1:15),10)),4)
sd_data <- rep (NA,600)
for (i in 1:600) {
sd_data[i]<-rnorm(1,mean=exp(0.1 * sim_years[i]), sd= 1/n_replicates[i])
}
max_rep <- sort(rep(c(10,100,1000,10000),150))
data_frame <- cbind.data.frame(n_replicates,sim_years,sd_data,max_rep)
#do first basic plot
library(ggplot2)
plot1<-ggplot(data=data_frame, aes(x=sim_years,y=sd_data,group =n_replicates, col=n_replicates)) +
geom_line() + theme_bw() +
labs(title ="", x = "year", y = "sd")
plot1
#make four facets
my_breaks = c(2, 10, 100, 1000, 10000)
facet_names <- c(
`10` = "2, 3, ..., 10 replicates",
`100` = "10, 20, ..., 100 replicates",
`1000` = "100, 200, ..., 1000 replicates",
`10000` = "1000, 2000, ..., 10000 replicates"
)
plot2 <- plot1 +
facet_wrap( ~ max_rep, ncol=2, labeller = as_labeller(facet_names)) +
scale_colour_gradientn(name = "number of replicates", trans = "log",
breaks = my_breaks, labels = my_breaks, colours = rainbow(20))
plot2
#extract inlays (this is where it goes wrong I think)
library(ggpmisc)
library(tibble)
library(dplyr)
inset <- tibble(x = 0.01, y = 10.01,
plot = list(plot2 +
facet_wrap( ~ max_rep, ncol=2, labeller = as_labeller(facet_names)) +
coord_cartesian(xlim = c(13, 15),
ylim = c(3, 5)) +
labs(x = NULL, y = NULL, color = NULL) +
scale_colour_gradient(guide = FALSE) +
theme_bw(10)))
plot3 <- plot2 +
expand_limits(x = 0, y = 0) +
geom_plot_npc(data = inset, aes(npcx = x, npcy = y, label = plot)) +
annotate(geom = "rect",
xmin = 13, xmax = 15, ymin = 3, ymax = 5,
linetype = "dotted", fill = NA, colour = "black")
plot3
That leads to the following graphic:
As you can see, the colours in the insets are wrong, and all four of them appear in each of the facets even though I only want the corresponding inset of course. I read through a lot of questions here (to even get me this far) and also some examples in the ggpmisc user guide but unfortunately I am still a bit lost on how to achieve what I want. Except maybe to do it by hand extracting four insets and then combining them with plot2. But I hope there will be a better way to do this. Thank you for your help!
Edit: better graphic now thanks to this answer, but problem remains partially unsolved:
The following code does good insets, but unfortunately the colours are not preserved. As in the above version each inset does its own rainbow colours anew instead of inheriting the partial rainbow scale from the facet it belongs to. Does anyone know why and how I could change this? In comments I put another (bad) attempt at solving this, it preserves the colors but has the problem of putting all four insets in each facet.
library(ggpmisc)
library(tibble)
library(dplyr)
# #extract inlays: good colours, but produces four insets.
# fourinsets <- tibble(#x = 0.01, y = 10.01,
# x = c(rep(0.01, 4)),
# y = c(rep(10.01, 4)),
# plot = list(plot2 +
# facet_wrap( ~ max_rep, ncol=2) +
# coord_cartesian(xlim = c(13, 15),
# ylim = c(3, 5)) +
# labs(x = NULL, y = NULL, color = NULL) +
# scale_colour_gradientn(name = "number of replicates", trans = "log", guide = FALSE,
# colours = rainbow(20)) +
# theme(
# strip.background = element_blank(),
# strip.text.x = element_blank()
# )
# ))
# fourinsets$plot
library(purrr)
pp <- map(unique(data_frame$max_rep), function(x) {
plot2$data <- plot2$data %>% filter(max_rep == x)
plot2 +
coord_cartesian(xlim = c(12, 14),
ylim = c(3, 4)) +
labs(x = NULL, y = NULL) +
theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.position = "none",
axis.text=element_blank(),
axis.ticks=element_blank()
)
})
#pp[[2]]
inset_new <- tibble(x = c(rep(0.01, 4)),
y = c(rep(10.01, 4)),
plot = pp,
max_rep = unique(data_frame$max_rep))
final_plot <- plot2 +
geom_plot_npc(data = inset_new, aes(npcx = x, npcy = y, label = plot, vp.width = 0.3, vp.height =0.6)) +
annotate(geom = "rect",
xmin = 12, xmax = 14, ymin = 3, ymax = 4,
linetype = "dotted", fill = NA, colour = "black")
#final_plot
final_plot then looks like this:
I hope this clarifies the problem a bit. Any ideas are very welcome :)
Modifying off #user63230's excellent answer:
pp <- map(unique(data_frame$max_rep), function(x) {
plot2 +
aes(alpha = ifelse(max_rep == x, 1, 0)) +
coord_cartesian(xlim = c(12, 14),
ylim = c(3, 4)) +
labs(x = NULL, y = NULL) +
scale_alpha_identity() +
facet_null() +
theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.position = "none",
axis.text=element_blank(),
axis.ticks=element_blank()
)
})
Explanation:
Instead of filtering the data passed into plot2 (which affects the mapping of colours), we impose a new aesthetic alpha, where lines belonging to the other replicate numbers are assigned 0 for transparency;
Use scale_alpha_identity() to tell ggplot that the alpha mapping is to be used as-is: i.e. 1 for 100%, 0 for 0%.
Add facet_null() to override plot2's existing facet_wrap, which removes the facet for the inset.
Everything else is unchanged from the code in the question.
I think this will get you started although its tricky to get the size of the inset plot right (when you include a legend).
#set up data
library(ggpmisc)
library(tibble)
library(dplyr)
library(ggplot2)
# create data frame
n_replicates <- c(rep(1:10, 15), rep(seq(10, 100, 10), 15), rep(seq(100,
1000, 100), 15), rep(seq(1000, 10000, 1000), 15))
sim_years <- rep(sort(rep((1:15), 10)), 4)
sd_data <- rep(NA, 600)
for (i in 1:600) {
sd_data[i] <- rnorm(1, mean = exp(0.1 * sim_years[i]), sd = 1/n_replicates[i])
}
max_rep <- sort(rep(c(10, 100, 1000, 10000), 150))
data_frame <- cbind.data.frame(n_replicates, sim_years, sd_data, max_rep)
# make four facets
my_breaks = c(2, 10, 100, 1000, 10000)
facet_names <- c(`10` = "2, 3, ..., 10 replicates", `100` = "10, 20, ..., 100 replicates",
`1000` = "100, 200, ..., 1000 replicates", `10000` = "1000, 2000, ..., 10000 replicates")
Get overall plot:
# overall facet plot
overall_plot <- ggplot(data = data_frame, aes(x = sim_years, y = sd_data, group = n_replicates, col = n_replicates)) +
geom_line() +
theme_bw() +
labs(title = "", x = "year", y = "sd") +
facet_wrap(~max_rep, ncol = 2, labeller = as_labeller(facet_names)) +
scale_colour_gradientn(name = "number of replicates", trans = "log", breaks = my_breaks, labels = my_breaks, colours = rainbow(20))
#plot
overall_plot
which gives:
Then from the overall plot you want to extract each plot, see here. We can map over the list to extract one at a time:
pp <- map(unique(data_frame$max_rep), function(x) {
overall_plot$data <- overall_plot$data %>% filter(max_rep == x)
overall_plot + # coord_cartesian(xlim = c(13, 15), ylim = c(3, 5)) +
labs(x = NULL, y = NULL) +
theme_bw(10) +
theme(legend.position = "none")
})
If we look at one of these (I've removed the legend) e.g.
pp[[1]]
#pp[[2]]
#pp[[3]]
#pp[[4]]
Gives:
Then we want to add these inset plots into a dataframe so that each plot has its own row:
inset <- tibble(x = c(rep(0.01, 4)),
y = c(rep(10.01, 4)),
plot = pp,
max_rep = unique(data_frame$max_rep))
Then merge this into the overall plot:
overall_plot +
expand_limits(x = 0, y = 0) +
geom_plot_npc(data = inset, aes(npcx = x, npcy = y, label = plot, vp.width = 0.8, vp.height = 0.8))
Gives:
Here is a solution based on Z. Lin's answer, but using ggforce::facet_wrap_paginate() to do the filtering and keeping colourscales consistent.
First, we can make the 'root' plot containing all the data with no facetting.
library(ggpmisc)
library(tibble)
library(dplyr)
n_replicates <- c(rep(1:10,15),rep(seq(10,100,10),15),rep(seq(100,1000,100),15),rep(seq(1000,10000,1000),15))
sim_years <- rep(sort(rep((1:15),10)),4)
sd_data <- rep (NA,600)
for (i in 1:600) {
sd_data[i]<-rnorm(1,mean=exp(0.1 * sim_years[i]), sd= 1/n_replicates[i])
}
max_rep <- sort(rep(c(10,100,1000,10000),150))
data_frame <- cbind.data.frame(n_replicates,sim_years,sd_data,max_rep)
my_breaks = c(2, 10, 100, 1000, 10000)
facet_names <- c(
`10` = "2, 3, ..., 10 replicates",
`100` = "10, 20, ..., 100 replicates",
`1000` = "100, 200, ..., 1000 replicates",
`10000` = "1000, 2000, ..., 10000 replicates"
)
base <- ggplot(data=data_frame,
aes(x=sim_years,y=sd_data,group =n_replicates, col=n_replicates)) +
geom_line() +
theme_bw() +
scale_colour_gradientn(
name = "number of replicates",
trans = "log10", breaks = my_breaks,
labels = my_breaks, colours = rainbow(20)
) +
labs(title ="", x = "year", y = "sd")
Next, the main plot will be just the root plot with facet_wrap().
main <- base + facet_wrap(~ max_rep, ncol = 2, labeller = as_labeller(facet_names))
Then the new part is to use facet_wrap_paginate with nrow = 1 and ncol = 1 for every max_rep, which we'll use as insets. The nice thing is that this does the filtering and it keeps colour scales consistent with the root plot.
nmax_rep <- length(unique(data_frame$max_rep))
insets <- lapply(seq_len(nmax_rep), function(i) {
base + ggforce::facet_wrap_paginate(~ max_rep, nrow = 1, ncol = 1, page = i) +
coord_cartesian(xlim = c(12, 14), ylim = c(3, 4)) +
guides(colour = "none", x = "none", y = "none") +
theme(strip.background = element_blank(),
strip.text = element_blank(),
axis.title = element_blank(),
plot.background = element_blank())
})
insets <- tibble(x = rep(0.01, nmax_rep),
y = rep(10.01, nmax_rep),
plot = insets,
max_rep = unique(data_frame$max_rep))
main +
geom_plot_npc(data = insets,
aes(npcx = x, npcy = y, label = plot,
vp.width = 0.3, vp.height = 0.6)) +
annotate(geom = "rect",
xmin = 12, xmax = 14, ymin = 3, ymax = 4,
linetype = "dotted", fill = NA, colour = "black")
Created on 2020-12-15 by the reprex package (v0.3.0)

Incorrect box widths of ggplot boxplot with continuous x axis?

I am plotting the same data once as geom_point() and once as geom_boxplot(), but the width of my boxplots seems to be off. The largest x is at 292, but the corresponding box is smaller than 285. How can i get this to the correct size?
Here is a minimal example of my data:
x = data.frame(cluster = c("c1","c2","c3","c4","c5","c6"),
elements = c(292,277,170,160,153,141),
divergence = c(0.08344564,0.12130600,0.05564219,0.12826086,0.05386341,0.09620389))
x.160 = x[x$elements >= 160,]
x.160$Size = "160+"
x.60 = x[x$elements >= 60 & x$elements < 160,]
x.60$Size = "60-160"
binnedX = rbind(x.60,x.160)
p1a = ggplot(data = binnedX, aes(x = elements, y = divergence, group = Size))+
geom_hline(yintercept = mean(binnedX$divergence), color = "black", linetype=2)+
geom_point(aes(color = Size))+
scale_x_continuous(breaks = c(seq(0,300,15))) +
scale_y_continuous(breaks = seq(0.00,0.25,0.05))+
scale_color_brewer(palette = "Spectral") +
ggtitle("element sequence divergence by cluster [clustalO, 100bp]") +
labs(x="Elements per cluster", y="Divergence")+
theme_linedraw(base_size = 18)+
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(margin = margin(15,15,15,15,"pt")),
axis.title.y = element_text(margin = margin(15,15,15,15,"pt")))
p2a = ggplot(data = binnedX, aes(x = elements, y = divergence, group = Size))+
geom_hline(yintercept = mean(binnedX$divergence), color = "Red", linetype=2)+
geom_boxplot(aes(fill = Size)) +
scale_x_continuous(breaks = c(seq(0,300,15)))+
scale_y_continuous(breaks = seq(0.00,0.25,0.05))+
scale_fill_brewer(palette = "Spectral") +
labs(x="Elements per cluster", y="Divergence")+
theme_linedraw(base_size = 18)+
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(margin = margin(15,15,15,15,"pt")),
axis.title.y = element_text(margin = margin(15,15,15,15,"pt")),
axis.text.x = element_text(angle=0))
multiplot(p1a,p2a)
When we create box plots by group, the width of each box is hard coded as 90% of the group's data range. We can see this from the compute_group() function in StatBoxplot:
# reproducing lines 87-88 of stat-boxplot.r
if (length(unique(data$x)) > 1)
width <- diff(range(data$x)) * 0.9
We can override this by defining a modified compute_group() function based on StatBoxplot$compute_group:
modified.function <- function(data, scales, width = NULL, na.rm = FALSE, coef = 1.5) {
qs <- c(0, 0.25, 0.5, 0.75, 1)
if (!is.null(data$weight)) {
mod <- quantreg::rq(y ~ 1, weights = weight, data = data, tau = qs)
stats <- as.numeric(stats::coef(mod))
}
else {
stats <- as.numeric(stats::quantile(data$y, qs))
}
names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
iqr <- diff(stats[c(2, 4)])
outliers <- data$y < (stats[2] - coef * iqr) | data$y > (stats[4] + coef * iqr)
if (any(outliers)) {
stats[c(1, 5)] <- range(c(stats[2:4], data$y[!outliers]), na.rm = TRUE)
}
if (length(unique(data$x)) > 1)
width <- diff(range(data$x)) * 1 # instead of 0.9
df <- as.data.frame(as.list(stats))
df$outliers <- list(data$y[outliers])
if (is.null(data$weight)) {
n <- sum(!is.na(data$y))
}
else {
n <- sum(data$weight[!is.na(data$y) & !is.na(data$weight)])
}
df$notchupper <- df$middle + 1.58 * iqr/sqrt(n)
df$notchlower <- df$middle - 1.58 * iqr/sqrt(n)
df$x <- if (is.factor(data$x))
data$x[1]
else mean(range(data$x))
df$width <- width
df$relvarwidth <- sqrt(n)
df}
Create a modified stat class based off StatBoxplot, as well as the corresponding layer function for it based off stat_boxplot:
StatBoxplot2 <- ggproto(`_class` = "StatBoxplot2", # class name
`_inherit` = StatBoxplot, # inherit from StatBoxplot
compute_group = modified.function) # override its compute_group function
stat_boxplot2 <- function(mapping = NULL, data = NULL, geom = "boxplot", position = "dodge2", ...,
coef = 1.5, na.rm = FALSE, show.legend = NA, inherit.aes = TRUE){
layer(data = data, mapping = mapping,
stat = StatBoxplot2, # use StatBoxplot2 rather than StatBoxplot
geom = geom, position = position,
show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, coef = coef, ...))
}
Compare a boxplot that uses the default stat = "boxplot", with one that uses our modified stat = "boxplot2":
# Base plot with vertical dashed lines to indicate each point's position, & theme_classic for a
# less cluttered background. I also changed the palette as Spectral's yellow was really hard to see.
p <- ggplot(data = binnedX,
aes(x = elements, y = divergence, fill = Size))+
geom_point(aes(color = Size), size = 3) +
geom_vline(aes(xintercept = elements), linetype = "dashed") +
scale_fill_brewer(palette = "Set1") +
scale_color_brewer(palette = "Set1", guide = FALSE) +
theme_classic()
gridExtra::grid.arrange(p + ggtitle("original") + geom_boxplot(alpha = 0.5),
p + ggtitle("modified") + geom_boxplot(alpha = 0.5, stat = "boxplot2"))

Resources