Making a specific quantile plot in R - r

I am very intrigued by the following visulization (Decile term)
And I wonder how it would be possible to do it in R.
There is of course histograms and density plots, but they do not make such a nice visualization. Especially, I would like to know if it possible to do it with ggplot/tidyverse.
edit in response to the comment
library(dplyr)
library(ggplot2)
someData <- data_frame(x = rnorm(1000))
ggplot(someData, aes(x = x)) +
geom_histogram()
this produces a histogram (see http://www.r-fiddle.org/#/fiddle?id=LQXazwMY&version=1)
But how I can get the coloful bars? How to implement the small rectangles? (The arrows are less relevant).

You have to define a number of breaks, and use approximate deciles that match those histogram breaks. Otherwise, two deciles will end up in one bar.
d <- data_frame(x = rnorm(1000))
breaks <- seq(min(d$x), max(d$x), length.out = 50)
quantiles <- quantile(d$x, seq(0, 1, 0.1))
quantiles2 <- sapply(quantiles, function(x) breaks[which.min(abs(x - breaks))])
d$bar <- as.numeric(as.character(cut(d$x, breaks, na.omit((breaks + dplyr::lag(breaks)) / 2))))
d$fill <- cut(d$x, quantiles2, na.omit((quantiles2 + dplyr::lag(quantiles2)) / 2))
ggplot(d, aes(bar, y = 1, fill = fill)) +
geom_col(position = 'stack', col = 1, show.legend = FALSE, width = diff(breaks)[1])
Or with more distinct colors:
ggplot(d, aes(bar, y = 1, fill = fill)) +
geom_col(position = 'stack', col = 1, show.legend = FALSE, width = diff(breaks)[1]) +
scale_fill_brewer(type = 'qual', palette = 3) # The only qual pallete with enough colors
Add some styling and increase the breaks to 100:
ggplot(d, aes(bar, y = 1, fill = fill)) +
geom_col(position = 'stack', col = 1, show.legend = FALSE, width = diff(breaks)[1], size = 0.3) +
scale_fill_brewer(type = 'qual', palette = 3) +
theme_classic() +
coord_fixed(diff(breaks)[1], expand = FALSE) + # makes square blocks
labs(x = 'x', y = 'count')
And here is a function to make that last one:
decile_histogram <- function(data, var, n_breaks = 100) {
breaks <- seq(min(data[[var]]), max(data[[var]]), length.out = n_breaks)
quantiles <- quantile(data[[var]], seq(0, 1, 0.1))
quantiles2 <- sapply(quantiles, function(x) breaks[which.min(abs(x - breaks))])
data$bar <- as.numeric(as.character(
cut(data[[var]], breaks, na.omit((breaks + dplyr::lag(breaks)) / 2)))
)
data$fill <- cut(data[[var]], quantiles2, na.omit((quantiles2 + dplyr::lag(quantiles2)) / 2))
ggplot2::ggplot(data, ggplot2::aes(bar, y = 1, fill = fill)) +
ggplot2::geom_col(position = 'stack', col = 1, show.legend = FALSE, width = diff(breaks)[1], size = 0.3) +
ggplot2::scale_fill_brewer(type = 'qual', palette = 3) +
ggplot2::theme_classic() +
ggplot2::coord_fixed(diff(breaks)[1], expand = FALSE) +
ggplot2::labs(x = 'x', y = 'count')
}
Use as:
d <- data.frame(x = rnorm(1000))
decile_histogram(d, 'x')

Related

Controlling ggplot2 legend order with new_scale_color() of ggnewscale

I can't seem to arrange the order of legend items when using the new_scale_color() feature of the ggnewscale package. Here's a minimal example:
n <- 10
sd_res <- 1
beta0 <- 0
beta1 <- 1
x <- runif(n, 0, 10)
y <- beta0 + beta1*x + rnorm(n, 0, sd_res)
dat <- data.frame(x,y)
ggplot() + theme_minimal() + labs(x = '', y = '') +
geom_abline(aes(color = 'x', linetype = 'x', slope = 1, intercept = 0)) +
geom_abline(aes(color = 'y', linetype = 'y', slope = 2, intercept = 0)) +
scale_color_manual(name = '', values = c('x' = 'orange', 'y' = 'black')) +
scale_linetype_manual(name = '', values = c('x' = 1, 'y' = 2)) +
new_scale_color() +
geom_point(data=dat, aes(x=x, y=y, shape = 'z', size = 'z', color = 'z'), alpha = 1) +
scale_size_manual(name = '', values = c('z' = 1.5)) +
scale_shape_manual(name = '', values = c('z' = 16)) +
scale_color_manual(name = '', values = c('z' = 'black')) +
theme(legend.position = 'bottom')
What I'd like is for the z-item to appear before the x- and y-items in the legend.
You could use guides with guide_legend and specify the order of your aes. Here I set the order of your "z" to 1 which will set it as the first legend like this:
n <- 10
sd_res <- 1
beta0 <- 0
beta1 <- 1
x <- runif(n, 0, 10)
y <- beta0 + beta1*x + rnorm(n, 0, sd_res)
dat <- data.frame(x,y)
library(ggplot2)
library(ggnewscale)
p <- ggplot() + theme_minimal() + labs(x = '', y = '') +
geom_abline(aes(color = 'x', linetype = 'x', slope = 1, intercept = 0)) +
geom_abline(aes(color = 'y', linetype = 'y', slope = 2, intercept = 0)) +
scale_color_manual(name = '', values = c('x' = 'orange', 'y' = 'black')) +
scale_linetype_manual(name = '', values = c('x' = 1, 'y' = 2)) +
new_scale_color() +
geom_point(data=dat, aes(x=x, y=y, shape = 'z', size = 'z', color = 'z'), alpha = 1) +
scale_size_manual(name = '', values = c('z' = 1.5)) +
scale_shape_manual(name = '', values = c('z' = 16)) +
scale_color_manual(name = '', values = c('z' = 'black')) +
theme(legend.position = 'bottom') +
guides(size = guide_legend(order = 1),
color = guide_legend(order = 1),
shape = guide_legend(order = 1))
p
Created on 2022-08-26 with reprex v2.0.2
When using ggnewscale, I don't recommend using guides() to define guides, since the name of the aesthetics change internally. It's better to use the guide argument of the scale_ function
Example from ggnewscale documentation:
library(ggplot2)
library(ggnewscale)
# Equivalent to melt(volcano)
topography <- expand.grid(x = 1:nrow(volcano),
y = 1:ncol(volcano))
topography$z <- c(volcano)
# point measurements of something at a few locations
set.seed(42)
measurements <- data.frame(x = runif(30, 1, 80),
y = runif(30, 1, 60),
thing = rnorm(30))
ggplot(mapping = aes(x, y)) +
geom_contour(data = topography, aes(z = z, color = stat(level))) +
# Color scale for topography
scale_color_viridis_c(option = "D",
guide = guide_colorbar(order = 2)) +
# geoms below will use another color scale
new_scale_color() +
geom_point(data = measurements, size = 3, aes(color = thing)) +
# Color scale applied to geoms added after new_scale_color()
scale_color_viridis_c(option = "A",
guide = guide_colorbar(order = 8))
Created on 2022-08-26 by the reprex package (v2.0.1)

How to manually change line size and alpha values for ggplot2 lines (separated by factor)?

I want to create a graph where I can change the line size for each line c(1,2,3) and the alpha values for each line c(0.5,0.6,0.7). I tried to use scale_size_manual but it didn't make any difference. Any ideas on how to proceed?
var <- c("T","T","T","M","M","M","A","A","A")
val <- rnorm(12,4,5)
x <- c(1:12)
df <- data.frame(var,val,x)
ggplot(aes(x= x , y = val, color = var, group = var), data = df) +
scale_color_manual(values = c("grey","blue","black")) + geom_smooth(aes(x = x, y = val), formula = "y ~ x", method = "loess",se = FALSE, size = 1) + scale_x_continuous(breaks=seq(1, 12, 1), limits=c(1, 12)) + scale_size_manual(values = c(1,2,3))
To set the size and alpha values for your lines you have to map on aesthetics. Otherwise scale_size_manual will have no effect:
library(ggplot2)
ggplot(aes(x = x, y = val, color = var, group = var), data = df) +
scale_color_manual(values = c("grey", "blue", "black")) +
geom_smooth(aes(x = x, y = val, size = var, alpha = var), formula = "y ~ x", method = "loess", se = FALSE) +
scale_x_continuous(breaks = seq(1, 12, 1), limits = c(1, 12)) +
scale_size_manual(values = c(1, 2, 3)) +
scale_alpha_manual(values = c(.5, .6, .7))

Automatically writing scatterplots in ggplot2 to a folder

I have a large number of variables and would like to create scatterplots comparing all variables to a single variable. I have been able to do this in base R using lapply, but I cannot complete the same task in ggplot2 using lapply.
Below is an example dataset.
df <- data.frame("ID" = 1:16)
df$A <- c(1,2,3,4,5,6,7,8,9,10,11,12,12,14,15,16)
df$B <- c(5,6,7,8,9,10,13,15,14,15,16,17,18,18,19,20)
df$C <- c(11,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
I define the variables I would like to generate scatterplots with, using the code below:
df_col_names <- df %>% select(A:C) %>% colnames(.)
Below is how I have been able to successfully complete the task of plotting all variables against variable A, using lapply in base R:
lapply(df_col_names, function(x) {
tiff(filename=sprintf("C:\\Documents\\%s.tiff", x),
width = 1000, height = 1000, res=200)
plot(df$A, df[[x]],
pch=19,
cex = 1.5,
ylab = x,
ylim = c(0, 20),
xlim = c(0, 20))
dev.off()
})
Below is my attempt at completing the task in ggplot2 without any success. It generates the tiff images, although they are empty.
lapply(df_col_names, function(x) {
tiff(filename=sprintf("C:\\Documents\\%s.tiff", x),
width = 1000, height = 1000, res=200)
ggplot(df) +
geom_point(data = df,
aes(x = A, y = df_col_names[[x]], size = 3)) +
geom_smooth(aes(x = A, y = df_col_names[[x]], size = 0), method = "lm", size=0.5) +
coord_fixed(ratio = 1, xlim = c(0, 20), ylim = c(0, 20)) +
guides(size = FALSE, color = FALSE) +
theme_bw(base_size = 14)
dev.off()
})
It works for me with ggsave. Also note that you are passing string column names to ggplot so use .data to refer to actual column values.
library(ggplot2)
lapply(df_col_names, function(x) {
ggplot(df) +
geom_point( aes(x = A, y = .data[[x]], size = 3)) +
geom_smooth(aes(x = A, y = .data[[x]], size = 0), method = "lm", size=0.5) +
coord_fixed(ratio = 1, xlim = c(0, 20), ylim = c(0, 20)) +
guides(size = FALSE, color = FALSE) +
theme_bw(base_size = 14) -> plt
ggsave(sprintf("%s.tiff", x), plt)
})

Grid as bars in ggplot

A common layout in many sites is to draw the grid as shaded bars:
I'm doing this with this function:
grid_bars <- function(data, y, n = 5, fill = "gray90") {
breaks <- pretty(data[[y]], n)
len <- length(breaks)-1
all_bars <- data.frame(
b.id = rep(1:len, 4),
b.x = c(rep(-Inf, len), rep(Inf, len*2), rep(-Inf, len)),
b.y = c(rep(breaks[-length(breaks)], 2), rep(breaks[-1], 2))
)
bars <- all_bars[all_bars$b.id %in% (1:len)[c(FALSE, TRUE)], ]
grid <- list(
geom_polygon(data = bars, aes(b.x, b.y, group = b.id),
fill = fill, colour = fill),
scale_y_continuous(breaks = breaks),
theme(panel.grid = element_blank())
)
return(grid)
}
#-------------------------------------------------
dat <- data.frame(year = 1875:1972,
level = as.vector(LakeHuron))
ggplot(dat, aes(year, level)) +
grid_bars(dat, "level", 10) +
geom_line(colour = "steelblue", size = 1.2) +
theme_classic()
But it needs to specify data and y again. How to take those directly from the ggplot?
After having a look at the options for extending ggplot2 in Hadley Wickham's book on ggplot2 you probably have to set up your own Geom or Stat layer to achieve the desired result. This way you can access the data and aesthetics specified in ggplot() or even pass different data and aesthetics to your fun. Still a newbie in writing extensions for ggplot2 but a first approach may look like so:
library(ggplot2)
# Make bars dataframe
make_bars_df <- function(y, n) {
breaks <- pretty(y, n)
len <- length(breaks) - 1
all_bars <- data.frame(
group = rep(1:len, 4),
x = c(rep(-Inf, len), rep(Inf, len * 2), rep(-Inf, len)),
y = c(rep(breaks[-length(breaks)], 2), rep(breaks[-1], 2))
)
all_bars[all_bars$group %in% (1:len)[c(FALSE, TRUE)], ]
}
# Setup Geom
geom_grid_bars_y <- function(mapping = NULL, data = NULL, stat = "identity",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, n = 5, ...) {
layer(
geom = GeomGridBarsY, mapping = mapping, data = data, stat = stat,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(n = n, ...)
)
}
GeomGridBarsY <- ggproto("GeomGridBarsY", Geom,
required_aes = c("y"),
default_aes = aes(alpha = NA, colour = NA, fill = "gray90", group = NA,
linetype = "solid", size = 0.5, subgroup = NA),
non_missing_aes = aes("n"),
setup_data = function(data, params) {
transform(data)
},
draw_group = function(data, panel_scales, coord, n = n) {
bars <- make_bars_df(data[["y"]], n)
# setup data for GeomPolygon
## If you want this to work with facets you have to take care of the PANEL
bars$PANEL <- factor(1)
# Drop x, y, group from data
d <- data[ , setdiff(names(data), c("x", "y", "group"))]
d <- d[!duplicated(d), ]
# Merge information in data to bars
bars <- merge(bars, d, by = "PANEL")
# Set color = fill
bars[["colour"]] <- bars[["fill"]]
# Draw
grid::gList(
ggplot2::GeomPolygon$draw_panel(bars, panel_scales, coord)
)
},
draw_key = draw_key_rect
)
grid_bars <- function(n = 5, fill = "gray90") {
list(
geom_grid_bars_y(n = n, fill = fill),
scale_y_continuous(breaks = scales::pretty_breaks(n = n)),
theme(panel.grid = element_blank())
)
}
dat <- data.frame(year = 1875:1972,
level = as.vector(LakeHuron))
ggplot(dat, aes(year, level)) +
grid_bars(n = 10, fill = "gray95") +
geom_line(colour = "steelblue", size = 1.2) +
theme_classic()
Just for reference:
A first and simple approach to get grid bars one could simply adjust the size of the grid lines via theme() like so:
# Simple approach via theme
ggplot(dat, aes(year, level)) +
geom_line(colour = "steelblue", size = 1.2) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme_classic() +
theme(panel.grid.major.y = element_line(size = 8))
Created on 2020-06-14 by the reprex package (v0.3.0)

Change colours of a ggplot object created by a function

I am using the fgsea library for some analyses, in particular I use the plotEnrichment function a lot. This function returns a ggplot object with all the layers, but I'd like to change the curve it shows from bright to something else. This code
library(fgsea)
data(examplePathways)
data(exampleRanks)
p = plotEnrichment(examplePathways[["5991130_Programmed_Cell_Death"]], exampleRanks)
will return this plot
Is there any way to change the colour once its created?
Note: I am pretty sure there are ways to do this fairly easily, but I did not create the plot so I don't know what each layer is called or how they were created.
As per BrianFisher's recommendations, I tried
p + scale_color_brewer(palette="GnBu")
p + scale_color_manual(values=c("blue","red"))
But they did not change anything on the plot, as far as I could tell.
Another way to achieve this is by changing the ggplot object directly by using the following code:
## change the aes parameter in the object
p$layers[[5]]$aes_params$colour <- 'blue'
## then plot p
p
This yields the following graph:
A short walk-through
This technique has proven useful to me on numerous occasions. Hence, some more detail:
p$layers gives us the info we need to dig further: we need to access the geom_line configuration. So, after consulting the info below, we choose to continue with p$layers[[5]]
> p$layers
[[1]]
geom_point: na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity
[[2]]
mapping: yintercept = ~yintercept
geom_hline: na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity
[[3]]
mapping: yintercept = ~yintercept
geom_hline: na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity
[[4]]
mapping: yintercept = ~yintercept
geom_hline: na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity
[[5]]
geom_line: na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity
[[6]]
mapping: x = ~x, y = ~-diff/2, xend = ~x, yend = ~diff/2
geom_segment: arrow = NULL, arrow.fill = NULL, lineend = butt, linejoin = round, na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity
If we add an $ after p$layers[[5]], we get the possible choices to extend the code (in RStudio) like in the picture below:
We choose aes_params and add a new $. At that moment, the only choice is colour. We are at the endpoint: here we can set colour of the geom_line.
So, now you know where the hacky, mysterious code came from; and here it is for the very last time:
p$layers[[5]]$aes_params$colour <- 'blue'
If you look at plotEnrichment:
plotEnrichment
function (pathway, stats, gseaParam = 1, ticksSize = 0.2)
{
rnk <- rank(-stats)
ord <- order(rnk)
statsAdj <- stats[ord]
statsAdj <- sign(statsAdj) * (abs(statsAdj)^gseaParam)
statsAdj <- statsAdj/max(abs(statsAdj))
pathway <- unname(as.vector(na.omit(match(pathway, names(statsAdj)))))
pathway <- sort(pathway)
gseaRes <- calcGseaStat(statsAdj, selectedStats = pathway,
returnAllExtremes = TRUE)
bottoms <- gseaRes$bottoms
tops <- gseaRes$tops
n <- length(statsAdj)
xs <- as.vector(rbind(pathway - 1, pathway))
ys <- as.vector(rbind(bottoms, tops))
toPlot <- data.frame(x = c(0, xs, n + 1), y = c(0, ys, 0))
diff <- (max(tops) - min(bottoms))/8
x = y = NULL
g <- ggplot(toPlot, aes(x = x, y = y)) + geom_point(color = "green",
size = 0.1) + geom_hline(yintercept = max(tops), colour = "red",
linetype = "dashed") + geom_hline(yintercept = min(bottoms),
colour = "red", linetype = "dashed") + geom_hline(yintercept = 0,
colour = "black") + geom_line(color = "green") + theme_bw() +
geom_segment(data = data.frame(x = pathway), mapping = aes(x = x,
y = -diff/2, xend = x, yend = diff/2), size = ticksSize) +
theme(panel.border = element_blank(), panel.grid.minor = element_blank()) +
labs(x = "rank", y = "enrichment score")
g
}
The color is hardcoded in geom_line(color = "green"), partly because there is no column in the data.frame that specifies the color. so you have two options:
a) Plot over it
p + geom_line(color="steelblue")
b) change the function and save it as another function (e.g plotEnr below)
plotEnr = function (pathway, stats, gseaParam = 1, ticksSize = 0.2)
{
LINECOL = "red"
rnk <- rank(-stats)
ord <- order(rnk)
statsAdj <- stats[ord]
statsAdj <- sign(statsAdj) * (abs(statsAdj)^gseaParam)
statsAdj <- statsAdj/max(abs(statsAdj))
pathway <- unname(as.vector(na.omit(match(pathway, names(statsAdj)))))
pathway <- sort(pathway)
gseaRes <- calcGseaStat(statsAdj, selectedStats = pathway,
returnAllExtremes = TRUE)
bottoms <- gseaRes$bottoms
tops <- gseaRes$tops
n <- length(statsAdj)
xs <- as.vector(rbind(pathway - 1, pathway))
ys <- as.vector(rbind(bottoms, tops))
toPlot <- data.frame(x = c(0, xs, n + 1), y = c(0, ys, 0))
diff <- (max(tops) - min(bottoms))/8
x = y = NULL
g <- ggplot(toPlot, aes(x = x, y = y)) + geom_point(size = 0.1) +
geom_hline(yintercept = max(tops), colour = "red",
linetype = "dashed") +
geom_hline(yintercept = min(bottoms),
colour = "red", linetype = "dashed") + geom_hline(yintercept = 0,
colour = "black") + geom_line(col=LINECOL) + theme_bw() +
geom_segment(data = data.frame(x = pathway), mapping = aes(x = x,
y = -diff/2, xend = x, yend = diff/2), size = ticksSize) +
theme(panel.border = element_blank(), panel.grid.minor = element_blank()) +
labs(x = "rank", y = "enrichment score")
g
}
plotEnr(examplePathways[["5991130_Programmed_Cell_Death"]], exampleRanks)

Resources