Histogram with normal Distribution in R using ggplot2 for illustrations - r

I'm trying to plot a histogram with ggplot2.
I wrote a simple code for this in R
dnorm.count <- function(x, mean = 0, sd = 1, log = FALSE, n = 1, binwidth = 1){
n * binwidth * dnorm(x = x, mean = mean, sd = sd, log = log)
}
mtcars %>%
ggplot(aes(x = mpg)) +
geom_histogram(bins =60,color = "white", fill = "#9FE367",boundary = 0.5) +
geom_vline(aes(xintercept = mean(mpg)),
linetype="dashed",
size = 1.6,
color = "#FF0000")+
geom_text(aes(label = ..count..), stat= "count",vjust = -0.6)+
stat_function(fun = dnorm.count, color = "#6D67E3",
args = list(mean= mean(mtcars$mpg),
sd = sd(mtcars$mpg),
n = nrow(mtcars)),
lwd = 1.2) +
scale_y_continuous(labels = comma, name = "Frequency") +
scale_x_continuous(breaks=seq(0,max(mtcars$mpg)))+
geom_text(aes(label = paste0("mean = ", round(mean(mtcars$mpg), 2)),
x = mean(mtcars$mpg)*1.2,
y = mean(mtcars$mpg)/5))+
geom_vline(aes(xintercept = sd(mpg)), linetype="dashed",size = 1.6, color = "#FF0000")
What I got is this!
The question is how do I Plot the histogram similar to this
using ggplot2 and is it possible to convert the code to R function?
Edit: For the better explanation of what I'm trying to do:
I wanna create a Histogram exactly the same as the one attached for reference using ggplot2 and then I wanna create a function for the same to reduce the coding. Use any package+ggplot2 you like. The histograms should have lines depicting the standard deviation & mean like the one in reference. If possible depict the standard deviation in the plot as the reference image, that's what I'm trying to achieve.

If your question how to plot histograms like the one you attached in your last figure, this 9 lines of code produce a very similar result.
library(magrittr) ; library(ggplot2)
set.seed(42)
data <- rnorm(1e5)
p <- data %>%
as.data.frame() %>%
ggplot(., aes(x = data)) +
geom_histogram(fill = "white", col = "black", bins = 30 ) +
geom_density(aes( y = 0.3 *..count..)) +
labs(x = "Statistics", y = "Probability/Density") +
theme_bw() + theme(axis.text = element_blank())
You could use annotate() to add symbols or text and geom_segment to show the intervals on the plot like this:
p + annotate(x = sd(data)/2 , y = 8000, geom = "text", label = "σ", size = 10) +
annotate(x = sd(data) , y = 6000, geom = "text", label = "2σ", size = 10) +
annotate(x = sd(data)*1.5 , y = 4000, geom = "text", label = "3σ", size = 10) +
geom_segment(x = 0, xend = sd(data), y = 7500, yend = 7500) +
geom_segment(x = 0, xend = sd(data)*2, y = 5500, yend = 5500) +
geom_segment(x = 0, xend = sd(data)*3, y = 3500, yend = 3500)
This chunk of code would give you something like this:

Related

R: How to set full transparency in a quantile line in geom_density_ridges

First of all, some data similar to what I am working with.
rawdata <- data.frame(Score = rnorm(1000, seq(1, 0, length.out = 10), sd = 1),
Group = rep(LETTERS[1:3], 10000))
rawdata$Score <- ifelse(rawdata$Group == "A", rawdata$Score+2,rawdata$Score)
rawdata$Score <- ifelse(rawdata$Group == "C", rawdata$Score-2,rawdata$Score)
stdev <- c(10.78,10.51,9.42)
col <- c("#004d8d", "#cc2701", "#e5b400")
Now, the code of my geom_density_ridges with quantile lines, which in this case they will be white.
p <- ggplot(rawdata, aes(x = Score, y = Group)) +
scale_y_discrete() +
geom_rect(inherit.aes = FALSE, mapping = aes(ymin = 0, ymax = Inf, xmin = -0.1 * min(stdev), xmax = 0.1 * max(stdev)),
fill = "grey", alpha = 0.5) +
geom_density_ridges(scale = -0.5, size = 1, alpha=0.5, show.legend = FALSE,
quantile_lines = TRUE, quantiles = c(0.025, 0.975),
vline_color = "white", aes(fill = Group)) +
scale_color_manual(values = col) +
scale_fill_manual(values = col) +
labs(title="Toy Graph", y="Group", x="Value") +
coord_flip(xlim = c(-8, 8), ylim = NULL, expand = TRUE, clip = "on")
p
An we obtain the following plot, which is perfectly adjusted to expectation.
Now I was wondering if there was a way to make only this little white quantile line transparent to the background. I tried first to set the vline_color = "transparent" and leaving the aes(fill = Group) at the end of geom_density_ridges at the logic that options where drew in order but it gets transparent not to the different shades of grey background but to the density fill (so the quantile line disappears), which is not what I am trying to achieve.
Thanks in advance for your ideas!
Colors can be modified with scales::alpha. This can be passed to your color argument.
library(ggridges)
library(ggplot2)
rawdata <- data.frame(Score = rnorm(1000, seq(1, 0, length.out = 10), sd = 1),
Group = rep(LETTERS[1:3], 10000))
rawdata$Score <- ifelse(rawdata$Group == "A", rawdata$Score+2,rawdata$Score)
rawdata$Score <- ifelse(rawdata$Group == "C", rawdata$Score-2,rawdata$Score)
stdev <- c(10.78,10.51,9.42)
col <- c("#004d8d", "#cc2701", "#e5b400")
ggplot(rawdata, aes(x = Score, y = Group)) +
scale_y_discrete() +
geom_rect(inherit.aes = FALSE, mapping = aes(ymin = 0, ymax = Inf, xmin = -0.1 * min(stdev), xmax = 0.1 * max(stdev)),
fill = "grey", alpha = 0.5) +
geom_density_ridges(scale = -0.5, size = 1, alpha=0.5, show.legend = FALSE,
quantile_lines = TRUE, quantiles = c(0.025, 0.975),
### The only change is here
vline_color = alpha("white", .5), aes(fill = Group)) +
scale_color_manual(values = col) +
scale_fill_manual(values = col) +
labs(title="Toy Graph", y="Group", x="Value") +
coord_flip(xlim = c(-8, 8), ylim = NULL, expand = TRUE, clip = "on")
#> Picking joint bandwidth of 0.148
#> Warning: Using the `size` aesthietic with geom_segment was deprecated in ggplot2 3.4.0.
#> ℹ Please use the `linewidth` aesthetic instead.
Created on 2022-11-14 with reprex v2.0.2
No, if you make something transparent you will see what's underneath, which is the density plot.
However, you can replicate the visual effect of "seeing through to the background" by simply setting the line colour to the same as the background.
Your grey rectangle is currently plotted underneath the density plots, therefore the "background" doesn't have a single colour. This can be solved by plotting it on top instead. Instead of a 50% grey with 50% alpha, you can replicate the same effect with a 0% grey (aka black) with a 25% alpha. Move the geom_rect later than the density plots and it will be layered on top.
Finally, your geom_rect is being called once for each row of raw_data, since it inherits the same data as the main plot. You probably don't want that, so specify a (dummy) data source instead.
ggplot(rawdata, aes(x = Score, y = Group)) +
scale_y_discrete() +
geom_density_ridges(scale = -0.5, size = 1, alpha=0.5, show.legend = FALSE,
quantile_lines = TRUE, quantiles = c(0.025, 0.975),
vline_color = "grey90", aes(fill = Group)) +
scale_color_manual(values = col) +
scale_fill_manual(values = col) +
labs(title="Toy Graph", y="Group", x="Value") +
geom_rect(data=data.frame(), inherit.aes = FALSE, mapping = aes(
ymin = 0, ymax = Inf, xmin = -0.1 * min(stdev), xmax = 0.1 * max(stdev)
), fill = "black", alpha = 0.25) +
coord_flip(xlim = c(-8, 8), ylim = NULL, expand = TRUE, clip = "on")
Note: I'm not sure the background colour is really "grey90", I've eyeballed it. You may want to specify it explicitly with theme if you want to be exact.
If you want literal see-through portions of your density curves, you will need to make the gaps yourself:
library(tidyverse)
rawdata %>%
mutate(GroupNum = as.numeric(as.factor(Group))) %>%
group_by(GroupNum, Group) %>%
summarise(yval = first(GroupNum) - density(Score)$y,
xval = density(Score)$x,
q025 = quantile(Score, 0.025),
q975 = quantile(Score, 0.975)) %>%
mutate(Q = ifelse(xval < q025, 'low', ifelse(xval > q975, 'hi', 'mid'))) %>%
ggplot(aes(xval, yval, group = interaction(Group, Q))) +
geom_line(size = 1) +
geom_ribbon(aes(ymax = GroupNum, ymin = yval, fill = Group),
color = NA, alpha = 0.5, outline.type = 'full',
data = . %>% filter(abs(q025 - xval) > 0.03 &
abs(q975 - xval) > 0.03)) +
coord_flip() +
scale_fill_manual(values = col) +
scale_y_continuous(breaks = 1:3, labels = levels(factor(rawdata$Group)),
name = 'Group') +
labs(x = 'Score')

Replicating a color-coded spaghetti plot [duplicate]

This question already has answers here:
Create a split violin plot with paired points and proper orientation
(2 answers)
Closed 10 months ago.
This post was edited and submitted for review 10 months ago and failed to reopen the post:
Original close reason(s) were not resolved
In this article: https://www.nature.com/articles/s41591-022-01744-z.epdf
I noticed an interesting plot:
2
Is there a simple way to do this in R?
EDIT: I am aware there are similar questions but none deal with the color-coding scheme that marks the improved / worsened cases.
The see package has a half violin geom like that:
ggplot(data = data.frame(id = rep(1:10, 2),
time = rep(c("A", "B"), each = 10),
value = runif(20)),
aes(time, value)) +
see::geom_violinhalf(aes(group = time, fill = time),
trim = FALSE, flip = 1, alpha = 0.2) +
geom_point(aes(color = time)) +
geom_line(aes(group = id))
You can get arbitrarily close to a chosen chart using ggplot:
ggplot(df, aes(xval, modularity, color = group)) +
geom_polygon(data = densdf, aes( x = y, y = x, fill = group), colour = NA) +
scale_fill_manual(values = c('#c2c2c2', '#fbc5b4')) +
scale_color_manual(values = c('#676767', '#ef453e')) +
geom_path(data = densdf, aes(x = y, y = x), size = 2) +
geom_segment(color = '#c2c2c2', inherit.aes = FALSE, size = 1.5,
data = df2[df2$`Post-treatment` > df2$Baseline,], alpha = 0.8,
aes(x = 1, xend = 2, y = Baseline, yend = `Post-treatment`)) +
geom_segment(color = '#ef453e', inherit.aes = FALSE, size = 1.5, alpha = 0.8,
data = df2[df2$`Post-treatment` < df2$Baseline,],
aes(x = 1, xend = 2, y = Baseline, yend = `Post-treatment`)) +
geom_point(size = 3) +
theme_classic() +
scale_x_continuous(breaks = 1:2, labels = c('Baseline', 'Post-treatment'),
name = '', expand = c(0.3, 0)) +
theme(legend.position = 'none',
text = element_text(size = 18, face = 2),
panel.background = element_rect(fill = NA, color = 'black', size = 1.5))
As long as you are prepared to do some work getting your data into the right format:
set.seed(4)
mod <- c(rnorm(16, 2.5, 0.25))
df <- data.frame(modularity = c(mod, mod + rnorm(16, -0.25, 0.2)),
xval = rep(c(1, 2), each = 16),
group = rep(c('Baseline', 'Post-treatment'), each = 16),
id = factor(rep(1:16, 2)))
df2 <- df %>% tidyr::pivot_wider(id_cols = id, names_from = group,
values_from = modularity)
BLdens <- as.data.frame(density(df$modularity[1:16])[c('x', 'y')])
PTdens <- as.data.frame(density(df$modularity[17:32])[c('x', 'y')])
BLdens$y <- 1 - 0.25 * BLdens$y
PTdens$y <- 2 + 0.25 * PTdens$y
densdf <- rbind(BLdens, PTdens)
densdf$group <- rep(c('Baseline', 'Post-treatment'), each = nrow(BLdens))

Warning message 'mapping' is not used by stat_function() in R

While completing a project for understanding central limit theorem for exponential distribution, I ran into an annoying error message when plotting simulated vs theoretical distributions. When I run the code below, I get an error: 'mapping' is not used by stat_function().
By mapping I assume the error is referring to the aes parameter, which I later map to color red using scale_color_manual in order to show it in a legend.
My question is two-fold: why is this error happening? and is there a more efficient way to create a legend without using scale_color_manual?
Thank you!
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
stat_function(fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
It's not an error, it's a warning:
library(ggplot2)
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
stat_function(fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
#> Warning: `mapping` is not used by stat_function()
Created on 2020-05-01 by the reprex package (v0.3.0)
You can suppress the warning by calling geom_line(stat = "function") rather than stat_function():
library(ggplot2)
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
geom_line(stat = "function", fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
Created on 2020-05-01 by the reprex package (v0.3.0)
In my opinion, the warning is erroneous, and an issue has been filed about this problem: https://github.com/tidyverse/ggplot2/issues/3611
However, it's not that easy to solve, and therefore as of now the warning is there.
I'm unable to recreate your issue -- when I run your code a plot is generated (below), which suggests the issue is likely to do you with your environment. A general 'solution' is to clear your workspace using the menu dropdown or similar: Session -> Clear workspace..., then re-run your code.
For refactoring the color issue, you can simplify scale_color_manual to
scale_color_manual("Legend", values = c('blue','red')), but how it is now, is a bit better in my view. Anything beyond that has more to do with changing the data structure and mapping.
Apologies, I don't have the rep to make a comment.

Making a specific quantile plot in R

I am very intrigued by the following visulization (Decile term)
And I wonder how it would be possible to do it in R.
There is of course histograms and density plots, but they do not make such a nice visualization. Especially, I would like to know if it possible to do it with ggplot/tidyverse.
edit in response to the comment
library(dplyr)
library(ggplot2)
someData <- data_frame(x = rnorm(1000))
ggplot(someData, aes(x = x)) +
geom_histogram()
this produces a histogram (see http://www.r-fiddle.org/#/fiddle?id=LQXazwMY&version=1)
But how I can get the coloful bars? How to implement the small rectangles? (The arrows are less relevant).
You have to define a number of breaks, and use approximate deciles that match those histogram breaks. Otherwise, two deciles will end up in one bar.
d <- data_frame(x = rnorm(1000))
breaks <- seq(min(d$x), max(d$x), length.out = 50)
quantiles <- quantile(d$x, seq(0, 1, 0.1))
quantiles2 <- sapply(quantiles, function(x) breaks[which.min(abs(x - breaks))])
d$bar <- as.numeric(as.character(cut(d$x, breaks, na.omit((breaks + dplyr::lag(breaks)) / 2))))
d$fill <- cut(d$x, quantiles2, na.omit((quantiles2 + dplyr::lag(quantiles2)) / 2))
ggplot(d, aes(bar, y = 1, fill = fill)) +
geom_col(position = 'stack', col = 1, show.legend = FALSE, width = diff(breaks)[1])
Or with more distinct colors:
ggplot(d, aes(bar, y = 1, fill = fill)) +
geom_col(position = 'stack', col = 1, show.legend = FALSE, width = diff(breaks)[1]) +
scale_fill_brewer(type = 'qual', palette = 3) # The only qual pallete with enough colors
Add some styling and increase the breaks to 100:
ggplot(d, aes(bar, y = 1, fill = fill)) +
geom_col(position = 'stack', col = 1, show.legend = FALSE, width = diff(breaks)[1], size = 0.3) +
scale_fill_brewer(type = 'qual', palette = 3) +
theme_classic() +
coord_fixed(diff(breaks)[1], expand = FALSE) + # makes square blocks
labs(x = 'x', y = 'count')
And here is a function to make that last one:
decile_histogram <- function(data, var, n_breaks = 100) {
breaks <- seq(min(data[[var]]), max(data[[var]]), length.out = n_breaks)
quantiles <- quantile(data[[var]], seq(0, 1, 0.1))
quantiles2 <- sapply(quantiles, function(x) breaks[which.min(abs(x - breaks))])
data$bar <- as.numeric(as.character(
cut(data[[var]], breaks, na.omit((breaks + dplyr::lag(breaks)) / 2)))
)
data$fill <- cut(data[[var]], quantiles2, na.omit((quantiles2 + dplyr::lag(quantiles2)) / 2))
ggplot2::ggplot(data, ggplot2::aes(bar, y = 1, fill = fill)) +
ggplot2::geom_col(position = 'stack', col = 1, show.legend = FALSE, width = diff(breaks)[1], size = 0.3) +
ggplot2::scale_fill_brewer(type = 'qual', palette = 3) +
ggplot2::theme_classic() +
ggplot2::coord_fixed(diff(breaks)[1], expand = FALSE) +
ggplot2::labs(x = 'x', y = 'count')
}
Use as:
d <- data.frame(x = rnorm(1000))
decile_histogram(d, 'x')

How to show percent labels on histogram bars using ggplot2

I have seen lots of question regarding converting count on y axis into percent but must of them are in bar plot.
I want to do similar thing in histogram but not able to show the labels on the bar clearly. Please tell me where I am doing wrong.
x = runif(100, min = 0, max = 10)
data1 <- data.frame(x = x)
ggplot(aes(x = x), data = data1)+
geom_histogram(aes(y = (..count..)/sum(..count..)), bins = 10, breaks =
seq(0,10,1), fill = "blue", col = "black")+
geom_text(aes(y = ((..count..)/sum(..count..)),
label = scales::percent((..count..)/sum(..count..))),
stat = "count", vjust = -10)+
scale_y_continuous(labels = scales::percent)
Output:
Use scale_y_continous with breaks and labels will solve your problem.
data1 <- data.frame (x = runif(100, min = 0, max = 10))
ggplot(aes(x=x), data1) + stat_bin(aes(y = ..count..))
ggplot(data1, aes(x = x)) + geom_histogram(fill = "blue", col = "black")+ scale_y_continuous(breaks = seq(0,10,1),labels = paste(seq(0, 10, by = 1) / 100, "%", sep = ""))+geom_text(aes(y = (..count..),label = scales::percent((..count..)/sum(..count..))), stat="bin",colour="green",vjust=2)
or, you can specify where you would like to add the percentage like this:
geom_text(aes(y = (..count..)+0.5))
of course you can change the color as well. from,
stat="bin",colour="your prefer color "
Also you can change the width of the bins as follows:
geom_histogram(fill = "blue", col = "black", binwidth = 0.5)

Resources