I am trying to just mark the max and min of each x-axis in a faceted ggplot. I have several facets with different x scales and the same y scale, and the x axis tick labels overlap each other. Rather than having to manually determine the limits and breaks for each facet x axis, I am looking for a way to just label the min and max values for each.
Code using example data of the CO2 dataset (see ?CO2):
CO2$num <- 1:nrow(CO2)
library(reshape2)
CO2.melt <- melt(CO2,
id.var=c("Type",
"Plant",
"Treatment",
"num"))
CO2.melt <- CO2.melt[order(CO2.melt$num),]
library(ggplot2)
ggplot(CO2.melt,
aes(x = value,
y = num)) +
geom_path(aes(color = Treatment)) +
facet_wrap( ~ variable, scales = "free_x",nrow=1)
Purpose is to replicate well log displays such as this one.
When you want to implemented this for the tick-labels, the use of scales = "free_x" in a faceted plot makes this hard to automate this. However, with a bit of tinkering and the help of several other packages, you could also use the following approach:
1) Summarise the data in order to get an idea which tick-labels / breaks you need on the x-axis:
library(data.table)
minmax <- melt(setDT(CO2.melt)[, .(min.val = min(value), max.val = max(value),
floor.end = 10*ceiling(min(value)/10),
ceil.end = 10*floor((max(value)-1)/10)),
variable][],
measure.vars = patterns('.val','.end'),
variable.name = 'var',
value.name = c('minmax','ends'))
which gives:
> minmax
variable var minmax ends
1: conc 1 95.0 100
2: uptake 1 7.7 10
3: conc 2 1000.0 990
4: uptake 2 45.5 40
2) Create break vecors for each facet:
brks1 <- c(95,250,500,750,1000)
brks2 <- c(7.7,10,20,30,40,45.5)
3) Create the facets:
p1 <- ggplot(CO2.melt[CO2.melt$variable=="conc",],
aes(x = value, y = num, colour = Treatment)) +
geom_path() +
scale_x_continuous(breaks = brks1) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(colour = c('red','black')[c(1,2,2,2,1)],
face = c('bold','plain')[c(1,2,2,2,1)]),
axis.title = element_blank(),
panel.grid.major = element_line(colour = "grey60"),
panel.grid.minor = element_blank())
p2 <- ggplot(CO2.melt[CO2.melt$variable=="uptake",],
aes(x = value, y = num, colour = Treatment)) +
geom_path() +
scale_x_continuous(breaks = brks2) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(colour = c('red','black')[c(1,2,2,2,2,1)],
face = c('bold','plain')[c(1,2,2,2,2,1)]),
axis.title = element_blank(),
panel.grid.major = element_line(colour = "grey60"),
panel.grid.minor = element_blank())
4) Extract the legend into a separate object:
library(grid)
library(gtable)
fill.legend <- gtable_filter(ggplot_gtable(ggplot_build(p2)), "guide-box")
legGrob <- grobTree(fill.legend)
5) Create the final plot:
library(gridExtra)
grid.arrange(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
legGrob, ncol=3, widths = c(4,4,1))
which results in:
A possible alternative solution to do this automatically, is either use geom_text or geom_label. An example to show how you can achieve this:
# create a summary
library(dplyr)
library(tidyr)
minmax <- CO2.melt %>%
group_by(variable) %>%
summarise(minx = min(value), maxx = max(value)) %>%
gather(lbl, val, -1)
# create the plot
ggplot(CO2.melt, aes(x = value, y = num, color = Treatment)) +
geom_path() +
geom_text(data = minmax,
aes(x = val, y = -3, label = val),
colour = "red", fontface = "bold", size = 5) +
facet_wrap( ~ variable, scales = "free_x", nrow=1) +
theme_minimal()
which gives:
You can also get the minimum and maximum values on the fly inside ggplot (credit to #eipi10). Another example using geom_label:
ggplot(CO2.melt, aes(x = value, y = num, color = Treatment)) +
geom_path() +
geom_label(data = CO2.melt %>%
group_by(variable) %>%
summarise(minx = min(value), maxx = max(value)) %>%
gather(lbl, val, -1),
aes(x = val, y = -3, label = val),
colour = "red", fontface = "bold", size = 5) +
facet_wrap( ~ variable, scales = "free_x", nrow=1) +
theme_minimal()
which gives:
Edit Updating to ggplot2 ver 3.0.0
This approach modifies the labels in the ggplot build data (i.e., ggplot_build(plot)). I've removed the x-axis expansions so that the maximum and minimum values fall on the panel boundaries.
# Packages
library(grid)
library(ggplot2)
library(reshape2)
# Data
CO2$num <- 1:nrow(CO2)
library(reshape2)
CO2.melt <- melt(CO2,
id.var=c("Type",
"Plant",
"Treatment",
"num"))
CO2.melt <- CO2.melt[order(CO2.melt$num),]
# Plot
(p <- ggplot(CO2.melt,
aes(x = value,
y = num)) +
scale_x_continuous(expand = c(0, 0)) +
geom_path(aes(color = Treatment)) +
facet_wrap( ~ variable, scales = "free_x", nrow=1))
# Get the build data
gb <- ggplot_build(p)
# Get number of panels
panels = length(gb$layout$panel_params)
# Get x tick mark labels
x.labels = lapply(1:panels, function(N) gb$layout$panel_params[[N]]$x.labels)
# Get range of x values
x.range = lapply(1:panels, function(N) gb$layout$panel_params[[N]]$x.range)
# Get position of x tick mark labels
x.pos = lapply(1:panels, function(N) gb$layout$panel_params[[N]]$x.major)
# Get new x tick mark labels - includes max and min
new.labels = lapply(1:panels, function(N) as.character(sort(unique(c(as.numeric(x.labels[[N]]), x.range[[N]])))))
# Tag min and max values with "min" and "max"
new.labelsC = new.labels
minmax = c("min", "max")
new.labelsC = lapply(1:panels, function(N) {
x = c(new.labelsC[[N]][1], new.labelsC[[N]][length(new.labels[[N]])])
x = paste0(x, "\n", minmax)
c(x[1], new.labelsC[[N]][2:(length(new.labels[[N]])-1)], x[2])
} )
# # Get position of new labels
new.pos = lapply(1:panels, function(N) (as.numeric(new.labels[[N]]) - x.range[[N]][1])/(x.range[[N]][2] - x.range[[N]][1]))
# Put them back into the build data
for(i in 1:panels) {
gb$layout$panel_params[[i]]$x.labels = new.labelsC[[i]]
gb$layout$panel_params[[i]]$x.major_source = as.numeric(new.labels[[i]])
gb$layout$panel_params[[i]]$x.major = new.pos[[i]]
}
# Get the ggplot grob
gp = ggplot_gtable(gb)
# Add some additional space between the panels
pos = gp$layout$l[grep("panel", gp$layout$name)] # Positions of the panels
for(i in 1:(panels-1)) gp$widths[[pos[i]+1]] = unit(1, "cm")
# Colour the min and max labels using `grid` editing functions
for(i in 1:panels) {
gp = editGrob(grid.force(gp), gPath(paste0("axis-b-", i), "axis", "axis", "GRID.text"),
grep = TRUE, gp = gpar(col = c("red", rep("black", length(new.labels[[i]])-2), "red")))
}
# Draw it
grid.newpage()
grid.draw(gp)
Related
I've got a plot that looks like the output of the following code using the iris data
require(tidyverse)
require(purrr)
require(forcats) # Useful for ordering facets found at [here][1]
# Make some long data and set a custom sorting order using some of t
tbl <- iris %>%
pivot_longer(., cols = 1:4, names_to = "Msr", values_to = "Vls") %>%
mutate(Msr = factor(Msr)) %>%
mutate(plot_fct = fct_cross(Species, Msr)) %>%
mutate(plot_fct = fct_reorder(plot_fct, Vls))
# A functioning factory for minor log breaks found [here][1] (very helpful)
minor_breaks_log <- function(base) {
# Prevents lazy evaluation
force(base)
# Wrap calculation in a function that the outer function returns
function(limits) {
ggplot2:::calc_logticks(
base = base,
minpow = floor(log(limits[1], base = base)),
maxpow = ceiling(log(limits[2], base = base))
)$value
}
}
# Plot the images
ggplot(data = tbl, aes(x =plot_fct, y = Vls, fill = Species)) +
geom_violin() +
coord_flip() + # swap coords
scale_y_log10(labels = function(x) sprintf("%g", x),
minor_breaks = minor_breaks_log(10)) + # format for labels # box fills
theme_bw(base_size = 12) +
annotation_logticks(base = 10, sides = "b") +
facet_wrap(~Species, nrow = 1, scales = "free")
I would now like to list the number of observations per violin on the right side of each facet just inside the maximum border, which I'm sure is possible but cannot seem to find an example that does this sort of labeling, with violins and facets.
ggplot(data = tbl, aes(y = plot_fct, fill = Species)) +
geom_violin(aes(x = Vls)) +
geom_text(aes(label = after_stat(count)), hjust = 1,
stat = "count", position = "fill") +
scale_x_log10(labels = function(x) sprintf("%g", x),
minor_breaks = minor_breaks_log(10)) + # format for labels # box fills
theme_bw(base_size = 12) +
annotation_logticks(base = 10, sides = "b") +
facet_wrap(~Species, nrow = 1, scales = "free")
I have a matrix with many zero elements. The column names are labeled on the horizontal axis. I'd like to show explictly the nonzero elements as the bias from the vertical line for each column.
So how should construct a figure such as the example using ggplot2?
An example data can be generated as follow:
set.seed(2018)
N <- 5
p <- 40
dat <- matrix(0.0, nrow=p, ncol=N)
dat[2:7, 1] <- 4*rnorm(6)
dat[4:12, 2] <- 2.6*rnorm(9)
dat[25:33, 3] <- 2.1*rnorm(9)
dat[19:26, 4] <- 3.3*rnorm(8)
dat[33:38, 5] <- 2.9*rnorm(6)
colnames(dat) <- letters[1:5]
print(dat)
Here is another option using facet_wrap and geom_col with theme_minimal.
library(tidyverse)
dat %>%
as.data.frame() %>%
rowid_to_column("row") %>%
gather(key, value, -row) %>%
ggplot(aes(x = row, y = value, fill = key)) +
geom_col() +
facet_wrap(~ key, ncol = ncol(dat)) +
coord_flip() +
theme_minimal()
To further increase the aesthetic similarity to the plot in your original post we can
move the facet strips to the bottom,
rotate strip labels,
add "zero lines" in matching colours,
remove the fill legend, and
get rid of the x & y axis ticks/labels/title.
library(tidyverse)
dat %>%
as.data.frame() %>%
rowid_to_column("row") %>%
gather(key, value, -row) %>%
ggplot(aes(x = row, y = value, fill = key)) +
geom_col() +
geom_hline(data = dat %>%
as.data.frame() %>%
gather(key, value) %>%
count(key) %>%
mutate(y = 0),
aes(yintercept = y, colour = key), show.legend = F) +
facet_wrap(~ key, ncol = ncol(dat), strip.position = "bottom") +
coord_flip() +
guides(fill = FALSE) +
theme_minimal() +
theme(
strip.text.x = element_text(angle = 45),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank())
It would be much easier if you can provide some sample data. Thus I needed to create them and there is no guarantee that this will work for your purpose.
set.seed(123)
# creating some random sample data
df <- data.frame(id = rep(1:100, each = 3),
x = rnorm(300),
group = rep(letters[1:3], each = 100),
bias = sample(0:1, 300, replace = T, prob = c(0.7, 0.3)))
# introducing bias
df$bias <- df$bias*rnorm(nrow(df))
# calculate lower/upper bias for errorbar
df$biaslow <- apply(data.frame(df$bias), 1, function(x){min(0, x)})
df$biasupp <- apply(data.frame(df$bias), 1, function(x){max(0, x)})
Then I used kind of hack to be able to print groups in sufficient distance to make them not overlapped. Based on group I shifted bias variable and also lower and upper bias.
# I want to print groups in sufficient distance
df$bias <- as.numeric(df$group)*5 + df$bias
df$biaslow <- as.numeric(df$group)*5 + df$biaslow
df$biasupp <- as.numeric(df$group)*5 + df$biasupp
And now it is possible to plot it:
library(ggplot2)
ggplot(df, aes(x = x, col = group)) +
geom_errorbar(aes(ymin = biaslow, ymax = biasupp), width = 0) +
coord_flip() +
geom_hline(aes(yintercept = 5, col = "a")) +
geom_hline(aes(yintercept = 10, col = "b")) +
geom_hline(aes(yintercept = 15, col = "c")) +
theme(legend.position = "none") +
scale_y_continuous(breaks = c(5, 10, 15), labels = letters[1:3])
EDIT:
To incorporate special design you can add
theme_bw() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
to your plot.
EDIT2:
To incorporate several horizontal lines, you can create different dataset:
df2 <- data.frame(int = unique(as.numeric(df$group)*5),
gr = levels(df$group))
And use
geom_hline(data = df2, aes(yintercept = int, col = gr))
instead of copy/pasting geom_hline for each group level.
I would like to create a dot plot with percentiles, which looks something like this-
Here is the ggplot2 code I used to create the dot plot. There are two things I'd like to change:
I can plot the percentile values on the y-axis but I want these
values on the x-axis (as shown in the graph above). Note that
the coordinates are flipped.
The axes don't display label for the
minimum value (for example the percentile axis labels start at 25
when they should start at 0 instead.)
# loading needed libraries
library(tidyverse)
library(ggstatsplot)
# creating dataframe with mean mileage per manufacturer
cty_mpg <- ggplot2::mpg %>%
dplyr::group_by(.data = ., manufacturer) %>%
dplyr::summarise(.data = ., mileage = mean(cty, na.rm = TRUE)) %>%
dplyr::rename(.data = ., make = manufacturer) %>%
dplyr::arrange(.data = ., mileage) %>%
dplyr::mutate(.data = ., make = factor(x = make, levels = .$make)) %>%
dplyr::mutate(
.data = .,
percent_rank = (trunc(rank(mileage)) / length(mileage)) * 100
) %>%
tibble::as_data_frame(x = .)
# plot
ggplot2::ggplot(data = cty_mpg, mapping = ggplot2::aes(x = make, y = mileage)) +
ggplot2::geom_point(col = "tomato2", size = 3) + # Draw points
ggplot2::geom_segment(
mapping = ggplot2::aes(
x = make,
xend = make,
y = min(mileage),
yend = max(mileage)
),
linetype = "dashed",
size = 0.1
) + # Draw dashed lines
ggplot2::scale_y_continuous(sec.axis = ggplot2::sec_axis(trans = ~(trunc(rank(.)) / length(.)) * 100, name = "percentile")) +
ggplot2::coord_flip() +
ggplot2::labs(
title = "City mileage by car manufacturer",
subtitle = "Dot plot",
caption = "source: mpg dataset in ggplot2"
) +
ggstatsplot::theme_ggstatsplot()
Created on 2018-08-17 by the reprex package (v0.2.0.9000).
I am not 100% sure to have understood what you really want, but below is my attempt to reproduce the first picture with mpg data:
require(ggplot2)
data <- aggregate(cty~manufacturer, mpg, FUN = mean)
data <- data.frame(data[order(data$cty), ], rank=1:nrow(data))
g <- ggplot(data, aes(y = rank, x = cty))
g <- g + geom_point(size = 2)
g <- g + scale_y_continuous(name = "Manufacturer", labels = data$manufacturer, breaks = data$rank,
sec.axis = dup_axis(name = element_blank(),
breaks = seq(1, nrow(data), (nrow(data)-1)/4),
labels = 25 * 0:4))
g <- g + scale_x_continuous(name = "Mileage", limits = c(10, 25),
sec.axis = dup_axis(name = element_blank()))
g <- g + theme_classic()
g <- g + theme(panel.grid.major.y = element_line(color = "black", linetype = "dotted"))
print(g)
That produces:
data <- aggregate(cty~manufacturer, mpg, FUN = mean)
data <- data.frame(data[order(data$cty), ], rank=1:nrow(data))
These two lines generate the data for the graph. Basically we need the manufacturers, the mileage (average of cty by manufacturer) and the rank.
g <- g + scale_y_continuous(name = "Manufacturer", labels = data$manufacturer, breaks = data$rank,
sec.axis = dup_axis(name = element_blank(),
breaks = seq(1, nrow(data), (nrow(data)-1)/4),
labels = 25 * 0:4))
Note that here the scale is using rank and not the column manufacturer. To display the name of the manufacturers, you must use the labels property and you must force the breaks to be for every values (see property breaks).
The second y-axis is generated using the sec.axis property. This is very straight-forward using dup_axis that easily duplicate the axis. By replacing the labels and the breaks, you can display the %-value.
g <- g + theme(panel.grid.major.y = element_line(color = "black", linetype = "dotted"))
The horizontal lines are just the major grid. This is much easier to manipulate than geom_segments in my opinion.
Regarding your question 1, you can flip the coordinates easily using coord_flip, with minor adjustments. Replace the following line:
g <- g + theme(panel.grid.major.y = element_line(color = "black", linetype = "dotted")
By the following two lines:
g <- g + coord_flip()
g <- g + theme(panel.grid.major.x = element_line(color = "black", linetype = "dotted"),
axis.text.x = element_text(angle = 90, hjust = 1))
Which produces:
Regarding your question 2, the problem is that the value 0% is outside the limits. You can solve this issue by changing the way you calculate the percentage (starting from zero and not from one), or you can extend the limit of your plot to include the value zero, but then no point will be associated to 0%.
I want to add two bar charts to the top and right of a heatmap representing the marginal distributions along the two dimensions of the bivariate distribution that the heatmap represents.
Here is some code:
library(gridExtra)
library(ggExtra)
library(cowplot)
# generate some data
df_hm = cbind(
expand.grid(
rows = sample(letters, 10),
cols = sample(LETTERS, 10)
),
value = rnorm(100)
)
# plot the heatmap
gg_hm = df_hm %>%
ggplot(aes(x = rows, y = cols, fill = value)) +
geom_tile() +
theme(legend.position = "bottom")
gg_rows = df_hm %>%
group_by(rows) %>%
summarize(value = mean(value)) %>%
ggplot(aes(x = rows,y = value)) +
geom_bar(stat = "identity", position = "dodge")
gg_cols = df_hm %>%
group_by(cols) %>%
summarize(value = mean(value)) %>%
ggplot(aes(x = cols, y = value))+
geom_bar(stat = "identity", position = "dodge") +
coord_flip()
gg_empty = df_hm %>%
ggplot(aes(x = cols, y = value)) +
geom_blank() +
theme(axis.text = element_blank(),
axis.title = element_blank(),
line = element_blank(),
panel.background = element_blank())
# try this with grid.arrange
grid.arrange(gg_rows, gg_empty, gg_hm, gg_cols,
ncol = 2, nrow = 2, widths = c(3, 1), heights = c(1, 3))
which produces this:
What I want to be able to do is to move the graphs to align as indicated by the red arrows:
- the y-axis of (1, 1) should line up with the y-axis of (2, 1)
- the x-axis of (2, 1) should line up with the x-axis of (2, 2)
I tried the accepted answer by renato vitolo and the alignments didn't work on my machine. But I subsequently discoverd a much easier solution: the egg package (available on CRAN). egg provides a version of grid.arrange called ggarrange which takes similar arguments but aligns the axes nicely. In the OP's code I just had to add library(egg), library(dplyr), and then replace grid.arrange with ggarrange (having installed egg with install.packages("egg")).
Full code:
library(gridExtra)
library(cowplot)
library(egg)
library(dplyr)
# generate some data
df_hm = cbind(
expand.grid(
rows = sample(letters, 10),
cols = sample(LETTERS, 10)
),
value = rnorm(100)
)
# plot the heatmap
gg_hm = df_hm %>%
ggplot(aes(x = rows, y = cols, fill = value)) +
geom_tile() +
theme(legend.position = "bottom")
gg_rows = df_hm %>%
group_by(rows) %>%
summarize(value = mean(value)) %>%
ggplot(aes(x = rows,y = value)) +
geom_bar(stat = "identity", position = "dodge")
gg_cols = df_hm %>%
group_by(cols) %>%
summarize(value = mean(value)) %>%
ggplot(aes(x = cols, y = value))+
geom_bar(stat = "identity", position = "dodge") +
coord_flip()
gg_empty = df_hm %>%
ggplot(aes(x = cols, y = value)) +
geom_blank() +
theme(axis.text = element_blank(),
axis.title = element_blank(),
line = element_blank(),
panel.background = element_blank())
ggarrange(
gg_rows, gg_empty, gg_hm, gg_cols,
nrow = 2, ncol = 2, widths = c(3, 1), heights = c(1, 3)
)
Output:
gtable is extremely useful. scales provides tools to format the axis ticks, to achieve alignment between the text y.ticks of the heatmap (X, F, ...) and the numeric y.ticks of the barplot on top, by formatting the former to a fixed width of 5 chars (to be adapted for your specific barplot).
require(ggplot2)
require(gtable)
require(grid)
library(dplyr)
library(scales)
## To format heatmap y.ticks with appropriate width (5 chars),
## to align with gg_rows y.tics
ytickform <- function(x){
lab <- sprintf("%05s",x)
}
set.seed(123)
## generate some data
df_hm = cbind(
expand.grid(
rows = sample(letters, 10),
cols = sample(LETTERS, 10)
),
value = rnorm(100)
)
# plot the heatmap
gg_hm = df_hm %>%
ggplot(aes(x = rows, y = cols, fill = value)) +
geom_tile() +
scale_y_discrete(label=ytickform) +
theme(legend.position = "bottom",
plot.margin = unit(c(3,3,3,3), "mm"))
gg_rows = df_hm %>%
group_by(rows) %>%
summarize(value = mean(value)) %>%
ggplot(aes(x = rows,y = value)) +
geom_bar(stat = "identity", position = "dodge") +
theme(plot.margin = unit(c(3,3,3,3), "mm"))
gg_cols = df_hm %>%
group_by(cols) %>%
summarize(value = mean(value)) %>%
ggplot(aes(x = cols, y = value))+
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
theme(plot.margin = unit(c(3,3,3,3), "mm"))
## extract legend from heatmap
g <- ggplotGrob(gg_hm)$grobs
legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
## plot heatmap without legend
g <- ggplotGrob(gg_hm + theme(legend.position="none"))
## add column and put column barplot within
g <- gtable_add_cols(g, unit(5,"cm"))
g <- gtable_add_grob(g, ggplotGrob(gg_cols),
t = 1, l=ncol(g), b=nrow(g), r=ncol(g))
## add row and put legend within
g <- gtable_add_rows(g, unit(1,"cm"))
g <- gtable_add_grob(g, legend,
t = nrow(g), l=1, b=nrow(g), r=ncol(g)-1)
## add row on top and put row barplot within
g <- gtable_add_rows(g, unit(5,"cm"), 0)
g <- gtable_add_grob(g, ggplotGrob(gg_rows),
t = 1, l=1, b=1, r=5)
grid.newpage()
grid.draw(g)
References:
Align ggplot2 plots vertically
http://www.cookbook-r.com/Graphs/Axes_(ggplot2)/#tick-mark-label-text-formatters
https://github.com/hadley/ggplot2/wiki/Share-a-legend-between-two-ggplot2-graphs
I've been trying to superimpose a normal curve over my histogram with ggplot 2.
My formula:
data <- read.csv (path...)
ggplot(data, aes(V2)) +
geom_histogram(alpha=0.3, fill='white', colour='black', binwidth=.04)
I tried several things:
+ stat_function(fun=dnorm)
....didn't change anything
+ stat_density(geom = "line", colour = "red")
...gave me a straight red line on the x-axis.
+ geom_density()
doesn't work for me because I want to keep my frequency values on the y-axis, and want no density values.
Any suggestions?
Solution found!
+geom_density(aes(y=0.045*..count..), colour="black", adjust=4)
Think I got it:
library(ggplot2)
set.seed(1)
df <- data.frame(PF = 10*rnorm(1000))
ggplot(df, aes(x = PF)) +
geom_histogram(aes(y =..density..),
breaks = seq(-50, 50, by = 10),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df$PF), sd = sd(df$PF)))
This has been answered here and partially here.
The area under a density curve equals 1, and the area under the histogram equals the width of the bars times the sum of their height ie. the binwidth times the total number of non-missing observations. To fit both on the same graph, one or other needs to be rescaled so that their areas match.
If you want the y-axis to have frequency counts, there are a number of options:
First simulate some data.
library(ggplot2)
set.seed(1)
dat_hist <- data.frame(
group = c(rep("A", 200), rep("B",150)),
value = c(rnorm(200, 20, 5), rnorm(150,25,10)))
# Set desired binwidth and number of non-missing obs
bw = 2
n_obs = sum(!is.na(dat_hist$value))
Option 1: Plot both histogram and density curve as density and then rescale the y axis
This is perhaps the easiest approach for a single histogram.
Using the approach suggested by Carlos, plot both histogram and density curve as density
g <- ggplot(dat_hist, aes(value)) +
geom_histogram(aes(y = ..density..), binwidth = bw, colour = "black") +
stat_function(fun = dnorm, args = list(mean = mean(dat_hist$value), sd = sd(dat_hist$value)))
And then rescale the y axis.
ybreaks = seq(0,50,5)
## On primary axis
g + scale_y_continuous("Counts", breaks = round(ybreaks / (bw * n_obs),3), labels = ybreaks)
## Or on secondary axis
g + scale_y_continuous("Density", sec.axis = sec_axis(
trans = ~ . * bw * n_obs, name = "Counts", breaks = ybreaks))
Option 2: Rescale the density curve using stat_function
With code tidied as per PatrickT's answer.
ggplot(dat_hist, aes(value)) +
geom_histogram(colour = "black", binwidth = bw) +
stat_function(fun = function(x)
dnorm(x, mean = mean(dat_hist$value), sd = sd(dat_hist$value)) * bw * n_obs)
Option 3: Create an external dataset and plot using geom_line.
Unlike the above options, this one works with facets. (EDITED to provide dplyr rather than plyr based solution). Note, the summarised dataset is being used as the primary, and the raw passed in for the histogram only.
library(tidyverse)
dat_hist %>%
group_by(group) %>%
nest(data = c(value)) %>%
mutate(y = map(data, ~ dnorm(
.$value, mean = mean(.$value), sd = sd(.$value)
) * bw * sum(!is.na(.$value)))) %>%
unnest(c(data,y)) %>%
ggplot(aes(x = value)) +
geom_histogram(data = dat_hist, binwidth = bw, colour = "black") +
geom_line(aes(y = y)) +
facet_wrap(~ group)
Option 4: Create external functions to edit the data on the fly
A bit over the top perhaps, but might be useful for someone?
## Function to create scaled dnorm data along full x axis range
dnorm_scaled <- function(data, x = NULL, binwidth = 1, xlim = NULL) {
.x <- na.omit(data[,x])
if(is.null(xlim))
xlim = c(min(.x), max(.x))
x_range = seq(xlim[1], xlim[2], length.out = 101)
setNames(
data.frame(
x = x_range,
y = dnorm(x_range, mean = mean(.x), sd = sd(.x)) * length(.x) * binwidth),
c(x, "y"))
}
## Function to apply over groups
dnorm_scaled_group <- function(data, x = NULL, group = NULL, binwidth = NULL, xlim = NULL) {
dat_hists <- lapply(
split(data, data[, group]), dnorm_scaled,
x = x, binwidth = binwidth, xlim = xlim)
for(g in names(dat_hists))
dat_hists[[g]][, "group"] <- g
setNames(do.call(rbind, dat_hists), c(x, "y", group))
}
## Single histogram
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = bw, colour = "black") +
geom_line(data = ~ dnorm_scaled(., "value", binwidth = bw),
aes(y = y))
## With a single faceting variable
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = 2, colour = "black") +
geom_line(data = ~ dnorm_scaled_group(
., x = "value", group = "group", binwidth = 2, xlim = c(0,50)),
aes(y = y)) +
facet_wrap(~ group)
This is an extended comment on JWilliman's answer. I found J's answer very useful. While playing around I discovered a way to simplify the code. I'm not saying it is a better way, but I thought I would mention it.
Note that JWilliman's answer provides the count on the y-axis and a "hack" to scale the corresponding density normal approximation (which otherwise would cover a total area of 1 and have therefore a much lower peak).
Main point of this comment: simpler syntax inside stat_function, by passing the needed parameters to the aesthetics function, e.g.
aes(x = x, mean = 0, sd = 1, binwidth = 0.3, n = 1000)
This avoids having to pass args = to stat_function and is therefore more user-friendly. Okay, it's not very different, but hopefully someone will find it interesting.
# parameters that will be passed to ``stat_function``
n = 1000
mean = 0
sd = 1
binwidth = 0.3 # passed to geom_histogram and stat_function
set.seed(1)
df <- data.frame(x = rnorm(n, mean, sd))
ggplot(df, aes(x = x, mean = mean, sd = sd, binwidth = binwidth, n = n)) +
theme_bw() +
geom_histogram(binwidth = binwidth,
colour = "white", fill = "cornflowerblue", size = 0.1) +
stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n * binwidth,
color = "darkred", size = 1)
This code should do it:
set.seed(1)
z <- rnorm(1000)
qplot(z, geom = "blank") +
geom_histogram(aes(y = ..density..)) +
stat_density(geom = "line", aes(colour = "bla")) +
stat_function(fun = dnorm, aes(x = z, colour = "blabla")) +
scale_colour_manual(name = "", values = c("red", "green"),
breaks = c("bla", "blabla"),
labels = c("kernel_est", "norm_curv")) +
theme(legend.position = "bottom", legend.direction = "horizontal")
Note: I used qplot but you can use the more versatile ggplot.
Here's a tidyverse informed version:
Setup
library(tidyverse)
Some data
d <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/openintro/speed_gender_height.csv")
Preparing data
We'll use a "total" histogram for the whole sample, to that end, we'll need to remove the grouping information from the data.
d2 <-
d |>
select(-gender)
Here's a data set with summary data:
d_summary <-
d %>%
group_by(gender) %>%
summarise(height_m = mean(height, na.rm = T),
height_sd = sd(height, na.rm = T))
d_summary
Plot it
d %>%
ggplot() +
aes() +
geom_histogram(aes(y = ..density.., x = height, fill = gender)) +
facet_wrap(~ gender) +
geom_histogram(data = d2, aes(y = ..density.., x = height),
alpha = .5) +
stat_function(data = d_summary %>% filter(gender == "female"),
fun = dnorm,
#color = "red",
args = list(mean = filter(d_summary,
gender == "female")$height_m,
sd = filter(d_summary,
gender == "female")$height_sd)) +
stat_function(data = d_summary %>% filter(gender == "male"),
fun = dnorm,
#color = "red",
args = list(mean = filter(d_summary,
gender == "male")$height_m,
sd = filter(d_summary,
gender == "male")$height_sd)) +
theme(legend.position = "none",
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(title = "Facetted histograms with overlaid normal curves",
caption = "The grey histograms shows the whole distribution (over) both groups, i.e. females and men") +
scale_fill_brewer(type = "qual", palette = "Set1")