Violin plot with list input - r

I am using the vioplot funciton form the vioplot package, and I would like to feed a list as an input. So this is what I have:
# Violin Plots
library(vioplot)
x1 <- mtcars$mpg[mtcars$cyl==4]
x2 <- mtcars$mpg[mtcars$cyl==6]
x3 <- mtcars$mpg[mtcars$cyl==8]
vioplot(x1, x2, x3, names=c("4 cyl", "6 cyl", "8 cyl"),
col="gold")
title("Violin Plots of Miles Per Gallon")
And this is what I would like to do:
# Violin Plots
library(vioplot)
x1 <- mtcars$mpg[mtcars$cyl==4]
x2 <- mtcars$mpg[mtcars$cyl==6]
x3 <- mtcars$mpg[mtcars$cyl==8]
l<-list(x1,x2,x3)
vioplot(l, names=c("4 cyl", "6 cyl", "8 cyl"),
col="gold")
title("Violin Plots of Miles Per Gallon")
But I get this error:
Error in min(data) : invalid 'type' (list) of argument
Can you help? Thanks!

I have modified the vioplot function to accept a list as an input, you can use this vioplot2:
vioplot2<-function (x, ..., range = 1.5, h = NULL, ylim = NULL, names = NULL,
horizontal = FALSE, col = "magenta", border = "black", lty = 1,
lwd = 1, rectCol = "black", colMed = "white", pchMed = 19,
at, add = FALSE, wex = 1, drawRect = TRUE)
{
if(!is.list(x)){
datas <- list(x, ...)
} else{
datas<-x
}
n <- length(datas)
if (missing(at))
at <- 1:n
upper <- vector(mode = "numeric", length = n)
lower <- vector(mode = "numeric", length = n)
q1 <- vector(mode = "numeric", length = n)
q3 <- vector(mode = "numeric", length = n)
med <- vector(mode = "numeric", length = n)
base <- vector(mode = "list", length = n)
height <- vector(mode = "list", length = n)
baserange <- c(Inf, -Inf)
args <- list(display = "none")
if (!(is.null(h)))
args <- c(args, h = h)
for (i in 1:n) {
data <- datas[[i]]
data.min <- min(data)
data.max <- max(data)
q1[i] <- quantile(data, 0.25)
q3[i] <- quantile(data, 0.75)
med[i] <- median(data)
iqd <- q3[i] - q1[i]
upper[i] <- min(q3[i] + range * iqd, data.max)
lower[i] <- max(q1[i] - range * iqd, data.min)
est.xlim <- c(min(lower[i], data.min), max(upper[i],
data.max))
smout <- do.call("sm.density", c(list(data, xlim = est.xlim),
args))
hscale <- 0.4/max(smout$estimate) * wex
base[[i]] <- smout$eval.points
height[[i]] <- smout$estimate * hscale
t <- range(base[[i]])
baserange[1] <- min(baserange[1], t[1])
baserange[2] <- max(baserange[2], t[2])
}
if (!add) {
xlim <- if (n == 1)
at + c(-0.5, 0.5)
else range(at) + min(diff(at))/2 * c(-1, 1)
if (is.null(ylim)) {
ylim <- baserange
}
}
if (is.null(names)) {
label <- 1:n
}
else {
label <- names
}
boxwidth <- 0.05 * wex
if (!add)
plot.new()
if (!horizontal) {
if (!add) {
plot.window(xlim = xlim, ylim = ylim)
axis(2)
axis(1, at = at, label = label)
}
box()
for (i in 1:n) {
polygon(c(at[i] - height[[i]], rev(at[i] + height[[i]])),
c(base[[i]], rev(base[[i]])), col = col, border = border,
lty = lty, lwd = lwd)
if (drawRect) {
lines(at[c(i, i)], c(lower[i], upper[i]), lwd = lwd,
lty = lty)
rect(at[i] - boxwidth/2, q1[i], at[i] + boxwidth/2,
q3[i], col = rectCol)
points(at[i], med[i], pch = pchMed, col = colMed)
}
}
}
else {
if (!add) {
plot.window(xlim = ylim, ylim = xlim)
axis(1)
axis(2, at = at, label = label)
}
box()
for (i in 1:n) {
polygon(c(base[[i]], rev(base[[i]])), c(at[i] - height[[i]],
rev(at[i] + height[[i]])), col = col, border = border,
lty = lty, lwd = lwd)
if (drawRect) {
lines(c(lower[i], upper[i]), at[c(i, i)], lwd = lwd,
lty = lty)
rect(q1[i], at[i] - boxwidth/2, q3[i], at[i] +
boxwidth/2, col = rectCol)
points(med[i], at[i], pch = pchMed, col = colMed)
}
}
}
invisible(list(upper = upper, lower = lower, median = med,
q1 = q1, q3 = q3))
}
What you get with your example is this:

I've included the solution above in an function (R package) on Github. Should work just like input for boxplot (including formula input), includes a range of other custom features such as separate colours for each violin.
Install and load with:
install.packages("devtools")
devtools::install_github("TomKellyGenetics/vioplotx")
library("vioplotx")
So just like you would make a boxplot:
# Set up data
data(mtcars)
x1 <- mtcars$mpg[mtcars$cyl==4]
x2 <- mtcars$mpg[mtcars$cyl==6]
x3 <- mtcars$mpg[mtcars$cyl==8]
l<-list(x1,x2,x3)
# Boxplots
boxplot(l, names=c("4 cyl", "6 cyl", "8 cyl"), col="gold")
title("Boxplots of Miles Per Gallon")
You can use the "vioplotx" do do the same:
So just like you would make a violinplot:
# Violin plots
vioplotx(l, names=c("4 cyl", "6 cyl", "8 cyl"), col="gold")
title("Violin plots of Miles Per Gallon")
This also enables additional customisation with vector inputs:
vioplotx(l, names=c("4 cyl", "6 cyl", "8 cyl"), col=c("cyan", "magenta", "green"))
This is modification of Daniel Adler's "vioplot" package including change based on Federico Giorgi's answer here. This is purely intended to make running such modifications easier.

Here is an another solution, adding colored violins using lapply:
library(vioplot)
# Set up data
x1 <- mtcars$mpg[mtcars$cyl==4]
x2 <- mtcars$mpg[mtcars$cyl==6]
x3 <- mtcars$mpg[mtcars$cyl==8]
l <- list(x1,x2,x3)
# Colors
mycol <- c("yellow", "green", "blue")
# Set up plot without violins
plot("", xlim = c(0.5, length(l)+0.5), ylim = c(min(unlist(l)), max(unlist(l))), xaxt = "n", xlab = "", ylab = "Miles Per Gallon")
axis(1, labels = c("4cyl", "6cyl", "8cyl"), at = c(1:length(l)))
# Add violins from list
lapply(seq_along(l), function(x)
vioplot(l[[x]], at = x, col = mycol[x], add = T, box = F)
)
Done!

Related

Why does the curve not plot in the existing graph? [duplicate]

I have managed to find online how to overlay a normal curve to a histogram in R, but I would like to retain the normal "frequency" y-axis of a histogram. See two code segments below, and notice how in the second, the y-axis is replaced with "density". How can I keep that y-axis as "frequency", as it is in the first plot.
AS A BONUS: I'd like to mark the SD regions (up to 3 SD) on the density curve as well. How can I do this? I tried abline, but the line extends to the top of the graph and looks ugly.
g = d$mydata
hist(g)
g = d$mydata
m<-mean(g)
std<-sqrt(var(g))
hist(g, density=20, breaks=20, prob=TRUE,
xlab="x-variable", ylim=c(0, 2),
main="normal curve over histogram")
curve(dnorm(x, mean=m, sd=std),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
See how in the image above, the y-axis is "density". I'd like to get that to be "frequency".
Here's a nice easy way I found:
h <- hist(g, breaks = 10, density = 10,
col = "lightgray", xlab = "Accuracy", main = "Overall")
xfit <- seq(min(g), max(g), length = 40)
yfit <- dnorm(xfit, mean = mean(g), sd = sd(g))
yfit <- yfit * diff(h$mids[1:2]) * length(g)
lines(xfit, yfit, col = "black", lwd = 2)
You need to find the right multiplier to convert density (an estimated curve where the area beneath the curve is 1) to counts. This can be easily calculated from the hist object.
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
A more complete version, with a normal density and lines at each standard deviation away from the mean (including the mean):
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
myx <- seq(min(mtcars$mpg), max(mtcars$mpg), length.out= 100)
mymean <- mean(mtcars$mpg)
mysd <- sd(mtcars$mpg)
normal <- dnorm(x = myx, mean = mymean, sd = mysd)
lines(myx, normal * multiplier[1], col = "blue", lwd = 2)
sd_x <- seq(mymean - 3 * mysd, mymean + 3 * mysd, by = mysd)
sd_y <- dnorm(x = sd_x, mean = mymean, sd = mysd) * multiplier[1]
segments(x0 = sd_x, y0= 0, x1 = sd_x, y1 = sd_y, col = "firebrick4", lwd = 2)
This is an implementation of aforementioned StanLe's anwer, also fixing the case where his answer would produce no curve when using densities.
This replaces the existing but hidden hist.default() function, to only add the normalcurve parameter (which defaults to TRUE).
The first three lines are to support roxygen2 for package building.
#' #noRd
#' #exportMethod hist.default
#' #export
hist.default <- function(x,
breaks = "Sturges",
freq = NULL,
include.lowest = TRUE,
normalcurve = TRUE,
right = TRUE,
density = NULL,
angle = 45,
col = NULL,
border = NULL,
main = paste("Histogram of", xname),
ylim = NULL,
xlab = xname,
ylab = NULL,
axes = TRUE,
plot = TRUE,
labels = FALSE,
warn.unused = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics::hist.default(
x = x,
breaks = breaks,
freq = freq,
include.lowest = include.lowest,
right = right,
density = density,
angle = angle,
col = col,
border = border,
main = main,
ylim = ylim,
xlab = xlab,
ylab = ylab,
axes = axes,
plot = plot,
labels = labels,
warn.unused = warn.unused,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- yfit * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Quick example:
hist(g)
For dates it's bit different. For reference:
#' #noRd
#' #exportMethod hist.Date
#' #export
hist.Date <- function(x,
breaks = "months",
format = "%b",
normalcurve = TRUE,
xlab = xname,
plot = TRUE,
freq = NULL,
density = NULL,
start.on.monday = TRUE,
right = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics:::hist.Date(
x = x,
breaks = breaks,
format = format,
freq = freq,
density = density,
start.on.monday = start.on.monday,
right = right,
xlab = xlab,
plot = plot,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- as.double(yfit) * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Just remove the prob = T, and let it stay at default ie F

R: Plotting histogram with bell curve [duplicate]

I have managed to find online how to overlay a normal curve to a histogram in R, but I would like to retain the normal "frequency" y-axis of a histogram. See two code segments below, and notice how in the second, the y-axis is replaced with "density". How can I keep that y-axis as "frequency", as it is in the first plot.
AS A BONUS: I'd like to mark the SD regions (up to 3 SD) on the density curve as well. How can I do this? I tried abline, but the line extends to the top of the graph and looks ugly.
g = d$mydata
hist(g)
g = d$mydata
m<-mean(g)
std<-sqrt(var(g))
hist(g, density=20, breaks=20, prob=TRUE,
xlab="x-variable", ylim=c(0, 2),
main="normal curve over histogram")
curve(dnorm(x, mean=m, sd=std),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
See how in the image above, the y-axis is "density". I'd like to get that to be "frequency".
Here's a nice easy way I found:
h <- hist(g, breaks = 10, density = 10,
col = "lightgray", xlab = "Accuracy", main = "Overall")
xfit <- seq(min(g), max(g), length = 40)
yfit <- dnorm(xfit, mean = mean(g), sd = sd(g))
yfit <- yfit * diff(h$mids[1:2]) * length(g)
lines(xfit, yfit, col = "black", lwd = 2)
You need to find the right multiplier to convert density (an estimated curve where the area beneath the curve is 1) to counts. This can be easily calculated from the hist object.
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
A more complete version, with a normal density and lines at each standard deviation away from the mean (including the mean):
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
myx <- seq(min(mtcars$mpg), max(mtcars$mpg), length.out= 100)
mymean <- mean(mtcars$mpg)
mysd <- sd(mtcars$mpg)
normal <- dnorm(x = myx, mean = mymean, sd = mysd)
lines(myx, normal * multiplier[1], col = "blue", lwd = 2)
sd_x <- seq(mymean - 3 * mysd, mymean + 3 * mysd, by = mysd)
sd_y <- dnorm(x = sd_x, mean = mymean, sd = mysd) * multiplier[1]
segments(x0 = sd_x, y0= 0, x1 = sd_x, y1 = sd_y, col = "firebrick4", lwd = 2)
This is an implementation of aforementioned StanLe's anwer, also fixing the case where his answer would produce no curve when using densities.
This replaces the existing but hidden hist.default() function, to only add the normalcurve parameter (which defaults to TRUE).
The first three lines are to support roxygen2 for package building.
#' #noRd
#' #exportMethod hist.default
#' #export
hist.default <- function(x,
breaks = "Sturges",
freq = NULL,
include.lowest = TRUE,
normalcurve = TRUE,
right = TRUE,
density = NULL,
angle = 45,
col = NULL,
border = NULL,
main = paste("Histogram of", xname),
ylim = NULL,
xlab = xname,
ylab = NULL,
axes = TRUE,
plot = TRUE,
labels = FALSE,
warn.unused = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics::hist.default(
x = x,
breaks = breaks,
freq = freq,
include.lowest = include.lowest,
right = right,
density = density,
angle = angle,
col = col,
border = border,
main = main,
ylim = ylim,
xlab = xlab,
ylab = ylab,
axes = axes,
plot = plot,
labels = labels,
warn.unused = warn.unused,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- yfit * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Quick example:
hist(g)
For dates it's bit different. For reference:
#' #noRd
#' #exportMethod hist.Date
#' #export
hist.Date <- function(x,
breaks = "months",
format = "%b",
normalcurve = TRUE,
xlab = xname,
plot = TRUE,
freq = NULL,
density = NULL,
start.on.monday = TRUE,
right = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics:::hist.Date(
x = x,
breaks = breaks,
format = format,
freq = freq,
density = density,
start.on.monday = start.on.monday,
right = right,
xlab = xlab,
plot = plot,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- as.double(yfit) * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Just remove the prob = T, and let it stay at default ie F

How to adjust legend position of interaction.plot and lineplot.CI?

I'm a beginner in coding. I was trying to create an interaction plot. Here's my code:
data is clinicaltrials from the data of the book "Learning Statistics with R."
library(sciplot)
library(lsr)
library(gplots)
lineplot.CI(x.factor = clin.trial$drug,
response = clin.trial$mood.gain,
group = clin.trial$therapy,
ci.fun = ciMean,
xlab = "Drug",
ylab = "Mood Gain")
and it produces the graph like this:
As can be seen in the graph, the legend box is not within my screen.
Also I tried creating another plot using the following code:
interaction.plot(x.factor = clin.trial$drug,
trace.factor = clin.trial$therapy,
response = clin.trial$mood.gain,
fun = mean,
type = "l",
lty = 1, # line type
lwd = 2, # line width
legend = T,
xlab = "Drug", ylab = "Mood Gain",
col = c("#00AFBB", "#E7B800"),
xpd = F,
trace.label = "Therapy")
For this code, I got the graph like this:
In this graph, the legend does not have labels.
Could anyone help me with these problems regarding legend?
You probably plan to save the plot via RStudio GUI. When you resize the plot window with your mouse, you need to run the code again to refresh the legend dimensions.
However, it's advantageous to use a more sophisticated method, e.g. to save it as a png with fixed dimensions like so:
library("sciplot")
library("lsr")
library("gplots")
png("Plot_1.png", height=400, width=500)
lineplot.CI(x.factor=clin.trial$drug,
response=clin.trial$mood.gain,
group=clin.trial$therapy,
ci.fun=ciMean,
xlab="Drug",
ylab="Mood Gain"
)
dev.off()
png("Plot_2.png", height=400, width=500)
interaction.plot(x.factor=clin.trial$drug,
trace.factor=clin.trial$therapy,
response=clin.trial$mood.gain,
fun=mean,
type="l",
lty=1, # line type
lwd=2, # line width
legend=T,
xlab="Drug", ylab="Mood Gain",
col=c("#00AFBB", "#E7B800"),
xpd=F,
trace.label="Therapy")
dev.off()
The plots are saved into your working directory, check getwd() .
Edit
You could also adjust the legend position.
In lineplot.CI you may use arguments; either by using characters just for x, e.g. x.leg="topleft" or both coordinates as numeric x.leg=.8, y.leg=2.2.
interaction.plot does not provide yet this functionality. I provide a hacked version below. Arguments are called xleg and yleg, functionality as above.
See ?legend for further explanations.
interaction.plot <- function (x.factor, trace.factor, response, fun = mean,
type = c("l", "p", "b", "o", "c"), legend = TRUE,
trace.label = deparse(substitute(trace.factor)),
fixed = FALSE, xlab = deparse(substitute(x.factor)),
ylab = ylabel, ylim = range(cells, na.rm = TRUE),
lty = nc:1, col = 1, pch = c(1L:9, 0, letters),
xpd = NULL, leg.bg = par("bg"), leg.bty = "n",
xtick = FALSE, xaxt = par("xaxt"), axes = TRUE,
xleg=NULL, yleg=NULL, ...) {
ylabel <- paste(deparse(substitute(fun)), "of ", deparse(substitute(response)))
type <- match.arg(type)
cells <- tapply(response, list(x.factor, trace.factor), fun)
nr <- nrow(cells)
nc <- ncol(cells)
xvals <- 1L:nr
if (is.ordered(x.factor)) {
wn <- getOption("warn")
options(warn = -1)
xnm <- as.numeric(levels(x.factor))
options(warn = wn)
if (!anyNA(xnm))
xvals <- xnm
}
xlabs <- rownames(cells)
ylabs <- colnames(cells)
nch <- max(sapply(ylabs, nchar, type = "width"))
if (is.null(xlabs))
xlabs <- as.character(xvals)
if (is.null(ylabs))
ylabs <- as.character(1L:nc)
xlim <- range(xvals)
if (is.null(xleg)) {
xleg <- xlim[2L] + 0.05 * diff(xlim)
xlim <- xlim + c(-0.2/nr, if (legend) 0.2 + 0.02 * nch else 0.2/nr) *
diff(xlim)
}
dev.hold()
on.exit(dev.flush())
matplot(xvals, cells, ..., type = type, xlim = xlim, ylim = ylim,
xlab = xlab, ylab = ylab, axes = axes, xaxt = "n",
col = col, lty = lty, pch = pch)
if (axes && xaxt != "n") {
axisInt <- function(x, main, sub, lwd, bg, log, asp,
...) axis(1, x, ...)
mgp. <- par("mgp")
if (!xtick)
mgp.[2L] <- 0
axisInt(1, at = xvals, labels = xlabs, tick = xtick,
mgp = mgp., xaxt = xaxt, ...)
}
if (legend) {
yrng <- diff(ylim)
if (is.null(yleg))
yleg <- ylim[2L] - 0.1 * yrng
if (!is.null(xpd) || {
xpd. <- par("xpd")
!is.na(xpd.) && !xpd. && (xpd <- TRUE)
}) {
op <- par(xpd = xpd)
on.exit(par(op), add = TRUE)
}
# text(xleg, ylim[2L] - 0.05 * yrng, paste(" ",
# trace.label), adj = 0)
if (!fixed) {
ord <- sort.list(cells[nr, ], decreasing = TRUE)
ylabs <- ylabs[ord]
lty <- lty[1 + (ord - 1)%%length(lty)]
col <- col[1 + (ord - 1)%%length(col)]
pch <- pch[ord]
}
legend(xleg, yleg, legend = ylabs, col = col,
title = if (trace.label == "") NULL else trace.label,
pch = if (type %in% c("p", "b"))
pch, lty = if (type %in% c("l", "b"))
lty, bty = leg.bty, bg = leg.bg)
}
invisible()
}
Data:
lk <- "https://learningstatisticswithr.com/data.zip"
tmp <- tempfile()
tmp.dir <- tempdir()
download.file(lk, tmp)
unzip(tmp, exdir=tmp.dir)
load("data/clinicaltrial.Rdata")

Constructing a histogram of exceedances and overlying Generalised Pareto pdf in R [duplicate]

I have managed to find online how to overlay a normal curve to a histogram in R, but I would like to retain the normal "frequency" y-axis of a histogram. See two code segments below, and notice how in the second, the y-axis is replaced with "density". How can I keep that y-axis as "frequency", as it is in the first plot.
AS A BONUS: I'd like to mark the SD regions (up to 3 SD) on the density curve as well. How can I do this? I tried abline, but the line extends to the top of the graph and looks ugly.
g = d$mydata
hist(g)
g = d$mydata
m<-mean(g)
std<-sqrt(var(g))
hist(g, density=20, breaks=20, prob=TRUE,
xlab="x-variable", ylim=c(0, 2),
main="normal curve over histogram")
curve(dnorm(x, mean=m, sd=std),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
See how in the image above, the y-axis is "density". I'd like to get that to be "frequency".
Here's a nice easy way I found:
h <- hist(g, breaks = 10, density = 10,
col = "lightgray", xlab = "Accuracy", main = "Overall")
xfit <- seq(min(g), max(g), length = 40)
yfit <- dnorm(xfit, mean = mean(g), sd = sd(g))
yfit <- yfit * diff(h$mids[1:2]) * length(g)
lines(xfit, yfit, col = "black", lwd = 2)
You need to find the right multiplier to convert density (an estimated curve where the area beneath the curve is 1) to counts. This can be easily calculated from the hist object.
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
A more complete version, with a normal density and lines at each standard deviation away from the mean (including the mean):
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
myx <- seq(min(mtcars$mpg), max(mtcars$mpg), length.out= 100)
mymean <- mean(mtcars$mpg)
mysd <- sd(mtcars$mpg)
normal <- dnorm(x = myx, mean = mymean, sd = mysd)
lines(myx, normal * multiplier[1], col = "blue", lwd = 2)
sd_x <- seq(mymean - 3 * mysd, mymean + 3 * mysd, by = mysd)
sd_y <- dnorm(x = sd_x, mean = mymean, sd = mysd) * multiplier[1]
segments(x0 = sd_x, y0= 0, x1 = sd_x, y1 = sd_y, col = "firebrick4", lwd = 2)
This is an implementation of aforementioned StanLe's anwer, also fixing the case where his answer would produce no curve when using densities.
This replaces the existing but hidden hist.default() function, to only add the normalcurve parameter (which defaults to TRUE).
The first three lines are to support roxygen2 for package building.
#' #noRd
#' #exportMethod hist.default
#' #export
hist.default <- function(x,
breaks = "Sturges",
freq = NULL,
include.lowest = TRUE,
normalcurve = TRUE,
right = TRUE,
density = NULL,
angle = 45,
col = NULL,
border = NULL,
main = paste("Histogram of", xname),
ylim = NULL,
xlab = xname,
ylab = NULL,
axes = TRUE,
plot = TRUE,
labels = FALSE,
warn.unused = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics::hist.default(
x = x,
breaks = breaks,
freq = freq,
include.lowest = include.lowest,
right = right,
density = density,
angle = angle,
col = col,
border = border,
main = main,
ylim = ylim,
xlab = xlab,
ylab = ylab,
axes = axes,
plot = plot,
labels = labels,
warn.unused = warn.unused,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- yfit * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Quick example:
hist(g)
For dates it's bit different. For reference:
#' #noRd
#' #exportMethod hist.Date
#' #export
hist.Date <- function(x,
breaks = "months",
format = "%b",
normalcurve = TRUE,
xlab = xname,
plot = TRUE,
freq = NULL,
density = NULL,
start.on.monday = TRUE,
right = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics:::hist.Date(
x = x,
breaks = breaks,
format = format,
freq = freq,
density = density,
start.on.monday = start.on.monday,
right = right,
xlab = xlab,
plot = plot,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- as.double(yfit) * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Just remove the prob = T, and let it stay at default ie F

Fitting a expected normal distribution over a histogram [duplicate]

I have managed to find online how to overlay a normal curve to a histogram in R, but I would like to retain the normal "frequency" y-axis of a histogram. See two code segments below, and notice how in the second, the y-axis is replaced with "density". How can I keep that y-axis as "frequency", as it is in the first plot.
AS A BONUS: I'd like to mark the SD regions (up to 3 SD) on the density curve as well. How can I do this? I tried abline, but the line extends to the top of the graph and looks ugly.
g = d$mydata
hist(g)
g = d$mydata
m<-mean(g)
std<-sqrt(var(g))
hist(g, density=20, breaks=20, prob=TRUE,
xlab="x-variable", ylim=c(0, 2),
main="normal curve over histogram")
curve(dnorm(x, mean=m, sd=std),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
See how in the image above, the y-axis is "density". I'd like to get that to be "frequency".
Here's a nice easy way I found:
h <- hist(g, breaks = 10, density = 10,
col = "lightgray", xlab = "Accuracy", main = "Overall")
xfit <- seq(min(g), max(g), length = 40)
yfit <- dnorm(xfit, mean = mean(g), sd = sd(g))
yfit <- yfit * diff(h$mids[1:2]) * length(g)
lines(xfit, yfit, col = "black", lwd = 2)
You need to find the right multiplier to convert density (an estimated curve where the area beneath the curve is 1) to counts. This can be easily calculated from the hist object.
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
A more complete version, with a normal density and lines at each standard deviation away from the mean (including the mean):
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
myx <- seq(min(mtcars$mpg), max(mtcars$mpg), length.out= 100)
mymean <- mean(mtcars$mpg)
mysd <- sd(mtcars$mpg)
normal <- dnorm(x = myx, mean = mymean, sd = mysd)
lines(myx, normal * multiplier[1], col = "blue", lwd = 2)
sd_x <- seq(mymean - 3 * mysd, mymean + 3 * mysd, by = mysd)
sd_y <- dnorm(x = sd_x, mean = mymean, sd = mysd) * multiplier[1]
segments(x0 = sd_x, y0= 0, x1 = sd_x, y1 = sd_y, col = "firebrick4", lwd = 2)
This is an implementation of aforementioned StanLe's anwer, also fixing the case where his answer would produce no curve when using densities.
This replaces the existing but hidden hist.default() function, to only add the normalcurve parameter (which defaults to TRUE).
The first three lines are to support roxygen2 for package building.
#' #noRd
#' #exportMethod hist.default
#' #export
hist.default <- function(x,
breaks = "Sturges",
freq = NULL,
include.lowest = TRUE,
normalcurve = TRUE,
right = TRUE,
density = NULL,
angle = 45,
col = NULL,
border = NULL,
main = paste("Histogram of", xname),
ylim = NULL,
xlab = xname,
ylab = NULL,
axes = TRUE,
plot = TRUE,
labels = FALSE,
warn.unused = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics::hist.default(
x = x,
breaks = breaks,
freq = freq,
include.lowest = include.lowest,
right = right,
density = density,
angle = angle,
col = col,
border = border,
main = main,
ylim = ylim,
xlab = xlab,
ylab = ylab,
axes = axes,
plot = plot,
labels = labels,
warn.unused = warn.unused,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- yfit * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Quick example:
hist(g)
For dates it's bit different. For reference:
#' #noRd
#' #exportMethod hist.Date
#' #export
hist.Date <- function(x,
breaks = "months",
format = "%b",
normalcurve = TRUE,
xlab = xname,
plot = TRUE,
freq = NULL,
density = NULL,
start.on.monday = TRUE,
right = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics:::hist.Date(
x = x,
breaks = breaks,
format = format,
freq = freq,
density = density,
start.on.monday = start.on.monday,
right = right,
xlab = xlab,
plot = plot,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- as.double(yfit) * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Just remove the prob = T, and let it stay at default ie F

Resources