Related
I have managed to find online how to overlay a normal curve to a histogram in R, but I would like to retain the normal "frequency" y-axis of a histogram. See two code segments below, and notice how in the second, the y-axis is replaced with "density". How can I keep that y-axis as "frequency", as it is in the first plot.
AS A BONUS: I'd like to mark the SD regions (up to 3 SD) on the density curve as well. How can I do this? I tried abline, but the line extends to the top of the graph and looks ugly.
g = d$mydata
hist(g)
g = d$mydata
m<-mean(g)
std<-sqrt(var(g))
hist(g, density=20, breaks=20, prob=TRUE,
xlab="x-variable", ylim=c(0, 2),
main="normal curve over histogram")
curve(dnorm(x, mean=m, sd=std),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
See how in the image above, the y-axis is "density". I'd like to get that to be "frequency".
Here's a nice easy way I found:
h <- hist(g, breaks = 10, density = 10,
col = "lightgray", xlab = "Accuracy", main = "Overall")
xfit <- seq(min(g), max(g), length = 40)
yfit <- dnorm(xfit, mean = mean(g), sd = sd(g))
yfit <- yfit * diff(h$mids[1:2]) * length(g)
lines(xfit, yfit, col = "black", lwd = 2)
You need to find the right multiplier to convert density (an estimated curve where the area beneath the curve is 1) to counts. This can be easily calculated from the hist object.
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
A more complete version, with a normal density and lines at each standard deviation away from the mean (including the mean):
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
myx <- seq(min(mtcars$mpg), max(mtcars$mpg), length.out= 100)
mymean <- mean(mtcars$mpg)
mysd <- sd(mtcars$mpg)
normal <- dnorm(x = myx, mean = mymean, sd = mysd)
lines(myx, normal * multiplier[1], col = "blue", lwd = 2)
sd_x <- seq(mymean - 3 * mysd, mymean + 3 * mysd, by = mysd)
sd_y <- dnorm(x = sd_x, mean = mymean, sd = mysd) * multiplier[1]
segments(x0 = sd_x, y0= 0, x1 = sd_x, y1 = sd_y, col = "firebrick4", lwd = 2)
This is an implementation of aforementioned StanLe's anwer, also fixing the case where his answer would produce no curve when using densities.
This replaces the existing but hidden hist.default() function, to only add the normalcurve parameter (which defaults to TRUE).
The first three lines are to support roxygen2 for package building.
#' #noRd
#' #exportMethod hist.default
#' #export
hist.default <- function(x,
breaks = "Sturges",
freq = NULL,
include.lowest = TRUE,
normalcurve = TRUE,
right = TRUE,
density = NULL,
angle = 45,
col = NULL,
border = NULL,
main = paste("Histogram of", xname),
ylim = NULL,
xlab = xname,
ylab = NULL,
axes = TRUE,
plot = TRUE,
labels = FALSE,
warn.unused = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics::hist.default(
x = x,
breaks = breaks,
freq = freq,
include.lowest = include.lowest,
right = right,
density = density,
angle = angle,
col = col,
border = border,
main = main,
ylim = ylim,
xlab = xlab,
ylab = ylab,
axes = axes,
plot = plot,
labels = labels,
warn.unused = warn.unused,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- yfit * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Quick example:
hist(g)
For dates it's bit different. For reference:
#' #noRd
#' #exportMethod hist.Date
#' #export
hist.Date <- function(x,
breaks = "months",
format = "%b",
normalcurve = TRUE,
xlab = xname,
plot = TRUE,
freq = NULL,
density = NULL,
start.on.monday = TRUE,
right = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics:::hist.Date(
x = x,
breaks = breaks,
format = format,
freq = freq,
density = density,
start.on.monday = start.on.monday,
right = right,
xlab = xlab,
plot = plot,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- as.double(yfit) * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Just remove the prob = T, and let it stay at default ie F
I have managed to find online how to overlay a normal curve to a histogram in R, but I would like to retain the normal "frequency" y-axis of a histogram. See two code segments below, and notice how in the second, the y-axis is replaced with "density". How can I keep that y-axis as "frequency", as it is in the first plot.
AS A BONUS: I'd like to mark the SD regions (up to 3 SD) on the density curve as well. How can I do this? I tried abline, but the line extends to the top of the graph and looks ugly.
g = d$mydata
hist(g)
g = d$mydata
m<-mean(g)
std<-sqrt(var(g))
hist(g, density=20, breaks=20, prob=TRUE,
xlab="x-variable", ylim=c(0, 2),
main="normal curve over histogram")
curve(dnorm(x, mean=m, sd=std),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
See how in the image above, the y-axis is "density". I'd like to get that to be "frequency".
Here's a nice easy way I found:
h <- hist(g, breaks = 10, density = 10,
col = "lightgray", xlab = "Accuracy", main = "Overall")
xfit <- seq(min(g), max(g), length = 40)
yfit <- dnorm(xfit, mean = mean(g), sd = sd(g))
yfit <- yfit * diff(h$mids[1:2]) * length(g)
lines(xfit, yfit, col = "black", lwd = 2)
You need to find the right multiplier to convert density (an estimated curve where the area beneath the curve is 1) to counts. This can be easily calculated from the hist object.
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
A more complete version, with a normal density and lines at each standard deviation away from the mean (including the mean):
myhist <- hist(mtcars$mpg)
multiplier <- myhist$counts / myhist$density
mydensity <- density(mtcars$mpg)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist)
lines(mydensity)
myx <- seq(min(mtcars$mpg), max(mtcars$mpg), length.out= 100)
mymean <- mean(mtcars$mpg)
mysd <- sd(mtcars$mpg)
normal <- dnorm(x = myx, mean = mymean, sd = mysd)
lines(myx, normal * multiplier[1], col = "blue", lwd = 2)
sd_x <- seq(mymean - 3 * mysd, mymean + 3 * mysd, by = mysd)
sd_y <- dnorm(x = sd_x, mean = mymean, sd = mysd) * multiplier[1]
segments(x0 = sd_x, y0= 0, x1 = sd_x, y1 = sd_y, col = "firebrick4", lwd = 2)
This is an implementation of aforementioned StanLe's anwer, also fixing the case where his answer would produce no curve when using densities.
This replaces the existing but hidden hist.default() function, to only add the normalcurve parameter (which defaults to TRUE).
The first three lines are to support roxygen2 for package building.
#' #noRd
#' #exportMethod hist.default
#' #export
hist.default <- function(x,
breaks = "Sturges",
freq = NULL,
include.lowest = TRUE,
normalcurve = TRUE,
right = TRUE,
density = NULL,
angle = 45,
col = NULL,
border = NULL,
main = paste("Histogram of", xname),
ylim = NULL,
xlab = xname,
ylab = NULL,
axes = TRUE,
plot = TRUE,
labels = FALSE,
warn.unused = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics::hist.default(
x = x,
breaks = breaks,
freq = freq,
include.lowest = include.lowest,
right = right,
density = density,
angle = angle,
col = col,
border = border,
main = main,
ylim = ylim,
xlab = xlab,
ylab = ylab,
axes = axes,
plot = plot,
labels = labels,
warn.unused = warn.unused,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- yfit * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Quick example:
hist(g)
For dates it's bit different. For reference:
#' #noRd
#' #exportMethod hist.Date
#' #export
hist.Date <- function(x,
breaks = "months",
format = "%b",
normalcurve = TRUE,
xlab = xname,
plot = TRUE,
freq = NULL,
density = NULL,
start.on.monday = TRUE,
right = TRUE,
...) {
# https://stackoverflow.com/a/20078645/4575331
xname <- paste(deparse(substitute(x), 500), collapse = "\n")
suppressWarnings(
h <- graphics:::hist.Date(
x = x,
breaks = breaks,
format = format,
freq = freq,
density = density,
start.on.monday = start.on.monday,
right = right,
xlab = xlab,
plot = plot,
...
)
)
if (normalcurve == TRUE & plot == TRUE) {
x <- x[!is.na(x)]
xfit <- seq(min(x), max(x), length = 40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
if (isTRUE(freq) | (is.null(freq) & is.null(density))) {
yfit <- as.double(yfit) * diff(h$mids[1:2]) * length(x)
}
lines(xfit, yfit, col = "black", lwd = 2)
}
if (plot == TRUE) {
invisible(h)
} else {
h
}
}
Just remove the prob = T, and let it stay at default ie F
I have a scatterplot and wish to color the points by a z value assigned to each point. Then I want to get the legend on the right hand side of the plot to show what colors correspond to what z values using a nice smooth color spectrum.
Here are some x,y,z values you can use so that this is a reproducible example.
x = runif(50)
y = runif(50)
z = runif(50) #determines color of the (x,y) point
I suppose the best answer would be one that is generalized for any color function, but I do anticipate using rainbow()
Translated from this previous question:
library(ggplot2)
d = data.frame(x=runif(50),y=runif(50),z=runif(50))
ggplot(data = d, mapping = aes(x = x, y = y)) + geom_point(aes(colour = z), shape = 19)
If you don't want to use ggplot2 I modified a solution to this provided by someone else, I don't remember who.
scatter_fill <- function (x, y, z,xlim=c(min(x),max(x)),ylim=c(min(y),max(y)),zlim=c(min(z),max(z)),
nlevels = 20, plot.title, plot.axes,
key.title, key.axes, asp = NA, xaxs = "i",
yaxs = "i", las = 1,
axes = TRUE, frame.plot = axes, ...)
{
mar.orig <- (par.orig <- par(c("mar", "las", "mfrow")))$mar
on.exit(par(par.orig))
w <- (3 + mar.orig[2L]) * par("csi") * 2.54
layout(matrix(c(2, 1), ncol = 2L), widths = c(1, lcm(w)))
par(las = las)
mar <- mar.orig
mar[4L] <- mar[2L]
mar[2L] <- 1
par(mar = mar)
# choose colors to interpolate
levels <- seq(zlim[1],zlim[2],length.out = nlevels)
col <- colorRampPalette(c("red","yellow","dark green"))(nlevels)
colz <- col[cut(z,nlevels)]
#
plot.new()
plot.window(xlim = c(0, 1), ylim = range(levels), xaxs = "i", yaxs = "i")
rect(0, levels[-length(levels)], 1, levels[-1L],col=col,border=col)
if (missing(key.axes)) {if (axes){axis(4)}}
else key.axes
box()
if (!missing(key.title))
key.title
mar <- mar.orig
mar[4L] <- 1
par(mar = mar)
# points
plot(x,y,type = "n",xaxt='n',yaxt='n',xlab="",ylab="",xlim=xlim,ylim=ylim,bty="n")
points(x,y,col = colz,xaxt='n',yaxt='n',xlab="",ylab="",bty="n",...)
## options to make mapping more customizable
if (missing(plot.axes)) {
if (axes) {
title(main = "", xlab = "", ylab = "")
Axis(x, side = 1)
Axis(y, side = 2)
}
}
else plot.axes
if (frame.plot)
box()
if (missing(plot.title))
title(...)
else plot.title
invisible()
}
Just run the function first and it is ready to be used. It is quite handy.
# random vectors
vx <- rnorm(40,0,1)
vy <- rnorm(40,0,1)
vz <- rnorm(40,10,10)
scatter_fill(vx,vy,vz,nlevels=15,xlim=c(-1,1),ylim=c(-1,5),zlim=c(-10,10),main="TEST",pch=".",cex=8)
As you can notice, it inherits the usual plot function capabilities.
Another alternative using levelplot in package latticeExtra, with three different colour palettes.
library(latticeExtra)
levelplot(z ~ x + y, panel = panel.levelplot.points, col.regions = heat.colors(50))
levelplot(z ~ x + y, panel = panel.levelplot.points,
col.regions =colorRampPalette(brewer.pal(11,"RdYlGn"))(50))
levelplot(z ~ x + y, panel = panel.levelplot.points, col.regions = rainbow(50))
I have a plot with two logarithmic axes. I'd like to add a circle to a certain position of the plot. I tried to use plotrix, but this does not give options for "log-radius".
# data to plot
x = 10^(-1 * c(5:0))
y = x ^-1.5
#install.packages("plotrix", dependencies=T)
# use require() within functions
library("plotrix")
plot (x, y, log="xy", type="o")
draw.circle(x=1e-2, y=1e2, radius=1e1, col=2)
How can I add a circle to my log-log plot?
As krlmlr suggests, the easiest solution is to slightly modify plotrix::draw.circle(). The log-log coordinate system distorts coordinates of a circle given in the linear scale; to counteract that, you just need to exponentiate the calculated coordinates, as I've done in the lines marked with ## <- in the code below:
library("plotrix")
draw.circle.loglog <-
function (x, y, radius, nv = 100, border = NULL, col = NA, lty = 1,
lwd = 1)
{
xylim <- par("usr")
plotdim <- par("pin")
ymult <- (xylim[4] - xylim[3])/(xylim[2] - xylim[1]) * plotdim[1]/plotdim[2]
angle.inc <- 2 * pi/nv
angles <- seq(0, 2 * pi - angle.inc, by = angle.inc)
if (length(col) < length(radius))
col <- rep(col, length.out = length(radius))
for (circle in 1:length(radius)) {
xv <- exp(cos(angles) * log(radius[circle])) * x[circle] ## <-
yv <- exp(sin(angles) * ymult * log(radius[circle])) * y[circle] ## <-
polygon(xv, yv, border = border, col = col[circle], lty = lty,
lwd = lwd)
}
invisible(list(x = xv, y = yv))
}
# Try it out
x = 10^(-1 * c(5:0))
y = x ^-1.5
plot (x, y, log="xy", type="o")
draw.circle.loglog(x = c(1e-2, 1e-3, 1e-4), y = c(1e2, 1e6, 1e2),
radius = c(2,4,8), col = 1:3)
A work around would be to apply log10 explicitly.
plot (log10(x), log10(y), type="o")
draw.circle(x=log10(1e-2), y=log10(1e2), radius=log10(1e1), col=2)
Edit (using symbols):
plot (x, y, log="xy", type="o",xlim=c(1e-5,1), ylim=c(1,1e8))
par(new=T)
symbols(x=1e-2, y=1e2, circles=1e1, xlim=c(1e-5,1), ylim=c(1,1e8),
xaxt='n', yaxt='n', ann=F, log="xy")
The function draw.circle from the plotrix package looks like that on my system:
> draw.circle
function (x, y, radius, nv = 100, border = NULL, col = NA, lty = 1,
lwd = 1)
{
xylim <- par("usr")
plotdim <- par("pin")
ymult <- (xylim[4] - xylim[3])/(xylim[2] - xylim[1]) * plotdim[1]/plotdim[2]
angle.inc <- 2 * pi/nv
angles <- seq(0, 2 * pi - angle.inc, by = angle.inc)
if (length(col) < length(radius))
col <- rep(col, length.out = length(radius))
for (circle in 1:length(radius)) {
xv <- cos(angles) * radius[circle] + x
yv <- sin(angles) * radius[circle] * ymult + y
polygon(xv, yv, border = border, col = col[circle], lty = lty,
lwd = lwd)
}
invisible(list(x = xv, y = yv))
}
<environment: namespace:plotrix>
What happens here is essentially that the circle is approximated by a polygon of 100 vertices (parameter nv). You can do either of the following:
Create your own version of draw.circle that does the necessary coordinate transformation to "undo" the log transform of the axes.
The function invisibly returns the list of coordinates that are used for plotting.
(If you pass a vector as radius, then only the coordinates of the last circle are returned.) You might be able to apply a transform to those coordinates and call polygon on the result. Pass appropriate values for border, col, lty and/or lwd to hide the polygon drawn by the functions itself.
The first version sounds easier to me. Simply replace the + x by a * x, same for y, inside the for loop, and you're done. Equivalently, for the second version, you subtract x and then multiply by x each coordinate, same for y. EDIT: These transformations are slightly wrong, see Josh's answer for the correct ones.
I've been working on a rather complicated chart in R. I have a wireframe with a surface and points distributed in X,Y,Z space all over (e.g. under the surface and over it).
The problem is that the points that plot don't "look" like they are underneath the surface.
I am trying to figure out how best to visualize this chart to make the points look under the surface. Some sample code for the wireframe & cloud come from here: R-List Posting
The code in an example:
library(lattice)
surf <-
expand.grid(x = seq(-pi, pi, length = 50),
y = seq(-pi, pi, length = 50))
surf$z <-
with(surf, {
d <- 3 * sqrt(x^2 + y^2)
exp(-0.02 * d^2) * sin(d)
})
g <- surf
pts <- data.frame(x =rbind(2,2,2), y=rbind(-2,-2,-2), z=rbind(.5,0,-.5))
wireframe(z ~ x * y, g, aspect = c(1, .5),
drape=TRUE,
scales = list(arrows = FALSE),
pts = pts,
panel.3d.wireframe =
function(x, y, z,
xlim, ylim, zlim,
xlim.scaled, ylim.scaled, zlim.scaled,
pts,
...) {
panel.3dwire(x = x, y = y, z = z,
xlim = xlim,
ylim = ylim,
zlim = zlim,
xlim.scaled = xlim.scaled,
ylim.scaled = ylim.scaled,
zlim.scaled = zlim.scaled,
...)
xx <-
xlim.scaled[1] + diff(xlim.scaled) *
(pts$x - xlim[1]) / diff(xlim)
yy <-
ylim.scaled[1] + diff(ylim.scaled) *
(pts$y - ylim[1]) / diff(ylim)
zz <-
zlim.scaled[1] + diff(zlim.scaled) *
(pts$z - zlim[1]) / diff(zlim)
panel.3dscatter(x = xx,
y = yy,
z = zz,
xlim = xlim,
ylim = ylim,
zlim = zlim,
xlim.scaled = xlim.scaled,
ylim.scaled = ylim.scaled,
zlim.scaled = zlim.scaled,
...)
})
Looking at my example, the points in pts are in actually in vertical line where X,Y =(2,-2) and the z goes from .5 to -.5.
However, to my eye the third point doesn't look like it is under the surface, to it looks like it is at coordinates(2,-3,0).
Is this just my eye mis-interpreting this ?
Does anyone have a suggestion on how to make my points look more "3D" ? Perhaps muting the color of the point to make it look "under the surface" by using some sort of transparency on the surface ?
I tried making the colors of the points different (red for over the surface, blue for under the surface) but that does not really help the graph much.
This might get you started:
library(emdbook)
sfun <- function(x,y) {
d <- 3 * sqrt(x^2 + y^2)
exp(-0.02 * d^2) * sin(d)
}
cc <- curve3d(sfun(x,y),xlim=c(-pi,pi),ylim=c(-pi,pi),n=c(50,50),
sys3d="rgl")
colvec <- colorRampPalette(c("pink","white","lightblue"))(100)
with(cc,persp3d(x,y,z,col=colvec[cut(z,100)],alpha=0.5))
pts <- data.frame(x=c(2,2,2), y=c(-2,-2,-2), z=c(.5,0,-.5))
with(pts,spheres3d(x,y,z,col="blue",radius=0.1))
rgl.snapshot("rgltmp1.png")