Bivariate normal with marginal and conditional densities - r

I am trying to create a figure in R. It consists of the contour plot of a bivariate normal distribution for the vector variable (x,y) along with the marginals f(x), f(y); the conditional distribution f(y|x) and the line through the conditioning value X=x (it will be a simple abline(v=x)).
I already got the contour and the abline:
but I don't know how to continue.
Here is the code I used so far:
bivariate.normal <- function(x, mu, Sigma) {
exp(-.5 * t(x-mu) %*% solve(Sigma) %*% (x-mu)) / sqrt(2 * pi * det(Sigma))
}
mu <- c(0,0)
Sigma <- matrix(c(1,.8,.8,1), nrow=2)
x1 <- seq(-3, 3, length.out=50)
x2 <- seq(-3, 3, length.out=50)
z <- outer(x1, x2, FUN=function(x1, x2, ...){
apply(cbind(x1,x2), 1, bivariate.normal, ...)
}, mu=mu, Sigma=Sigma)
contour(x1, x2, z, col="blue", drawlabels=FALSE, nlevels=4,
xlab=expression(x[1]), ylab=expression(x[2]), lwd=1)
abline(v=.7, col=1, lwd=2, lty=2)
text(2, -2, labels=expression(x[1]==0.7))

It would have been helpful if you had provided the function to calculate the marginal distribution. I may have got the marginal distribution function wrong, but I think this gets you what you want:
par(lwd=2,mgp=c(1,1,0))
# Modified to extract diagonal.
bivariate.normal <- function(x, mu, Sigma)
exp(-.5 * diag(t(x-mu) %*% solve(Sigma) %*% (x-mu))) / sqrt(2 * pi * det(Sigma))
mu <- c(0,0)
Sigma <- matrix(c(1,.8,.8,1), nrow=2)
x1 <- seq(-3, 3, length.out=50)
x2 <- seq(-3, 3, length.out=50)
plot(1:10,axes=FALSE,frame.plot=TRUE,lwd=1)
# z can now be calculated much easier.
z<-bivariate.normal(t(expand.grid(x1,x2)),mu,Sigma)
dim(z)<-c(length(x1),length(x2))
contour(x1, x2, z, col="#4545FF", drawlabels=FALSE, nlevels=4,
xlab=expression(x[1]), ylab=expression(x[2]), lwd=2,xlim=range(x1),ylim=range(x2),frame.plot=TRUE,axes=FALSE,xaxs = "i", yaxs = "i")
axis(1,labels=FALSE,lwd.ticks=2)
axis(2,labels=FALSE,lwd.ticks=2)
abline(v=.7, col=1, lwd=2, lty=2)
text(2, -2, labels=expression(x[1]==0.7))
# Dotted
f<-function(x1,x2) bivariate.normal(t(cbind(x1,x2)),mu,Sigma)
x.s<-seq(from=min(x1),to=max(x1),by=0.1)
vals<-f(x1=0.7,x2=x.s)
lines(vals-abs(min(x1)),x.s,lty=2,lwd=2)
# Marginal probability distribution: http://mpdc.mae.cornell.edu/Courses/MAE714/biv-normal.pdf
# Please check this, I'm not sure it is correct.
marginal.x1<-function(x) exp((-(x-mu[1])^2)/2*(Sigma[1,2]^2)) / (Sigma[1,2]*sqrt(2*pi))
marginal.x2<-function(x) exp((-(x-mu[1])^2)/2*(Sigma[2,1]^2)) / (Sigma[2,1]*sqrt(2*pi))
# Left side solid
vals<-marginal.x2(x.s)
lines(vals-abs(min(x1)),x.s,lty=1,lwd=2)
# Bottom side solid
vals<-marginal.x1(x.s)
lines(x.s,vals-abs(min(x2)),lty=1,lwd=2)

My solution in ggplot2, inspired in this post
rm(list=ls())
options(max.print=999999)
library(pacman)
p_load(tidyverse)
p_load(mvtnorm)
my_mean<-c(25,65)
mycors<-seq(-1,1,by=.25)
sd_vec<-c(5,7)
i<-3
temp_cor<-matrix(c(1,mycors[i],
mycors[i],1),
byrow = T,ncol=2)
V<-sd_vec %*% t(sd_vec) *temp_cor
###data for vertical curve
my_dnorm<- function(x, mean = 0, sd = 1, log = FALSE, new_loc, multplr){
new_loc+dnorm(x, mean , sd, log)*multplr
}
##margina Y distribution
yden<-data.frame(y=seq(48,82,length.out = 100),x=my_dnorm(seq(48,82,length.out = 100),my_mean[2],sd_vec[2],new_loc=8,multplr=100))
##conditional distribution
my_int<-(my_mean[2]-(V[1,2]*my_mean[1]/V[1,1]))
my_slp<-V[1,2]/V[1,1]
givenX<-34
mu_givenX<-my_int+givenX*my_slp
sigma2_givenX<-(1-mycors[i]^2)*V[2,2]
y_givenX_range<-seq(mu_givenX-3*sqrt(sigma2_givenX),mu_givenX+3*sqrt(sigma2_givenX),length.out = 100)
yden_x<-data.frame(y=y_givenX_range, x=my_dnorm(y_givenX_range,mu_givenX,sqrt(sigma2_givenX),new_loc=givenX,multplr=80))
yden_x<-data.frame(y=y_givenX_range, x=my_dnorm(y_givenX_range,mu_givenX,sqrt(sigma2_givenX),new_loc=8,multplr=80))
###data for drawing ellipse
data.grid <- expand.grid(x = seq(my_mean[1]-3*sd_vec[1], my_mean[1]+3*sd_vec[1], length.out=200),
y = seq(my_mean[2]-3*sd_vec[2], my_mean[2]+3*sd_vec[2], length.out=200))
q.samp <- cbind(data.grid, prob = dmvnorm(data.grid, mean = my_mean, sigma = V))
###plot
ggplot(q.samp, aes(x=x, y=y, z=prob)) +
geom_contour() + theme_bw()+
geom_abline(intercept = my_int, slope = my_slp, color="red",
linetype="dashed")+
stat_function(fun = my_dnorm, n = 101, args = list(mean = my_mean[1], sd = sd_vec[1], new_loc=35,multplr=100),color=1) +
geom_path(aes(x=x,y=y), data = yden,inherit.aes = FALSE) +
geom_path(aes(x=x,y=y), data = yden_x,inherit.aes = FALSE,color=1,linetype="dashed") +
geom_vline(xintercept = givenX,linetype="dashed")
Created on 2020-10-31 by the reprex package (v0.3.0)

Related

Understanding "levels" in r contour function of bivariate distribution

I have trouble understanding how to set the levels in the plot of a bivariate distribution in r. The documentation states that I can choose the levels by setting a
numeric vector of levels at which to draw contour lines
Now I would like the contour to show the limit containing 95% of the density or mass. But if, in the example below (adapted from here) I set the vector as a <- c(.95,.90) the code runs without error but the plot is not displayed. If instead, I set the vector as a <- c(.01,.05) the plot is displayed. But I am not sure I understand what the labels "0.01" and "0.05" mean with respect to the density.
library(mnormt)
x <- seq(-5, 5, 0.25)
y <- seq(-5, 5, 0.25)
mu1 <- c(0, 0)
sigma1 <- matrix(c(2, -1, -1, 2), nrow = 2)
f <- function(x, y) dmnorm(cbind(x, y), mu1, sigma1)
z <- outer(x, y, f)
a <- c(.01,.05)
contour(x, y, z, levels = a)
But I am not sure I understand what the labels "0.01" and "0.05" mean with respect to the density.
It means the points where the density is equal 0.01 and 0.05. From help("contour"):
numeric vector of levels at which to draw contour lines.
So it is the function values at which to draw the lines (contours) where the function is equal to those levels (in this case the density). Take a simple example which may help is x + y:
y <- x <- seq(0, 1, length.out = 50)
z <- outer(x, y, `+`)
par(mar = c(5, 5, 1, 1))
contour(x, y, z, levels = c(0.5, 1, 1.5))
Now I would like the contour to show the limit containing 95% of the density or mass.
In your example, you can follow my answer here and draw the exact points:
# input
mu1 <- c(0, 0)
sigma1 <- matrix(c(2, -1, -1, 2), nrow = 2)
# we start from points on the unit circle
n_points <- 100
xy <- cbind(sin(seq(0, 2 * pi, length.out = n_points)),
cos(seq(0, 2 * pi, length.out = n_points)))
# then we scale the dimensions
ev <- eigen(sigma1)
xy[, 1] <- xy[, 1] * 1
xy[, 2] <- xy[, 2] * sqrt(min(ev$values) / max(ev$values))
# then rotate
phi <- atan(ev$vectors[2, 1] / ev$vectors[1, 1])
R <- matrix(c(cos(phi), sin(phi), -sin(phi), cos(phi)), 2)
xy <- tcrossprod(R, xy)
# find the right length. You can change .95 to which ever
# quantile you want
chi_vals <- qchisq(.95, df = 2) * max(ev$values)
s <- sqrt(chi_vals)
par(mar = c(5, 5, 1, 1))
plot(s * xy[1, ] + mu1[1], s * xy[2, ] + mu1[2], lty = 1,
type = "l", xlab = "x", ylab = "y")
The levels indicates where the lines are drawn, with respect to the specific 'z' value of the bivariate normal density. Since max(z) is
0.09188815, levels of a <- c(.95,.90) can't be drawn.
To draw the line delimiting 95% of the mass I used the ellipse() function as suggested in this post (second answer from the top).
library(mixtools)
library(mnormt)
x <- seq(-5, 5, 0.25)
y <- seq(-5, 5, 0.25)
mu1 <- c(0, 0)
sigma1 <- matrix(c(2, -1, -1, 2), nrow = 2)
f <- function(x, y) dmnorm(cbind(x, y), mu1, sigma1)
z <- outer(x, y, f)
a <- c(.01,.05)
contour(x, y, z, levels = a)
ellipse(mu=mu1, sigma=sigma1, alpha = .05, npoints = 250, col="red")
I also found another solution in the book "Applied Multivariate Statistics with R" by Daniel Zelterman.
# Figure 6.5: Bivariate confidence ellipse
library(datasets)
library(MASS)
library(MVA)
#> Loading required package: HSAUR2
#> Loading required package: tools
biv <- swiss[, 2 : 3] # Extract bivariate data
bivCI <- function(s, xbar, n, alpha, m)
# returns m (x,y) coordinates of 1-alpha joint confidence ellipse of mean
{
x <- sin( 2* pi * (0 : (m - 1) )/ (m - 1)) # m points on a unit circle
y <- cos( 2* pi * (0 : (m - 1)) / (m - 1))
cv <- qchisq(1 - alpha, 2) # chisquared critical value
cv <- cv / n # value of quadratic form
for (i in 1 : m)
{
pair <- c(x[i], y[i]) # ith (x,y) pair
q <- pair %*% solve(s, pair) # quadratic form
x[i] <- x[i] * sqrt(cv / q) + xbar[1]
y[i] <- y[i] * sqrt(cv / q) + xbar[2]
}
return(cbind(x, y))
}
### pdf(file = "bivSwiss.pdf")
plot(biv, col = "red", pch = 16, cex.lab = 1.5)
lines(bivCI(var(biv), colMeans(biv), dim(biv)[1], .01, 1000), type = "l",
col = "blue")
lines(bivCI(var(biv), colMeans(biv), dim(biv)[1], .05, 1000),
type = "l", col = "green", lwd = 1)
lines(colMeans(biv)[1], colMeans(biv)[2], pch = 3, cex = .8, type = "p",
lwd = 1)
Created on 2021-03-15 by the reprex package (v0.3.0)

How to dd conditions to the bivariate distribution( x-mu =>0) and (x-mu <0) in plotting contour plot?

bivariate.ESEP <- function(x, mu, Sigma ,eps)
{
sqrt(det(k))/(2*(gamma(1+2/alp))*sqrt(det(Sigma)))*
exp(-(((t(x-mu)%*% solve((I-eps)^2)%*% solve(Sigma) %*%k%*%(x-mu))^alp/2)*ifelse(x-mu>=0,1,0) +
(((t(mu-x)%*% solve((I+eps)^2)%*% solve(Sigma)%*%k%*%(mu-x)))^alp/2)))*ifelse(x-mu<0,1,0)
}
mu <- c(179.81,22.2)
Sigma <- matrix(c(0.6,0.35,0.35,2), nrow=2)
Sigma<-Sigma1
eps<-matrix(c(-0.186,0,0,0.6), nrow=2)
k<-matrix(c(1,0,0,1), nrow=2)
I<-matrix(c(1,0,0,1), nrow=2)
alp =1
x1 <- seq(150, 220)
x2 <- seq(0, 40)
z <- outer(x1, x2, FUN=function(x1, x2, ... ){
apply(cbind(x1,x2), 1, bivariate.ESEP, ...)
}, mu=mu, Sigma=Sigma ,eps = eps)
plot(dat1, xlab="Ht", ylab="BMI", pch=19, cex=.7)
contour(x1, x2, z, col="blue", drawlabels=FALSE, nlevels= 20,
xlab=expression(x[1]), ylab=expression(x[2]), lwd=1, add = TRUE)
x is a bivariate distribution. So if (x>=mu) then we take the first part of bivariatte.ESEP and putting the other part 0 and similarly if (x

Reproduce Fisher linear discriminant figure

Many books illustrate the idea of Fisher linear discriminant analysis using the following figure (this particular is from Pattern Recognition and Machine Learning, p. 188)
I wonder how to reproduce this figure in R (or in any other language). Pasted below is my initial effort in R. I simulate two groups of data and draw linear discriminant using abline() function. Any suggestions are welcome.
set.seed(2014)
library(MASS)
library(DiscriMiner) # For scatter matrices
# Simulate bivariate normal distribution with 2 classes
mu1 <- c(2, -4)
mu2 <- c(2, 6)
rho <- 0.8
s1 <- 1
s2 <- 3
Sigma <- matrix(c(s1^2, rho * s1 * s2, rho * s1 * s2, s2^2), byrow = TRUE, nrow = 2)
n <- 50
X1 <- mvrnorm(n, mu = mu1, Sigma = Sigma)
X2 <- mvrnorm(n, mu = mu2, Sigma = Sigma)
y <- rep(c(0, 1), each = n)
X <- rbind(x1 = X1, x2 = X2)
X <- scale(X)
# Scatter matrices
B <- betweenCov(variables = X, group = y)
W <- withinCov(variables = X, group = y)
# Eigenvectors
ev <- eigen(solve(W) %*% B)$vectors
slope <- - ev[1,1] / ev[2,1]
intercept <- ev[2,1]
par(pty = "s")
plot(X, col = y + 1, pch = 16)
abline(a = slope, b = intercept, lwd = 2, lty = 2)
MY (UNFINISHED) WORK
I pasted my current solution below. The main question is how to rotate (and move) the density plot according to decision boundary. Any suggestions are still welcome.
require(ggplot2)
library(grid)
library(MASS)
# Simulation parameters
mu1 <- c(5, -9)
mu2 <- c(4, 9)
rho <- 0.5
s1 <- 1
s2 <- 3
Sigma <- matrix(c(s1^2, rho * s1 * s2, rho * s1 * s2, s2^2), byrow = TRUE, nrow = 2)
n <- 50
# Multivariate normal sampling
X1 <- mvrnorm(n, mu = mu1, Sigma = Sigma)
X2 <- mvrnorm(n, mu = mu2, Sigma = Sigma)
# Combine into data frame
y <- rep(c(0, 1), each = n)
X <- rbind(x1 = X1, x2 = X2)
X <- scale(X)
X <- data.frame(X, class = y)
# Apply lda()
m1 <- lda(class ~ X1 + X2, data = X)
m1.pred <- predict(m1)
# Compute intercept and slope for abline
gmean <- m1$prior %*% m1$means
const <- as.numeric(gmean %*% m1$scaling)
z <- as.matrix(X[, 1:2]) %*% m1$scaling - const
slope <- - m1$scaling[1] / m1$scaling[2]
intercept <- const / m1$scaling[2]
# Projected values
LD <- data.frame(predict(m1)$x, class = y)
# Scatterplot
p1 <- ggplot(X, aes(X1, X2, color=as.factor(class))) +
geom_point() +
theme_bw() +
theme(legend.position = "none") +
scale_x_continuous(limits=c(-5, 5)) +
scale_y_continuous(limits=c(-5, 5)) +
geom_abline(intecept = intercept, slope = slope)
# Density plot
p2 <- ggplot(LD, aes(x = LD1)) +
geom_density(aes(fill = as.factor(class), y = ..scaled..)) +
theme_bw() +
theme(legend.position = "none")
grid.newpage()
print(p1)
vp <- viewport(width = .7, height = 0.6, x = 0.5, y = 0.3, just = c("centre"))
pushViewport(vp)
print(p2, vp = vp)
Basically you need to project the data along the direction of the classifier, plot a histogram for each class, and then rotate the histogram so its x axis is parallel to the classifier. Some trial-and-error with scaling the histogram is needed in order to get a nice result. Here's an example of how to do it in Matlab, for the naive classifier (difference of class' means). For the Fisher classifier it is of course similar, you just use a different classifier w. I changed the parameters from your code so the plot is more similar to the one you gave.
rng('default')
n = 1000;
mu1 = [1,3]';
mu2 = [4,1]';
rho = 0.3;
s1 = .8;
s2 = .5;
Sigma = [s1^2,rho*s1*s1;rho*s1*s1, s2^2];
X1 = mvnrnd(mu1,Sigma,n);
X2 = mvnrnd(mu2,Sigma,n);
X = [X1; X2];
Y = [zeros(n,1);ones(n,1)];
scatter(X1(:,1), X1(:,2), [], 'b' );
hold on
scatter(X2(:,1), X2(:,2), [], 'r' );
axis equal
m1 = mean(X(1:n,:))';
m2 = mean(X(n+1:end,:))';
plot(m1(1),m1(2),'bx','markersize',18)
plot(m2(1),m2(2),'rx','markersize',18)
plot([m1(1),m2(1)], [m1(2),m2(2)],'g')
%% classifier taking only means into account
w = m2 - m1;
w = w / norm(w);
% project data onto w
X1_projected = X1 * w;
X2_projected = X2 * w;
% plot histogram and rotate it
angle = 180/pi * atan(w(2)/w(1));
[hy1, hx1] = hist(X1_projected);
[hy2, hx2] = hist(X2_projected);
hy1 = hy1 / sum(hy1); % normalize
hy2 = hy2 / sum(hy2); % normalize
scale = 4; % set manually
h1 = bar(hx1, scale*hy1,'b');
h2 = bar(hx2, scale*hy2,'r');
set([h1, h2],'ShowBaseLine','off')
% rotate around the origin
rotate(get(h1,'children'),[0,0,1], angle, [0,0,0])
rotate(get(h2,'children'),[0,0,1], angle, [0,0,0])

Fine tuning stat_ellipse() in ggplot2

I want to create a scatter plot of bivariate normal distribution with 95% "exact" confidence ellipse.
library(mvtnorm)
library(ggplot2)
set.seed(1)
n <- 1e3
c95 <- qchisq(.95, df=2)
rho <- 0.8 #correlation
Sigma <- matrix(c(1, rho, rho, 1), 2, 2) # Covariance matrix
I generated 1000 observations from bivariate normal with mean zero and variance =Sigma
x <- rmvnorm(n, mean=c(0, 0), Sigma)
z <- p95 <- rep(NA, n)
for(i in 1:n){
z[i] <- x[i, ] %*% solve(Sigma, x[i, ])
p95[i] <- (z[i] < c95)
}
We can draw the 95% confidence ellipse on the top of scatterplot of the generated data with ease using stat_ellipse. Resulting figure is completely satisfactory until you note that the several of the red points lie inside the confidence ellipse. I guess that this discrepancy comes from the estimation of some parameters, and disappears as the sample size gets larger.
data <- data.frame(x, z, p95)
p <- ggplot(data, aes(X1, X2)) + geom_point(aes(colour = p95))
p + stat_ellipse(type = "norm")
Is there any way to fine tune stat_ellipse() so that it depicts the "exact" confidence ellipse as shown in the figure below which was created using "hand-made" ellips function?
ellips <- function(center = c(0,0), c=c95, rho=-0.8, npoints = 100){
t <- seq(0, 2*pi, len=npoints)
Sigma <- matrix(c(1, rho, rho, 1), 2, 2)
a <- sqrt(c*eigen(Sigma)$values[2])
b <- sqrt(c*eigen(Sigma)$values[1])
x <- center[1] + a*cos(t)
y <- center[2] + b*sin(t)
X <- cbind(x, y)
R <- eigen(Sigma)$vectors
data.frame(X%*%R)
}
dat <- ellips(center=c(0, 0), c=c95, rho, npoints=100)
p + geom_path(data=dat, aes(x=X1, y=X2), colour='blue')
This is not a real answer, but it might help.
By exploring stat_ellipse with the following commands,
stat_ellipse
ls(ggplot2:::StatEllipse)
ggplot2:::StatEllipse$calculate
ggplot2:::calculate_ellipse
?cov.wt
it seems that cov.wt is estimating the covariance matrix from the simulated data:
cov.wt(data[, c(1, 2)])$cov
# X1 X2
# X1 1.1120267 0.8593946
# X2 0.8593946 1.0372208
# True covariance matrix:
Sigma
# [,1] [,2]
# [1,] 1.0 0.8
# [2,] 0.8 1.0
You may consider calculating your p95 values using the estimated covariance matrix. Or just stick with your own well-executed ellipse drawing code.
The ellipse code proposed in the original question is wrong. It works when the X1 and X2 variables have a mean of 0 and a standard deviation of 1, but not in the general case.
Here is an alternative implementation, adapted from the stat_ellipse source code. It takes as argument the vector of means, the covariance matrix, the radius (computed with the confidence level for instance) and the number of segments for the shape.
calculate_ellipse <- function(center, shape, radius, segments){
# Adapted from https://github.com/tidyverse/ggplot2/blob/master/R/stat-ellipse.R
chol_decomp <- chol(shape)
angles <- (0:segments) * 2 * pi/segments
unit.circle <- cbind(cos(angles), sin(angles))
ellipse <- t(center + radius * t(unit.circle %*% chol_decomp))
colnames(ellipse) <- c("X1","X2")
as.data.frame(ellipse)
}
Let's compare both implementations:
library(ggplot2)
library(MASS) # mvrnorm function, to sample multivariate normal variables
set.seed(42)
mu = c(10, 20) # vector of means
rho = -0.7 # correlation coefficient
correlation = matrix(c(1, rho, rho, 1), 2) # correlation matrix
std = c(1, 10) # vector of standard deviations
sigma = diag(std) %*% correlation %*% diag(std) # covariance matrix
N = 1000 # number of points
confidence = 0.95 # confidence level for the ellipse
df = data.frame(mvrnorm(n=N, mu=mu, Sigma=sigma))
radius = sqrt(2 * stats::qf(confidence, 2, Inf)) # radius of the ellipse
ellips <- function(center = c(0,0), c=c95, rho=-0.8, npoints = 100){
# Original proposal
t <- seq(0, 2*pi, len=npoints)
Sigma <- matrix(c(1, rho, rho, 1), 2, 2)
a <- sqrt(c*eigen(Sigma)$values[2])
b <- sqrt(c*eigen(Sigma)$values[1])
x <- center[1] + a*cos(t)
y <- center[2] + b*sin(t)
X <- cbind(x, y)
R <- eigen(Sigma)$vectors
data.frame(X%*%R)
}
calculate_ellipse <- function(center, shape, radius, segments){
# Adapted from https://github.com/tidyverse/ggplot2/blob/master/R/stat-ellipse.R
chol_decomp <- chol(shape)
angles <- (0:segments) * 2 * pi/segments
unit.circle <- cbind(cos(angles), sin(angles))
ellipse <- t(center + radius * t(unit.circle %*% chol_decomp))
colnames(ellipse) <- c("X1","X2")
as.data.frame(ellipse)
}
ggplot(df) +
aes(x=X1, y=X2) +
theme_bw() +
geom_point() +
geom_path(aes(color="new implementation"), data=calculate_ellipse(mu, sigma, radius, 100)) +
geom_path(aes(color="original implementation"), data=ellips(mu, confidence, rho, 100))

How can I superimpose an arbitrary parametric distribution over a histogram using ggplot?

How can I superimpose an arbitrary parametric distribution over a histogram using ggplot?
I have made an attempt based on a Quick-R example, but I don't understand where the scaling factor comes from. Is this method reasonable? How can I modify it to use ggplot?
An example overplot the normal and lognormal distributions using this method follows:
## Get a log-normalish data set: the number of characters per word in "Alice in Wonderland"
alice.raw <- readLines(con = "http://www.gutenberg.org/cache/epub/11/pg11.txt",
n = -1L, ok = TRUE, warn = TRUE,
encoding = "UTF-8")
alice.long <- paste(alice.raw, collapse=" ")
alice.long.noboilerplate <- strsplit(alice.long, split="\\*\\*\\*")[[1]][3]
alice.words <- strsplit(alice.long.noboilerplate, "[[:space:]]+")[[1]]
alice.nchar <- nchar(alice.words)
alice.nchar <- alice.nchar[alice.nchar > 0]
# Now we want to plot both the histogram and then log-normal probability dist
require(MASS)
h <- hist(alice.nchar, breaks=1:50, xlab="Characters in word", main="Count")
xfit <- seq(1, 50, 0.1)
# Plot a normal curve
yfit<-dnorm(xfit,mean=mean(alice.nchar),sd=sd(alice.nchar))
yfit <- yfit * diff(h$mids[1:2]) * length(alice.nchar)
lines(xfit, yfit, col="blue", lwd=2)
# Now plot a log-normal curve
params <- fitdistr(alice.nchar, densfun="lognormal")
yfit <- dlnorm(xfit, meanlog=params$estimate[1], sdlog=params$estimate[1])
yfit <- yfit * diff(h$mids[1:2]) * length(alice.nchar)
lines(xfit, yfit, col="red", lwd=2)
This produces the following plot:
To clarify, I would like to have counts on the y-axis, rather than a density estimate.
Have a look at stat_function()
alice.raw <- readLines(con = "http://www.gutenberg.org/cache/epub/11/pg11.txt",
n = -1L, ok = TRUE, warn = TRUE,
encoding = "UTF-8")
alice.long <- paste(alice.raw, collapse=" ")
alice.long.noboilerplate <- strsplit(alice.long, split="\\*\\*\\*")[[1]][3]
alice.words <- strsplit(alice.long.noboilerplate, "[[:space:]]+")[[1]]
alice.nchar <- nchar(alice.words)
alice.nchar <- alice.nchar[alice.nchar > 0]
dataset <- data.frame(alice.nchar = alice.nchar)
library(ggplot2)
ggplot(dataset, aes(x = alice.nchar)) + geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm,
args = c(
mean = mean(dataset$alice.nchar),
sd = sd(dataset$alice.nchar)),
colour = "red")
If you want to have counts on the y-axis as in the example, then you'll need a function that converts the density to counts:
dnorm.count <- function(x, mean = 0, sd = 1, log = FALSE, n = 1, binwidth = 1){
n * binwidth * dnorm(x = x, mean = mean, sd = sd, log = log)
}
ggplot(dataset, aes(x = alice.nchar)) + geom_histogram(binwidth=1.6) +
stat_function(fun = dnorm.count,
args = c(
mean = mean(dataset$alice.nchar),
sd = sd(dataset$alice.nchar),
n = nrow(dataset), binwidth=1.6),
colour = "red")

Resources