Plotting the normal and binomial distribution in same plot - r

As the title indicates I am trying to plot the normal distribution and the binomial distribution in the same plot using R. My attempt can be seen below, is there any reason why my normal distribution looks so off? I have double checked the mean and standard deviation and everything looks fine.
n <- 151
p <- 0.2409
dev <- 4
mu <- n*p
sigma <- sqrt(n*p*(1 - p))
xmin <- round(max(mu - dev*sigma,0));
xmax <- round(min(mu + dev*sigma,n))
x <- seq(xmin, xmax)
y <- dbinom(x,n,p)
barplot(y,
col = 'lightblue',
names.arg = x,
main = 'Binomial distribution, n=151, p=.803')
range <- seq(mu - dev*sigma, mu + dev*sigma, 0.01)
height <- dnorm(range, mean = mu, sd = sigma)
lines(range, height, col = 'red', lwd = 3)

barplot is just the wrong function for your case. Or if you really want to use it, you'd have to rejigger the x-axes between barplot and lines
The default for barplot is to put each height value at
head(c(barplot(y, plot = FALSE)))
# [1] 0.7 1.9 3.1 4.3 5.5 6.7
This can be changed by your choices of space and width or a combination of both
head(c(barplot(y, plot = FALSE, space = 0)))
# [1] 0.5 1.5 2.5 3.5 4.5 5.5
head(c(barplot(y, plot = FALSE, space = 0, width = 3)))
# [1] 1.5 4.5 7.5 10.5 13.5 16.5
You can just use plot to avoid dealing with those things
n <- 151
p <- 0.2409
dev <- 4
mu <- n*p
sigma <- sqrt(n*p*(1 - p))
xmin <- round(max(mu - dev*sigma,0));
xmax <- round(min(mu + dev*sigma,n))
x <- seq(xmin, xmax)
y <- dbinom(x,n,p)
plot(x, y, type = 'h', lwd = 10, lend = 3, col = 'lightblue',
ann = FALSE, las = 1, bty = 'l', yaxs = 'i', ylim = c(0, 0.08))
title(main = sprintf('Binomial distribution, n=%s, p=%.3f', n, p))
lines(x, dnorm(x, mean = mu, sd = sigma), col = 'red', lwd = 7)
xx <- seq(min(x), max(x), length.out = 1000)
lines(xx, dnorm(xx, mean = mu, sd = sigma), col = 'white')
The "bars" in this figure depend on your choice of lwd and your device dimensions, but if you need finer control over that, you can use rect which takes a little more work.
w <- 0.75
plot(x, y, type = 'n', ann = FALSE, las = 1, bty = 'l', yaxs = 'i', ylim = c(0, 0.08))
rect(x - w / 2, 0, x + w / 2, y, col = 'lightblue')
lines(xx, dnorm(xx, mean = mu, sd = sigma), col = 'red', lwd = 3)
title(main = sprintf('Binomial distribution, n=%s, p=%.3f', n, p))

You can use the ggplot2 package
library(ggplot2)
n <- 151
p <- 0.2409
mean <- n*p
sd <- sqrt(n*p*(1-p))
binwidth <- 0.005
xmin <- round(max(mu - dev*sigma,0));
xmax <- round(min(mu + dev*sigma,n))
x <- seq(xmin, xmax)
y <- dbinom(x,n,p)
df <- cbind.data.frame(x, y)
ggplot(df, aes(x = x, y = y)) +
geom_bar(stat="identity", fill = 'dodgerblue3')+
labs(title = "Binomial distribution, n=151, p=.803",
x = "",
y = "") +
theme_minimal()+
# Create normal curve, akousting for number of observations and binwidth
stat_function(
fun = function(x, mean, sd, n, bw){
dnorm(x = x, mean = mean, sd = sd)
}, col = "red", size=I(1.4),
args = c(mean = mean, sd = sd, n = n, bw = binwidth))

You could do it using the ggplot2 package (I was surprised by the normal distribution but replacing geom_line by geom_point convinced me that is has this form (is the variance too high ?)) :
n <- 151
p <- 0.2409
dev <- 4
mu <- n*p
sigma <- sqrt(n*p*(1 - p))
xmin <- round(max(mu - dev*sigma,0));
xmax <- round(min(mu + dev*sigma,n))
x <- seq(xmin, xmax)
y <- dbinom(x,n,p)
z <- dnorm(x = qnorm(p = seq(0,1, length.out = length(x)), mean = mu, sd = sigma), mean = mu, sd = sigma)
library(magrittr)
library(ggplot2)
data.frame(x, y, z) %>%
ggplot(aes(x = x)) +
geom_col(aes(y = y)) +
geom_line(aes(x = x, y = z, colour = "red"),
show.legend = FALSE)

Related

How to plot dot plot in r with a point representing the mean and error bars

I have a data set with a continuous variable (column named PI) and 3 categories (column named Size). I want to plot a dot plot for all PI values for each category and another point representing the mean and a SE error bar. Something which looks a bit like this from this Ross et al paper:
This is what I have so far (I have all of my points and my mean point plotted) but it isn't looking like the picture from the paper
p<-ggplot(Index,aes(x=Size,y=PI)) + geom_dotplot(binaxis='y',stackdir='center',dotsize=0.5)
p
p + stat_summary(fun.y=mean,geom="point",shape=18,size=3,colour=c("red","blue","green"))
#to add SE bars
p + stat summary(fun.data=mean_sdl,fun.args=list(multi=1),geom="pointrange",colour="red")
p + stat_summary(fun.data=mean_sdl, fun.args = list(mult=1),
geom="pointrange", color="red")
library(ggplot2)
`library(Hmisc)`
library(gplots)
library(dplyr)
plot_data<-Index %>%
group_by(Size) %>%
summarise(mean=mean(PI),sd=sd(PI))
m<-plot_data$mean
s<-plot_data$sd
plot(plot_data$Size,plot_data$mean,ylim=range(0:5),border="white",xlab="Body Size",ylab="Performance Index", ylim(c(1,3)))
points(jitter(as.numeric(Index$Size)),Index$PI,col=as.integer(Index$Size)+1,pch=19)
segments(x0=1:3,x1=1:3,y0=m-s,y1=m+s,lwd=2)
segments(x0=1:3-0.2,x1=1:3+0.2,y0=m,y1=m,lwd=2)
segments(x0=1:3-0.1,x1=1:3+0.1,y0=m+s,y1=m+s,lwd=2)
segments(x0=1:3-0.1,x1=1:3+0.1,y0=m-s,y1=m-s,lwd=2)
Since we don't have your data, we'll use the pre-canned dataset warpbreaks to illustrate this
data("warpbreaks")
Your attempt was a decent first try
ggplot(warpbreaks, aes(x = tension, y = breaks)) +
geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = 0.5) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1),
geom = "pointrange", colour = "red")
However, I think we can actually end up with a better outcome from "manually" doing it using base graphics
library(dplyr)
plot_data <- warpbreaks %>%
group_by(tension) %>%
summarise(mean = mean(breaks), sd = sd(breaks))
m <- plot_data$mean
s <- plot_data$sd
plot(plot_data$tension, plot_data$mean, ylim = range(warpbreaks$breaks),
border = "white", xlab = "Tension", ylab = "Breaks")
points(jitter(as.numeric(warpbreaks$tension)), warpbreaks$breaks,
col = as.integer(warpbreaks$tension) + 1, pch = 19)
segments(x0 = 1:3, x1 = 1:3, y0 = m - s, y1 = m + s, lwd = 2)
segments(x0 = 1:3 - 0.2, x1 = 1:3 + 0.2, y0 = m, y1 = m, lwd = 2)
segments(x0 = 1:3 - 0.1, x1 = 1:3 + 0.1, y0 = m + s, y1 = m + s, lwd = 2)
segments(x0 = 1:3 - 0.1, x1 = 1:3 + 0.1, y0 = m - s, y1 = m - s, lwd = 2)

How to draw multiple contours in the same

I am trying to get two contours in the same plot using ggplot2 in R.
Here is a reproducible example:
library(MASS)
library(ggplot2)
# first contour
m <- c(.0, -.0)
sigma <- matrix(c(1,.5,.5,1), nrow=2)
data.grid <- expand.grid(s.1 = seq(-3, 3, length.out=200), s.2 = seq(-3, 3, length.out=200))
q.samp <- cbind(data.grid, prob = mvtnorm::dmvnorm(data.grid, mean = m, sigma = sigma))
plot1 <- ggplot(q.samp, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'green')
# second contour
m1 <- c(1, 1)
sigma1 <- matrix(c(1,-.5,-.5,1), nrow=2)
set.seed(10)
data.grid1 <- expand.grid(s.1 = seq(-3, 3, length.out=200), s.2 = seq(-3, 3, length.out=200))
q.samp1 <- cbind(data.grid1, prob = mvtnorm::dmvnorm(data.grid1, mean = m1, sigma = sigma1))
plot2 <- ggplot(q.samp1, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'red')
However, trying plot1 + plot2 also does not work. Is there a way to get the two contours on the same plot.
What about including another stat_contour with different data?
ggplot(q.samp1, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'red') +
stat_contour(data = q.samp, aes(x = s.1, y = s.2, z = prob), color = 'green')

Generating multiple plots containing functions in ggplot2

I am trying to make a composite plot in R using the packages ggplot2 and ggpubr.
I have no problem in making the composite plots except each plot has a normal distribution curve specific to that dataset. When I generate the composite plot, both plots have the same curve, that of the last dataset.
How can I generate the composite plot with each plot having its own specific normal distribution curve?
CODE AND OUTPUT PLOTS
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
m <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
dnorm_C <- function(x){
norm_C <- dnorm(x, m, sd)
return(norm_C)
}
e = 1
dnorm_one_sd_C <- function(x){
norm_one_sd_C <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)
}
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_C, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_C, colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
m <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
dnorm_U <- function(x){
norm_U <- dnorm(x, m, sd)
return(norm_U)
}
e = 2
dnorm_one_sd_U <- function(x){
norm_one_sd_U <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)
}
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_U, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_U, colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)
As you can see in the composite plot, the first one has taken the normal distribution curve of the second plot rather than its own one from my initial plot (Plot 1).
UPDATE
Variable "e" refers to the shaded area which is related to the distribution curve.
m = mean of the dataset
sd = standard deviation of the dataset
m and sd are used to generate the normal distribution curves
SOLVED
By inserting the function in full into the stat_function section of the ggplot2 code, this has worked
i.e:
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
mean <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
e = 1
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_C <- dnorm(x, mean, sd)
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
mean <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
e = 2
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_U <- dnorm(x, mean, sd)
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)

Creating multiple Bias-Variance Tradeoff plots in R

I am relatively new to R. I would like to know how to create the following graphic. I have been stuck for over two hours.
Suppose the red line - the true relationship - is y = x^2.
Suppose I want to fit 100 linear models to 100 random samples (blue lines).
How would I do this? So far, this is what I have:
# create the true relationship
f <- function(x) x^2 # true model
x <- seq(0, 1, by = 0.01)
y <- f(x)
# plot the true function
plot(x, y, type = "l", col = "red", ylim = c(-0.2, 1.2), lwd = 4)
# fit 100 models
set.seed(1)
for (i in 1:100)
{
errors <- rnorm(n, 0, sigma) # random errors, have standard deviation sigma
obs_y <- f(obs_x) + errors # observed y = true_model + error
model <- lm(obs_y ~ obs_x) # fit a linear model to the observed values
points(obs_x[i], mean(obs_y[i]), col = "green") # mean values
abline(model, col = "purple") # plot the fitted model
}
which creates this:
in which the green dots are definitely off...
and I don't have the black dots...
Thanks!
Here's your code after several adjustments:
f <- function(x) x^2
x <- seq(0, 1, by = 0.05)
n <- length(x)
sigma <- 0.05
y <- f(x)
plot(x, y, type = "l", col = "red", ylim = c(-0.2, 1.2), lwd = 2)
fitted <- ys <- matrix(0, ncol = n, nrow = 100)
set.seed(1)
for (i in 1:100)
{
errors <- rnorm(n, 0, sigma)
ys[i, ] <- obs_y <- f(x) + errors
model <- lm(obs_y ~ x)
fitted[i, ] <- fitted(model)
abline(model, col = "purple", lwd = 0.1)
}
points(x = rep(x, each = 100), y = ys, cex = 0.1)
points(x = x, y = colMeans(fitted), col = 'green', cex = 0.3)
With ggplot, e.g.
library(ggplot2)
x <- seq(0, 1, 0.1)
y <- x^2
dat <- as.data.frame(do.call(rbind, lapply(1:100, function(i){
y_err <- y + rnorm(1, 0, 0.06)
l <- lm(y_err ~ x)$coefficients
cbind(samp = i, intercept = l[1], slope = l[2], t(x * l[2] + l[1]), t(y_err))
})), row.names = 1:100)
ggplot() +
geom_abline(aes(intercept = dat$intercept, slope = dat$slope)) +
geom_point(aes(x = rep(x, each = 100), y = unlist(dat[, 15:25])), alpha = 0.5) +
geom_line(aes(x = x, y = y), color = "red", lwd = 2) +
geom_point(aes(x = x, y = colMeans(dat[, 4:14])), color = "green")

How to plot a plane from an equation in R

I've been tinkering with the RGL package to figure out how to plot a plane from an equation in R, to no avail.
For example, I would like to visualize the following plane:
1x + 0y + 0z = 2
0x + 1y + 0z = 3
0x + 0y + 1z = 4
It seems the rgl's planes3d function only adds a plane to an existing 3D plot.
Here is a simple example:
library(rgl)
# Create some dummy data
dat <- replicate(2, 1:3)
# Initialize the scene, no data plotted
plot3d(dat, type = 'n', xlim = c(-1, 1), ylim = c(-1, 1), zlim = c(-3, 3), xlab = '', ylab = '', zlab = '')
# Add planes
planes3d(1, 1, 1, 0, col = 'red', alpha = 0.6)
planes3d(1, -1, 1, 0, col = 'orange', alpha = 0.6)
planes3d(1, -1, -1, -0.8, col = 'blue', alpha = 0.6)
Which gives the following result.
As you can see, it is quite hard to understand the spatial structure from such a plot, but the interactivity of course helps. Alternatively you can plot the planes as wireframes, which will sometimes help in understanding the spatial structure:
# Evaluate planes
n <- 20
x <- y <- seq(-1, 1, length = n)
region <- expand.grid(x = x, y = y)
z1 <- matrix(-(region$x + region$y), n, n)
z2 <- matrix(-region$x + region$y, n, n)
z3 <- matrix(region$x - region$y - 0.8, n, n)
surface3d(x, y, z1, back = 'line', front = 'line', col = 'red', lwd = 1.5, alpha = 0.4)
surface3d(x, y, z2, back = 'line', front = 'line', col = 'orange', lwd = 1.5, alpha = 0.4)
surface3d(x, y, z3, back = 'line', front = 'line', col = 'blue', lwd = 1.5, alpha = 0.4)
axes3d()
If you want to plot, e.g., a plane defined by the equation 2*x+y-z-3=0, you could do this in the following way:
x <- y <- seq(-10, 10, length= 30)
f <- function(x,y){ z <- x*2 + y -3 }
z <- outer(x,y,f)
persp(x, y, z, theta = 30, phi = 30, expand = 0.5, col = "lightblue")
For more examples see ?persp.

Resources