Creating multiple Bias-Variance Tradeoff plots in R - r

I am relatively new to R. I would like to know how to create the following graphic. I have been stuck for over two hours.
Suppose the red line - the true relationship - is y = x^2.
Suppose I want to fit 100 linear models to 100 random samples (blue lines).
How would I do this? So far, this is what I have:
# create the true relationship
f <- function(x) x^2 # true model
x <- seq(0, 1, by = 0.01)
y <- f(x)
# plot the true function
plot(x, y, type = "l", col = "red", ylim = c(-0.2, 1.2), lwd = 4)
# fit 100 models
set.seed(1)
for (i in 1:100)
{
errors <- rnorm(n, 0, sigma) # random errors, have standard deviation sigma
obs_y <- f(obs_x) + errors # observed y = true_model + error
model <- lm(obs_y ~ obs_x) # fit a linear model to the observed values
points(obs_x[i], mean(obs_y[i]), col = "green") # mean values
abline(model, col = "purple") # plot the fitted model
}
which creates this:
in which the green dots are definitely off...
and I don't have the black dots...
Thanks!

Here's your code after several adjustments:
f <- function(x) x^2
x <- seq(0, 1, by = 0.05)
n <- length(x)
sigma <- 0.05
y <- f(x)
plot(x, y, type = "l", col = "red", ylim = c(-0.2, 1.2), lwd = 2)
fitted <- ys <- matrix(0, ncol = n, nrow = 100)
set.seed(1)
for (i in 1:100)
{
errors <- rnorm(n, 0, sigma)
ys[i, ] <- obs_y <- f(x) + errors
model <- lm(obs_y ~ x)
fitted[i, ] <- fitted(model)
abline(model, col = "purple", lwd = 0.1)
}
points(x = rep(x, each = 100), y = ys, cex = 0.1)
points(x = x, y = colMeans(fitted), col = 'green', cex = 0.3)

With ggplot, e.g.
library(ggplot2)
x <- seq(0, 1, 0.1)
y <- x^2
dat <- as.data.frame(do.call(rbind, lapply(1:100, function(i){
y_err <- y + rnorm(1, 0, 0.06)
l <- lm(y_err ~ x)$coefficients
cbind(samp = i, intercept = l[1], slope = l[2], t(x * l[2] + l[1]), t(y_err))
})), row.names = 1:100)
ggplot() +
geom_abline(aes(intercept = dat$intercept, slope = dat$slope)) +
geom_point(aes(x = rep(x, each = 100), y = unlist(dat[, 15:25])), alpha = 0.5) +
geom_line(aes(x = x, y = y), color = "red", lwd = 2) +
geom_point(aes(x = x, y = colMeans(dat[, 4:14])), color = "green")

Related

Plotting the normal and binomial distribution in same plot

As the title indicates I am trying to plot the normal distribution and the binomial distribution in the same plot using R. My attempt can be seen below, is there any reason why my normal distribution looks so off? I have double checked the mean and standard deviation and everything looks fine.
n <- 151
p <- 0.2409
dev <- 4
mu <- n*p
sigma <- sqrt(n*p*(1 - p))
xmin <- round(max(mu - dev*sigma,0));
xmax <- round(min(mu + dev*sigma,n))
x <- seq(xmin, xmax)
y <- dbinom(x,n,p)
barplot(y,
col = 'lightblue',
names.arg = x,
main = 'Binomial distribution, n=151, p=.803')
range <- seq(mu - dev*sigma, mu + dev*sigma, 0.01)
height <- dnorm(range, mean = mu, sd = sigma)
lines(range, height, col = 'red', lwd = 3)
barplot is just the wrong function for your case. Or if you really want to use it, you'd have to rejigger the x-axes between barplot and lines
The default for barplot is to put each height value at
head(c(barplot(y, plot = FALSE)))
# [1] 0.7 1.9 3.1 4.3 5.5 6.7
This can be changed by your choices of space and width or a combination of both
head(c(barplot(y, plot = FALSE, space = 0)))
# [1] 0.5 1.5 2.5 3.5 4.5 5.5
head(c(barplot(y, plot = FALSE, space = 0, width = 3)))
# [1] 1.5 4.5 7.5 10.5 13.5 16.5
You can just use plot to avoid dealing with those things
n <- 151
p <- 0.2409
dev <- 4
mu <- n*p
sigma <- sqrt(n*p*(1 - p))
xmin <- round(max(mu - dev*sigma,0));
xmax <- round(min(mu + dev*sigma,n))
x <- seq(xmin, xmax)
y <- dbinom(x,n,p)
plot(x, y, type = 'h', lwd = 10, lend = 3, col = 'lightblue',
ann = FALSE, las = 1, bty = 'l', yaxs = 'i', ylim = c(0, 0.08))
title(main = sprintf('Binomial distribution, n=%s, p=%.3f', n, p))
lines(x, dnorm(x, mean = mu, sd = sigma), col = 'red', lwd = 7)
xx <- seq(min(x), max(x), length.out = 1000)
lines(xx, dnorm(xx, mean = mu, sd = sigma), col = 'white')
The "bars" in this figure depend on your choice of lwd and your device dimensions, but if you need finer control over that, you can use rect which takes a little more work.
w <- 0.75
plot(x, y, type = 'n', ann = FALSE, las = 1, bty = 'l', yaxs = 'i', ylim = c(0, 0.08))
rect(x - w / 2, 0, x + w / 2, y, col = 'lightblue')
lines(xx, dnorm(xx, mean = mu, sd = sigma), col = 'red', lwd = 3)
title(main = sprintf('Binomial distribution, n=%s, p=%.3f', n, p))
You can use the ggplot2 package
library(ggplot2)
n <- 151
p <- 0.2409
mean <- n*p
sd <- sqrt(n*p*(1-p))
binwidth <- 0.005
xmin <- round(max(mu - dev*sigma,0));
xmax <- round(min(mu + dev*sigma,n))
x <- seq(xmin, xmax)
y <- dbinom(x,n,p)
df <- cbind.data.frame(x, y)
ggplot(df, aes(x = x, y = y)) +
geom_bar(stat="identity", fill = 'dodgerblue3')+
labs(title = "Binomial distribution, n=151, p=.803",
x = "",
y = "") +
theme_minimal()+
# Create normal curve, akousting for number of observations and binwidth
stat_function(
fun = function(x, mean, sd, n, bw){
dnorm(x = x, mean = mean, sd = sd)
}, col = "red", size=I(1.4),
args = c(mean = mean, sd = sd, n = n, bw = binwidth))
You could do it using the ggplot2 package (I was surprised by the normal distribution but replacing geom_line by geom_point convinced me that is has this form (is the variance too high ?)) :
n <- 151
p <- 0.2409
dev <- 4
mu <- n*p
sigma <- sqrt(n*p*(1 - p))
xmin <- round(max(mu - dev*sigma,0));
xmax <- round(min(mu + dev*sigma,n))
x <- seq(xmin, xmax)
y <- dbinom(x,n,p)
z <- dnorm(x = qnorm(p = seq(0,1, length.out = length(x)), mean = mu, sd = sigma), mean = mu, sd = sigma)
library(magrittr)
library(ggplot2)
data.frame(x, y, z) %>%
ggplot(aes(x = x)) +
geom_col(aes(y = y)) +
geom_line(aes(x = x, y = z, colour = "red"),
show.legend = FALSE)

Changing the color of the contour curve

I'd like to change the color of the contour curve from z variable. My MWE can be seen below.
library(ggplot2)
library(tidyverse)
rosenbrock <- function(x){
d <- length(x)
out <- 0
for(i in 1 : (d - 1)){
out <- out + 100 * ( x[i]^2 - x[i + 1] )^2 + (x[i] - 1)^2
}
out
}
set.seed(1)
coord <- matrix(runif(2000, -50, 50), byrow = TRUE, ncol = 2)
graph <- apply(coord, 1, rosenbrock)
results <- data.frame(x = coord[, 1], y = coord[, 2], z = graph) %>%
arrange(x, y)
set.seed(2020)
n <- 5
x1 <- matrix(c(round(rnorm(n, -12, 5), 2), 0, round(rnorm(n, -6, 5), 2), 0), byrow = F, ncol = 2)
y1 <- apply(x1, 1, function(x) rosenbrock(x))
test_points <- data.frame(x = x1[, 1], y = x1[, 2],
z = y1)
results %>%
ggplot(aes(x = x, y = y, z = z)) +
stat_density2d() +
geom_point(data = test_points, aes(colour = z), size = 2.0, shape = 19) +
scale_colour_gradientn(colours=rainbow(4)) +
theme_light() +
labs(colour = 'Fitness')
Something like this?
results %>%
ggplot(aes(x = x, y = y, z = z)) +
stat_density2d(aes(fill = stat(level)), geom = "polygon") +
geom_point(data = test_points, aes(colour = z), size = 2.0, shape = 19) +
scale_colour_gradientn(colours=rainbow(4)) +
theme_light() +
labs(colour = 'Fitness')
The last few examples at https://ggplot2.tidyverse.org/reference/geom_density_2d.html might be what you're looking for

How to draw multiple contours in the same

I am trying to get two contours in the same plot using ggplot2 in R.
Here is a reproducible example:
library(MASS)
library(ggplot2)
# first contour
m <- c(.0, -.0)
sigma <- matrix(c(1,.5,.5,1), nrow=2)
data.grid <- expand.grid(s.1 = seq(-3, 3, length.out=200), s.2 = seq(-3, 3, length.out=200))
q.samp <- cbind(data.grid, prob = mvtnorm::dmvnorm(data.grid, mean = m, sigma = sigma))
plot1 <- ggplot(q.samp, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'green')
# second contour
m1 <- c(1, 1)
sigma1 <- matrix(c(1,-.5,-.5,1), nrow=2)
set.seed(10)
data.grid1 <- expand.grid(s.1 = seq(-3, 3, length.out=200), s.2 = seq(-3, 3, length.out=200))
q.samp1 <- cbind(data.grid1, prob = mvtnorm::dmvnorm(data.grid1, mean = m1, sigma = sigma1))
plot2 <- ggplot(q.samp1, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'red')
However, trying plot1 + plot2 also does not work. Is there a way to get the two contours on the same plot.
What about including another stat_contour with different data?
ggplot(q.samp1, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'red') +
stat_contour(data = q.samp, aes(x = s.1, y = s.2, z = prob), color = 'green')

Generating multiple plots containing functions in ggplot2

I am trying to make a composite plot in R using the packages ggplot2 and ggpubr.
I have no problem in making the composite plots except each plot has a normal distribution curve specific to that dataset. When I generate the composite plot, both plots have the same curve, that of the last dataset.
How can I generate the composite plot with each plot having its own specific normal distribution curve?
CODE AND OUTPUT PLOTS
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
m <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
dnorm_C <- function(x){
norm_C <- dnorm(x, m, sd)
return(norm_C)
}
e = 1
dnorm_one_sd_C <- function(x){
norm_one_sd_C <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)
}
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_C, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_C, colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
m <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
dnorm_U <- function(x){
norm_U <- dnorm(x, m, sd)
return(norm_U)
}
e = 2
dnorm_one_sd_U <- function(x){
norm_one_sd_U <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)
}
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_U, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_U, colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)
As you can see in the composite plot, the first one has taken the normal distribution curve of the second plot rather than its own one from my initial plot (Plot 1).
UPDATE
Variable "e" refers to the shaded area which is related to the distribution curve.
m = mean of the dataset
sd = standard deviation of the dataset
m and sd are used to generate the normal distribution curves
SOLVED
By inserting the function in full into the stat_function section of the ggplot2 code, this has worked
i.e:
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
mean <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
e = 1
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_C <- dnorm(x, mean, sd)
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
mean <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
e = 2
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_U <- dnorm(x, mean, sd)
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)

How to plot a plane from an equation in R

I've been tinkering with the RGL package to figure out how to plot a plane from an equation in R, to no avail.
For example, I would like to visualize the following plane:
1x + 0y + 0z = 2
0x + 1y + 0z = 3
0x + 0y + 1z = 4
It seems the rgl's planes3d function only adds a plane to an existing 3D plot.
Here is a simple example:
library(rgl)
# Create some dummy data
dat <- replicate(2, 1:3)
# Initialize the scene, no data plotted
plot3d(dat, type = 'n', xlim = c(-1, 1), ylim = c(-1, 1), zlim = c(-3, 3), xlab = '', ylab = '', zlab = '')
# Add planes
planes3d(1, 1, 1, 0, col = 'red', alpha = 0.6)
planes3d(1, -1, 1, 0, col = 'orange', alpha = 0.6)
planes3d(1, -1, -1, -0.8, col = 'blue', alpha = 0.6)
Which gives the following result.
As you can see, it is quite hard to understand the spatial structure from such a plot, but the interactivity of course helps. Alternatively you can plot the planes as wireframes, which will sometimes help in understanding the spatial structure:
# Evaluate planes
n <- 20
x <- y <- seq(-1, 1, length = n)
region <- expand.grid(x = x, y = y)
z1 <- matrix(-(region$x + region$y), n, n)
z2 <- matrix(-region$x + region$y, n, n)
z3 <- matrix(region$x - region$y - 0.8, n, n)
surface3d(x, y, z1, back = 'line', front = 'line', col = 'red', lwd = 1.5, alpha = 0.4)
surface3d(x, y, z2, back = 'line', front = 'line', col = 'orange', lwd = 1.5, alpha = 0.4)
surface3d(x, y, z3, back = 'line', front = 'line', col = 'blue', lwd = 1.5, alpha = 0.4)
axes3d()
If you want to plot, e.g., a plane defined by the equation 2*x+y-z-3=0, you could do this in the following way:
x <- y <- seq(-10, 10, length= 30)
f <- function(x,y){ z <- x*2 + y -3 }
z <- outer(x,y,f)
persp(x, y, z, theta = 30, phi = 30, expand = 0.5, col = "lightblue")
For more examples see ?persp.

Resources