Scaling stat_function in R - r

I am taking a random sample of 30 data points from the standard normal distribution and plotting the resulting histogram in R. I would like to show an overlapping normal distribution that illustrates how the sample distribution is close to the population distribution. However, I can't figure out how to scale the normal curve. Here is what I have so far in R:
library(ggplot2)
n <- 30
set.seed(42)
X <- rnorm(n, mean = 0, sd = 1)
X <- as.data.frame(X)
ggplot(X, aes(x = X)) +
geom_histogram(bins = 6) +
stat_function(fun = dnorm, args = list(
mean = 0, sd = 1
))
How do I vertically stretch the PDF of the normal distribution to account for n = 30?

A) Using frequency as the y-axis in the histogram
I have one solution in the function rcompanion::plotNormalHistogram
n <- 30
set.seed(42)
X <- rnorm(n, mean = 0, sd = 1)
library(rcompanion)
plotNormalHistogram(X)
I think you are looking for the scenario with the default prob=FALSE. There, I extract some information about the counts and density from the hist() function, and use this Factor to stretch the normal curve vertically.
I don't know how to do the equivalent in ggplot2, but I would suspect that there is a way.
You can just use library(rcompanion); plotNormalHistogram to see the code.
B) Using density as the y-axis in the histogram
library(ggplot2)
n <- 30
set.seed(42)
X <- rnorm(n, mean = 0, sd = 1)
X <- as.data.frame(X)
ggplot(X, aes(x=X)) +
geom_histogram(aes(y = ..density..), bins=6) +
stat_function(fun = dnorm, args = list(
mean = 0, sd = 1))

Related

ggplot2: Plot two different Densities in the same Plot of the same Variable before and after a Cutoff

My goal is to plot two different densities in the same plot of the same variable. I want to do this as it is common to show robustness of the forcing variable (here z) in a Regression Discontinuity Design. In the code below, I got it working however I do not want the density to be plotted before the cutoff (here 0) if it the key is "above" and vice-versa. Also, the graph should not just be hidden because of the smoothing. It should start computing the density just until (or start) the cutoff.
library(ggplot2)
x <- rnorm(1000, mean = 0)
y <- rnorm(500, mean = 2)
z <- append(x,y)
d <- tibble(value = z, key = ifelse(z <= 0, "below", "above"))
ggplot(d) +
geom_density(aes(z, group = key)) +
geom_vline(aes(xintercept = 0))
Does anybody know how to implement this? For linear regressions I got it working, but with geom_density() it plots the other side of the cutoff as well and smoothes it.
Thanks in advance for your help.
You can use trim = TRUE in geom_density to only calculate density over the range of values in the data:
library(ggplot2)
library(dplyr)
x <- rnorm(1000, mean = 0)
y <- rnorm(500, mean = 2)
z <- append(x,y)
d <- tibble(value = z, key = ifelse(z <= 0, "below", "above"))
ggplot(d) +
# added fill for easier discrimination
geom_density(aes(value, group = key, fill = key),
alpha = 0.5, trim = TRUE) +
geom_vline(aes(xintercept = 0), lty = 2, colour = 'red')

ggplot2 adding label to geom_area

I'm teaching undergrad statistics and trying to make a useful little R script to help my students understand calculating probabilities in the standard normal distribution. I have this script, which takes zscore breakpoints, calculates the fraction of data between each breakpoint, and colors each breakpoint section:
library(tidyverse)
library(ggplot2)
library(magrittr)
sim_dat = data.frame(z = seq(-5,5, length.out = 1001))
sim_dat$y = dnorm(sim_dat$z, mean = 0, sd=1)
#fill in z-score bkpts, excluding zero: 0 will always be included
zscores <- c(-1,1.5)
zscores <- sort( setdiff(zscores,0) )
bkpoints <- sort( c(-Inf, zscores,0, Inf))
#find pct data between brekpoints
pctdata <- numeric(length=length(bkpoints)-1)
interval <- character(length=length(bkpoints)-1)
for(i in 1:length(pctdata)){
pctdata[i] <- plyr::round_any( pnorm(q=bkpoints[i+1]) - pnorm(q=bkpoints[i]) , 0.0001)
interval[i] <- paste0(bkpoints[i],",",bkpoints[i+1])
}
pctdata_df <- cbind.data.frame(interval,pctdata,stringsAsFactors=FALSE)
sim_dat$standard_normal_sections = cut(sim_dat$z, breaks = bkpoints)
p1 <- ggplot2::ggplot(sim_dat, aes(z, y, fill = standard_normal_sections)) + geom_area() +
scale_x_continuous(breaks= c(seq(-5,5,1), zscores))
p1
pctdata_df
I'd like to use pctdata_df$pctdata(vector of how much data is in section of p1) as labels. I'm finding very little on how to add labels to geom_area. Any help is appreciated!
There is nothing special about geom_area. If you want to add labels you could do so with geom_text where you pass your pctdata_df to the data argument. As you gave no information on where you want to add your labels I have put them beneath the area chart.
Note: There is no need for a for loop. You could simply pass a vector to pnorm or paste.
library(scales)
library(ggplot2)
# find pct data between brekpoints
lower <- bkpoints[1:(length(bkpoints) - 1)]
upper <- bkpoints[2:length(bkpoints)]
pctdata <- pnorm(q = upper) - pnorm(q = lower)
interval <- paste0(lower, ",", upper)
pctdata_df <- data.frame(interval, lower, upper, pctdata)
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(lower), upper - 1, .5 * (lower + upper)))
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(upper), lower + 1, x_label))
sim_dat$standard_normal_sections <- cut(sim_dat$z, breaks = bkpoints)
ggplot(sim_dat, aes(z, y)) +
geom_area(aes(fill = standard_normal_sections)) +
geom_text(data = pctdata_df, aes(x = x_label, y = 0, label = scales::number(pctdata, .01)),
vjust = 1, size = 8 / .pt, nudge_y = -.01) +
scale_x_continuous(breaks = c(seq(-5, 5, 1), zscores))

Make ggplot with regression line and normal distribution overlay

I am trying to make a plot to show the intuition behind logistic (or probit) regression. How would I make a plot that looks something like this in ggplot?
(Wolf & Best, The Sage Handbook of Regression Analysis and Causal Inference, 2015, p. 155)
Actually, what I would rather even do is have one single normal distribution displayed along the y axis with mean = 0, and a specific variance, so that I can draw horizontal lines going from the linear predictor to the y axis and sideways normal distribution. Something like this:
What this is supposed to show (assuming I haven't misunderstood something) is . I haven't had much success so far...
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# Probability density function of a normal logistic distribution
pdfDeltaFun <- function(x) {
prob = (exp(x)/(1 + exp(x))^2)
return(prob)
}
# Tried switching the x and y to be able to turn the
# distribution overlay 90 degrees with coord_flip()
ggplot(df, aes(x = y, y = x)) +
geom_point() +
geom_line() +
stat_function(fun = pdfDeltaFun)+
coord_flip()
I think this comes pretty close to the first illustration you give. If this is a thing you don't need to repeat many times, it is probably best to compute the density curves prior to plotting and use a seperate dataframe to plot these.
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# For every row in `df`, compute a rotated normal density centered at `y` and shifted by `x`
curves <- lapply(seq_len(NROW(df)), function(i) {
mu <- df$y[i]
range <- mu + c(-3, 3)
seq <- seq(range[1], range[2], length.out = 100)
data.frame(
x = -1 * dnorm(seq, mean = mu) + df$x[i],
y = seq,
grp = i
)
})
# Combine above densities in one data.frame
curves <- do.call(rbind, curves)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
# The path draws the curve
geom_path(data = curves, aes(group = grp)) +
# The polygon does the shading. We can use `oob_squish()` to set a range.
geom_polygon(data = curves, aes(y = scales::oob_squish(y, c(0, Inf)),group = grp))
The second illustration is pretty close to your code. I simplified your density function by the standard normal density function and added some extra paramters to stat function:
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
stat_function(fun = dnorm,
aes(x = after_stat(-y * 4 - 5), y = after_stat(x)),
xlim = range(df$y)) +
# We fill with a polygon, squishing the y-range
stat_function(fun = dnorm, geom = "polygon",
aes(x = after_stat(-y * 4 - 5),
y = after_stat(scales::oob_squish(x, c(-Inf, -1)))),
xlim = range(df$y))

Filling parts of a contour plot in R

I have made a contour plot in R with the following code:
library(mvtnorm)
# Define the parameters for the multivariate normal distribution
mu = c(0,0)
sigma = matrix(c(1,0.2,0.2,3),nrow = 2)
# Make a grid in the x-y plane centered in mu, +/- 3 standard deviations
xygrid = expand.grid(x = seq(from = mu[1]-3*sigma[1,1], to = mu[1]+3*sigma[1,1], length.out = 100),
y = seq(from = mu[2]-3*sigma[2,2], to = mu[2]+3*sigma[2,2], length.out = 100))
# Use the mvtnorm library to calculate the multivariate normal density for each point in the grid
distribution = as.matrix(dmvnorm(x = xygrid, mean = mu, sigma = sigma))
# Plot contours
df = as.data.frame(cbind(xygrid, distribution))
myPlot = ggplot() + geom_contour(data = df,geom="polygon",aes( x = x, y = y, z = distribution))
myPlot
I want to illustrate cumulative probability by shading/colouring certain parts of the plot, for instance everything in the region {x<0, y<0} (or any other self defined region).
Is there any way of achieving this in R with ggplot?
So you are able to get the coordinates used to draw the circles in the plot using ggplot_build. Subsequently you could try to use these coordinates in combination with geom_polygon to shade a particular region. My best try:
library(dplyr)
data <- ggplot_build(myPlot)$data[[1]]
xCoor <- 0
yCoor <- 0
df <- data %>% filter(group == '-1-001', x <= xCoor, y <= yCoor) %>% select(x,y)
# Insert the [0,0] coordinate in the right place
index <- which.max(abs(diff(rank(df$y))))
df <- rbind( df[1:index,], data.frame(x=xCoor, y=yCoor), df[(index+1):nrow(df),] )
myPlot + geom_polygon(data = df, aes(x=x, y=y), fill = 'red', alpha = 0.5)
As you can see it's not perfect because the [x,0] and [0,y] coordinates are not included in the data, but it's a start.

ggplot plot 2d probability density function on top of points on ggplot

I have the following example:
require(mvtnorm)
require(ggplot2)
set.seed(1234)
xx <- data.frame(rmvt(100, df = c(13, 13)))
ggplot(data = xx, aes(x = X1, y= X2)) + geom_point() + geom_density2d()
Here is what I get:
However, I would like to get the density contour from the mutlivariate t density given by the dmvt function. How do I tweak geom_density2d to do that?
This is not an easy question to answer: because the contours need to be calculated and the ellipse drawn using the ellipse package.
Done with elliptical t-densities to illustrate the plotting better.
nu <- 5 ## this is the degrees of freedom of the multivariate t.
library(mvtnorm)
library(ggplot2)
sig <- matrix(c(1, 0.5, 0.5, 1), ncol = 2) ## this is the sigma parameter for the multivariate t
xx <- data.frame( rmvt(n = 100, df = c(nu, nu), sigma = sig)) ## generating the original sample
rtsq <- rowSums(x = matrix(rt(n = 2e6, df = nu)^2, ncol = 2)) ## generating the sample for the ellipse-quantiles. Note that this is a cumbersome calculation because it is the sum of two independent t-squared random variables with the same degrees of freedom so I am using simulation to get the quantiles. This is the sample from which I will create the quantiles.
g <- ggplot( data = xx
, aes( x = X1
, y = X2
)
) + geom_point(colour = "red", size = 2) ## initial setup
library(ellipse)
for (i in seq(from = 0.01, to = 0.99, length.out = 20)) {
el.df <- data.frame(ellipse(x = sig, t = sqrt(quantile(rtsq, probs = i)))) ## create the data for the given quantile of the ellipse.
names(el.df) <- c("x", "y")
g <- g + geom_polygon(data=el.df, aes(x=x, y=y), fill = NA, linetype=1, colour = "blue") ## plot the ellipse
}
g + theme_bw()
This yields:
I still have a question: how does one reduce the size of the plotting ellispe lines?

Resources