Fill superimposed ellipses in ggplot2 scatterplots - r

This question is a follow-up of "How can a data ellipse be superimposed on a ggplot2 scatterplot?".
I want to create a 2D scatterplot using ggplot2 with filled superimposed confidence ellipses. Using the solution of Etienne Low-Décarie from the above mentioned post, I do get superimposed ellipses to work. The solution is based on stat_ellipse available from https://github.com/JoFrhwld/FAAV/blob/master/r/stat-ellipse.R
Q: How can I fill the inner area of the ellipse(s) with a certain color (more specifically I want to use the color of the ellipse border with some alpha)?
Here is the minimal working example modified from the above mentioned post:
# create data
set.seed(20130226)
n <- 200
x1 <- rnorm(n, mean = 2)
y1 <- 1.5 + 0.4 * x1 + rnorm(n)
x2 <- rnorm(n, mean = -1)
y2 <- 3.5 - 1.2 * x2 + rnorm(n)
class <- rep(c("A", "B"), each = n)
df <- data.frame(x = c(x1, x2), y = c(y1, y2), colour = class)
# get code for "stat_ellipse"
library(devtools)
library(ggplot2)
source_url("https://raw.github.com/JoFrhwld/FAAV/master/r/stat-ellipse.R")
# scatterplot with confidence ellipses (but inner ellipse areas are not filled)
qplot(data = df, x = x, y = y, colour = class) + stat_ellipse()
Output of working example:

As mentioned in the comments, polygon is needed here:
qplot(data = df, x = x, y = y, colour = class) +
stat_ellipse(geom = "polygon", alpha = 1/2, aes(fill = class))

Related

Make ggplot with regression line and normal distribution overlay

I am trying to make a plot to show the intuition behind logistic (or probit) regression. How would I make a plot that looks something like this in ggplot?
(Wolf & Best, The Sage Handbook of Regression Analysis and Causal Inference, 2015, p. 155)
Actually, what I would rather even do is have one single normal distribution displayed along the y axis with mean = 0, and a specific variance, so that I can draw horizontal lines going from the linear predictor to the y axis and sideways normal distribution. Something like this:
What this is supposed to show (assuming I haven't misunderstood something) is . I haven't had much success so far...
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# Probability density function of a normal logistic distribution
pdfDeltaFun <- function(x) {
prob = (exp(x)/(1 + exp(x))^2)
return(prob)
}
# Tried switching the x and y to be able to turn the
# distribution overlay 90 degrees with coord_flip()
ggplot(df, aes(x = y, y = x)) +
geom_point() +
geom_line() +
stat_function(fun = pdfDeltaFun)+
coord_flip()
I think this comes pretty close to the first illustration you give. If this is a thing you don't need to repeat many times, it is probably best to compute the density curves prior to plotting and use a seperate dataframe to plot these.
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# For every row in `df`, compute a rotated normal density centered at `y` and shifted by `x`
curves <- lapply(seq_len(NROW(df)), function(i) {
mu <- df$y[i]
range <- mu + c(-3, 3)
seq <- seq(range[1], range[2], length.out = 100)
data.frame(
x = -1 * dnorm(seq, mean = mu) + df$x[i],
y = seq,
grp = i
)
})
# Combine above densities in one data.frame
curves <- do.call(rbind, curves)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
# The path draws the curve
geom_path(data = curves, aes(group = grp)) +
# The polygon does the shading. We can use `oob_squish()` to set a range.
geom_polygon(data = curves, aes(y = scales::oob_squish(y, c(0, Inf)),group = grp))
The second illustration is pretty close to your code. I simplified your density function by the standard normal density function and added some extra paramters to stat function:
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
stat_function(fun = dnorm,
aes(x = after_stat(-y * 4 - 5), y = after_stat(x)),
xlim = range(df$y)) +
# We fill with a polygon, squishing the y-range
stat_function(fun = dnorm, geom = "polygon",
aes(x = after_stat(-y * 4 - 5),
y = after_stat(scales::oob_squish(x, c(-Inf, -1)))),
xlim = range(df$y))

Filling parts of a contour plot in R

I have made a contour plot in R with the following code:
library(mvtnorm)
# Define the parameters for the multivariate normal distribution
mu = c(0,0)
sigma = matrix(c(1,0.2,0.2,3),nrow = 2)
# Make a grid in the x-y plane centered in mu, +/- 3 standard deviations
xygrid = expand.grid(x = seq(from = mu[1]-3*sigma[1,1], to = mu[1]+3*sigma[1,1], length.out = 100),
y = seq(from = mu[2]-3*sigma[2,2], to = mu[2]+3*sigma[2,2], length.out = 100))
# Use the mvtnorm library to calculate the multivariate normal density for each point in the grid
distribution = as.matrix(dmvnorm(x = xygrid, mean = mu, sigma = sigma))
# Plot contours
df = as.data.frame(cbind(xygrid, distribution))
myPlot = ggplot() + geom_contour(data = df,geom="polygon",aes( x = x, y = y, z = distribution))
myPlot
I want to illustrate cumulative probability by shading/colouring certain parts of the plot, for instance everything in the region {x<0, y<0} (or any other self defined region).
Is there any way of achieving this in R with ggplot?
So you are able to get the coordinates used to draw the circles in the plot using ggplot_build. Subsequently you could try to use these coordinates in combination with geom_polygon to shade a particular region. My best try:
library(dplyr)
data <- ggplot_build(myPlot)$data[[1]]
xCoor <- 0
yCoor <- 0
df <- data %>% filter(group == '-1-001', x <= xCoor, y <= yCoor) %>% select(x,y)
# Insert the [0,0] coordinate in the right place
index <- which.max(abs(diff(rank(df$y))))
df <- rbind( df[1:index,], data.frame(x=xCoor, y=yCoor), df[(index+1):nrow(df),] )
myPlot + geom_polygon(data = df, aes(x=x, y=y), fill = 'red', alpha = 0.5)
As you can see it's not perfect because the [x,0] and [0,y] coordinates are not included in the data, but it's a start.

Simulated sphere shape data based on normal distribution

I'm clueless on below question. Any help is appreciated please.
"Simulate data with n=1000 observations and p=3 covariates -- all random variables from standard normal distribution. Create two category class variable assigning all observations within a sphere with radius of 1.5 centered at 3D zero to one class category and all others -- to the second".
Here's a 2D example to get you going...
library(ggplot2)
library(grid)
Sample x & y coords from normal distribution (default mean = 0, sd = 1)
df <- data.frame(x = rnorm(100), y = rnorm(100))
Calculate distance from centre (0,0)
df$r = sqrt(df$x^2 + df$y^2)
Assign to category
df$category <- ifelse(df$r < 1, "in", "out")
Plot
ggplot(df, aes(x = x, y = y, color = category)) +
geom_point() +
coord_equal() +
annotation_custom(grob=circleGrob(r=unit(1,"npc"), gp = gpar(fill = NA)), xmin=-0.5, xmax=0.5, ymin=-0.5, ymax=0.5)

A ggplot2 equivalent of the lines() function in basic plot

For reasons I won't go into I need to plot a vertical normal curve on a blank ggplot2 graph. The following code gets it done as a series of points with x,y coordinates
dfBlank <- data.frame()
g <- ggplot(dfBlank) + xlim(0.58,1) + ylim(-0.2,113.2)
hdiLo <- 31.88
hdiHi <- 73.43
yComb <- seq(hdiLo, hdiHi, length = 75)
xVals <- 0.79 - (0.06*dnorm(yComb, 52.65, 10.67))/0.05
dfVertCurve <- data.frame(x = xVals, y = yComb)
g + geom_point(data = dfVertCurve, aes(x = x, y = y), size = 0.01)
The curve is clearly discernible but is a series of points. The lines() function in basic plot would turn these points into a smooth line.
Is there a ggplot2 equivalent?
I see two different ways to do it.
geom_segment
The first uses geom_segment to 'link' each point with its next one.
hdiLo <- 31.88
hdiHi <- 73.43
yComb <- seq(hdiLo, hdiHi, length = 75)
xVals <- 0.79 - (0.06*dnorm(yComb, 52.65, 10.67))/0.05
dfVertCurve <- data.frame(x = xVals, y = yComb)
library(ggplot2)
ggplot() +
xlim(0.58, 1) +
ylim(-0.2, 113.2) +
geom_segment(data = dfVertCurve, aes(x = x, xend = dplyr::lead(x), y = y, yend = dplyr::lead(y)), size = 0.01)
#> Warning: Removed 1 rows containing missing values (geom_segment).
As you can see it just link the points you created. The last point does not have a next one, so the last segment is removed (See the warning)
stat_function
The second one, which I think is better and more ggplotish, utilize stat_function().
library(ggplot2)
f = function(x) .79 - (.06 * dnorm(x, 52.65, 10.67)) / .05
hdiLo <- 31.88
hdiHi <- 73.43
yComb <- seq(hdiLo, hdiHi, length = 75)
ggplot() +
xlim(-0.2, 113.2) +
ylim(0.58, 1) +
stat_function(data = data.frame(yComb), fun = f) +
coord_flip()
This build a proper function (y = f(x)), plot it. Note that it is build on the X axis and then flipped. Because of this the xlim and ylim are inverted.

R - add centroids to scatter plot

I have a dataset two continuous variables and one factor variable (two classes). I want to create a scatterplot with two centroids (one for each class) that includes error bars in R. The centroids should be positioned at the mean values for x and y for each class.
I can easily create the scatter plot using ggplot2, but I can't figure out how to add the centroids. Is it possible to do this using ggplot / qplot?
Here is some example code:
x <- c(1,2,3,4,5,2,3,5)
y <- c(10,11,14,5,7,9,8,5)
class <- c(1,1,1,0,0,1,0,0)
df <- data.frame(class, x, y)
qplot(x,y, data=df, color=as.factor(class))
Is this what you had in mind?
centroids <- aggregate(cbind(x,y)~class,df,mean)
ggplot(df,aes(x,y,color=factor(class))) +
geom_point(size=3)+ geom_point(data=centroids,size=5)
This creates a separate data frame, centroids, with columns x, y, and class where x and y are the mean values by class. Then we add a second point geometry layer using centroid as the dataset.
This is a slightly more interesting version, useful in cluster analysis.
gg <- merge(df,aggregate(cbind(mean.x=x,mean.y=y)~class,df,mean),by="class")
ggplot(gg, aes(x,y,color=factor(class)))+geom_point(size=3)+
geom_point(aes(x=mean.x,y=mean.y),size=5)+
geom_segment(aes(x=mean.x, y=mean.y, xend=x, yend=y))
EDIT Response to OP's comment.
Vertical and horizontal error bars can be added using geom_errorbar(...) and geom_errorbarh(...).
centroids <- aggregate(cbind(x,y)~class,df,mean)
f <- function(z)sd(z)/sqrt(length(z)) # function to calculate std.err
se <- aggregate(cbind(se.x=x,se.y=y)~class,df,f)
centroids <- merge(centroids,se, by="class") # add std.err column to centroids
ggplot(gg, aes(x,y,color=factor(class)))+
geom_point(size=3)+
geom_point(data=centroids, size=5)+
geom_errorbar(data=centroids,aes(ymin=y-se.y,ymax=y+se.y),width=0.1)+
geom_errorbarh(data=centroids,aes(xmin=x-se.x,xmax=x+se.x),height=0.1)
If you want to calculate, say, 95% confidence instead of std. error, replace
f <- function(z)sd(z)/sqrt(length(z)) # function to calculate std.err
with
f <- function(z) qt(0.025,df=length(z)-1, lower.tail=F)* sd(z)/sqrt(length(z))
I could not get the exact code by #jlhoward to work for me (specifically with the error bars), so I made minor changes to remove errors and even remove warnings. So, you should be able to run the code from start to finish, and if #jlhoward wants to incorporate this into the existing answer, that's great.
centroids <- aggregate(cbind(mean.x = x, mean.y = y) ~ class, df, mean)
gg <- merge(df, centroids, by = "class")
f <- function(z) sd(z) / sqrt(length(z)) # function to calculate std.err
se <- aggregate(cbind(se.x = x ,se.y = y) ~ class, df, f)
centroids <- merge(centroids, se, by = "class") # add std.err column to centroids
ggplot(gg, aes(x = x, y = y, color = factor(class))) +
geom_point(size = 3) +
geom_point(data = centroids, aes(x = mean.x, y = mean.y), size = 5) +
geom_errorbar(data = centroids,
aes(x = mean.x, y = mean.y, ymin = mean.y - se.y, ymax = mean.y + se.y),
width = 0.1) +
geom_errorbarh(data = centroids, inherit.aes=FALSE, # keeps ggplot from using first aes
aes(xmin = (mean.x - se.x), xmax = (mean.x + se.x), y = mean.y,
height = 0.1, color = factor(class))) +
labs(x = "Label for x-axis", y = "Label for y-axis") +
theme(legend.title = element_blank()) # remove legend title

Resources