ggplot mixture model R - r

I have a dataset with numeric values and a categorical variable. The distribution of the numeric variable differs for each category. I want to plot "density plots" for each categorical variable so that they are visually below the entire density plot.
This is similiar to components of a mixture model without calculating the mixture model (as I already know the categorical variable which splits the data).
If I take ggplot to group according to the categorical variable, each of the four densities are real densities and integrate to one.
library(ggplot2)
ggplot(iris, aes(x = Sepal.Width)) + geom_density() + geom_density(aes(x = Sepal.Width, group = Species, colour = 'Species'))
What I want is to have the densities of each category as a sub-density (not integrating to 1). Similiar to the following code (which I only implemented for two of the three iris species)
myIris <- as.data.table(iris)
# calculate density for entire dataset
dens_entire <- density(myIris[, Sepal.Width], cut = 0)
dens_e <- data.table(x = dens_entire[[1]], y = dens_entire[[2]])
# calculate density for dataset with setosa
dens_setosa <- density(myIris[Species == 'setosa', Sepal.Width], cut = 0)
dens_sa <- data.table(x = dens_setosa[[1]], y = dens_setosa[[2]])
# calculate density for dataset with versicolor
dens_versicolor <- density(myIris[Species == 'versicolor', Sepal.Width], cut = 0)
dens_v <- data.table(x = dens_versicolor[[1]], y = dens_versicolor[[2]])
# plot densities as mixture model
ggplot(dens_e, aes(x=x, y=y)) + geom_line() + geom_line(data = dens_sa, aes(x = x, y = y/2.5, colour = 'setosa')) +
geom_line(data = dens_v, aes(x = x, y = y/1.65, colour = 'versicolor'))
resulting in
Above I hard-coded the number to reduce the y values. Is there any way to do it with ggplot? Or to calculate it?
Thanks for your ideas.

Do you mean something like this? You need to change the scale though.
ggplot(iris, aes(x = Sepal.Width)) +
geom_density(aes(y = ..count..)) +
geom_density(aes(x = Sepal.Width, y = ..count..,
group = Species, colour = Species))
Another option may be
ggplot(iris, aes(x = Sepal.Width)) +
geom_density(aes(y = ..density..)) +
geom_density(aes(x = Sepal.Width, y = ..density../3,
group = Species, colour = Species))

Related

Plot the convex hull in FDA plot - R

I am trying to add a convex hull for each group in this plot using ggpubr package? Why it does not work?
Code:
library(dplyr)
library(MASS)
library(ggplot2)
library(scales)
library(ggpubr)
library(data.table)
irisfda <- fda(Species ~ ., data = iris, method = mars)
df1 <- cbind(data.frame(irisfda$fit$fitted.values), species = iris[,"Species"])
ggplot(df1) +
geom_point(aes(X1, X2, color = species, shape = species), size = 2.5) +
labs(x = "FDA1",y = "FDA1") +
stat_chull(aes(color = species, fill = species), geom = "polygon", alpha = 0.1)
You haven't told stat_chull where the x and y points are. You told geom_point where they were, but geoms and stats don't inherit from each other when you add them to a plot. You can either just add the x and y co-ordinates to stat_chull or, better yet, add them to the ggplot call. Then stat_chull can inherit them, and you can save on some typing.
Incidentally, you used library calls for dplyr, MASS, scales and data.table, which aren't needed for this example, but you forgot to put the library call for mda, which is needed:
library(ggplot2)
library(ggpubr)
library(mda)
irisfda <- fda(Species ~ ., data = iris, method = mars)
df1 <- cbind(data.frame(irisfda$fit$fitted.values), species = iris[,"Species"])
ggplot(df1, aes(x = X1, y = X2, color = species, shape = species)) +
geom_point(size = 2.5) +
labs(x = "FDA1",y = "FDA1") +
stat_chull(geom = "polygon", alpha = 0.1)

draw line on geom_density_ridges

I am trying to draw a line through the density plots from ggridges
library(ggplot2)
library(ggridges)
ggplot(iris, aes(x = Sepal.Length, y = Species)) +
geom_density_ridges(rel_min_height = 0.01)
Indicating the highest point and label the value of x at that point. Something like this below. Any suggestions on accomplishing this is much appreciated
One neat approach is to interrogate the ggplot object itself and use it to construct additional features:
# This is the OP chart
library(ggplot2)
library(ggridges)
gr <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
geom_density_ridges(rel_min_height = 0.01)
Edit: This next part has been shortened, using purrr::pluck to extract the whole data part of the list, instead of manually specifying the columns we'd need later.
# Extract the data ggplot used to prepare the figure.
# purrr::pluck is grabbing the "data" list from the list that
# ggplot_build creates, and then extracting the first element of that list.
ingredients <- ggplot_build(gr) %>% purrr::pluck("data", 1)
# Pick the highest point. Could easily add quantiles or other features here.
density_lines <- ingredients %>%
group_by(group) %>% filter(density == max(density)) %>% ungroup()
# Use the highest point to add more geoms
ggplot(iris, aes(x = Sepal.Length, y = Species)) +
geom_density_ridges(rel_min_height = 0.01) +
geom_segment(data = density_lines,
aes(x = x, y = ymin, xend = x,
yend = ymin+density*scale*iscale)) +
geom_text(data = density_lines,
aes(x = x, y = ymin + 0.5 *(density*scale*iscale),
label = round(x, 2)),
hjust = -0.2)

Adding multiple points to a ggplot ecdf plot

I'm trying to generate a ggplot only C.D.F. plot for some of my data. I am also looking to be able to plot an arbitrary number of percentiles as points on top. I have a solution that works for adding a single point to my curve but fails for multiple values.
This works for plotting one percentile value
TestDf <- as.data.frame(rnorm(1000))
names(TestDf) <- c("Values")
percentiles <- c(0.5)
ggplot(data = TestDf, aes(x = Values)) +
stat_ecdf() +
geom_point(aes(x = quantile(TestDf$Values, percentiles),
y = percentiles))
However this fails
TestDf <- as.data.frame(rnorm(1000))
names(TestDf) <- c("Values")
percentiles <- c(0.25,0.5,0.75)
ggplot(data = TestDf, aes(x = Values)) +
stat_ecdf() +
geom_point(aes(x = quantile(TestDf$Values, percentiles),
y = percentiles))
With error
Error: Aesthetics must be either length 1 or the same as the data (1000): x, y
How can I add an arbitrary number of points to a stat_ecdf() plot?
You need to define a new dataset, outside of the aesthetics. aes refers to the original dataframe that you used for making the CDF (in the original ggplot argument).
ggplot(data = TestDf, aes(x = Values)) +
stat_ecdf() +
geom_point(data = data.frame(x=quantile(TestDf$Values, percentiles),
y=percentiles), aes(x=x, y=y))

How to scale density plots (for several variables) in ggplot having melted data

I have a melted data set which also includes data generated from normal distribution. I want to plot empirical density function of my data against normal distribution but the scales of the two produced density plots are different. I could find this post for two separate data sets:
Normalising the x scales of overlaying density plots in ggplot
but I couldn't figure out how to apply it to melted data. Suppose I have a data frame like this:
df<-data.frame(type=rep(c('A','B'),each=100),x=rnorm(200,1,2)/10,y=rnorm(200))
df.m<-melt(df)
using the code below:
qplot(value,data=df.m,col=variable,geom='density',facets=~type)
produces this graph:
How can I make the two densities comparable given the fact that normal distribution is the reference plot? (I prefer to use qplot instead of ggplot)
UPDATE:
I want to produce something like this (i.e. in terms of plot-comparison) but with ggplot2:
plot(density(rnorm(200,1,2)/10),col='red',main=NA) #my data
par(new=T)
plot(density(rnorm(200)),axes=F,main=NA,xlab=NA,ylab=NA) # reference data
which generates this:
Is this what you had in mind?
There's a built-in variable, ..scaled.. that does this automatically.
set.seed(1)
df<-data.frame(type=rep(c('A','B'),each=100),x=rnorm(200,1,2)/10,y=rnorm(200))
df.m<-melt(df)
ggplot(df.m) +
stat_density(aes(x=value, y=..scaled..,color=variable), position="dodge", geom="line")
df<-data.frame(type=rep(c('A','B'),each=100),x = rnorm(200,1,2)/10, y = rnorm(200))
df.m<-melt(df)
require(data.table)
DT <- data.table(df.m)
Insert a new column with the scaled value into DT. Then plot.
This is the image code:
DT <- DT[, scaled := scale(value), by = "variable"]
str(DT)
ggplot(DT) +
geom_density(aes(x = scaled, color = variable)) +
facet_grid(. ~ type)
qplot(data = DT, x = scaled, color = variable,
facets = ~ type, geom = "density")
# Using fill (inside aes) and alpha outside(so you don't get a legend for it)
ggplot(DT) +
geom_density(aes(x = scaled, fill = variable), alpha = 0.2) +
facet_grid(. ~ type)
qplot(data = DT, x = scaled, fill = variable, geom = "density", alpha = 0.2, facets = ~type)
# Histogram
ggplot(DT, aes(x = scaled, fill = variable)) +
geom_histogram(binwidth=.2, alpha=.5, position="identity") +
facet_grid(. ~ type, scales = "free")
qplot(data = DT, x = scaled, fill = variable, alpha = 0.2, facets = ~type)

Most succinct way to label/annotate extreme values with ggplot?

I'd like to annotate all y-values greater than a y-threshold using ggplot2.
When you plot(lm(y~x)), using the base package, the second graph that pops up automatically is Residuals vs Fitted, the third is qqplot, and the fourth is Scale-location. Each of these automatically label your extreme Y values by listing their corresponding X value as an adjacent annotation. I'm looking for something like this.
What's the best way to achieve this base-default behavior using ggplot2?
Updated scale_size_area() in place of scale_area()
You might be able to take something from this to suit your needs.
library(ggplot2)
#Some data
df <- data.frame(x = round(runif(100), 2), y = round(runif(100), 2))
m1 <- lm(y ~ x, data = df)
df.fortified = fortify(m1)
names(df.fortified) # Names for the variables containing residuals and derived qquantities
# Select extreme values
df.fortified$extreme = ifelse(abs(df.fortified$`.stdresid`) > 1.5, 1, 0)
# Based on examples on page 173 in Wickham's ggplot2 book
plot = ggplot(data = df.fortified, aes(x = x, y = .stdresid)) +
geom_point() +
geom_text(data = df.fortified[df.fortified$extreme == 1, ],
aes(label = x, x = x, y = .stdresid), size = 3, hjust = -.3)
plot
plot1 = ggplot(data = df.fortified, aes(x = .fitted, y = .resid)) +
geom_point() + geom_smooth(se = F)
plot2 = ggplot(data = df.fortified, aes(x = .fitted, y = .resid, size = .cooksd)) +
geom_point() + scale_size_area("Cook's distance") + geom_smooth(se = FALSE, show_guide = FALSE)
library(gridExtra)
grid.arrange(plot1, plot2)

Resources