Cumulative Density Plots with ggplot and plotly - r

When we take the following example from ggplot2 docs
df <- data.frame(x = c(rnorm(100, 0, 3), rnorm(100, 0, 10)),
g = gl(2, 100))
library(ggplot2)
p <- ggplot(df, aes(x, colour = g)) +
stat_ecdf(geom = "step", na.rm = T) + # interchange point and step
theme_bw()
p
We can create a standard cdf plot. Now if we want to play with the plot in plotly, I obtain a very confusing image when I use the step command. See below. However, when I use the point command plotly behaves like it should. What is happening with the step command? Why can't I recreate the image from using ggplot only?
library(plotly)
ggplotly(p)

I found the solution here https://community.plotly.com/t/bug-with-ggplot2-stat-ecdf-function/1187/3.
You should reorder the dataframe along x.
df <- dplyr::arrange(df, x)
library(ggplot2)
p <- ggplot(df, aes(x, colour = g)) +
stat_ecdf(geom = "step", na.rm = T) +
theme_bw()
p
library(plotly)
ggplotly(p)

This can be solved using ecdf() function.
## ecdf function to get y and 1-y
rcdf <- function (x) {
cdf <- ecdf(x)
y1 <- cdf(x)
y <- unique(y1)
# xrcdf <- 1-y ## to get reverse cdf
xrcdf <- y ## to get cdf
}
ug <- unique(df$g)
ng <- length(ug)
xll <- min(df$x)
xul <- max(df$x)
adr <- data.frame(myxx=c(), myyy=c(), mygg=c())
lapply(1:ng, function(i){
ad2r <- subset(df, g==ug[i])
myx1 <- unique(ad2r$x)
myxx <- c(xll,myx1,xul) ## add lowest value - dummy to assign 100%
myy1 <- rcdf(ad2r$x)
# myyy <- c(1.0,myy1,0.0) ## add 100% to get reverse cdf
myyy <- c(0.0,myy1,1.0) ## add 0% to get cdf
mygg <- ug[i]
ad2rf <- data.frame(myxx,myyy,mygg)
adr <<- rbind(adr,ad2rf)
})
adf <- adr[order(adr$myxx),]
pp <- ggplot(data=adf,
aes_(x=adf$myxx, y=100*adf$myyy, col=adf$mygg, group=adf$mygg)) +
geom_step() +
labs(title="CDF", y = "Y", x = "X", col=NULL)
ppp <- ggplotly(pp, tooltip=c("x","y"))
ppp
This gives the following output:
CDF

Related

Use a gradient fill under a facet wrap of density curves in ggplot in R?

Similar questions have been asked before in other forms. Some can be found here and here. However, I cant seem to adapt them when using a facet wrap displaying multiple density plots.
I tried adapting the other examples, but failed... I also tried using the ggpattern package, but when there is a large amount of data, it takes several minutes on my machine to create a plot.
I am trying to create a gradient under the density curve... but with the gradient pointing down. Something like in the example image below:
Some example data to work with:
library(ggplot2)
set.seed(321)
# create data
varNames <- c("x1", "x2", "x3")
df <- data.frame(
var = sample(varNames, 100, replace = T),
val = runif(100)
)
# create plot
ggplot(df, aes(x = val)) +
geom_density(aes(colour = var, fill = var)) +
facet_wrap(~var) +
theme_bw() +
theme(legend.position = "none")
You can use teunbrand's function, but you will need to apply it to each facet. Here simply looping over it with lapply
library(tidyverse)
library(polyclip)
#> polyclip 1.10-0 built from Clipper C++ version 6.4.0
## This is teunbrands function copied without any change!!
## from https://stackoverflow.com/a/64695516/7941188
fade_polygon <- function(x, y, n = 100) {
poly <- data.frame(x = x, y = y)
# Create bounding-box edges
yseq <- seq(min(poly$y), max(poly$y), length.out = n)
xlim <- range(poly$x) + c(-1, 1)
# Pair y-edges
grad <- cbind(head(yseq, -1), tail(yseq, -1))
# Add vertical ID
grad <- cbind(grad, seq_len(nrow(grad)))
# Slice up the polygon
grad <- apply(grad, 1, function(range) {
# Create bounding box
bbox <- data.frame(x = c(xlim, rev(xlim)),
y = c(range[1], range[1:2], range[2]))
# Do actual slicing
slice <- polyclip::polyclip(poly, bbox)
# Format as data.frame
for (i in seq_along(slice)) {
slice[[i]] <- data.frame(
x = slice[[i]]$x,
y = slice[[i]]$y,
value = range[3],
id = c(1, rep(0, length(slice[[i]]$x) - 1))
)
}
slice <- do.call(rbind, slice)
})
# Combine slices
grad <- do.call(rbind, grad)
# Create IDs
grad$id <- cumsum(grad$id)
return(grad)
}
## now here starts the change, loop over your variables. I'm creating the data frame directly instead of keeping the density object
dens <- lapply(split(df, df$var), function(x) {
dens <- density(x$val)
data.frame(x = dens$x, y = dens$y)
}
)
## we need this one for the plot, but still need the list
dens_df <- bind_rows(dens, .id = "var")
grad <- bind_rows(lapply(dens, function(x) fade_polygon(x$x, x$y)), .id = "var")
ggplot(grad, aes(x, y)) +
geom_line(data = dens_df) +
geom_polygon(aes(alpha = value, group = id),
fill = "blue") +
facet_wrap(~var) +
scale_alpha_continuous(range = c(0, 1))
Created on 2021-12-05 by the reprex package (v2.0.1)

Countor plot of bivariate normal functions

Could someone explain me why such function doesn't produce a countor plot as I expected.
I've a bivariate normal function whit:
means = c(5,1)
var_cov = matrix(c(2,1,1,1),2)
I'd like to plot its contour plot; I'm able to reach the result but I'd like to ask why in one case I don't get expected result.
Working Example:
library(MASS)
library(ggplot2)
N <- 100
set.seed(123)
var_cov_matrix <- matrix(c(2,1,1,1),2)
mean <- c(5,1)
bivariate_points <- expand.grid(s.1 = seq(-25, 25, length.out=N), s.2 = seq(-25, 25, length.out=N))
z <- mvtnorm::dmvnorm(bivariate_points, mean = mean, sigma = var_cov_matrix)
data <- cbind(bivariate_points,z)
colnames(data) <- c("X1","X2","Z")
data.df <- as.data.frame(data)
ggplot() +
geom_contour(data=data.df,aes(x=X1,y=X2,z=Z))
Non Working Example:
library(MASS)
library(ggplot2)
N <- 100
set.seed(123)
var_cov_matrix <- matrix(c(2,1,1,1),2)
mean <- c(5,1)
bivariate_points <- mvrnorm(N, mu = mean, Sigma = var_cov_matrix ) # <---- EDITED
z <- mvtnorm::dmvnorm(bivariate_points, mean = mean, sigma = var_cov_matrix)
data <- cbind(bivariate_points,z)
colnames(data) <- c("X1","X2","Z")
data.df <- as.data.frame(data)
ggplot() +
geom_contour(data=data.df,aes(x=X1,y=X2,z=Z))
In your non-working example, since you don't have regular grid for contour plot, you can use stat_density2d instead, i.e.,
ggplot(data.df, aes(x = X1, y = X2, z = Z)) +
geom_point(aes(colour = z)) +
stat_density2d()

Fill negative value area below geom_line [duplicate]

I frequently use kernel density plots to illustrate distributions. These are easy and fast to create in R like so:
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
plot(dens)
#or in one line like this: plot(density(rnorm(100)^2))
Which gives me this nice little PDF:
I'd like to shade the area under the PDF from the 75th to 95th percentiles. It's easy to calculate the points using the quantile function:
q75 <- quantile(draws, .75)
q95 <- quantile(draws, .95)
But how do I shade the the area between q75 and q95?
With the polygon() function, see its help page and I believe we had similar questions here too.
You need to find the index of the quantile values to get the actual (x,y) pairs.
Edit: Here you go:
x1 <- min(which(dens$x >= q75))
x2 <- max(which(dens$x < q95))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="gray"))
Output (added by JDL)
Another solution:
dd <- with(dens,data.frame(x,y))
library(ggplot2)
qplot(x,y,data=dd,geom="line")+
geom_ribbon(data=subset(dd,x>q75 & x<q95),aes(ymax=y),ymin=0,
fill="red",colour=NA,alpha=0.5)
Result:
An expanded solution:
If you wanted to shade both tails (copy & paste of Dirk's code) and use known x values:
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
plot(dens)
q2 <- 2
q65 <- 6.5
qn08 <- -0.8
qn02 <- -0.2
x1 <- min(which(dens$x >= q2))
x2 <- max(which(dens$x < q65))
x3 <- min(which(dens$x >= qn08))
x4 <- max(which(dens$x < qn02))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="gray"))
with(dens, polygon(x=c(x[c(x3,x3:x4,x4)]), y= c(0, y[x3:x4], 0), col="gray"))
Result:
This question needs a lattice answer. Here's a very basic one, simply adapting the method employed by Dirk and others:
#Set up the data
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
#Put in a simple data frame
d <- data.frame(x = dens$x, y = dens$y)
#Define a custom panel function;
# Options like color don't need to be hard coded
shadePanel <- function(x,y,shadeLims){
panel.lines(x,y)
m1 <- min(which(x >= shadeLims[1]))
m2 <- max(which(x <= shadeLims[2]))
tmp <- data.frame(x1 = x[c(m1,m1:m2,m2)], y1 = c(0,y[m1:m2],0))
panel.polygon(tmp$x1,tmp$y1,col = "blue")
}
#Plot
xyplot(y~x,data = d, panel = shadePanel, shadeLims = c(1,3))
Here's another ggplot2 variant based on a function that approximates the kernel density at the original data values:
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
Using the original data (rather than producing a new data frame with the density estimate's x and y values) has the benefit of also working in faceted plots where the quantile values depend on the variable by which the data is being grouped:
Code used
library(tidyverse)
library(RColorBrewer)
# dummy data
set.seed(1)
n <- 1e2
dt <- tibble(value = rnorm(n)^2)
# function that approximates the density at the provided values
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
probs <- c(0.75, 0.95)
dt <- dt %>%
mutate(dy = approxdens(value), # calculate density
p = percent_rank(value), # percentile rank
pcat = as.factor(cut(p, breaks = probs, # percentile category based on probs
include.lowest = TRUE)))
ggplot(dt, aes(value, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcat)) +
geom_line() +
scale_fill_brewer(guide = "none") +
theme_bw()
# dummy data with 2 groups
dt2 <- tibble(category = c(rep("A", n), rep("B", n)),
value = c(rnorm(n)^2, rnorm(n, mean = 2)))
dt2 <- dt2 %>%
group_by(category) %>%
mutate(dy = approxdens(value),
p = percent_rank(value),
pcat = as.factor(cut(p, breaks = probs,
include.lowest = TRUE)))
# faceted plot
ggplot(dt2, aes(value, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcat)) +
geom_line() +
facet_wrap(~ category, nrow = 2, scales = "fixed") +
scale_fill_brewer(guide = "none") +
theme_bw()
Created on 2018-07-13 by the reprex package (v0.2.0).

Partially fill density plot for area of interest [duplicate]

I frequently use kernel density plots to illustrate distributions. These are easy and fast to create in R like so:
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
plot(dens)
#or in one line like this: plot(density(rnorm(100)^2))
Which gives me this nice little PDF:
I'd like to shade the area under the PDF from the 75th to 95th percentiles. It's easy to calculate the points using the quantile function:
q75 <- quantile(draws, .75)
q95 <- quantile(draws, .95)
But how do I shade the the area between q75 and q95?
With the polygon() function, see its help page and I believe we had similar questions here too.
You need to find the index of the quantile values to get the actual (x,y) pairs.
Edit: Here you go:
x1 <- min(which(dens$x >= q75))
x2 <- max(which(dens$x < q95))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="gray"))
Output (added by JDL)
Another solution:
dd <- with(dens,data.frame(x,y))
library(ggplot2)
qplot(x,y,data=dd,geom="line")+
geom_ribbon(data=subset(dd,x>q75 & x<q95),aes(ymax=y),ymin=0,
fill="red",colour=NA,alpha=0.5)
Result:
An expanded solution:
If you wanted to shade both tails (copy & paste of Dirk's code) and use known x values:
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
plot(dens)
q2 <- 2
q65 <- 6.5
qn08 <- -0.8
qn02 <- -0.2
x1 <- min(which(dens$x >= q2))
x2 <- max(which(dens$x < q65))
x3 <- min(which(dens$x >= qn08))
x4 <- max(which(dens$x < qn02))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="gray"))
with(dens, polygon(x=c(x[c(x3,x3:x4,x4)]), y= c(0, y[x3:x4], 0), col="gray"))
Result:
This question needs a lattice answer. Here's a very basic one, simply adapting the method employed by Dirk and others:
#Set up the data
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
#Put in a simple data frame
d <- data.frame(x = dens$x, y = dens$y)
#Define a custom panel function;
# Options like color don't need to be hard coded
shadePanel <- function(x,y,shadeLims){
panel.lines(x,y)
m1 <- min(which(x >= shadeLims[1]))
m2 <- max(which(x <= shadeLims[2]))
tmp <- data.frame(x1 = x[c(m1,m1:m2,m2)], y1 = c(0,y[m1:m2],0))
panel.polygon(tmp$x1,tmp$y1,col = "blue")
}
#Plot
xyplot(y~x,data = d, panel = shadePanel, shadeLims = c(1,3))
Here's another ggplot2 variant based on a function that approximates the kernel density at the original data values:
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
Using the original data (rather than producing a new data frame with the density estimate's x and y values) has the benefit of also working in faceted plots where the quantile values depend on the variable by which the data is being grouped:
Code used
library(tidyverse)
library(RColorBrewer)
# dummy data
set.seed(1)
n <- 1e2
dt <- tibble(value = rnorm(n)^2)
# function that approximates the density at the provided values
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
probs <- c(0.75, 0.95)
dt <- dt %>%
mutate(dy = approxdens(value), # calculate density
p = percent_rank(value), # percentile rank
pcat = as.factor(cut(p, breaks = probs, # percentile category based on probs
include.lowest = TRUE)))
ggplot(dt, aes(value, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcat)) +
geom_line() +
scale_fill_brewer(guide = "none") +
theme_bw()
# dummy data with 2 groups
dt2 <- tibble(category = c(rep("A", n), rep("B", n)),
value = c(rnorm(n)^2, rnorm(n, mean = 2)))
dt2 <- dt2 %>%
group_by(category) %>%
mutate(dy = approxdens(value),
p = percent_rank(value),
pcat = as.factor(cut(p, breaks = probs,
include.lowest = TRUE)))
# faceted plot
ggplot(dt2, aes(value, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcat)) +
geom_line() +
facet_wrap(~ category, nrow = 2, scales = "fixed") +
scale_fill_brewer(guide = "none") +
theme_bw()
Created on 2018-07-13 by the reprex package (v0.2.0).

Shading a kernel density plot between two points.

I frequently use kernel density plots to illustrate distributions. These are easy and fast to create in R like so:
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
plot(dens)
#or in one line like this: plot(density(rnorm(100)^2))
Which gives me this nice little PDF:
I'd like to shade the area under the PDF from the 75th to 95th percentiles. It's easy to calculate the points using the quantile function:
q75 <- quantile(draws, .75)
q95 <- quantile(draws, .95)
But how do I shade the the area between q75 and q95?
With the polygon() function, see its help page and I believe we had similar questions here too.
You need to find the index of the quantile values to get the actual (x,y) pairs.
Edit: Here you go:
x1 <- min(which(dens$x >= q75))
x2 <- max(which(dens$x < q95))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="gray"))
Output (added by JDL)
Another solution:
dd <- with(dens,data.frame(x,y))
library(ggplot2)
qplot(x,y,data=dd,geom="line")+
geom_ribbon(data=subset(dd,x>q75 & x<q95),aes(ymax=y),ymin=0,
fill="red",colour=NA,alpha=0.5)
Result:
An expanded solution:
If you wanted to shade both tails (copy & paste of Dirk's code) and use known x values:
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
plot(dens)
q2 <- 2
q65 <- 6.5
qn08 <- -0.8
qn02 <- -0.2
x1 <- min(which(dens$x >= q2))
x2 <- max(which(dens$x < q65))
x3 <- min(which(dens$x >= qn08))
x4 <- max(which(dens$x < qn02))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="gray"))
with(dens, polygon(x=c(x[c(x3,x3:x4,x4)]), y= c(0, y[x3:x4], 0), col="gray"))
Result:
This question needs a lattice answer. Here's a very basic one, simply adapting the method employed by Dirk and others:
#Set up the data
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
#Put in a simple data frame
d <- data.frame(x = dens$x, y = dens$y)
#Define a custom panel function;
# Options like color don't need to be hard coded
shadePanel <- function(x,y,shadeLims){
panel.lines(x,y)
m1 <- min(which(x >= shadeLims[1]))
m2 <- max(which(x <= shadeLims[2]))
tmp <- data.frame(x1 = x[c(m1,m1:m2,m2)], y1 = c(0,y[m1:m2],0))
panel.polygon(tmp$x1,tmp$y1,col = "blue")
}
#Plot
xyplot(y~x,data = d, panel = shadePanel, shadeLims = c(1,3))
Here's another ggplot2 variant based on a function that approximates the kernel density at the original data values:
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
Using the original data (rather than producing a new data frame with the density estimate's x and y values) has the benefit of also working in faceted plots where the quantile values depend on the variable by which the data is being grouped:
Code used
library(tidyverse)
library(RColorBrewer)
# dummy data
set.seed(1)
n <- 1e2
dt <- tibble(value = rnorm(n)^2)
# function that approximates the density at the provided values
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
probs <- c(0.75, 0.95)
dt <- dt %>%
mutate(dy = approxdens(value), # calculate density
p = percent_rank(value), # percentile rank
pcat = as.factor(cut(p, breaks = probs, # percentile category based on probs
include.lowest = TRUE)))
ggplot(dt, aes(value, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcat)) +
geom_line() +
scale_fill_brewer(guide = "none") +
theme_bw()
# dummy data with 2 groups
dt2 <- tibble(category = c(rep("A", n), rep("B", n)),
value = c(rnorm(n)^2, rnorm(n, mean = 2)))
dt2 <- dt2 %>%
group_by(category) %>%
mutate(dy = approxdens(value),
p = percent_rank(value),
pcat = as.factor(cut(p, breaks = probs,
include.lowest = TRUE)))
# faceted plot
ggplot(dt2, aes(value, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcat)) +
geom_line() +
facet_wrap(~ category, nrow = 2, scales = "fixed") +
scale_fill_brewer(guide = "none") +
theme_bw()
Created on 2018-07-13 by the reprex package (v0.2.0).

Resources