Pairwise graphical comparison of several distributions - r

This is an edited version of a previous question.
We are given an m by n table of n observations (samples) over m variables (genes, etc), and we are looking to study behavior of the variables between each pair of observations - For instance the two observations having the highest positive or negative correlation. For this purpose I have seen a great chart in Stadler et.al. Nature paper (2011):
Here it could be a sample dataset to be used.
m <- 1000
samples <- data.frame(unif1 = runif(m), unif2 = runif(m, 1, 2), norm1 = rnorm(m),
norm2 = rnorm(m, 1), norm3 = rnorm(m, 0, 5))
I have already tested gpairs(samples) of package gpairs that produces this one. It's a good start, but has no option to put correlation coefficients on the upper-right section, nor the density plots on the lower corner:
Next I used ggpairs(samples, lower=list(continuous="density")) of package GGally (Thanks #LucianoSelzer for a comment below). Now we have correlations on the upper corner and the densities on the lower corner, but we are missing the diagonal barplots, and the density plots are not heatmap shaped.
Any ideas to make the more closer to the desired picture (the first one)?

You could try to combine several different plotting methods and combine the results. Here's an example, which could be tweaked accordingly:
cors<-round(cor(samples),2) #correlations
# make layout for plot layout
laymat<-diag(1:5) #histograms
laymat[upper.tri(laymat)]<-6:15 #correlations
laymat[lower.tri(laymat)]<-16:25 #heatmaps
layout(laymat) #define layout using laymat
par(mar=c(2,2,2,2)) #define marginals etc.
# Draw histograms, tweak arguments of hist to make nicer figures
for(i in 1:5)
hist(samples[,i],main=names(samples)[i])
# Write correlations to upper diagonal part of the graph
# Again, tweak accordingly
for(i in 1:4)
for(j in (i+1):5){
plot(-1:1,-1:1, type = "n",xlab="",ylab="",xaxt="n",yaxt="n")
text(x=0,y=0,labels=paste(cors[i,j]),cex=2)
}
# Plot heatmaps, here I use kde2d function for density estimation
# image function for generating heatmaps
library(MASS)
for(i in 2:5)
for(j in 1:(i-1)){
k <- kde2d(samples[,i],samples[,j])
image(k,col=heat.colors(1000))
}
edit: Corrected indexing on the last loop.

You can do something like this using three different packages and two different functions as below:
cor_fun is for the upper triangle correlative calculation.
my_fn is for the lower triangle plotting
You also need ggpairs.
library(GGally)
library(ggplot2)
library(RColorBrewer)
m <- 1000
samples <- data.frame(unif1 = runif(m), unif2 = runif(m, 1, 2), norm1 = rnorm(m),
norm2 = rnorm(m, 1), norm3 = rnorm(m, 0, 5))
cor_fun <- function(data, mapping, method="pearson", ndp=2, sz=5, stars=TRUE){ #ndp is to adjust the number of decimals
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
corr <- cor.test(x, y, method=method)
est <- corr$estimate
lb.size <- sz
if(stars){
stars <- c("***", "**", "*", "")[findInterval(corr$p.value, c(0, 0.001, 0.01, 0.05, 1))]
lbl <- paste0(round(est, ndp), stars)
}else{
lbl <- round(est, ndp)
}
ggplot(data=data, mapping=mapping) +
annotate("text", x=mean(x, na.rm=TRUE), y=mean(y, na.rm=TRUE), label=lbl, size=lb.size)+
theme(panel.grid = element_blank(), panel.background=element_rect(fill="snow1"))
}
colfunc<-colorRampPalette(c("darkblue","cyan","yellow","red"))
my_fn <- function(data, mapping){
p <- ggplot(data = data, mapping = mapping) +
stat_density2d(aes(fill=..density..), geom="tile", contour = FALSE) +
scale_fill_gradientn(colours = colfunc(100)) + theme_classic()
}
ggpairs(samples, columns = c(1,2,3,4,5),
lower=list(continuous=my_fn),
diag=list(continuous=wrap("densityDiag", fill="gray92")), #densityDiag is a function
upper=list(continuous=cor_fun)) + theme(panel.background=element_rect(fill="white")) +
theme(axis.text.x = element_text(angle = 0, vjust = 1, color = "black")) +
theme(axis.text.y = element_text(angle = 0, vjust = 1 , color = "black"))

Related

Tweaking ggpairs() or a better solution to a correlation matrix

I am trying to create a correlation matrix between my X and Y variables and display this information in a nice figure. I am currently using ggpairs() from the GGally package, but if there's a better way to do this then I am happy to try something new. The figure should:
-Fit linear regression models (using lm) between X and Y variables
-Display scatterplots with a regression line
-Display the Coefficient of the determination (R2)
-Map the colour of points/lines/R2 values by group
I have been able to do most of this, but ggpairs only displays the correlation coefficient (r) and not the coefficient of determination (R2). I was able to use the suggestion from this post, but unfortunately the solution does not display R2 values by group.
So far:
library(GGally)
library(ggplot2)
cars <- mtcars
cars$group <- factor(c(rep("A", 16), rep("B", 16))) #adding grouping variable
#function to return R2 (coefficient of determination) and not just r (Coefficient of correlation) in the top portion of the figure
upper_fn <- function(data, mapping, ndp=2, ...){
# Extract the relevant columns as data
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
# Calculate the r^2 & format output
m <- summary(lm(y ~ x))
lbl <- paste("r^2: ", formatC(m$r.squared, digits=ndp, format="f"))
# Write out label which is centered at x&y position
ggplot(data=data, mapping=mapping) +
annotate("text", x=mean(x, na.rm=TRUE), y=mean(y, na.rm=TRUE), label=lbl, parse=TRUE, ...)+
theme(panel.grid = element_blank())
}
#lower function basically fits a linear model and displays points
lower_fn <- function(data, mapping, ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point(alpha = 0.7) +
geom_smooth(method=lm, fill="blue", se = F, ...)
p
}
#The actual figure
ggpairs(cars,
columns = c(1:11),
mapping = ggplot2::aes(color = group),
upper = list(continuous = "cor", size = 15),
diag = list(continuous = "densityDiag", alpha=0.5),
lower = list(continuous = lower_fn))
Based on Is it possible to split correlation box to show correlation values for two different treatments in pairplot?, below is a little code to get you started.
The idea is that you need to 1. split the data over the aesthetic variable (which is assumed to be colour), 2. run a regression over each data subset and extract the r^2, 3. quick calculation of where to place the r^2 labels, 4. plot. Some features are left to do.
upper_fn <- function(data, mapping, ndp=2, ...){
# Extract the relevant columns as data
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
col <- eval_data_col(data, mapping$colour)
# if no colour mapping run over full data
if(is.null(col)) {
## add something here
}
# if colour aesthetic, split data and run `lm` over each group
if(!is.null(col)) {
idx <- split(seq_len(nrow(data)), col)
r2 <- unlist(lapply(idx, function(i) summary(lm(y[i] ~ x[i]))$r.squared))
lvs <- if(is.character(col)) sort(unique(col)) else levels(col)
cuts <- seq(min(y, na.rm=TRUE), max(y, na.rm=TRUE), length=length(idx)+1L)
pos <- (head(cuts, -1) + tail(cuts, -1))/2
p <- ggplot(data=data, mapping=mapping, ...) +
geom_blank() +
theme_void() +
# you could map colours to each level
annotate("text", x=mean(x), y=pos, label=paste(lvs, ": ", formatC(r2, digits=ndp, format="f")))
}
return(p)
}

ggplot2 adding label to geom_area

I'm teaching undergrad statistics and trying to make a useful little R script to help my students understand calculating probabilities in the standard normal distribution. I have this script, which takes zscore breakpoints, calculates the fraction of data between each breakpoint, and colors each breakpoint section:
library(tidyverse)
library(ggplot2)
library(magrittr)
sim_dat = data.frame(z = seq(-5,5, length.out = 1001))
sim_dat$y = dnorm(sim_dat$z, mean = 0, sd=1)
#fill in z-score bkpts, excluding zero: 0 will always be included
zscores <- c(-1,1.5)
zscores <- sort( setdiff(zscores,0) )
bkpoints <- sort( c(-Inf, zscores,0, Inf))
#find pct data between brekpoints
pctdata <- numeric(length=length(bkpoints)-1)
interval <- character(length=length(bkpoints)-1)
for(i in 1:length(pctdata)){
pctdata[i] <- plyr::round_any( pnorm(q=bkpoints[i+1]) - pnorm(q=bkpoints[i]) , 0.0001)
interval[i] <- paste0(bkpoints[i],",",bkpoints[i+1])
}
pctdata_df <- cbind.data.frame(interval,pctdata,stringsAsFactors=FALSE)
sim_dat$standard_normal_sections = cut(sim_dat$z, breaks = bkpoints)
p1 <- ggplot2::ggplot(sim_dat, aes(z, y, fill = standard_normal_sections)) + geom_area() +
scale_x_continuous(breaks= c(seq(-5,5,1), zscores))
p1
pctdata_df
I'd like to use pctdata_df$pctdata(vector of how much data is in section of p1) as labels. I'm finding very little on how to add labels to geom_area. Any help is appreciated!
There is nothing special about geom_area. If you want to add labels you could do so with geom_text where you pass your pctdata_df to the data argument. As you gave no information on where you want to add your labels I have put them beneath the area chart.
Note: There is no need for a for loop. You could simply pass a vector to pnorm or paste.
library(scales)
library(ggplot2)
# find pct data between brekpoints
lower <- bkpoints[1:(length(bkpoints) - 1)]
upper <- bkpoints[2:length(bkpoints)]
pctdata <- pnorm(q = upper) - pnorm(q = lower)
interval <- paste0(lower, ",", upper)
pctdata_df <- data.frame(interval, lower, upper, pctdata)
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(lower), upper - 1, .5 * (lower + upper)))
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(upper), lower + 1, x_label))
sim_dat$standard_normal_sections <- cut(sim_dat$z, breaks = bkpoints)
ggplot(sim_dat, aes(z, y)) +
geom_area(aes(fill = standard_normal_sections)) +
geom_text(data = pctdata_df, aes(x = x_label, y = 0, label = scales::number(pctdata, .01)),
vjust = 1, size = 8 / .pt, nudge_y = -.01) +
scale_x_continuous(breaks = c(seq(-5, 5, 1), zscores))

How to get ggplot2 to draw multiple simulated trajectories in same plot?

I want to draw multiple simulated paths from any distribution (lognormal in the present case) on the same plot using ggplot2?
Using print(ggplot()) inside a for- loop does not show the paths all together.
library(ggplot2)
t <- 1000 # length of a simulation
time <- seq(0,t-1,by = 1) # make vector of time points
s <- cumsum(rlnorm(t, meanlog = 0, sdlog = 1)) # simulate trajectory of lognormal variable
df <- data.frame(cbind(time,s)) # make dataframe
colnames(df) <- c("t","s") # colnames
ggplot(df, aes(t,s )) + geom_line() # Get one trajectory
Now i want (say) 100 such paths in the same plot;
nsim <- 100 # number of paths
for (i in seq(1,nsim, by =1)) {
s <- cumsum(rlnorm(t, meanlog = 0, sdlog = 1))
df <- data.frame(cbind(time,s))
colnames(df) <- c("t","s")
print(ggplot(df, aes(t,s, color = i)) + geom_line())
}
The above loop obviously cannot do the job.
Any way to visualize such simulations using simple R with ggplot?
Instead of adding each line iteratively, you could iteratively simulate in a loop, collect all results in a data.frame, and plot all lines at once.
library(ggplot2)
nsim <- 100
npoints <- 1000
sims <- lapply(seq_len(nsim), function(i) {
data.frame(x = seq_len(npoints),
y = cumsum(rlnorm(npoints, meanlog = 0, sdlog = 1)),
iteration = i)
})
sims <- do.call(rbind, sims)
ggplot(sims, aes(x, y, colour = iteration, group = iteration)) +
geom_line()
Created on 2019-08-13 by the reprex package (v0.3.0)
In ggplot one method to achieve such methods is to add extra layers to the plot at each iteration. Doing so, a simple change of the latter code should be sufficient.
library(ggplot2)
nsim <- 100 # number of paths
dat <- vector("list", nsim)
p <- ggplot()
t <- 1000 # length of a simulation
time <- seq(0, t-1, by = 1)
for (i in seq(nsim)) {
s <- cumsum(rlnorm(t, meanlog = 0, sdlog = 1))
dat[[i]] <- data.frame(t = time, s = s)
p <- p + geom_line(data = dat[[i]], mapping = aes(x = t, y = s), col = i)
}
p #or print(p)
Note how I initiate the plot, similarly to how I initiate a list to contain the data frames prior to the loop. The loop then builds the plot step by step, but it is not visualized before i print the plot after the for loop. At which point every layer is evaluated (thus it can take a bit longer than standard R plots.)
Additionally as I want to specify the colour for each specific line, the col argument has to be moved outside the aes.

Plotting posterior parameter estimates from multiple models with bayesplot

I am using the great plotting library bayesplot to visualize posterior probability intervals from models I am estimating with rstanarm. I want to graphically compare draws from different models by getting the posterior intervals for coefficients onto the same plot.
Imagine, for instance, that I have 1000 draws from the posterior for three parameters beta1, beta2, beta3 for two different models:
# load the plotting library
library(bayesplot)
#> This is bayesplot version 1.6.0
#> - Online documentation and vignettes at mc-stan.org/bayesplot
#> - bayesplot theme set to bayesplot::theme_default()
#> * Does _not_ affect other ggplot2 plots
#> * See ?bayesplot_theme_set for details on theme setting
library(ggplot2)
# generate fake posterior draws from model1
fdata <- matrix(rnorm(1000 * 3), ncol = 3)
colnames(fdata) <- c('beta1', 'beta2', 'beta3')
# fake posterior draws from model 2
fdata2 <- matrix(rnorm(1000 * 3, 1, 2), ncol = 3)
colnames(fdata2) <- c('beta1', 'beta2', 'beta3')
Bayesplot makes fantastic visualizations for individual model draws, and it is ggplot2 'under the hood' so I can customize as I please:
# a nice plot of 1
color_scheme_set("orange")
mcmc_intervals(fdata) + theme_minimal() + ggtitle("Model 1")
# a nice plot of 2
color_scheme_set("blue")
mcmc_intervals(fdata2) + ggtitle("Model 2")
But what I would like to achieve is to plot these two models together on the same plot, such that for each coefficient I have two intervals and can distinguish which interval is which by mapping color to the model. However I can't figure out how to do this. Some things that don't work:
# doesnt work
mcmc_intervals(fdata) + mcmc_intervals(fdata2)
#> Error: Don't know how to add mcmc_intervals(fdata2) to a plot
# appears to pool
mcmc_intervals(list(fdata, fdata2))
Any ideas on how I could do this? Or how to do it manually given the matrices of posterior draws?
Created on 2018-10-18 by the reprex package (v0.2.1)
Just so the answer is also posted here, I have expanded on the code at the link from #Manny T (https://github.com/stan-dev/bayesplot/issues/232)
# simulate having posteriors for two different models each with parameters beta[1],..., beta[4]
posterior_1 <- matrix(rnorm(4000), 1000, 4)
posterior_2 <- matrix(rnorm(4000), 1000, 4)
colnames(posterior_1) <- colnames(posterior_2) <- paste0("beta[", 1:4, "]")
# use bayesplot::mcmc_intervals_data() function to get intervals data in format easy to pass to ggplot
library(bayesplot)
combined <- rbind(mcmc_intervals_data(posterior_1), mcmc_intervals_data(posterior_2))
combined$model <- rep(c("Model 1", "Model 2"), each = ncol(posterior_1))
# make the plot using ggplot
library(ggplot2)
theme_set(bayesplot::theme_default())
pos <- position_nudge(y = ifelse(combined$model == "Model 2", 0, 0.1))
ggplot(combined, aes(x = m, y = parameter, color = model)) +
geom_linerange(aes(xmin = l, xmax = h), position = pos, size=2)+
geom_linerange(aes(xmin = ll, xmax = hh), position = pos)+
geom_point(position = pos, color="black")
If you are like me, you will want 80% and 90% credible intervals (instead of 50% being the inner ones) and might want the coordinates flipped, and let's add a dashed line at 0 (model estimates no change). You can do that like this:
# use bayesplot::mcmc_intervals_data() function to get intervals data in format easy to pass to ggplot
library(bayesplot)
combined <- rbind(mcmc_intervals_data(posterior_1,prob=0.8,prob_outer = 0.9), mcmc_intervals_data(posterior_2,prob=0.8,prob_outer = 0.9))
combined$model <- rep(c("Model 1", "Model 2"), each = ncol(posterior_1))
# make the plot using ggplot
library(ggplot2)
theme_set(bayesplot::theme_default())
pos <- position_nudge(y = ifelse(combined$model == "Model 2", 0, 0.1))
ggplot(combined, aes(x = m, y = parameter, color = model)) +
geom_linerange(aes(xmin = l, xmax = h), position = pos, size=2)+
geom_linerange(aes(xmin = ll, xmax = hh), position = pos)+
geom_point(position = pos, color="black")+
coord_flip()+
geom_vline(xintercept=0,linetype="dashed")
A few things to note on this last one. I added prob_outer = 0.9 even though that is the default, just to show how you might change the outer credible intervals. The dashed line is created with geom_vline and xintercept = here instead of geom_hline and yintercept = because of the coord_flip (everything is reversed). So if you don't flip axes, you will need to do the opposite.
I asked this question on the bayesplot page on GitHub and got a response (Issue #232).
I blew more time than I'd like to admit writing this, so might as well post it here. Here's a function that incorporates the suggestions from above that (for the moment) works for rstanarm and brms model objects.
compare_posteriors <- function(..., dodge_width = 0.5) {
dots <- rlang::dots_list(..., .named = TRUE)
draws <- lapply(dots, function(x) {
if (class(x)[1] == "stanreg") {
posterior::subset_draws(posterior::as_draws(x$stanfit),
variable = names(fixef(x))
)
} else if (class(x)[1] == "brmsfit") {
brm_draws <- posterior::subset_draws(posterior::as_draws(x$fit),
variable = paste0("b_", rownames(fixef(x)))
)
posterior::variables(brm_draws) <- stringr::str_split(posterior::variables(brm_draws), "_", simplify = T)[, 2]
posterior::rename_variables(brm_draws, `(Intercept)` = Intercept)
} else {
stop(paste0(class(x)[1], " objects not supported."))
}
})
intervals <- lapply(draws, bayesplot::mcmc_intervals_data)
combined <- dplyr::bind_rows(intervals, .id = "model")
ggplot(combined, aes(x = m, y = parameter, color = model, group = model)) +
geom_linerange(aes(xmin = l, xmax = h), size = 2, position = position_dodge(dodge_width)) +
geom_linerange(aes(xmin = ll, xmax = hh), position = position_dodge(dodge_width)) +
geom_point(color = "black", position = position_dodge(dodge_width)) +
geom_vline(xintercept = 0, linetype = "dashed")
}
Usage:
compare_posteriors(mod1, mod2, mod3)

ggplot plot 2d probability density function on top of points on ggplot

I have the following example:
require(mvtnorm)
require(ggplot2)
set.seed(1234)
xx <- data.frame(rmvt(100, df = c(13, 13)))
ggplot(data = xx, aes(x = X1, y= X2)) + geom_point() + geom_density2d()
Here is what I get:
However, I would like to get the density contour from the mutlivariate t density given by the dmvt function. How do I tweak geom_density2d to do that?
This is not an easy question to answer: because the contours need to be calculated and the ellipse drawn using the ellipse package.
Done with elliptical t-densities to illustrate the plotting better.
nu <- 5 ## this is the degrees of freedom of the multivariate t.
library(mvtnorm)
library(ggplot2)
sig <- matrix(c(1, 0.5, 0.5, 1), ncol = 2) ## this is the sigma parameter for the multivariate t
xx <- data.frame( rmvt(n = 100, df = c(nu, nu), sigma = sig)) ## generating the original sample
rtsq <- rowSums(x = matrix(rt(n = 2e6, df = nu)^2, ncol = 2)) ## generating the sample for the ellipse-quantiles. Note that this is a cumbersome calculation because it is the sum of two independent t-squared random variables with the same degrees of freedom so I am using simulation to get the quantiles. This is the sample from which I will create the quantiles.
g <- ggplot( data = xx
, aes( x = X1
, y = X2
)
) + geom_point(colour = "red", size = 2) ## initial setup
library(ellipse)
for (i in seq(from = 0.01, to = 0.99, length.out = 20)) {
el.df <- data.frame(ellipse(x = sig, t = sqrt(quantile(rtsq, probs = i)))) ## create the data for the given quantile of the ellipse.
names(el.df) <- c("x", "y")
g <- g + geom_polygon(data=el.df, aes(x=x, y=y), fill = NA, linetype=1, colour = "blue") ## plot the ellipse
}
g + theme_bw()
This yields:
I still have a question: how does one reduce the size of the plotting ellispe lines?

Resources