ggplot2 adding label to geom_area - r

I'm teaching undergrad statistics and trying to make a useful little R script to help my students understand calculating probabilities in the standard normal distribution. I have this script, which takes zscore breakpoints, calculates the fraction of data between each breakpoint, and colors each breakpoint section:
library(tidyverse)
library(ggplot2)
library(magrittr)
sim_dat = data.frame(z = seq(-5,5, length.out = 1001))
sim_dat$y = dnorm(sim_dat$z, mean = 0, sd=1)
#fill in z-score bkpts, excluding zero: 0 will always be included
zscores <- c(-1,1.5)
zscores <- sort( setdiff(zscores,0) )
bkpoints <- sort( c(-Inf, zscores,0, Inf))
#find pct data between brekpoints
pctdata <- numeric(length=length(bkpoints)-1)
interval <- character(length=length(bkpoints)-1)
for(i in 1:length(pctdata)){
pctdata[i] <- plyr::round_any( pnorm(q=bkpoints[i+1]) - pnorm(q=bkpoints[i]) , 0.0001)
interval[i] <- paste0(bkpoints[i],",",bkpoints[i+1])
}
pctdata_df <- cbind.data.frame(interval,pctdata,stringsAsFactors=FALSE)
sim_dat$standard_normal_sections = cut(sim_dat$z, breaks = bkpoints)
p1 <- ggplot2::ggplot(sim_dat, aes(z, y, fill = standard_normal_sections)) + geom_area() +
scale_x_continuous(breaks= c(seq(-5,5,1), zscores))
p1
pctdata_df
I'd like to use pctdata_df$pctdata(vector of how much data is in section of p1) as labels. I'm finding very little on how to add labels to geom_area. Any help is appreciated!

There is nothing special about geom_area. If you want to add labels you could do so with geom_text where you pass your pctdata_df to the data argument. As you gave no information on where you want to add your labels I have put them beneath the area chart.
Note: There is no need for a for loop. You could simply pass a vector to pnorm or paste.
library(scales)
library(ggplot2)
# find pct data between brekpoints
lower <- bkpoints[1:(length(bkpoints) - 1)]
upper <- bkpoints[2:length(bkpoints)]
pctdata <- pnorm(q = upper) - pnorm(q = lower)
interval <- paste0(lower, ",", upper)
pctdata_df <- data.frame(interval, lower, upper, pctdata)
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(lower), upper - 1, .5 * (lower + upper)))
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(upper), lower + 1, x_label))
sim_dat$standard_normal_sections <- cut(sim_dat$z, breaks = bkpoints)
ggplot(sim_dat, aes(z, y)) +
geom_area(aes(fill = standard_normal_sections)) +
geom_text(data = pctdata_df, aes(x = x_label, y = 0, label = scales::number(pctdata, .01)),
vjust = 1, size = 8 / .pt, nudge_y = -.01) +
scale_x_continuous(breaks = c(seq(-5, 5, 1), zscores))

Related

ggplot2: Plot two different Densities in the same Plot of the same Variable before and after a Cutoff

My goal is to plot two different densities in the same plot of the same variable. I want to do this as it is common to show robustness of the forcing variable (here z) in a Regression Discontinuity Design. In the code below, I got it working however I do not want the density to be plotted before the cutoff (here 0) if it the key is "above" and vice-versa. Also, the graph should not just be hidden because of the smoothing. It should start computing the density just until (or start) the cutoff.
library(ggplot2)
x <- rnorm(1000, mean = 0)
y <- rnorm(500, mean = 2)
z <- append(x,y)
d <- tibble(value = z, key = ifelse(z <= 0, "below", "above"))
ggplot(d) +
geom_density(aes(z, group = key)) +
geom_vline(aes(xintercept = 0))
Does anybody know how to implement this? For linear regressions I got it working, but with geom_density() it plots the other side of the cutoff as well and smoothes it.
Thanks in advance for your help.
You can use trim = TRUE in geom_density to only calculate density over the range of values in the data:
library(ggplot2)
library(dplyr)
x <- rnorm(1000, mean = 0)
y <- rnorm(500, mean = 2)
z <- append(x,y)
d <- tibble(value = z, key = ifelse(z <= 0, "below", "above"))
ggplot(d) +
# added fill for easier discrimination
geom_density(aes(value, group = key, fill = key),
alpha = 0.5, trim = TRUE) +
geom_vline(aes(xintercept = 0), lty = 2, colour = 'red')

ggplot2 - how to adjust stat_bin and stat to use calculation of a different variable

The goal is to generate a "histogram" of x where the bars are sum(y)/count(x), where y is another variable describing the data. The point is to use ggplot binning to do the grouping part. I do not want to calculate the binning myself and then perform the calculation.
example:
library(ggplot2)
library(data.table)
k <- runif(1000)
k <- k[order(k)]
y <- c(rbinom(n = 500, size = 1, prob = .05), rbinom(n = 500, size = 1, prob = .95))
w <- data.table(k, y)
so a plot(w$k, w$y) gives
so theoretically what I am looking for looks like this:
ggplot(w, aes(k)) + geom_histogram(aes(y = stat(sum(y)/count)))
but it generates this:
Not sure if this is what you want but sum(y) is going to be the same for all bars.
library(ggplot2)
library(data.table)
set.seed(13434)
k <- runif(1000)
k <- k[order(k)]
y <- c(rbinom(n = 500, size = 1, prob = .05), rbinom(n = 500, size = 1, prob = .95))
w <- data.table(k, y)
constant_value <- sum(w$y)
ggplot(w, aes(k)) + geom_histogram(aes(y = stat(constant_value/count)))
gives exactly the same plot as
ggplot(w, aes(k)) + geom_histogram(aes(y = stat(sum(w$y)/count)))
Edit:
Not sure if this helps you, here I use the same binwidth (30) as ggplot2s default:
library(tidyverse)
w %>%
arrange(k) %>%
mutate(bin = cut_interval(1:length(k), length=30, labels=FALSE)) %>%
group_by(bin) %>%
summarise(mean_y = mean(y),
mean_k = mean(k),
width = max(k) - min(k)) %>%
ggplot(aes(mean_k, mean_y, width=width)) +
geom_bar(stat="identity") +
labs(x="k", y="mean y")
which makes this figure:

Plot for 100 random surveys

I am attempting to reproduce the following graph in R:
This is meant to represent 100 random polls confidence intervals, with a mean of 42.9. I made some progress with qplot, however there are some things that I still couldn't get.
library(ggplot2)
polls <- replicate(100, rnorm(100, mean = 30, sd=3))
# Calculate 90% confidence intervals for each row.
tint <- matrix(NA, nrow = dim(polls)[2], ncol = 2)
for (i in 1:dim(polls)[2]) {
temp <- t.test(polls[, i], conf.level = 0.9)
tint[i, ] <- temp$conf.int
}
colnames(tint) <- c("lcl", "ucl")
# The width of each confidence interval:
width <- apply(tint, 1, diff)
tint <- cbind(tint, width)
tint <- data.frame(tint)
And with the command:
qplot(tint$width, y=30, geom="pointrange",ymin = tint$lcl, ymax = tint$ucl) + coord_flip() +
theme_bw()
I get:
Questions:
How do I change y to represent each poll?
How to draw the line at the intended mean (30, in this case)?
Not a very elegant solution but it works:
ggplot(data = tint, aes(x = ucl - width/2, y = seq(1:100))) +
geom_point() +
geom_errorbarh(aes(xmin = lcl, xmax = ucl)) +
geom_vline(xintercept = 30, color = "red")

Add segments of circles to ggplot based on product of x & y

I want to add shaded areas to a chart to help people understand where bad, ok, and good points can fit.
Good = x*y>=.66
Ok = x*y>=.34
Bad = x*y<.34
Generating the right sequence of data to correctly apply the curved boundaries to the chart is proving tough.
What is the most elegant way to generate the curves?
Bonus Q: How would you do this to produce non-overlapping areas so that different colours could be used?
Updates
I've managed to do in a rather hacky way the drawing of the circle segments. I updated the MRE to use the revised segMaker function.
MRE
library(ggplot2)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
# What function will generate correct sequence of values as these are linear?
segMaker<-function(x,by){
# Original
# data.frame(x=c(seq(0,x,by),0)
# ,y=c(seq(x,0,-by),0)
# )
zero <- data.frame(x = 0, y = 0)
rs <- seq(0, pi, by)
xc <- x * cos(rs)
yc <- x * sin(rs)
gr <- data.frame(x = xc, y = yc)
gr <- rbind(gr[gr$x >= 0, ], zero)
return(gr)
}
firstSeg <-segMaker(.34,0.02)
secondSeg <-segMaker(.66,0.02)
thirdSeg <-segMaker(1,0.02)
ggplot(data.frame(x,y),aes(x,y, colour=x*y))+
geom_point() +
geom_polygon(data=firstSeg, fill="blue", alpha=.25)+
geom_polygon(data=secondSeg, fill="blue", alpha=.25)+
geom_polygon(data=thirdSeg, fill="blue", alpha=.25)
Current & desired shadings
You can create a data frame with the boundaries between each region and then use geom_ribbon to plot it. Here's an example using the conditions you supplied (which result in boundaries that are the reciprocal function, rather than circles, but the idea is the same, whichever function you use for the boundaries):
library(ggplot2)
# Fake data
pts<-seq(0,1,.02)
set.seed(19485)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
df = data.frame(x,y)
# Region boundaries
x = seq(0.001,1.1,0.01)
bounds = data.frame(x, ymin=c(-100/x, 0.34/x, 0.66/x),
ymax=c(0.34/x, 0.66/x, 100/x),
g=rep(c("Bad","OK","Good"), each=length(x)))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
ggplot() +
coord_cartesian(ylim=0:1, xlim=0:1) +
geom_ribbon(data=bounds, aes(x, ymin=ymin, ymax=ymax, fill=g), colour="grey50", lwd=0.2) +
geom_point(data=df, aes(x,y), colour="grey20") +
scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80)) +
#scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80, alpha=0.25)) + # If you want the fill colors to be transparent
labs(fill="") +
guides(fill=guide_legend(reverse=TRUE))
For circular boundaries, assuming we want boundaries at r=1/3 and r=2/3:
# Calculate y for circle, given r and x
cy = function(r, x) {sqrt(r^2 - x^2)}
n = 200
x = unlist(lapply(c(1/3,2/3,1), function(to) seq(0, to, len=n)))
bounds = data.frame(x, ymin = c(rep(0, n),
cy(1/3, seq(0, 1/3, len=n/2)), rep(0, n/2),
cy(2/3, seq(0, 2/3, len=2*n/3)), rep(0, n/3)),
ymax = c(cy(1/3, seq(0,1/3,len=n)),
cy(2/3, seq(0,2/3,len=n)),
rep(1,n)),
g=rep(c("Bad","OK","Good"), each=n))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
If you can use a github package, ggforce adds geom_arc_bar():
# devtools::install_github('thomasp85/ggforce')
library(ggplot2)
library(ggforce)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
arcs <- data.frame(
x0 = 0,
y0 = 0,
start = 0,
end = pi / 2,
r0 = c(0, 1/3, 2/3),
r = c(1/3, 2/3, 1),
fill = c("bad", "ok", "good")
)
ggplot() +
geom_arc_bar(data = arcs,
aes(x0 = x0, y0 = y0, start = start, end = end, r0 = r0, r = r,
fill = fill), alpha = 0.6) +
geom_point(data = data.frame(x = x, y = y),
aes(x = x, y = y))
Based on #eipi10's great answer, to do the product component (basically ends up with the same thing) I did:
library(ggplot2)
library(data.table)
set.seed(19485)
pts <- seq(0, 1, .001)
x <- sample(pts, 50, replace = TRUE)
y <- sample(pts, 50, replace = TRUE)
df <- data.frame(x,y)
myRibbon<-CJ(pts,pts)
myRibbon[,prod:=V1 * V2]
myRibbon[,cat:=ifelse(prod<=1/3,"bad",
ifelse(prod<=2/3,"ok","good"))]
myRibbon<-myRibbon[
,.(ymin=min(V2),ymax=max(V2))
,.(cat,V1)]
ggplot() +
geom_ribbon(data=myRibbon
, aes(x=V1, ymin=ymin,ymax=ymax
, group=cat, fill=cat),
colour="grey90", lwd=0.2, alpha=.5)+
geom_point(data=df, aes(x,y), colour="grey20") +
theme_minimal()
This doesn't do anything fancy but works out for each value of x, what the smallest and largest values were that could give rise to a specific banding.
If I had just wanted arcs, the use of ggforce (#GregF) would be really great- it tucks away all the complexity.

Pairwise graphical comparison of several distributions

This is an edited version of a previous question.
We are given an m by n table of n observations (samples) over m variables (genes, etc), and we are looking to study behavior of the variables between each pair of observations - For instance the two observations having the highest positive or negative correlation. For this purpose I have seen a great chart in Stadler et.al. Nature paper (2011):
Here it could be a sample dataset to be used.
m <- 1000
samples <- data.frame(unif1 = runif(m), unif2 = runif(m, 1, 2), norm1 = rnorm(m),
norm2 = rnorm(m, 1), norm3 = rnorm(m, 0, 5))
I have already tested gpairs(samples) of package gpairs that produces this one. It's a good start, but has no option to put correlation coefficients on the upper-right section, nor the density plots on the lower corner:
Next I used ggpairs(samples, lower=list(continuous="density")) of package GGally (Thanks #LucianoSelzer for a comment below). Now we have correlations on the upper corner and the densities on the lower corner, but we are missing the diagonal barplots, and the density plots are not heatmap shaped.
Any ideas to make the more closer to the desired picture (the first one)?
You could try to combine several different plotting methods and combine the results. Here's an example, which could be tweaked accordingly:
cors<-round(cor(samples),2) #correlations
# make layout for plot layout
laymat<-diag(1:5) #histograms
laymat[upper.tri(laymat)]<-6:15 #correlations
laymat[lower.tri(laymat)]<-16:25 #heatmaps
layout(laymat) #define layout using laymat
par(mar=c(2,2,2,2)) #define marginals etc.
# Draw histograms, tweak arguments of hist to make nicer figures
for(i in 1:5)
hist(samples[,i],main=names(samples)[i])
# Write correlations to upper diagonal part of the graph
# Again, tweak accordingly
for(i in 1:4)
for(j in (i+1):5){
plot(-1:1,-1:1, type = "n",xlab="",ylab="",xaxt="n",yaxt="n")
text(x=0,y=0,labels=paste(cors[i,j]),cex=2)
}
# Plot heatmaps, here I use kde2d function for density estimation
# image function for generating heatmaps
library(MASS)
for(i in 2:5)
for(j in 1:(i-1)){
k <- kde2d(samples[,i],samples[,j])
image(k,col=heat.colors(1000))
}
edit: Corrected indexing on the last loop.
You can do something like this using three different packages and two different functions as below:
cor_fun is for the upper triangle correlative calculation.
my_fn is for the lower triangle plotting
You also need ggpairs.
library(GGally)
library(ggplot2)
library(RColorBrewer)
m <- 1000
samples <- data.frame(unif1 = runif(m), unif2 = runif(m, 1, 2), norm1 = rnorm(m),
norm2 = rnorm(m, 1), norm3 = rnorm(m, 0, 5))
cor_fun <- function(data, mapping, method="pearson", ndp=2, sz=5, stars=TRUE){ #ndp is to adjust the number of decimals
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
corr <- cor.test(x, y, method=method)
est <- corr$estimate
lb.size <- sz
if(stars){
stars <- c("***", "**", "*", "")[findInterval(corr$p.value, c(0, 0.001, 0.01, 0.05, 1))]
lbl <- paste0(round(est, ndp), stars)
}else{
lbl <- round(est, ndp)
}
ggplot(data=data, mapping=mapping) +
annotate("text", x=mean(x, na.rm=TRUE), y=mean(y, na.rm=TRUE), label=lbl, size=lb.size)+
theme(panel.grid = element_blank(), panel.background=element_rect(fill="snow1"))
}
colfunc<-colorRampPalette(c("darkblue","cyan","yellow","red"))
my_fn <- function(data, mapping){
p <- ggplot(data = data, mapping = mapping) +
stat_density2d(aes(fill=..density..), geom="tile", contour = FALSE) +
scale_fill_gradientn(colours = colfunc(100)) + theme_classic()
}
ggpairs(samples, columns = c(1,2,3,4,5),
lower=list(continuous=my_fn),
diag=list(continuous=wrap("densityDiag", fill="gray92")), #densityDiag is a function
upper=list(continuous=cor_fun)) + theme(panel.background=element_rect(fill="white")) +
theme(axis.text.x = element_text(angle = 0, vjust = 1, color = "black")) +
theme(axis.text.y = element_text(angle = 0, vjust = 1 , color = "black"))

Resources