Plot for 100 random surveys - r

I am attempting to reproduce the following graph in R:
This is meant to represent 100 random polls confidence intervals, with a mean of 42.9. I made some progress with qplot, however there are some things that I still couldn't get.
library(ggplot2)
polls <- replicate(100, rnorm(100, mean = 30, sd=3))
# Calculate 90% confidence intervals for each row.
tint <- matrix(NA, nrow = dim(polls)[2], ncol = 2)
for (i in 1:dim(polls)[2]) {
temp <- t.test(polls[, i], conf.level = 0.9)
tint[i, ] <- temp$conf.int
}
colnames(tint) <- c("lcl", "ucl")
# The width of each confidence interval:
width <- apply(tint, 1, diff)
tint <- cbind(tint, width)
tint <- data.frame(tint)
And with the command:
qplot(tint$width, y=30, geom="pointrange",ymin = tint$lcl, ymax = tint$ucl) + coord_flip() +
theme_bw()
I get:
Questions:
How do I change y to represent each poll?
How to draw the line at the intended mean (30, in this case)?

Not a very elegant solution but it works:
ggplot(data = tint, aes(x = ucl - width/2, y = seq(1:100))) +
geom_point() +
geom_errorbarh(aes(xmin = lcl, xmax = ucl)) +
geom_vline(xintercept = 30, color = "red")

Related

Add OLS squares to ggplot2 graph

I'm trying to plot a graph with points, OLS line and I want to add a square to every point to explain how OLS work. I am doing so using for loop which is not the best in the first place, but it is what it is. My code is as follows:
library(ggplot2)
set.seed(0)
x = sort(rnorm(20, 5, 5))
y = sort(rnorm(20, 5, 10))
lmod = lm(y ~ x)
subor <- data.frame(prva = x, druha = y)
a = ggplot(subor, aes_string(x = x, y = y)) +
geom_point() +
geom_abline(slope = lmod$coefficients[2], intercept = lmod$coefficients[1]) +
coord_fixed()
for(i in 1:20) {
a = a + geom_rect(aes(xmin = ifelse(y[i] > lmod$fitted.values[i], x[i] - (y[i] - lmod$fitted.values[i]), x[i]),
ymin = min(y[i], lmod$fitted.values[i]), ymax = max(y[i], lmod$fitted.values[i]),
xmax = ifelse(y[i] > lmod$fitted.values[i], x[i], x[i] + (lmod$fitted.values[i] - y[i]))))
}
a
But instead of getting all the squares I only get the last one.
You don't really need a loop here. Use the vectorized pmin and pmax functions to work out the minimum and maximum edges of your squares from the fitted values and residuals:
a + geom_rect(alpha = 0.2, col = 'gray60',
aes(ymin = pmin(druha, fitted(lmod)),
ymax = pmax(druha, fitted(lmod)),
xmin = pmin(prva, prva - resid(lmod)),
xmax = pmax(prva, prva - resid(lmod))))

Illustrate standard deviation in histogram

Consider the following simple example:
# E. Musk in Grunheide
set.seed(22032022)
# generate random numbers
randomNumbers <- rnorm(n = 1000, mean = 10, sd = 10)
# empirical sd
sd(randomNumbers)
#> [1] 10.34369
# histogram
hist(randomNumbers, probability = TRUE, main = "", breaks = 50)
# just for illusatration purpose
###
# empirical density
lines(density(randomNumbers), col = 'black', lwd = 2)
# theortical density
curve(dnorm(x, mean = 10, sd = 10), col = "blue", lwd = 2, add = TRUE)
###
Created on 2022-03-22 by the reprex package (v2.0.1)
Question:
Is there a nice way to illustrate the empirical standard deviation (sd) in the histogram by colour?
E.g. representing the inner bars by a different color, or indicating the range of the sd by an interval, i.e., [mean +/- sd], on the x-axis?
Note, if ggplot2 provides an easy solution, suggesting this would be also much appreciated.
This is similar ggplot solution to Benson's answer, except we precompute the histogram and use geom_col, so that we don't get any of the unwelcome stacking at the sd boundary:
# E. Musk in Grunheide
set.seed(22032022)
# generate random numbers
randomNumbers <- rnorm(n=1000, mean=10, sd=10)
h <- hist(randomNumbers, breaks = 50, plot = FALSE)
lower <- mean(randomNumbers) - sd(randomNumbers)
upper <- mean(randomNumbers) + sd(randomNumbers)
df <- data.frame(x = h$mids, y = h$density,
fill = h$mids > lower & h$mids < upper)
library(ggplot2)
ggplot(df) +
geom_col(aes(x, y, fill = fill), width = 1, color = 'black') +
geom_density(data = data.frame(x = randomNumbers),
aes(x = x, color = 'Actual density'),
key_glyph = 'path') +
geom_function(fun = function(x) {
dnorm(x, mean = mean(randomNumbers), sd = sd(randomNumbers)) },
aes(color = 'theoretical density')) +
scale_fill_manual(values = c(`TRUE` = '#FF374A', 'FALSE' = 'gray'),
name = 'within 1 SD') +
scale_color_manual(values = c('black', 'blue'), name = 'Density lines') +
labs(x = 'Value of random number', y = 'Density') +
theme_minimal()
Here is a ggplot solution. First calculate mean and sd, and save the values in different vectors. Then use an ifelse statement to categorise the values into "Within range" and "Outside range", fill them with different colours.
Blue line represents the normal distribution stated in your question, and black line represents the density graph of the histogram we're plotting.
library(ggplot2)
set.seed(22032022)
# generate random numbers
randomNumbers <- rnorm(n=1000, mean=10, sd=10)
randomNumbers_mean <- mean(randomNumbers)
randomNumbers_sd <- sd(randomNumbers)
ggplot(data.frame(randomNumbers = randomNumbers), aes(randomNumbers)) +
geom_histogram(aes(
fill = ifelse(
randomNumbers > randomNumbers_mean + randomNumbers_sd |
randomNumbers < randomNumbers_mean - randomNumbers_sd,
"Outside range",
"Within range"
)
),
binwidth = 1, col = "gray") +
geom_density(aes(y = ..count..)) +
stat_function(fun = function(x) dnorm(x, mean = 10, sd = 10) * 1000,
color = "blue") +
labs(fill = "Data")
Created on 2022-03-22 by the reprex package (v2.0.1)
data.frame(rand = randomNumbers,
cut = {
sd <- sd(randomNumbers)
mn <- mean(randomNumbers)
cut(randomNumbers, c(-Inf, mn -sd, mn +sd, Inf))
}) |>
ggplot(aes(x = rand, fill = cut ) ) +
geom_histogram()

ggplot2 adding label to geom_area

I'm teaching undergrad statistics and trying to make a useful little R script to help my students understand calculating probabilities in the standard normal distribution. I have this script, which takes zscore breakpoints, calculates the fraction of data between each breakpoint, and colors each breakpoint section:
library(tidyverse)
library(ggplot2)
library(magrittr)
sim_dat = data.frame(z = seq(-5,5, length.out = 1001))
sim_dat$y = dnorm(sim_dat$z, mean = 0, sd=1)
#fill in z-score bkpts, excluding zero: 0 will always be included
zscores <- c(-1,1.5)
zscores <- sort( setdiff(zscores,0) )
bkpoints <- sort( c(-Inf, zscores,0, Inf))
#find pct data between brekpoints
pctdata <- numeric(length=length(bkpoints)-1)
interval <- character(length=length(bkpoints)-1)
for(i in 1:length(pctdata)){
pctdata[i] <- plyr::round_any( pnorm(q=bkpoints[i+1]) - pnorm(q=bkpoints[i]) , 0.0001)
interval[i] <- paste0(bkpoints[i],",",bkpoints[i+1])
}
pctdata_df <- cbind.data.frame(interval,pctdata,stringsAsFactors=FALSE)
sim_dat$standard_normal_sections = cut(sim_dat$z, breaks = bkpoints)
p1 <- ggplot2::ggplot(sim_dat, aes(z, y, fill = standard_normal_sections)) + geom_area() +
scale_x_continuous(breaks= c(seq(-5,5,1), zscores))
p1
pctdata_df
I'd like to use pctdata_df$pctdata(vector of how much data is in section of p1) as labels. I'm finding very little on how to add labels to geom_area. Any help is appreciated!
There is nothing special about geom_area. If you want to add labels you could do so with geom_text where you pass your pctdata_df to the data argument. As you gave no information on where you want to add your labels I have put them beneath the area chart.
Note: There is no need for a for loop. You could simply pass a vector to pnorm or paste.
library(scales)
library(ggplot2)
# find pct data between brekpoints
lower <- bkpoints[1:(length(bkpoints) - 1)]
upper <- bkpoints[2:length(bkpoints)]
pctdata <- pnorm(q = upper) - pnorm(q = lower)
interval <- paste0(lower, ",", upper)
pctdata_df <- data.frame(interval, lower, upper, pctdata)
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(lower), upper - 1, .5 * (lower + upper)))
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(upper), lower + 1, x_label))
sim_dat$standard_normal_sections <- cut(sim_dat$z, breaks = bkpoints)
ggplot(sim_dat, aes(z, y)) +
geom_area(aes(fill = standard_normal_sections)) +
geom_text(data = pctdata_df, aes(x = x_label, y = 0, label = scales::number(pctdata, .01)),
vjust = 1, size = 8 / .pt, nudge_y = -.01) +
scale_x_continuous(breaks = c(seq(-5, 5, 1), zscores))

Add segments of circles to ggplot based on product of x & y

I want to add shaded areas to a chart to help people understand where bad, ok, and good points can fit.
Good = x*y>=.66
Ok = x*y>=.34
Bad = x*y<.34
Generating the right sequence of data to correctly apply the curved boundaries to the chart is proving tough.
What is the most elegant way to generate the curves?
Bonus Q: How would you do this to produce non-overlapping areas so that different colours could be used?
Updates
I've managed to do in a rather hacky way the drawing of the circle segments. I updated the MRE to use the revised segMaker function.
MRE
library(ggplot2)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
# What function will generate correct sequence of values as these are linear?
segMaker<-function(x,by){
# Original
# data.frame(x=c(seq(0,x,by),0)
# ,y=c(seq(x,0,-by),0)
# )
zero <- data.frame(x = 0, y = 0)
rs <- seq(0, pi, by)
xc <- x * cos(rs)
yc <- x * sin(rs)
gr <- data.frame(x = xc, y = yc)
gr <- rbind(gr[gr$x >= 0, ], zero)
return(gr)
}
firstSeg <-segMaker(.34,0.02)
secondSeg <-segMaker(.66,0.02)
thirdSeg <-segMaker(1,0.02)
ggplot(data.frame(x,y),aes(x,y, colour=x*y))+
geom_point() +
geom_polygon(data=firstSeg, fill="blue", alpha=.25)+
geom_polygon(data=secondSeg, fill="blue", alpha=.25)+
geom_polygon(data=thirdSeg, fill="blue", alpha=.25)
Current & desired shadings
You can create a data frame with the boundaries between each region and then use geom_ribbon to plot it. Here's an example using the conditions you supplied (which result in boundaries that are the reciprocal function, rather than circles, but the idea is the same, whichever function you use for the boundaries):
library(ggplot2)
# Fake data
pts<-seq(0,1,.02)
set.seed(19485)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
df = data.frame(x,y)
# Region boundaries
x = seq(0.001,1.1,0.01)
bounds = data.frame(x, ymin=c(-100/x, 0.34/x, 0.66/x),
ymax=c(0.34/x, 0.66/x, 100/x),
g=rep(c("Bad","OK","Good"), each=length(x)))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
ggplot() +
coord_cartesian(ylim=0:1, xlim=0:1) +
geom_ribbon(data=bounds, aes(x, ymin=ymin, ymax=ymax, fill=g), colour="grey50", lwd=0.2) +
geom_point(data=df, aes(x,y), colour="grey20") +
scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80)) +
#scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80, alpha=0.25)) + # If you want the fill colors to be transparent
labs(fill="") +
guides(fill=guide_legend(reverse=TRUE))
For circular boundaries, assuming we want boundaries at r=1/3 and r=2/3:
# Calculate y for circle, given r and x
cy = function(r, x) {sqrt(r^2 - x^2)}
n = 200
x = unlist(lapply(c(1/3,2/3,1), function(to) seq(0, to, len=n)))
bounds = data.frame(x, ymin = c(rep(0, n),
cy(1/3, seq(0, 1/3, len=n/2)), rep(0, n/2),
cy(2/3, seq(0, 2/3, len=2*n/3)), rep(0, n/3)),
ymax = c(cy(1/3, seq(0,1/3,len=n)),
cy(2/3, seq(0,2/3,len=n)),
rep(1,n)),
g=rep(c("Bad","OK","Good"), each=n))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
If you can use a github package, ggforce adds geom_arc_bar():
# devtools::install_github('thomasp85/ggforce')
library(ggplot2)
library(ggforce)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
arcs <- data.frame(
x0 = 0,
y0 = 0,
start = 0,
end = pi / 2,
r0 = c(0, 1/3, 2/3),
r = c(1/3, 2/3, 1),
fill = c("bad", "ok", "good")
)
ggplot() +
geom_arc_bar(data = arcs,
aes(x0 = x0, y0 = y0, start = start, end = end, r0 = r0, r = r,
fill = fill), alpha = 0.6) +
geom_point(data = data.frame(x = x, y = y),
aes(x = x, y = y))
Based on #eipi10's great answer, to do the product component (basically ends up with the same thing) I did:
library(ggplot2)
library(data.table)
set.seed(19485)
pts <- seq(0, 1, .001)
x <- sample(pts, 50, replace = TRUE)
y <- sample(pts, 50, replace = TRUE)
df <- data.frame(x,y)
myRibbon<-CJ(pts,pts)
myRibbon[,prod:=V1 * V2]
myRibbon[,cat:=ifelse(prod<=1/3,"bad",
ifelse(prod<=2/3,"ok","good"))]
myRibbon<-myRibbon[
,.(ymin=min(V2),ymax=max(V2))
,.(cat,V1)]
ggplot() +
geom_ribbon(data=myRibbon
, aes(x=V1, ymin=ymin,ymax=ymax
, group=cat, fill=cat),
colour="grey90", lwd=0.2, alpha=.5)+
geom_point(data=df, aes(x,y), colour="grey20") +
theme_minimal()
This doesn't do anything fancy but works out for each value of x, what the smallest and largest values were that could give rise to a specific banding.
If I had just wanted arcs, the use of ggforce (#GregF) would be really great- it tucks away all the complexity.

How to visualise the difference between probability distribution functions? [closed]

Closed. This question is opinion-based. It is not currently accepting answers.
Want to improve this question? Update the question so it can be answered with facts and citations by editing this post.
Closed 7 years ago.
Improve this question
I try to visualise the difference between two histograms of distribution functions such as the difference in following two curves :
When the difference is big, you could just plot two curves on top of each other and fill the difference as denoted above, though when the difference becomes very small, this is cumbersome. Another way to plot this, is plotting the difference itself as follows :
However, this seems very hard to read for everyone seeing such a graph for the first time, so i was wondering: is there any other way you can visualise the difference between two distribution functions ?
I thought that maybe it might be an option to simply combine your two propositions, while scaling up the differences to make them visible.
What follows is an attempt to do this with ggplot2. Actually it was quite a bit more involved to do this than I initially thought, and I'm definitely not a hundred percent satisfied with the result; but maybe it helps nevertheless. Comments and improvements very welcome.
library(ggplot2)
library(dplyr)
## function that replicates default ggplot2 colors
## taken from [1]
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
## Set up sample data
set.seed(1)
n <- 2000
x1 <- rlnorm(n, 0, 1)
x2 <- rlnorm(n, 0, 1.1)
df <- bind_rows(data.frame(sample=1, x=x1), data.frame(sample=2, x=x2)) %>%
mutate(sample = as.factor(sample))
## Calculate density estimates
g1 <- ggplot(df, aes(x=x, group=sample, colour=sample)) +
geom_density(data = df) + xlim(0, 10)
gg1 <- ggplot_build(g1)
## Use these estimates (available at the same x coordinates!) for
## calculating the differences.
## Inspired by [2]
x <- gg1$data[[1]]$x[gg1$data[[1]]$group == 1]
y1 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 1]
y2 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 2]
df2 <- data.frame(x = x, ymin = pmin(y1, y2), ymax = pmax(y1, y2),
side=(y1<y2), ydiff = y2-y1)
g2 <- ggplot(df2) +
geom_ribbon(aes(x = x, ymin = ymin, ymax = ymax, fill = side, alpha = 0.5)) +
geom_line(aes(x = x, y = 5 * abs(ydiff), colour = side)) +
geom_area(aes(x = x, y = 5 * abs(ydiff), fill = side, alpha = 0.4))
g3 <- g2 +
geom_density(data = df, size = 1, aes(x = x, group = sample, colour = sample)) +
xlim(0, 10) +
guides(alpha = FALSE, colour = FALSE) +
ylab("Curves: density\n Shaded area: 5 * difference of densities") +
scale_fill_manual(name = "samples", labels = 1:2, values = gg_color_hue(2)) +
scale_colour_manual(limits = list(1, 2, FALSE, TRUE), values = rep(gg_color_hue(2), 2))
print(g3)
Sources: SO answer 1, SO answer 2
As suggested by #Gregor in the comments, here's a version that does two separate plots below eachother but sharing the same x axis scaling. At least the legends should obviously be tweaked.
library(ggplot2)
library(dplyr)
library(grid)
## function that replicates default ggplot2 colors
## taken from [1]
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
## Set up sample data
set.seed(1)
n <- 2000
x1 <- rlnorm(n, 0, 1)
x2 <- rlnorm(n, 0, 1.1)
df <- bind_rows(data.frame(sample=1, x=x1), data.frame(sample=2, x=x2)) %>%
mutate(sample = as.factor(sample))
## Calculate density estimates
g1 <- ggplot(df, aes(x=x, group=sample, colour=sample)) +
geom_density(data = df) + xlim(0, 10)
gg1 <- ggplot_build(g1)
## Use these estimates (available at the same x coordinates!) for
## calculating the differences.
## Inspired by [2]
x <- gg1$data[[1]]$x[gg1$data[[1]]$group == 1]
y1 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 1]
y2 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 2]
df2 <- data.frame(x = x, ymin = pmin(y1, y2), ymax = pmax(y1, y2),
side=(y1<y2), ydiff = y2-y1)
g2 <- ggplot(df2) +
geom_ribbon(aes(x = x, ymin = ymin, ymax = ymax, fill = side, alpha = 0.5)) +
geom_density(data = df, size = 1, aes(x = x, group = sample, colour = sample)) +
xlim(0, 10) +
guides(alpha = FALSE, fill = FALSE)
g3 <- ggplot(df2) +
geom_line(aes(x = x, y = abs(ydiff), colour = side)) +
geom_area(aes(x = x, y = abs(ydiff), fill = side, alpha = 0.4)) +
guides(alpha = FALSE, fill = FALSE)
## See [3]
grid.draw(rbind(ggplotGrob(g2), ggplotGrob(g3), size="last"))
... or with abs(ydiff) replaced by ydiff in the construction of the second plot:
Source: SO answer 3

Resources