Wrong area on normal curve plot - r

I'm trying to learn R from scratch and I just delivered a college assignment for hypothesis testing a binomial distribution (proportion test for one sample) that I used R to solve and plot. But I ran into some problems.
My sample size is 130, success cases are 68.
H0: π = 50%
H1: π > 50
The is the code I used (plenty of copy-paste and trial/error)
library(ggplot2)
library(ggthemes)
library(scales)
#data
n = 130
p = 1/2
stdev = sqrt(n*p*(1-p))
mean_binon = n*p
cases = 68
ztest = (cases-mean_binon)/stdev
pvalor = pnorm(-abs(ztest))
zcrit = qnorm(0.975)
#normal curve
xvalues <- data.frame(x = c(-4, 4))
#first plots and lines
p1 <- ggplot(xvalues, aes(x = xvalues))
p2 <- p1 + stat_function(fun = dnorm) + xlim(c(-4, 4)) +
geom_vline(xintercept = ztest, linetype="solid", color="blue",
size=1) +
geom_vline(xintercept = zcrit, linetype="solid", color="red",
size=1)
#z area function
area_z <- function(x){
norm_z <- dnorm(x)
norm_z[x < ztest] <- NA
return(norm_z)
}
#critical z area function
area_zc <- function(x){
norm_zc <- dnorm(x)
norm_zc[x < zcrit] <- NA
return(norm_zc)
}
#area value
valor_area_z <- round(pnorm(4) - pnorm(ztest), 3)
valor_area_zc <- round(pnorm(4) - pnorm(zcrit), 3)
#final plot
p3 <- p2 + stat_function(fun = dnorm) +
stat_function(fun = area_z, geom = "area", fill = "blue", alpha = 0.3) +
geom_text(x = 1.13, y = 0.1, size = 5, fontface = "bold",
label = paste0(valor_area_z * 100, "%")) +
stat_function(fun = area_zc, geom = "area", fill = "red", alpha = 0.5) +
geom_text(x = 2.27, y = 0.015, size = 3, fontface = "bold",
label = paste0(valor_area_zc * 100, "%")) +
scale_x_continuous(breaks = c(-3:3)) +
labs(x = "\n z", y = "f(z) \n", title = "Distribuição Normal \n") +
theme_fivethirtyeight()
p3
Here's the plot
There is a gap between my geom_vline's and the shaded area. I'm not sure if I'm doing the wrong steps with my statistics or this is an R related problem. Maybe both? Sorry if this is elementary. I'm not good at both but I'm trying to improve.

A solution is to use the option xlim inside stat_function which defines the range of the function. You can also replace area_z and area_zc with dnorm.
p3 <- p2 + stat_function(fun = dnorm) +
stat_function(fun = dnorm, geom = "area", fill = "blue", alpha = 0.3,
xlim = c(ztest,zcrit)) +
geom_text(x = 1.13, y = 0.1, size = 5, fontface = "bold",
label = paste0(valor_area_z * 100, "%")) +
stat_function(fun = dnorm, geom = "area", fill = "red", alpha = 0.5,
xlim = c(zcrit,xvalues$x[2])) +
geom_text(x = 2.27, y = 0.015, size = 3, fontface = "bold",
label = paste0(valor_area_zc * 100, "%")) +
scale_x_continuous(breaks = c(-3:3)) +
labs(x = "\n z", y = "f(z) \n", title = "Distribuição Normal \n") +
theme_fivethirtyeight()
p3

Related

facet_wrap not showing the second facet with geom_density_ridges

I am trying to subset parts of a data you can download the csv from here and read it as datplot. This is what its head looks like:
> head(datplot)
# A tibble: 6 × 3
bta electrode value
<chr> <chr> <dbl>
1 b0 Fz 3.03
2 b0 Cz 1.78
3 b0 Pz -1.05
4 b0 Fz 3.78
5 b0 Cz 2.82
6 b0 Pz -0.242
As you can see, the data has a variable named bta with levels "b0" and "b1" and values from a particular distribution. What I am trying to do, is wrap the two facets but I can't manage to do it.
This is the ggplot code I am using at the moment:
time <- "225"
col <- c("#004d8d", "#cc2701", "#e5b400")
p1 <- datplot %>% ggplot(mapping = aes(x=value, y=factor(electrode, level = c('Fz','Cz','Pz')), group = electrode, color = electrode)) +
scale_y_discrete() +
geom_rect(data=data.frame(), inherit.aes = FALSE, mapping = aes(
ymin = 0, ymax = Inf, xmin = -0.1 * min(stdev), xmax = 0.1 * max(stdev)), fill = "black", alpha = 0.1) +
geom_density_ridges(data= subset(datplot, bta='b0'), scale = -0.5, alpha=0.2, show.legend = FALSE,
quantile_lines = TRUE, quantiles = c(0.025, 0.5, 0.975),
vline_color = alpha("white", 0.3), aes(fill = electrode)) +
#plotb1
geom_density_ridges(data= subset(datplot, bta='b1'),scale = -0.5, alpha=0.5, show.legend = FALSE,
quantile_lines = TRUE, quantiles = c(0.025, 0.5, 0.975),
vline_color = alpha("white", 0.6), aes(fill = electrode)) +
facet_wrap(~bta) +
scale_color_manual(values = col, breaks = c("Fz", "Cz", "Pz")) +
scale_fill_manual(values = col, breaks = c("Fz", "Cz", "Pz")) +
labs(title=sprintf('%s ms.',time)) +
ylab("Electrode") +
xlab(TeX(r'(Signal (µV) Posteriors $\beta_1\cdot x\; (x=1)$)')) +
theme_light() +
theme(axis.text = element_text(size = 14)) +
theme(axis.title = element_text(size = 16)) +
theme(plot.title = element_text(size = 20)) +
coord_flip(xlim = c(-8, 8), ylim = c(0.4,3.05), expand = FALSE, clip = "on")
p1
This code results only with the plot corresponding to "b0" but the other facet seems to be missing, as you can see in the following image:
And for some reason, the data that corresponds to "b1" does not appear. I am probably failing in something with the code that I am unsuccesful identifying. In case it helps, the missing plot, which I plotted alone, would look something like this:
So clearly something is failing when using both geom_density_ridges together in the previous code.
[UPDATE]
Following a comment by #Paul, I tried to not subset the data with the following code:
p1 <- datplot %>% ggplot(mapping = aes(x=value, y=factor(electrode, level = c('Fz','Cz','Pz')), group = electrode, color = electrode)) +
scale_y_discrete() +
geom_rect(data=data.frame(), inherit.aes = FALSE, mapping = aes(
ymin = 0, ymax = Inf, xmin = -0.1 * min(stdev), xmax = 0.1 * max(stdev)), fill = "black", alpha = 0.1) +
geom_density_ridges(data= datplot, scale = -0.5, alpha=0.5, show.legend = FALSE,
quantile_lines = TRUE, quantiles = c(0.025, 0.5, 0.975),
vline_color = alpha("white", 0.3), aes(fill = electrode)) +
facet_wrap(~bta) +
scale_color_manual(values = col, breaks = c("Fz", "Cz", "Pz")) +
scale_fill_manual(values = col, breaks = c("Fz", "Cz", "Pz")) +
labs(title=sprintf('%s ms.',time)) +
ylab("Electrode") +
xlab(TeX(r'(Signal (µV) Posteriors $\beta_1\cdot x\; (x=1)$)')) +
theme_light() +
theme(axis.text = element_text(size = 14)) +
theme(axis.title = element_text(size = 16)) +
theme(plot.title = element_text(size = 20)) +
coord_flip(xlim = c(-8, 8), ylim = c(0.4,3.05), expand = FALSE, clip = "on")
p1
With the same output:
Thanks for the help!
Attempted a minimal reproducible example:
library(ggplot2)
library(ggridges)
library(magrittr)
# data for testing
datplot <- data.frame(
bta = rep(c("b0", "b1"), each = 75),
electrode = rep(c("Fz","Cz","Pz"), times = 50),
value = runif(150,-2,5)
)
time <- "225"
col <- c("#004d8d", "#cc2701", "#e5b400")
p1 <- datplot %>%
ggplot(mapping = aes(x=value, y=factor(electrode, levels = c('Fz','Cz','Pz')), group = electrode, color = electrode)) +
scale_y_discrete() +
geom_density_ridges(scale = -0.5, alpha=0.2,
quantile_lines = TRUE,
quantiles = c(0.025, 0.5, 0.975),
vline_color = alpha("white", 0.3),
aes(fill = electrode)) +
facet_wrap(~bta) +
coord_flip(xlim = c(-8, 8), ylim = c(0.4,3.05), expand = FALSE, clip = "on")
p1
This produces a faceted plot which I think meets the meat of the request - there is much to be done to match desired aesthetics but this should provide a start point from which you can tweak the appearance.

How to add a vertical blank space between straight and inverted geom_density() with ggplot2

I am trying to reproduce this kind of Figure, with two densities, a first one pointing upwards and a second one pointing downwards. I would also like to have some blank space between the two densities.
Here is the code I am currently using.
library(hrbrthemes)
library(tidyverse)
library(RWiener)
# generating data
df <- rwiener(n = 1e2, alpha = 2, tau = 0.3, beta = 0.5, delta = 0.5)
df %>%
ggplot(aes(x = q) ) +
geom_density(
data = . %>% filter(resp == "upper"),
aes(y = ..density..),
colour = "steelblue", fill = "steelblue",
outline.type = "upper", alpha = 0.8, adjust = 1, trim = TRUE
) +
geom_density(
data = . %>% filter(resp == "lower"),
aes(y = -..density..), colour = "orangered", fill = "orangered",
outline.type = "upper", alpha = 0.8, adjust = 1, trim = TRUE
) +
# stimulus onset
geom_vline(xintercept = 0, lty = 1, col = "grey") +
annotate(
geom = "text",
x = 0, y = 0,
# hjust = 0,
vjust = -1,
size = 3, angle = 90,
label = "stimulus onset"
) +
# aesthetics
theme_ipsum_rc(base_size = 12) +
theme(axis.text.y = element_blank() ) +
labs(x = "Reaction time (in seconds)", y = "") +
xlim(0, NA)
Which results in something like...
How could I add some vertical space between the two densities to reproduce the above Figure?
If you want to try without faceting, you're probably best to just plot the densities as polygons with adjusted y values according to your desired spacing:
s <- 0.25 # set to change size of the space
ud <- density(df$q[df$resp == "upper"])
ld <- density(df$q[df$resp == "lower"])
x <- c(ud$x[1], ud$x, ud$x[length(ud$x)],
ld$x[1], ld$x, ld$x[length(ld$x)])
y <- c(s, ud$y + s, s, -s, -ld$y - s, -s)
df2 <- data.frame(x = x, y = y,
resp = rep(c("upper", "lower"), each = length(ud$x) + 2))
df2 %>%
ggplot(aes(x = x, y = y, fill = resp, color = resp) ) +
geom_polygon(alpha = 0.8) +
scale_fill_manual(values = c("steelblue", "orangered")) +
scale_color_manual(values = c("steelblue", "orangered"), guide = guide_none()) +
geom_vline(xintercept = 0, lty = 1, col = "grey") +
annotate(
geom = "text",
x = 0, y = 0,
# hjust = 0,
vjust = -1,
size = 3, angle = 90,
label = "stimulus onset"
) +
# aesthetics
theme_ipsum_rc(base_size = 12) +
theme(axis.text.y = element_blank() ) +
labs(x = "Reaction time (in seconds)", y = "")
you can try facetting
set.seed(123)
q=rbeta(100, 0.25, 1)
df_dens =data.frame(gr=1,
x=density(df$q)$x,
y=density(df$q)$y)
df_dens <- rbind(df_dens,
data.frame(gr=2,
x=density(df$q)$x,
y=-density(df$q)$y))
ggplot(df_dens, aes(x, y, fill = factor(gr))) +
scale_x_continuous(limits = c(0,1)) +
geom_area(show.legend = F) +
facet_wrap(~gr, nrow = 2, scales = "free_y") +
theme_minimal() +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank())
The space between both plots can be increased using panel.spacing = unit(20, "mm"). Instead of facet_grid you can also try facet_grid(gr~., scales = "free_y")

Warning message 'mapping' is not used by stat_function() in R

While completing a project for understanding central limit theorem for exponential distribution, I ran into an annoying error message when plotting simulated vs theoretical distributions. When I run the code below, I get an error: 'mapping' is not used by stat_function().
By mapping I assume the error is referring to the aes parameter, which I later map to color red using scale_color_manual in order to show it in a legend.
My question is two-fold: why is this error happening? and is there a more efficient way to create a legend without using scale_color_manual?
Thank you!
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
stat_function(fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
It's not an error, it's a warning:
library(ggplot2)
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
stat_function(fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
#> Warning: `mapping` is not used by stat_function()
Created on 2020-05-01 by the reprex package (v0.3.0)
You can suppress the warning by calling geom_line(stat = "function") rather than stat_function():
library(ggplot2)
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
geom_line(stat = "function", fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
Created on 2020-05-01 by the reprex package (v0.3.0)
In my opinion, the warning is erroneous, and an issue has been filed about this problem: https://github.com/tidyverse/ggplot2/issues/3611
However, it's not that easy to solve, and therefore as of now the warning is there.
I'm unable to recreate your issue -- when I run your code a plot is generated (below), which suggests the issue is likely to do you with your environment. A general 'solution' is to clear your workspace using the menu dropdown or similar: Session -> Clear workspace..., then re-run your code.
For refactoring the color issue, you can simplify scale_color_manual to
scale_color_manual("Legend", values = c('blue','red')), but how it is now, is a bit better in my view. Anything beyond that has more to do with changing the data structure and mapping.
Apologies, I don't have the rep to make a comment.

Removing the border of legend symbol

I was trying to plot some predicted vs. actual data, something that resembles the following:
# Some random data
x <- seq(1: 10)
y_pred <- runif(10, min = -10, max = 10)
y_obs <- y_pred + rnorm(10)
# Faking a CI
Lo.95 <- y_pred - 1.96
Hi.95 <- y_pred + 1.96
my_df <- data.frame(x, y_pred, y_obs, Lo.95, Hi.95)
ggplot(my_df, aes(x = x, y = y_pred)) +
geom_line(aes(colour = "Forecasted Data"), size = 1.2) +
geom_point(aes(x = x, y = y_obs, colour = "Actual Data"), size = 3) +
geom_ribbon(aes(ymin=Lo.95, ymax=Hi.95, x=x, linetype = NA, colour = "Confidence Interval"), alpha=0.2) +
theme_grey() +
scale_colour_manual(
values = c("gray30", "blue", "red"),
guide = guide_legend(override.aes = list(
border=c(NA, NA, NA),
fill=c("gray30", "white", "white"),
linetype = c("blank", "blank", "solid"),
shape = c(NA, 19, NA))))
The plot looks like this:
The only issue I have with this plot is the red border surrounding the legend item symbol for the line (i.e. the forecasted data). Is there any way I can remove it without breaking the rest of my plot?
I think geom_ribbon was the problem. If we take its color & fill out of aes, everything looks fine
library(ggplot2)
# Some random data
x <- seq(1: 10)
y_pred <- runif(10, min = -10, max = 10)
y_obs <- y_pred + rnorm(10)
# Faking a CI
Lo.95 <- y_pred - 1.96
Hi.95 <- y_pred + 1.96
my_df <- data.frame(x, y_pred, y_obs, Lo.95, Hi.95)
m1 <- ggplot(my_df, aes(x = x, y = y_pred)) +
geom_point(aes(x = x, y = y_obs, colour = "Actual"), size = 3) +
geom_line(aes(colour = "Forecasted"), size = 1.2) +
geom_ribbon(aes(x = x, ymin = Lo.95, ymax = Hi.95),
fill = "grey30", alpha = 0.2) +
scale_color_manual("Legend",
values = c("blue", "red"),
labels = c("Actual", "Forecasted")) +
guides( color = guide_legend(
order = 1,
override.aes = list(
color = c("blue", "red"),
fill = c("white", "white"),
linetype = c("blank", "solid"),
shape = c(19, NA)))) +
theme_bw() +
# remove legend key border color & background
theme(legend.key = element_rect(colour = NA, fill = NA),
legend.box.background = element_blank())
m1
As we leave Confidence Interval out of aes, we no longer have its legend. One workaround is to create an invisible point and take one unused geom to manually create a legend key. Here we can use size/shape (credit to this answer)
m2 <- m1 +
geom_point(aes(x = x, y = y_obs, size = "Confidence Interval", shape = NA)) +
guides(size = guide_legend(NULL,
order = 2,
override.aes = list(shape = 15,
color = "lightgrey",
size = 6))) +
# Move legends closer to each other
theme(legend.title = element_blank(),
legend.justification = "center",
legend.spacing.y = unit(0.05, "cm"),
legend.margin = margin(0, 0, 0, 0),
legend.box.margin = margin(0, 0, 0, 0))
m2
Created on 2018-03-19 by the reprex package (v0.2.0).
A better way to address this question would be to specify show.legend = F option in the geom_ribbon(). This will eliminate the need for the second step for adding and merging the legend key for the confidence interval. Here is the code with slight modifications.
ggplot(my_dff, aes(x = x, y = y_pred)) +
geom_line(aes(colour = "Forecasted Data"), size = 1) +
geom_point(aes(x = x, y = y_obs, colour = "Actual Data"), size = 1) +
geom_ribbon(aes(ymin=Lo.95, ymax=Hi.95, x=x, linetype = NA, colour = "Confidence Interval"), alpha=0.2, show.legend = F) +
theme_grey() +
scale_colour_manual(
values = c("blue", "gray30", "red"))+
guides(color = guide_legend(
override.aes = list(linetype = c(1, 1, 0)),
shape = c(1, NA, NA),
reverse = T))
My plot
Credit to https://stackoverflow.com/users/4282026/marblo
for their answer to similar question.

Applying log scale to y-axis for visualizing proportions with ggplot2

I am attempting to recreate some plots from a research article in R and am running into an issue with applying a log scale to y axis. The visualization I'm attempting to recreate is this:
reference plot with y log scale
I currently have a working version without the logarithmic scale applied to the y-axis:
Proportion_Mean_Plot <- ggplot(proportions, aes(days2,
proportion_mean, group = observation)) +
geom_point(aes(shape = observation)) +
geom_line() +
scale_x_continuous(breaks = seq(0,335,20)) +
scale_y_continuous(breaks = seq(0,6,.5)) +
theme_tufte() +
geom_rangeframe() +
theme(legend.position="none") +
theme(axis.line.x = element_line(colour = "black", size = 0.5, linetype = 1),
axis.line.y = element_line(colour = "black", size = 0.5, linetype = 1)) +
labs(title = "Proportion of Baseline Mean",
subtitle = "Daily steps within each intervention phase",
x = "DAYS",
y = "PROPORTION OF BASELINE \n(MEAN)") +
geom_vline(xintercept = 164.5) +
geom_hline(yintercept = 1) +
annotate("text", x = c(82, 246), y = 5,
label = c("Intervention 1", "Intervention 2")) +
geom_segment(aes(x = 0, y = mean, xend = end, yend = mean),
data = proportion_intervention1_data) +
geom_segment(aes(x = start, y = mean, xend = end, yend = mean),
data = proportion_intervention2_data, linetype = 4)
This produces a decent representation of the original:
normally scaled y-axis plot
I would like to try to apply that logarithmic scaling to more closely match it. Any help is appreciated.
As per Richard's suggestion, here is a quick example how you can use scale_y_log10:
suppressPackageStartupMessages(library(tidyverse))
set.seed(123)
# generate some data
proportions <- tibble(interv_1 = pmax(0.4, rnorm(160, mean = 1.3, sd = 0.2)),
interv_2 = pmax(0.01, rnorm(160, mean = 1.6, sd = 0.5)))
proportions <- proportions %>%
gather(key = observation, value = proportion_mean) %>%
mutate(days2 = 1:320)
# create the plot
ggplot(proportions, aes(days2, proportion_mean, group = observation)) +
geom_point(aes(shape = observation)) +
geom_line() +
scale_x_continuous(breaks = seq(0,335,20), expand = c(0, 0)) +
scale_y_log10(breaks = c( 0.1, 0.5, 1, 2, 3, 4, 5), limits = c(0.1, 5)) +
# theme_tufte() +
# geom_rangeframe() +
theme(legend.position="none") +
theme(axis.line.x = element_line(colour = "black", size = 0.5, linetype = 1),
axis.line.y = element_line(colour = "black", size = 0.5, linetype = 1)) +
labs(title = "Proportion of Baseline Mean",
subtitle = "Daily steps within each intervention phase",
x = "DAYS",
y = "PROPORTION OF BASELINE \n(MEAN)") +
geom_vline(xintercept = 164.5) +
geom_hline(yintercept = 1) +
annotate("text", x = c(82, 246), y = 5,
label = c("Intervention 1", "Intervention 2")) +
# plugged the values for the means of the two distributions
geom_segment(aes(x = 0, y = 1.3, xend = 164.5, yend = 1.3)) +
geom_segment(aes(x = 164.5, y = 1.6, xend = 320, yend = 1.6), linetype = 4)

Resources