Adding to ggplot mean and SD plot - r

I would like to ask for the help.
I am trying to plot data and their mean and SD values on one graph. But I am getting this error
Error in eval(substitute(list(...)), _data, parent.frame()) :
object 'x' not found
First I am dividing data into intervals, and calculate mean and SD values of the intervals using summary. Than I am trying to plot data points (that part works) and add mean and SD value graph to the previous one (here I fail).
Please help me to resolve this issue.
UPD: Ok, I think I should have used stat_summary on the ss data set. Just do not know how to do that at the moment. Any suggestions would be appreciated.
Here is my code:
#Data
s <- data.frame(L5=rnorm(1686, mean=0.3, sd=1.5),
GLDAS=rnorm(1686, mean=0.25, sd=0.8))
#1 )
#Divide data into 0.02 intervals
breaks = seq(from = 0, to = max(s$GLDAS)+0.02, by = 0.02) #intervals
s$group <- cut(s$GLDAS,
breaks = breaks,
labels = seq(from = 1, to = length(breaks)-1, by = 1),
#create label
right = FALSE)
#Assign labels to a value equal to the middle of the interval
pos <- seq(from = breaks[1]+0.02/2, to = max(breaks)-0.02/2, by = 0.02)
group <- seq(from = 1, to = length(breaks)-1, by = 1)
poss <- cbind.data.frame(pos,group)
ss <- merge(s, poss, by = "group")
#Calculate summary
Summary <- ss %>% #
group_by(pos) %>% # the grouping variable
summarise(mean = mean(L5), # calculates the mean of each group
sd = sd(L5), # calculates the standard deviation of each group
n = n(), # calculates the sample size per group
SE = sd(L5)/sqrt(n())) # calculates the standard error of each group
2) #Plot data points
p2 <- ggplot()+
geom_point(data = s, aes(x = GLDAS, y = L5)) +
#geom_smooth(method = "lm", se=FALSE, color="black",
# formula = my.formula) +
stat_poly_eq(formula = my.formula, size = 4,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) + geom_point()+
geom_abline(intercept=0, slope=1)+
xlim (0,0.6) + ylim(0,0.6) + labs(x="GLDAS [mm/hr]", y="L5 [mm/hr]" ) +
theme(text = element_text(size=16))
3) #plot mean and SD values
p2 + geom_line(data = Summary, aes(x=pos, y=mean), color='blue') +
geom_point(data = Summary, aes(x=pos, y=mean), color='blue')+
geom_errorbar(data = Summary, aes(ymin=mean-sd, ymax=mean+sd), width=.01,
position=position_dodge(0.005), color='blue')

I think I've got it, I did not need to use Summary, there is built in function.
p2 <- ggplot()+
geom_point(data = ss, aes(x = GLDAS, y = L5)) +
stat_poly_eq(formula = my.formula, size = 4,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) + geom_point()+
geom_abline(intercept=0, slope=1)+
xlim (0,0.5) + ylim(0,0.5) + labs(x="GLDAS [mm/hr]", y="L5 [mm/hr]" ) +
theme(text = element_text(size=16))
p <- p2 + stat_summary(data = ss, aes(x = pos, y = L5),
fun.y = 'mean', fun.ymin = function(x) 0, geom = 'point',
position = 'dodge') +
stat_summary(data = ss, aes(x = pos, y = L5),
fun.y = mean,
fun.ymin = function(y) mean(y) - sd(y),
fun.ymax = function(y) mean(y) + sd(y),
color = "red",
geom ="pointrange",show.legend = FALSE)
p

Related

Fit grouped curves by label in ggplot2

While making a nomogram of Remotion related to Depth and Time of sedimentation, I need to fit curves (as paraboles) to remotion labels if they are lower than its upper ten (7 ceils to 10, and 18 to 20). This is very close to what I need.
data.frame(
depth=rep(seq(0.5, 3.5, 0.5), each=8),
time=rep(seq(0, 280, 40), times=7),
ss = c(
820,369,238,164,107,66,41,33,
820,224,369,279,213,164,115,90,
820,631,476,361,287,230,180,148,
820,672,558,426,353,287,238,187,
820,713,590,492,402,344,262,230,
820,722,615,533,460,394,320,262,
820,738,656,574,492,418,360,303)
) %>%
transmute(
depth = depth,
time = time,
R = 100*(1- ss/820)
) %>%
mutate(G=factor(round(R, digits=-1))) %>%
ggplot(aes(x=time, y=depth, colour=time))+
geom_label(aes(label=round(R)))+
scale_y_continuous(trans = "reverse")+
geom_path(aes(group=G))
But it is not getting parabolical curves. How can I smooth them under the tens condition?
I'm not sure if this is what you're looking for. I separated the data and the plot and applied stat_smooth for each group. Unfortunately, the smoothed lines do not follow the color scheme. You will also see several warnings do to the method in which this creates the splines.
plt <- ggplot(df1, aes(x=time, y=depth, colour = time)) +
geom_label(aes(label=round(R))) +
scale_y_continuous(trans = "reverse") +
geom_path(aes(group=G), size = .6, alpha = .5)
lapply(1:length(unique(df1$G)),
function(i){
df2 <- df1 %>% filter(G == unique(G)[i])
plt <<- plt +
stat_smooth(data = df2, size = .5,
aes(x = time, y = depth),
se = F, method = lm, color = "darkred",
formula = y ~ splines::bs(x, knots = nrow(df2)))
})
You can extend this further with additional parameters. I'm just not sure exactly what you're expecting.
plt <- ggplot(df1, aes(x=time, y=depth, colour = time)) +
geom_label(aes(label=round(R))) +
scale_y_continuous(trans = "reverse") +
geom_path(aes(group=G), size = .6, alpha = .5)
lapply(1:length(unique(df1$G)),
function(i){
df2 <- df1 %>% filter(G == unique(G)[i])
# u <- df1 %>% {nrow(unique(.[,c(1:2)]))}
plt <<- plt +
stat_smooth(
data = df2, size = .5,
aes(x = time, y = depth),
se = F, method = lm, color = "darkred",
formula = y ~ splines::bs(x, knots = nrow(df2),
degree = ifelse(nrow(df2) <= 4,
3, nrow(df2) - 2)))
})

How to adjust the position of regression equation on ggplot?

I would like to add the regression line and R^2 to my ggplot. I am fitting the regression line to different categories and for each category I am getting a unique equation. I'd like to set the position of equations for each category manually. i.e. Finding the max expression of y for each group and printing the equation at ymax + 1.
Here is my code:
library(ggpmisc)
df <- data.frame(x = c(1:100))
df$y <- 20 * c(0, 1) + 3 * df$x + rnorm(100, sd = 40)
df$group <- factor(rep(c("A", "B"), 50))
df <- df %>% group_by(group) %>% mutate(ymax = max(y))
my.formula <- y ~ x
df %>%
group_by(group) %>%
do(tidy(lm(y ~ x, data = .)))
p <- ggplot(data = df, aes(x = x, y = y, colour = group)) +
geom_smooth(method = "lm", se=FALSE, formula = my.formula) +
stat_poly_eq(formula = my.formula,
aes(x = x , y = ymax + 1, label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) +
geom_point()
p
Any suggestion how to do this?
Also is there any way I can only print the slope of the equation. (remove the intercept from plot)?
Thanks,
I'm pretty sure that setting adjusting stat_poly_eq() with the geom argument will get what you want. Doing so will center the equations, leaving the left half of each clipped, so we use hjust = 0 to left-adjust the equations. Finally, depending on your specific data, the equations may be overlapping each other, so we use the position argument to have ggplot attempt to separate them.
This adjusted call should get you started, I hope:
p <- ggplot(data = df, aes(x = x, y = y, colour = group)) +
geom_smooth(method = "lm", se=FALSE, formula = my.formula) +
stat_poly_eq(
formula = my.formula,
geom = "text", # or 'label'
hjust = 0, # left-adjust equations
position = position_dodge(), # in case equations now overlap
aes(x = x , y = ymax + 1, label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) +
geom_point()
p

Add ribbon showing mean and interquartile range to ggplot2

I use the following example data and code
# Example
x1<- as.Date("2013-12-31")
adddate1 <- 1:60
dts <- x1 + adddate1
df <- data.frame(a=runif(100),b=runif(100),c=runif(100) ,d=rnorm(2700), dates=dts)
df$Metric <- ifelse(df$a > 0.5,"a", "b")
df$Methodology <- ifelse(df$a > 0.5,"One", "Two")
df$Methodology <- factor(df$Methodology)
pl<-df %>%
group_by(Methodology) %>%
do(
plots = ggplot(data=., aes(x = dates, y = b)) +
geom_point() +
stat_smooth(method="auto",size=1.5) +
stat_summary(fun.data=median_hilow, fun.args=(conf.int=1)) + # Show IQR
scale_x_date(date_breaks = "1 week", date_labels = "%d-%b-%y") +
facet_wrap(~Metric, scales="free") +
ggtitle(unique(.$Methodology))
)
pl[[1,2]]
The output I see is:
However, I would like to see IQR, as calculated by stat_summary or some such routine, shown as a ribbon plot, as well as a line showing the median value.
I suspect I will have to write a user defined function and play with that.
Appreciate any hints or tips.
You can use stat_summary with geom_smooth:
library(ggplot2)
set.seed(47)
df <- data.frame(a = runif(100),
b = runif(100),
c = runif(100),
d = rnorm(2700),
dates = as.Date("2013-12-31") + 1:60)
df$Metric <- ifelse(df$a > 0.5, "a", "b")
df$Methodology <- factor(ifelse(df$a > 0.5, "One", "Two"))
ggplot(df, aes(x = dates, y = b)) +
geom_point() +
stat_smooth(size = 1.5) +
geom_smooth(stat = 'summary', alpha = 0.2, fill = 'red', color = 'red',
fun.data = median_hilow, fun.args = list(conf.int = 1)) +
scale_x_date(date_breaks = "1 week", date_labels = "%d-%b-%y") +
facet_wrap(~ Methodology + Metric, ncol = 1)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Since conf.int = 1, this plots a ribbon between the minimum and maximum at each x value, with the median as the line. If you actually want to plot 25th and 75th percentiles, set conf.int = 0.5. On this data, there aren't enough observations at each x value for that to look very different, though, so on some new sample data,
library(ggplot2)
set.seed(47)
ggplot(tibble::tibble(x = rep(seq(0, 4*pi, length.out = 50), 50),
y = rnorm(2500) * sin(x) + sin(x)),
aes(x, y)) +
geom_point(alpha = 0.1) +
geom_smooth(fill = 'darkblue') +
geom_smooth(stat = 'summary', color = 'red', fill = 'red', alpha = 0.2,
fun.data = median_hilow, fun.args = list(conf.int = 0.5))
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
median_hilow (really Hmisc::smedian.hilow) doesn't allow you to set the type of quantile, though, so for more precise control, rewrite the function (returning a similarly structured data frame) or pass separate functions for each statistic to the fun.y, fun.ymin and fun.ymax parameters.
Something's changed in either R or ggplot 2, but stat_summary() no longer works with the geom = 'smooth' option. It needs to be geom = 'ribbon'. This works as advertised in R 3.6.0 and ggplot 3.1.1
library(ggplot2)
set.seed(47)
ggplot(tibble::data_frame(x = rep(seq(0, 4*pi, length.out = 50), 50),
y = rnorm(2500) * sin(x) + sin(x)),
aes(x, y)) +
geom_point(alpha = 0.1) +
geom_smooth(fill = 'darkblue') +
stat_summary(fun.data = median_hilow, fun.args = list(conf.int = 0.5),
geom = 'ribbon', color = 'red', fill = 'red', alpha = 0.2)

geom_errorbar() overlap and positioning issues

I am having trouble with geom_errorbars particularly in utilizing position_dodge() effectively in this script.
library(ggplot2)
library(plyr)
Dose <- rep(c(3,10,30,100), each = 6)
Visit <- rep(c(1,28), each = 3, times = 4)
Animal <- rep(1:3, times = 8)
Estimate <- runif(24)
Dose <- factor(Dose)
Visit <- factor(Visit)
df <- data.frame(Animal, Dose, Visit, Estimate)
e <- ddply(df, .(Dose, Visit), summarise, mean = mean(Estimate), sd = sd(Estimate), n = length(Estimate))
e$se = e$sd/sqrt(e$n)
trace.out <- ggplot(data = e, aes(x = Visit, y = mean, colour = Dose))
trace.out <- trace.out +
geom_point(data = e, aes(y = mean), size = 3, postion = position_dodge(width = 0.2)) +
geom_line(data = e, aes(y = mean, group = Dose), position = position_dodge(width = 0.2)) +
geom_errorbar(aes(ymin= mean - se, ymax = mean + se), postion = position_dodge(0.2), colour='black', width= 0.3) +
labs(y = 'Estimate') +
theme_bw()
print(trace.out)
The output for me looks like:
I would like for the points, lines and error bars to line up and to have the errorbars not overlap. Is there some way to do that? Additionally I get an error of:
ymax not defined: adjusting position using y instead
Would this have anything to do with it? Thanks in Advance!
Maybe facets are an option:
trace.out <- ggplot(data = e, aes(x = Visit, y = mean, colour = Dose, ymin= mean - se, ymax = mean + se, group = Dose))
trace.out <- trace.out +
geom_point(size = 3, postion = position_dodge(width = 0.2)) +
geom_line(position = position_dodge(width = 0.2), ) +
geom_errorbar(postion = position_dodge(0.2), colour='black', width= 0.3) +
labs(y = 'Estimate') +
theme_bw()
print(trace.out + facet_grid(~Dose) )

ggplot2: histogram with normal curve

I've been trying to superimpose a normal curve over my histogram with ggplot 2.
My formula:
data <- read.csv (path...)
ggplot(data, aes(V2)) +
geom_histogram(alpha=0.3, fill='white', colour='black', binwidth=.04)
I tried several things:
+ stat_function(fun=dnorm)
....didn't change anything
+ stat_density(geom = "line", colour = "red")
...gave me a straight red line on the x-axis.
+ geom_density()
doesn't work for me because I want to keep my frequency values on the y-axis, and want no density values.
Any suggestions?
Solution found!
+geom_density(aes(y=0.045*..count..), colour="black", adjust=4)
Think I got it:
library(ggplot2)
set.seed(1)
df <- data.frame(PF = 10*rnorm(1000))
ggplot(df, aes(x = PF)) +
geom_histogram(aes(y =..density..),
breaks = seq(-50, 50, by = 10),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df$PF), sd = sd(df$PF)))
This has been answered here and partially here.
The area under a density curve equals 1, and the area under the histogram equals the width of the bars times the sum of their height ie. the binwidth times the total number of non-missing observations. To fit both on the same graph, one or other needs to be rescaled so that their areas match.
If you want the y-axis to have frequency counts, there are a number of options:
First simulate some data.
library(ggplot2)
set.seed(1)
dat_hist <- data.frame(
group = c(rep("A", 200), rep("B",150)),
value = c(rnorm(200, 20, 5), rnorm(150,25,10)))
# Set desired binwidth and number of non-missing obs
bw = 2
n_obs = sum(!is.na(dat_hist$value))
Option 1: Plot both histogram and density curve as density and then rescale the y axis
This is perhaps the easiest approach for a single histogram.
Using the approach suggested by Carlos, plot both histogram and density curve as density
g <- ggplot(dat_hist, aes(value)) +
geom_histogram(aes(y = ..density..), binwidth = bw, colour = "black") +
stat_function(fun = dnorm, args = list(mean = mean(dat_hist$value), sd = sd(dat_hist$value)))
And then rescale the y axis.
ybreaks = seq(0,50,5)
## On primary axis
g + scale_y_continuous("Counts", breaks = round(ybreaks / (bw * n_obs),3), labels = ybreaks)
## Or on secondary axis
g + scale_y_continuous("Density", sec.axis = sec_axis(
trans = ~ . * bw * n_obs, name = "Counts", breaks = ybreaks))
Option 2: Rescale the density curve using stat_function
With code tidied as per PatrickT's answer.
ggplot(dat_hist, aes(value)) +
geom_histogram(colour = "black", binwidth = bw) +
stat_function(fun = function(x)
dnorm(x, mean = mean(dat_hist$value), sd = sd(dat_hist$value)) * bw * n_obs)
Option 3: Create an external dataset and plot using geom_line.
Unlike the above options, this one works with facets. (EDITED to provide dplyr rather than plyr based solution). Note, the summarised dataset is being used as the primary, and the raw passed in for the histogram only.
library(tidyverse)
dat_hist %>%
group_by(group) %>%
nest(data = c(value)) %>%
mutate(y = map(data, ~ dnorm(
.$value, mean = mean(.$value), sd = sd(.$value)
) * bw * sum(!is.na(.$value)))) %>%
unnest(c(data,y)) %>%
ggplot(aes(x = value)) +
geom_histogram(data = dat_hist, binwidth = bw, colour = "black") +
geom_line(aes(y = y)) +
facet_wrap(~ group)
Option 4: Create external functions to edit the data on the fly
A bit over the top perhaps, but might be useful for someone?
## Function to create scaled dnorm data along full x axis range
dnorm_scaled <- function(data, x = NULL, binwidth = 1, xlim = NULL) {
.x <- na.omit(data[,x])
if(is.null(xlim))
xlim = c(min(.x), max(.x))
x_range = seq(xlim[1], xlim[2], length.out = 101)
setNames(
data.frame(
x = x_range,
y = dnorm(x_range, mean = mean(.x), sd = sd(.x)) * length(.x) * binwidth),
c(x, "y"))
}
## Function to apply over groups
dnorm_scaled_group <- function(data, x = NULL, group = NULL, binwidth = NULL, xlim = NULL) {
dat_hists <- lapply(
split(data, data[, group]), dnorm_scaled,
x = x, binwidth = binwidth, xlim = xlim)
for(g in names(dat_hists))
dat_hists[[g]][, "group"] <- g
setNames(do.call(rbind, dat_hists), c(x, "y", group))
}
## Single histogram
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = bw, colour = "black") +
geom_line(data = ~ dnorm_scaled(., "value", binwidth = bw),
aes(y = y))
## With a single faceting variable
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = 2, colour = "black") +
geom_line(data = ~ dnorm_scaled_group(
., x = "value", group = "group", binwidth = 2, xlim = c(0,50)),
aes(y = y)) +
facet_wrap(~ group)
This is an extended comment on JWilliman's answer. I found J's answer very useful. While playing around I discovered a way to simplify the code. I'm not saying it is a better way, but I thought I would mention it.
Note that JWilliman's answer provides the count on the y-axis and a "hack" to scale the corresponding density normal approximation (which otherwise would cover a total area of 1 and have therefore a much lower peak).
Main point of this comment: simpler syntax inside stat_function, by passing the needed parameters to the aesthetics function, e.g.
aes(x = x, mean = 0, sd = 1, binwidth = 0.3, n = 1000)
This avoids having to pass args = to stat_function and is therefore more user-friendly. Okay, it's not very different, but hopefully someone will find it interesting.
# parameters that will be passed to ``stat_function``
n = 1000
mean = 0
sd = 1
binwidth = 0.3 # passed to geom_histogram and stat_function
set.seed(1)
df <- data.frame(x = rnorm(n, mean, sd))
ggplot(df, aes(x = x, mean = mean, sd = sd, binwidth = binwidth, n = n)) +
theme_bw() +
geom_histogram(binwidth = binwidth,
colour = "white", fill = "cornflowerblue", size = 0.1) +
stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n * binwidth,
color = "darkred", size = 1)
This code should do it:
set.seed(1)
z <- rnorm(1000)
qplot(z, geom = "blank") +
geom_histogram(aes(y = ..density..)) +
stat_density(geom = "line", aes(colour = "bla")) +
stat_function(fun = dnorm, aes(x = z, colour = "blabla")) +
scale_colour_manual(name = "", values = c("red", "green"),
breaks = c("bla", "blabla"),
labels = c("kernel_est", "norm_curv")) +
theme(legend.position = "bottom", legend.direction = "horizontal")
Note: I used qplot but you can use the more versatile ggplot.
Here's a tidyverse informed version:
Setup
library(tidyverse)
Some data
d <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/openintro/speed_gender_height.csv")
Preparing data
We'll use a "total" histogram for the whole sample, to that end, we'll need to remove the grouping information from the data.
d2 <-
d |>
select(-gender)
Here's a data set with summary data:
d_summary <-
d %>%
group_by(gender) %>%
summarise(height_m = mean(height, na.rm = T),
height_sd = sd(height, na.rm = T))
d_summary
Plot it
d %>%
ggplot() +
aes() +
geom_histogram(aes(y = ..density.., x = height, fill = gender)) +
facet_wrap(~ gender) +
geom_histogram(data = d2, aes(y = ..density.., x = height),
alpha = .5) +
stat_function(data = d_summary %>% filter(gender == "female"),
fun = dnorm,
#color = "red",
args = list(mean = filter(d_summary,
gender == "female")$height_m,
sd = filter(d_summary,
gender == "female")$height_sd)) +
stat_function(data = d_summary %>% filter(gender == "male"),
fun = dnorm,
#color = "red",
args = list(mean = filter(d_summary,
gender == "male")$height_m,
sd = filter(d_summary,
gender == "male")$height_sd)) +
theme(legend.position = "none",
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(title = "Facetted histograms with overlaid normal curves",
caption = "The grey histograms shows the whole distribution (over) both groups, i.e. females and men") +
scale_fill_brewer(type = "qual", palette = "Set1")

Resources