Multiple Polynomial fits in ggplot using facetwrap - r

So I would like to use multiple polynomial curves to fit 2 dimensional data,
I am able to plot one polynomial function but I would like to use for example 4 and then plot all of them at the same time using facet_wrap.
Now I am using simple 2 order polynomial:
library(ggplot2)
df <- mtcars
df <- data.frame("x"=df$mpg, "y"=df$hp)
my.formula <- y ~ x + I(x^2)
p <- ggplot(df, aes(x, y)) +
geom_point(shape=21, fill="blue", colour="black", size=2, alpha = 0.7) +
geom_smooth(method = "lm", se = F,
formula = my.formula,
colour = "red")
m <- lm(my.formula, df)
my.eq <- as.character(signif(as.polynomial(coef(m)), 3))
label.text <- paste(gsub("x", "~italic(x)", my.eq, fixed = TRUE),
paste("italic(R)^2",
format(summary(m)$r.squared, digits = 2),
sep = "~`=`~"),
sep = "~~~~")
p + annotate(geom = "text", label = label.text,
family = "serif", hjust = 0, parse = TRUE, size = 4)
lets say we would like to use another formulas such as:
my.formula2 <- y ~ x + I(x^2) + I(x^3)
my.formula4 <- y ~ x + I(x^2) + I(x^3) + I(x^4)
my.formula5 <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5)
And plot it in the base plot above using facet_wrap so we would have 4 seperate plots and each has to have its own label text and anotation.

Here is an answer that first fits polynomial regression and gets the predicted values, then plots them all with geom_line, not geom_smooth.
library(ggplot2)
df <- mtcars
df <- data.frame("x"=df$mpg, "y"=df$hp)
tmp <- sapply(2:5, function(d){
predict(lm(y ~ poly(x, d), df))
})
df2 <- df
df2 <- cbind(df2, tmp)
rm(tmp)
names(df2)[-(1:2)] <- paste0("degree", 2:5)
long <- reshape2::melt(df2, id.vars = c("x", "y"))
ggplot(long, aes(x, y)) +
geom_point(shape=21, fill="blue", colour="black", size=2, alpha = 0.7) +
geom_line(aes(y = value), colour = "red") +
facet_wrap(~ variable)
Edit.
Another way, without fitting the models previously, is the following, inspired in a RStudio community post.
library(tidyverse)
cbind(df, tmp) %>%
gather(degree, value, -x, -y) %>%
{
reduce2(.init = ggplot(., aes(x = x, y = y)),
.x = .$degree,
.y = .$value,
function(prev, .x, .y) {
force(.y) # The formula below won't evaluate .y by itself
prev + geom_smooth(
data = . %>% filter(degree == .x),
method = "lm",
se = FALSE,
formula = y ~ poly(x, .y))
})
} +
geom_point(fill = "blue", colour = "black",size = 2, alpha = 0.7) +
facet_wrap(~ degree)

It's pretty simple with the stat_function function
I know you said you wanted to use facet_wrap but I would suggest using ggarrange in the ggpubr library
mylm1 <- lm(hp ~ mpg + I(mpg^2), data = df)
mylm2 <- lm(hp ~ mpg + I(mpg^2) + I(mpg^3), data = df)
mylm3 <- lm(hp ~ mpg + I(mpg^2) + I(mpg^3) + I(mpg^4), data = df)
mylm4 <- lm(hp ~ mpg + I(mpg^2) + I(mpg^3) + I(mpg^4) + I(mpg^5), data = df)
b1 <- coef(mylm1)
b2 <- coef(mylm2)
b3 <- coef(mylm3)
b4 <- coef(mylm4)
p1 <- df %>%
ggplot() +
geom_point(aes(x = mpg, y = hp)) +
stat_function(fun = function(x) b1[1] + b1[2]*x + b1[3]*x^2)
p2 <- df %>%
ggplot() +
geom_point(aes(x = mpg, y = hp)) +
stat_function(fun = function(x) b2[1] + b2[2]*x + b2[3]*x^2 + b2[4]*x^3)
p3 <- df %>%
ggplot() +
geom_point(aes(x = mpg, y = hp)) +
stat_function(fun = function(x) b3[1] + b3[2]*x + b3[3]*x^2 + b3[4]*x^3 + b3[5]*x^4)
p4 <- df %>%
ggplot() +
geom_point(aes(x = mpg, y = hp)) +
stat_function(fun = function(x) b4[1] + b4[2]*x + b4[3]*x^2 + b4[4]*x^3 + b4[5]*x^4 + b4[6]*x^5)
library(ggpubr)
ggarrange(p1,p2,p3,p4)

Related

Is there a way in R to find the regression line for this plot and then also display the equation using ggplot2? Current method seems incorrect [duplicate]

I wonder how to add regression line equation and R^2 on the ggplot. My code is:
library(ggplot2)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p
Any help will be highly appreciated.
Here is one solution
# GET EQUATION AND R-SQUARED AS STRING
# SOURCE: https://groups.google.com/forum/#!topic/ggplot2/1TgH-kG5XMA
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
p1 <- p + geom_text(x = 25, y = 300, label = lm_eqn(df), parse = TRUE)
EDIT. I figured out the source from where I picked this code. Here is the link to the original post in the ggplot2 google groups
Statistic stat_poly_eq() in my package ggpmisc makes it possible add text labels based on a linear model fit.
This answer has been updated for 'ggpmisc' (>= 0.4.0) and 'ggplot2' (>= 3.3.0) on 2022-06-02.
In the examples I use stat_poly_line() instead of stat_smooth() as it has the same defaults as stat_poly_eq() for method and formula. I have omitted in all code examples the additional arguments to stat_poly_line() as they are irrelevant to the question of adding labels.
library(ggplot2)
library(ggpmisc)
#> Loading required package: ggpp
#>
#> Attaching package: 'ggpp'
#> The following object is masked from 'package:ggplot2':
#>
#> annotate
# artificial data
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$yy <- 2 + 3 * df$x + 0.1 * df$x^2 + rnorm(100, sd = 40)
# using default formula, label and methods
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq() +
geom_point()
# assembling a single label with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
# adding separate labels with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = after_stat(eq.label))) +
stat_poly_eq(label.y = 0.9) +
geom_point()
# regression through the origin
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line(formula = y ~ x + 0) +
stat_poly_eq(formula = y ~ x + 0, aes(label = after_stat(eq.label))) +
geom_point()
# fitting a polynomial
ggplot(data = df, aes(x = x, y = yy)) +
stat_poly_line(formula = y ~ poly(x, 2, raw = TRUE)) +
stat_poly_eq(formula = y ~ poly(x, 2, raw = TRUE),
aes(label = after_stat(eq.label))) +
geom_point()
# adding a hat as asked by #MYaseen208 and #elarry
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(hat(y))~`=`~",
aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
# variable substitution as asked by #shabbychef
# same labels in equation and axes
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(h)~`=`~",
eq.x.rhs = "~italic(z)",
aes(label = after_stat(eq.label))) +
labs(x = expression(italic(z)), y = expression(italic(h))) +
geom_point()
# grouping as asked by #helen.h
dfg <- data.frame(x = c(1:100))
dfg$y <- 20 * c(0, 1) + 3 * df$x + rnorm(100, sd = 40)
dfg$group <- factor(rep(c("A", "B"), 50))
ggplot(data = dfg, aes(x = x, y = y, colour = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
ggplot(data = dfg, aes(x = x, y = y, linetype = group, grp.label = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(grp.label), "*\": \"*",
after_stat(eq.label), "*\", \"*",
after_stat(rr.label), sep = ""))) +
geom_point()
# a single fit with grouped data as asked by #Herman
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point(aes(colour = group))
# facets
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point() +
facet_wrap(~group)
Created on 2022-06-02 by the reprex package (v2.0.1)
I changed a few lines of the source of stat_smooth and related functions to make a new function that adds the fit equation and R squared value. This will work on facet plots too!
library(devtools)
source_gist("524eade46135f6348140")
df = data.frame(x = c(1:100))
df$y = 2 + 5 * df$x + rnorm(100, sd = 40)
df$class = rep(1:2,50)
ggplot(data = df, aes(x = x, y = y, label=y)) +
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
geom_smooth(method="lm",se=FALSE) +
geom_point() + facet_wrap(~class)
I used the code in #Ramnath's answer to format the equation. The stat_smooth_func function isn't very robust, but it shouldn't be hard to play around with it.
https://gist.github.com/kdauria/524eade46135f6348140. Try updating ggplot2 if you get an error.
I've modified Ramnath's post to a) make more generic so it accepts a linear model as a parameter rather than the data frame and b) displays negatives more appropriately.
lm_eqn = function(m) {
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
Usage would change to:
p1 = p + geom_text(aes(x = 25, y = 300, label = lm_eqn(lm(y ~ x, df))), parse = TRUE)
Here's the most simplest code for everyone
Note: Showing Pearson's Rho and not R^2.
library(ggplot2)
library(ggpubr)
df <- data.frame(x = c(1:100)
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()+
stat_cor(label.y = 35)+ #this means at 35th unit in the y axis, the r squared and p value will be shown
stat_regline_equation(label.y = 30) #this means at 30th unit regresion line equation will be shown
p
Using ggpubr:
library(ggpubr)
# reproducible data
set.seed(1)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
# By default showing Pearson R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300) +
stat_regline_equation(label.y = 280)
# Use R2 instead of R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300,
aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))) +
stat_regline_equation(label.y = 280)
## compare R2 with accepted answer
# m <- lm(y ~ x, df)
# round(summary(m)$r.squared, 2)
# [1] 0.85
really love #Ramnath solution. To allow use to customize the regression formula (instead of fixed as y and x as literal variable names), and added the p-value into the printout as well (as #Jerry T commented), here is the mod:
lm_eqn <- function(df, y, x){
formula = as.formula(sprintf('%s ~ %s', y, x))
m <- lm(formula, data=df);
# formating the values into a summary string to print out
# ~ give some space, but equal size and comma need to be quoted
eq <- substitute(italic(target) == a + b %.% italic(input)*","~~italic(r)^2~"="~r2*","~~p~"="~italic(pvalue),
list(target = y,
input = x,
a = format(as.vector(coef(m)[1]), digits = 2),
b = format(as.vector(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
# getting the pvalue is painful
pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits=1)
)
)
as.character(as.expression(eq));
}
geom_point() +
ggrepel::geom_text_repel(label=rownames(mtcars)) +
geom_text(x=3,y=300,label=lm_eqn(mtcars, 'hp','wt'),color='red',parse=T) +
geom_smooth(method='lm')
Unfortunately, this doesn't work with facet_wrap or facet_grid.
Another option would be to create a custom function generating the equation using dplyr and broom libraries:
get_formula <- function(model) {
broom::tidy(model)[, 1:2] %>%
mutate(sign = ifelse(sign(estimate) == 1, ' + ', ' - ')) %>% #coeff signs
mutate_if(is.numeric, ~ abs(round(., 2))) %>% #for improving formatting
mutate(a = ifelse(term == '(Intercept)', paste0('y ~ ', estimate), paste0(sign, estimate, ' * ', term))) %>%
summarise(formula = paste(a, collapse = '')) %>%
as.character
}
lm(y ~ x, data = df) -> model
get_formula(model)
#"y ~ 6.22 + 3.16 * x"
scales::percent(summary(model)$r.squared, accuracy = 0.01) -> r_squared
Now we need to add the text to the plot:
p +
geom_text(x = 20, y = 300,
label = get_formula(model),
color = 'red') +
geom_text(x = 20, y = 285,
label = r_squared,
color = 'blue')
Inspired by the equation style provided in this answer, a more generic approach (more than one predictor + latex output as option) can be:
print_equation= function(model, latex= FALSE, ...){
dots <- list(...)
cc= model$coefficients
var_sign= as.character(sign(cc[-1]))%>%gsub("1","",.)%>%gsub("-"," - ",.)
var_sign[var_sign==""]= ' + '
f_args_abs= f_args= dots
f_args$x= cc
f_args_abs$x= abs(cc)
cc_= do.call(format, args= f_args)
cc_abs= do.call(format, args= f_args_abs)
pred_vars=
cc_abs%>%
paste(., x_vars, sep= star)%>%
paste(var_sign,.)%>%paste(., collapse= "")
if(latex){
star= " \\cdot "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]%>%
paste0("\\hat{",.,"_{i}}")
x_vars= names(cc_)[-1]%>%paste0(.,"_{i}")
}else{
star= " * "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]
x_vars= names(cc_)[-1]
}
equ= paste(y_var,"=",cc_[1],pred_vars)
if(latex){
equ= paste0(equ," + \\hat{\\varepsilon_{i}} \\quad where \\quad \\varepsilon \\sim \\mathcal{N}(0,",
summary(MetamodelKdifEryth)$sigma,")")%>%paste0("$",.,"$")
}
cat(equ)
}
The model argument expects an lm object, the latex argument is a boolean to ask for a simple character or a latex-formated equation, and the ... argument pass its values to the format function.
I also added an option to output it as latex so you can use this function in a rmarkdown like this:
```{r echo=FALSE, results='asis'}
print_equation(model = lm_mod, latex = TRUE)
```
Now using it:
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$z <- 8 + 3 * df$x + rnorm(100, sd = 40)
lm_mod= lm(y~x+z, data = df)
print_equation(model = lm_mod, latex = FALSE)
This code yields:
y = 11.3382963933174 + 2.5893419 * x + 0.1002227 * z
And if we ask for a latex equation, rounding the parameters to 3 digits:
print_equation(model = lm_mod, latex = TRUE, digits= 3)
This yields:
Similar to #zx8754 and #kdauria answers except using ggplot2 and ggpubr. I prefer using ggpubr because it does not require custom functions such as the top answer to this question.
library(ggplot2)
library(ggpubr)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label..)), # adds R^2 value
r.accuracy = 0.01,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)
Could also add p-value to the figure above
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")), # adds R^2 and p-value
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)
Also works well with facet_wrap() when you have multiple groups
df$group <- rep(1:2,50)
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..),
label.x = 0, label.y = 400, size = 4) +
theme_bw() +
facet_wrap(~group)

Multiple linear regression in one graph [duplicate]

I wonder how to add regression line equation and R^2 on the ggplot. My code is:
library(ggplot2)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p
Any help will be highly appreciated.
Here is one solution
# GET EQUATION AND R-SQUARED AS STRING
# SOURCE: https://groups.google.com/forum/#!topic/ggplot2/1TgH-kG5XMA
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
p1 <- p + geom_text(x = 25, y = 300, label = lm_eqn(df), parse = TRUE)
EDIT. I figured out the source from where I picked this code. Here is the link to the original post in the ggplot2 google groups
Statistic stat_poly_eq() in my package ggpmisc makes it possible add text labels based on a linear model fit.
This answer has been updated for 'ggpmisc' (>= 0.4.0) and 'ggplot2' (>= 3.3.0) on 2022-06-02.
In the examples I use stat_poly_line() instead of stat_smooth() as it has the same defaults as stat_poly_eq() for method and formula. I have omitted in all code examples the additional arguments to stat_poly_line() as they are irrelevant to the question of adding labels.
library(ggplot2)
library(ggpmisc)
#> Loading required package: ggpp
#>
#> Attaching package: 'ggpp'
#> The following object is masked from 'package:ggplot2':
#>
#> annotate
# artificial data
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$yy <- 2 + 3 * df$x + 0.1 * df$x^2 + rnorm(100, sd = 40)
# using default formula, label and methods
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq() +
geom_point()
# assembling a single label with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
# adding separate labels with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = after_stat(eq.label))) +
stat_poly_eq(label.y = 0.9) +
geom_point()
# regression through the origin
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line(formula = y ~ x + 0) +
stat_poly_eq(formula = y ~ x + 0, aes(label = after_stat(eq.label))) +
geom_point()
# fitting a polynomial
ggplot(data = df, aes(x = x, y = yy)) +
stat_poly_line(formula = y ~ poly(x, 2, raw = TRUE)) +
stat_poly_eq(formula = y ~ poly(x, 2, raw = TRUE),
aes(label = after_stat(eq.label))) +
geom_point()
# adding a hat as asked by #MYaseen208 and #elarry
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(hat(y))~`=`~",
aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
# variable substitution as asked by #shabbychef
# same labels in equation and axes
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(h)~`=`~",
eq.x.rhs = "~italic(z)",
aes(label = after_stat(eq.label))) +
labs(x = expression(italic(z)), y = expression(italic(h))) +
geom_point()
# grouping as asked by #helen.h
dfg <- data.frame(x = c(1:100))
dfg$y <- 20 * c(0, 1) + 3 * df$x + rnorm(100, sd = 40)
dfg$group <- factor(rep(c("A", "B"), 50))
ggplot(data = dfg, aes(x = x, y = y, colour = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
ggplot(data = dfg, aes(x = x, y = y, linetype = group, grp.label = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(grp.label), "*\": \"*",
after_stat(eq.label), "*\", \"*",
after_stat(rr.label), sep = ""))) +
geom_point()
# a single fit with grouped data as asked by #Herman
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point(aes(colour = group))
# facets
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point() +
facet_wrap(~group)
Created on 2022-06-02 by the reprex package (v2.0.1)
I changed a few lines of the source of stat_smooth and related functions to make a new function that adds the fit equation and R squared value. This will work on facet plots too!
library(devtools)
source_gist("524eade46135f6348140")
df = data.frame(x = c(1:100))
df$y = 2 + 5 * df$x + rnorm(100, sd = 40)
df$class = rep(1:2,50)
ggplot(data = df, aes(x = x, y = y, label=y)) +
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
geom_smooth(method="lm",se=FALSE) +
geom_point() + facet_wrap(~class)
I used the code in #Ramnath's answer to format the equation. The stat_smooth_func function isn't very robust, but it shouldn't be hard to play around with it.
https://gist.github.com/kdauria/524eade46135f6348140. Try updating ggplot2 if you get an error.
I've modified Ramnath's post to a) make more generic so it accepts a linear model as a parameter rather than the data frame and b) displays negatives more appropriately.
lm_eqn = function(m) {
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
Usage would change to:
p1 = p + geom_text(aes(x = 25, y = 300, label = lm_eqn(lm(y ~ x, df))), parse = TRUE)
Here's the most simplest code for everyone
Note: Showing Pearson's Rho and not R^2.
library(ggplot2)
library(ggpubr)
df <- data.frame(x = c(1:100)
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()+
stat_cor(label.y = 35)+ #this means at 35th unit in the y axis, the r squared and p value will be shown
stat_regline_equation(label.y = 30) #this means at 30th unit regresion line equation will be shown
p
Using ggpubr:
library(ggpubr)
# reproducible data
set.seed(1)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
# By default showing Pearson R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300) +
stat_regline_equation(label.y = 280)
# Use R2 instead of R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300,
aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))) +
stat_regline_equation(label.y = 280)
## compare R2 with accepted answer
# m <- lm(y ~ x, df)
# round(summary(m)$r.squared, 2)
# [1] 0.85
really love #Ramnath solution. To allow use to customize the regression formula (instead of fixed as y and x as literal variable names), and added the p-value into the printout as well (as #Jerry T commented), here is the mod:
lm_eqn <- function(df, y, x){
formula = as.formula(sprintf('%s ~ %s', y, x))
m <- lm(formula, data=df);
# formating the values into a summary string to print out
# ~ give some space, but equal size and comma need to be quoted
eq <- substitute(italic(target) == a + b %.% italic(input)*","~~italic(r)^2~"="~r2*","~~p~"="~italic(pvalue),
list(target = y,
input = x,
a = format(as.vector(coef(m)[1]), digits = 2),
b = format(as.vector(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
# getting the pvalue is painful
pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits=1)
)
)
as.character(as.expression(eq));
}
geom_point() +
ggrepel::geom_text_repel(label=rownames(mtcars)) +
geom_text(x=3,y=300,label=lm_eqn(mtcars, 'hp','wt'),color='red',parse=T) +
geom_smooth(method='lm')
Unfortunately, this doesn't work with facet_wrap or facet_grid.
Another option would be to create a custom function generating the equation using dplyr and broom libraries:
get_formula <- function(model) {
broom::tidy(model)[, 1:2] %>%
mutate(sign = ifelse(sign(estimate) == 1, ' + ', ' - ')) %>% #coeff signs
mutate_if(is.numeric, ~ abs(round(., 2))) %>% #for improving formatting
mutate(a = ifelse(term == '(Intercept)', paste0('y ~ ', estimate), paste0(sign, estimate, ' * ', term))) %>%
summarise(formula = paste(a, collapse = '')) %>%
as.character
}
lm(y ~ x, data = df) -> model
get_formula(model)
#"y ~ 6.22 + 3.16 * x"
scales::percent(summary(model)$r.squared, accuracy = 0.01) -> r_squared
Now we need to add the text to the plot:
p +
geom_text(x = 20, y = 300,
label = get_formula(model),
color = 'red') +
geom_text(x = 20, y = 285,
label = r_squared,
color = 'blue')
Inspired by the equation style provided in this answer, a more generic approach (more than one predictor + latex output as option) can be:
print_equation= function(model, latex= FALSE, ...){
dots <- list(...)
cc= model$coefficients
var_sign= as.character(sign(cc[-1]))%>%gsub("1","",.)%>%gsub("-"," - ",.)
var_sign[var_sign==""]= ' + '
f_args_abs= f_args= dots
f_args$x= cc
f_args_abs$x= abs(cc)
cc_= do.call(format, args= f_args)
cc_abs= do.call(format, args= f_args_abs)
pred_vars=
cc_abs%>%
paste(., x_vars, sep= star)%>%
paste(var_sign,.)%>%paste(., collapse= "")
if(latex){
star= " \\cdot "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]%>%
paste0("\\hat{",.,"_{i}}")
x_vars= names(cc_)[-1]%>%paste0(.,"_{i}")
}else{
star= " * "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]
x_vars= names(cc_)[-1]
}
equ= paste(y_var,"=",cc_[1],pred_vars)
if(latex){
equ= paste0(equ," + \\hat{\\varepsilon_{i}} \\quad where \\quad \\varepsilon \\sim \\mathcal{N}(0,",
summary(MetamodelKdifEryth)$sigma,")")%>%paste0("$",.,"$")
}
cat(equ)
}
The model argument expects an lm object, the latex argument is a boolean to ask for a simple character or a latex-formated equation, and the ... argument pass its values to the format function.
I also added an option to output it as latex so you can use this function in a rmarkdown like this:
```{r echo=FALSE, results='asis'}
print_equation(model = lm_mod, latex = TRUE)
```
Now using it:
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$z <- 8 + 3 * df$x + rnorm(100, sd = 40)
lm_mod= lm(y~x+z, data = df)
print_equation(model = lm_mod, latex = FALSE)
This code yields:
y = 11.3382963933174 + 2.5893419 * x + 0.1002227 * z
And if we ask for a latex equation, rounding the parameters to 3 digits:
print_equation(model = lm_mod, latex = TRUE, digits= 3)
This yields:
Similar to #zx8754 and #kdauria answers except using ggplot2 and ggpubr. I prefer using ggpubr because it does not require custom functions such as the top answer to this question.
library(ggplot2)
library(ggpubr)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label..)), # adds R^2 value
r.accuracy = 0.01,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)
Could also add p-value to the figure above
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")), # adds R^2 and p-value
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)
Also works well with facet_wrap() when you have multiple groups
df$group <- rep(1:2,50)
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..),
label.x = 0, label.y = 400, size = 4) +
theme_bw() +
facet_wrap(~group)

how to add regression equation and R2 on plot [duplicate]

I wonder how to add regression line equation and R^2 on the ggplot. My code is:
library(ggplot2)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p
Any help will be highly appreciated.
Here is one solution
# GET EQUATION AND R-SQUARED AS STRING
# SOURCE: https://groups.google.com/forum/#!topic/ggplot2/1TgH-kG5XMA
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
p1 <- p + geom_text(x = 25, y = 300, label = lm_eqn(df), parse = TRUE)
EDIT. I figured out the source from where I picked this code. Here is the link to the original post in the ggplot2 google groups
Statistic stat_poly_eq() in my package ggpmisc makes it possible add text labels based on a linear model fit.
This answer has been updated for 'ggpmisc' (>= 0.4.0) and 'ggplot2' (>= 3.3.0) on 2022-06-02.
In the examples I use stat_poly_line() instead of stat_smooth() as it has the same defaults as stat_poly_eq() for method and formula. I have omitted in all code examples the additional arguments to stat_poly_line() as they are irrelevant to the question of adding labels.
library(ggplot2)
library(ggpmisc)
#> Loading required package: ggpp
#>
#> Attaching package: 'ggpp'
#> The following object is masked from 'package:ggplot2':
#>
#> annotate
# artificial data
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$yy <- 2 + 3 * df$x + 0.1 * df$x^2 + rnorm(100, sd = 40)
# using default formula, label and methods
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq() +
geom_point()
# assembling a single label with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
# adding separate labels with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = after_stat(eq.label))) +
stat_poly_eq(label.y = 0.9) +
geom_point()
# regression through the origin
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line(formula = y ~ x + 0) +
stat_poly_eq(formula = y ~ x + 0, aes(label = after_stat(eq.label))) +
geom_point()
# fitting a polynomial
ggplot(data = df, aes(x = x, y = yy)) +
stat_poly_line(formula = y ~ poly(x, 2, raw = TRUE)) +
stat_poly_eq(formula = y ~ poly(x, 2, raw = TRUE),
aes(label = after_stat(eq.label))) +
geom_point()
# adding a hat as asked by #MYaseen208 and #elarry
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(hat(y))~`=`~",
aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
# variable substitution as asked by #shabbychef
# same labels in equation and axes
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(h)~`=`~",
eq.x.rhs = "~italic(z)",
aes(label = after_stat(eq.label))) +
labs(x = expression(italic(z)), y = expression(italic(h))) +
geom_point()
# grouping as asked by #helen.h
dfg <- data.frame(x = c(1:100))
dfg$y <- 20 * c(0, 1) + 3 * df$x + rnorm(100, sd = 40)
dfg$group <- factor(rep(c("A", "B"), 50))
ggplot(data = dfg, aes(x = x, y = y, colour = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
ggplot(data = dfg, aes(x = x, y = y, linetype = group, grp.label = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(grp.label), "*\": \"*",
after_stat(eq.label), "*\", \"*",
after_stat(rr.label), sep = ""))) +
geom_point()
# a single fit with grouped data as asked by #Herman
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point(aes(colour = group))
# facets
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point() +
facet_wrap(~group)
Created on 2022-06-02 by the reprex package (v2.0.1)
I changed a few lines of the source of stat_smooth and related functions to make a new function that adds the fit equation and R squared value. This will work on facet plots too!
library(devtools)
source_gist("524eade46135f6348140")
df = data.frame(x = c(1:100))
df$y = 2 + 5 * df$x + rnorm(100, sd = 40)
df$class = rep(1:2,50)
ggplot(data = df, aes(x = x, y = y, label=y)) +
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
geom_smooth(method="lm",se=FALSE) +
geom_point() + facet_wrap(~class)
I used the code in #Ramnath's answer to format the equation. The stat_smooth_func function isn't very robust, but it shouldn't be hard to play around with it.
https://gist.github.com/kdauria/524eade46135f6348140. Try updating ggplot2 if you get an error.
I've modified Ramnath's post to a) make more generic so it accepts a linear model as a parameter rather than the data frame and b) displays negatives more appropriately.
lm_eqn = function(m) {
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
Usage would change to:
p1 = p + geom_text(aes(x = 25, y = 300, label = lm_eqn(lm(y ~ x, df))), parse = TRUE)
Here's the most simplest code for everyone
Note: Showing Pearson's Rho and not R^2.
library(ggplot2)
library(ggpubr)
df <- data.frame(x = c(1:100)
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()+
stat_cor(label.y = 35)+ #this means at 35th unit in the y axis, the r squared and p value will be shown
stat_regline_equation(label.y = 30) #this means at 30th unit regresion line equation will be shown
p
Using ggpubr:
library(ggpubr)
# reproducible data
set.seed(1)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
# By default showing Pearson R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300) +
stat_regline_equation(label.y = 280)
# Use R2 instead of R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300,
aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))) +
stat_regline_equation(label.y = 280)
## compare R2 with accepted answer
# m <- lm(y ~ x, df)
# round(summary(m)$r.squared, 2)
# [1] 0.85
really love #Ramnath solution. To allow use to customize the regression formula (instead of fixed as y and x as literal variable names), and added the p-value into the printout as well (as #Jerry T commented), here is the mod:
lm_eqn <- function(df, y, x){
formula = as.formula(sprintf('%s ~ %s', y, x))
m <- lm(formula, data=df);
# formating the values into a summary string to print out
# ~ give some space, but equal size and comma need to be quoted
eq <- substitute(italic(target) == a + b %.% italic(input)*","~~italic(r)^2~"="~r2*","~~p~"="~italic(pvalue),
list(target = y,
input = x,
a = format(as.vector(coef(m)[1]), digits = 2),
b = format(as.vector(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
# getting the pvalue is painful
pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits=1)
)
)
as.character(as.expression(eq));
}
geom_point() +
ggrepel::geom_text_repel(label=rownames(mtcars)) +
geom_text(x=3,y=300,label=lm_eqn(mtcars, 'hp','wt'),color='red',parse=T) +
geom_smooth(method='lm')
Unfortunately, this doesn't work with facet_wrap or facet_grid.
Another option would be to create a custom function generating the equation using dplyr and broom libraries:
get_formula <- function(model) {
broom::tidy(model)[, 1:2] %>%
mutate(sign = ifelse(sign(estimate) == 1, ' + ', ' - ')) %>% #coeff signs
mutate_if(is.numeric, ~ abs(round(., 2))) %>% #for improving formatting
mutate(a = ifelse(term == '(Intercept)', paste0('y ~ ', estimate), paste0(sign, estimate, ' * ', term))) %>%
summarise(formula = paste(a, collapse = '')) %>%
as.character
}
lm(y ~ x, data = df) -> model
get_formula(model)
#"y ~ 6.22 + 3.16 * x"
scales::percent(summary(model)$r.squared, accuracy = 0.01) -> r_squared
Now we need to add the text to the plot:
p +
geom_text(x = 20, y = 300,
label = get_formula(model),
color = 'red') +
geom_text(x = 20, y = 285,
label = r_squared,
color = 'blue')
Inspired by the equation style provided in this answer, a more generic approach (more than one predictor + latex output as option) can be:
print_equation= function(model, latex= FALSE, ...){
dots <- list(...)
cc= model$coefficients
var_sign= as.character(sign(cc[-1]))%>%gsub("1","",.)%>%gsub("-"," - ",.)
var_sign[var_sign==""]= ' + '
f_args_abs= f_args= dots
f_args$x= cc
f_args_abs$x= abs(cc)
cc_= do.call(format, args= f_args)
cc_abs= do.call(format, args= f_args_abs)
pred_vars=
cc_abs%>%
paste(., x_vars, sep= star)%>%
paste(var_sign,.)%>%paste(., collapse= "")
if(latex){
star= " \\cdot "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]%>%
paste0("\\hat{",.,"_{i}}")
x_vars= names(cc_)[-1]%>%paste0(.,"_{i}")
}else{
star= " * "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]
x_vars= names(cc_)[-1]
}
equ= paste(y_var,"=",cc_[1],pred_vars)
if(latex){
equ= paste0(equ," + \\hat{\\varepsilon_{i}} \\quad where \\quad \\varepsilon \\sim \\mathcal{N}(0,",
summary(MetamodelKdifEryth)$sigma,")")%>%paste0("$",.,"$")
}
cat(equ)
}
The model argument expects an lm object, the latex argument is a boolean to ask for a simple character or a latex-formated equation, and the ... argument pass its values to the format function.
I also added an option to output it as latex so you can use this function in a rmarkdown like this:
```{r echo=FALSE, results='asis'}
print_equation(model = lm_mod, latex = TRUE)
```
Now using it:
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$z <- 8 + 3 * df$x + rnorm(100, sd = 40)
lm_mod= lm(y~x+z, data = df)
print_equation(model = lm_mod, latex = FALSE)
This code yields:
y = 11.3382963933174 + 2.5893419 * x + 0.1002227 * z
And if we ask for a latex equation, rounding the parameters to 3 digits:
print_equation(model = lm_mod, latex = TRUE, digits= 3)
This yields:
Similar to #zx8754 and #kdauria answers except using ggplot2 and ggpubr. I prefer using ggpubr because it does not require custom functions such as the top answer to this question.
library(ggplot2)
library(ggpubr)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label..)), # adds R^2 value
r.accuracy = 0.01,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)
Could also add p-value to the figure above
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")), # adds R^2 and p-value
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)
Also works well with facet_wrap() when you have multiple groups
df$group <- rep(1:2,50)
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..),
label.x = 0, label.y = 400, size = 4) +
theme_bw() +
facet_wrap(~group)

How can I add p-value from a correlation test and R² from a lm to a scatter plot? [duplicate]

I wonder how to add regression line equation and R^2 on the ggplot. My code is:
library(ggplot2)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p
Any help will be highly appreciated.
Here is one solution
# GET EQUATION AND R-SQUARED AS STRING
# SOURCE: https://groups.google.com/forum/#!topic/ggplot2/1TgH-kG5XMA
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
p1 <- p + geom_text(x = 25, y = 300, label = lm_eqn(df), parse = TRUE)
EDIT. I figured out the source from where I picked this code. Here is the link to the original post in the ggplot2 google groups
Statistic stat_poly_eq() in my package ggpmisc makes it possible add text labels based on a linear model fit.
This answer has been updated for 'ggpmisc' (>= 0.4.0) and 'ggplot2' (>= 3.3.0) on 2022-06-02.
In the examples I use stat_poly_line() instead of stat_smooth() as it has the same defaults as stat_poly_eq() for method and formula. I have omitted in all code examples the additional arguments to stat_poly_line() as they are irrelevant to the question of adding labels.
library(ggplot2)
library(ggpmisc)
#> Loading required package: ggpp
#>
#> Attaching package: 'ggpp'
#> The following object is masked from 'package:ggplot2':
#>
#> annotate
# artificial data
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$yy <- 2 + 3 * df$x + 0.1 * df$x^2 + rnorm(100, sd = 40)
# using default formula, label and methods
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq() +
geom_point()
# assembling a single label with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
# adding separate labels with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = after_stat(eq.label))) +
stat_poly_eq(label.y = 0.9) +
geom_point()
# regression through the origin
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line(formula = y ~ x + 0) +
stat_poly_eq(formula = y ~ x + 0, aes(label = after_stat(eq.label))) +
geom_point()
# fitting a polynomial
ggplot(data = df, aes(x = x, y = yy)) +
stat_poly_line(formula = y ~ poly(x, 2, raw = TRUE)) +
stat_poly_eq(formula = y ~ poly(x, 2, raw = TRUE),
aes(label = after_stat(eq.label))) +
geom_point()
# adding a hat as asked by #MYaseen208 and #elarry
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(hat(y))~`=`~",
aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
# variable substitution as asked by #shabbychef
# same labels in equation and axes
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(h)~`=`~",
eq.x.rhs = "~italic(z)",
aes(label = after_stat(eq.label))) +
labs(x = expression(italic(z)), y = expression(italic(h))) +
geom_point()
# grouping as asked by #helen.h
dfg <- data.frame(x = c(1:100))
dfg$y <- 20 * c(0, 1) + 3 * df$x + rnorm(100, sd = 40)
dfg$group <- factor(rep(c("A", "B"), 50))
ggplot(data = dfg, aes(x = x, y = y, colour = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()
ggplot(data = dfg, aes(x = x, y = y, linetype = group, grp.label = group)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(grp.label), "*\": \"*",
after_stat(eq.label), "*\", \"*",
after_stat(rr.label), sep = ""))) +
geom_point()
# a single fit with grouped data as asked by #Herman
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point(aes(colour = group))
# facets
ggplot(data = dfg, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point() +
facet_wrap(~group)
Created on 2022-06-02 by the reprex package (v2.0.1)
I changed a few lines of the source of stat_smooth and related functions to make a new function that adds the fit equation and R squared value. This will work on facet plots too!
library(devtools)
source_gist("524eade46135f6348140")
df = data.frame(x = c(1:100))
df$y = 2 + 5 * df$x + rnorm(100, sd = 40)
df$class = rep(1:2,50)
ggplot(data = df, aes(x = x, y = y, label=y)) +
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
geom_smooth(method="lm",se=FALSE) +
geom_point() + facet_wrap(~class)
I used the code in #Ramnath's answer to format the equation. The stat_smooth_func function isn't very robust, but it shouldn't be hard to play around with it.
https://gist.github.com/kdauria/524eade46135f6348140. Try updating ggplot2 if you get an error.
I've modified Ramnath's post to a) make more generic so it accepts a linear model as a parameter rather than the data frame and b) displays negatives more appropriately.
lm_eqn = function(m) {
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
Usage would change to:
p1 = p + geom_text(aes(x = 25, y = 300, label = lm_eqn(lm(y ~ x, df))), parse = TRUE)
Here's the most simplest code for everyone
Note: Showing Pearson's Rho and not R^2.
library(ggplot2)
library(ggpubr)
df <- data.frame(x = c(1:100)
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()+
stat_cor(label.y = 35)+ #this means at 35th unit in the y axis, the r squared and p value will be shown
stat_regline_equation(label.y = 30) #this means at 30th unit regresion line equation will be shown
p
Using ggpubr:
library(ggpubr)
# reproducible data
set.seed(1)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
# By default showing Pearson R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300) +
stat_regline_equation(label.y = 280)
# Use R2 instead of R
ggscatter(df, x = "x", y = "y", add = "reg.line") +
stat_cor(label.y = 300,
aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))) +
stat_regline_equation(label.y = 280)
## compare R2 with accepted answer
# m <- lm(y ~ x, df)
# round(summary(m)$r.squared, 2)
# [1] 0.85
really love #Ramnath solution. To allow use to customize the regression formula (instead of fixed as y and x as literal variable names), and added the p-value into the printout as well (as #Jerry T commented), here is the mod:
lm_eqn <- function(df, y, x){
formula = as.formula(sprintf('%s ~ %s', y, x))
m <- lm(formula, data=df);
# formating the values into a summary string to print out
# ~ give some space, but equal size and comma need to be quoted
eq <- substitute(italic(target) == a + b %.% italic(input)*","~~italic(r)^2~"="~r2*","~~p~"="~italic(pvalue),
list(target = y,
input = x,
a = format(as.vector(coef(m)[1]), digits = 2),
b = format(as.vector(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
# getting the pvalue is painful
pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits=1)
)
)
as.character(as.expression(eq));
}
geom_point() +
ggrepel::geom_text_repel(label=rownames(mtcars)) +
geom_text(x=3,y=300,label=lm_eqn(mtcars, 'hp','wt'),color='red',parse=T) +
geom_smooth(method='lm')
Unfortunately, this doesn't work with facet_wrap or facet_grid.
Another option would be to create a custom function generating the equation using dplyr and broom libraries:
get_formula <- function(model) {
broom::tidy(model)[, 1:2] %>%
mutate(sign = ifelse(sign(estimate) == 1, ' + ', ' - ')) %>% #coeff signs
mutate_if(is.numeric, ~ abs(round(., 2))) %>% #for improving formatting
mutate(a = ifelse(term == '(Intercept)', paste0('y ~ ', estimate), paste0(sign, estimate, ' * ', term))) %>%
summarise(formula = paste(a, collapse = '')) %>%
as.character
}
lm(y ~ x, data = df) -> model
get_formula(model)
#"y ~ 6.22 + 3.16 * x"
scales::percent(summary(model)$r.squared, accuracy = 0.01) -> r_squared
Now we need to add the text to the plot:
p +
geom_text(x = 20, y = 300,
label = get_formula(model),
color = 'red') +
geom_text(x = 20, y = 285,
label = r_squared,
color = 'blue')
Inspired by the equation style provided in this answer, a more generic approach (more than one predictor + latex output as option) can be:
print_equation= function(model, latex= FALSE, ...){
dots <- list(...)
cc= model$coefficients
var_sign= as.character(sign(cc[-1]))%>%gsub("1","",.)%>%gsub("-"," - ",.)
var_sign[var_sign==""]= ' + '
f_args_abs= f_args= dots
f_args$x= cc
f_args_abs$x= abs(cc)
cc_= do.call(format, args= f_args)
cc_abs= do.call(format, args= f_args_abs)
pred_vars=
cc_abs%>%
paste(., x_vars, sep= star)%>%
paste(var_sign,.)%>%paste(., collapse= "")
if(latex){
star= " \\cdot "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]%>%
paste0("\\hat{",.,"_{i}}")
x_vars= names(cc_)[-1]%>%paste0(.,"_{i}")
}else{
star= " * "
y_var= strsplit(as.character(model$call$formula), "~")[[2]]
x_vars= names(cc_)[-1]
}
equ= paste(y_var,"=",cc_[1],pred_vars)
if(latex){
equ= paste0(equ," + \\hat{\\varepsilon_{i}} \\quad where \\quad \\varepsilon \\sim \\mathcal{N}(0,",
summary(MetamodelKdifEryth)$sigma,")")%>%paste0("$",.,"$")
}
cat(equ)
}
The model argument expects an lm object, the latex argument is a boolean to ask for a simple character or a latex-formated equation, and the ... argument pass its values to the format function.
I also added an option to output it as latex so you can use this function in a rmarkdown like this:
```{r echo=FALSE, results='asis'}
print_equation(model = lm_mod, latex = TRUE)
```
Now using it:
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
df$z <- 8 + 3 * df$x + rnorm(100, sd = 40)
lm_mod= lm(y~x+z, data = df)
print_equation(model = lm_mod, latex = FALSE)
This code yields:
y = 11.3382963933174 + 2.5893419 * x + 0.1002227 * z
And if we ask for a latex equation, rounding the parameters to 3 digits:
print_equation(model = lm_mod, latex = TRUE, digits= 3)
This yields:
Similar to #zx8754 and #kdauria answers except using ggplot2 and ggpubr. I prefer using ggpubr because it does not require custom functions such as the top answer to this question.
library(ggplot2)
library(ggpubr)
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label..)), # adds R^2 value
r.accuracy = 0.01,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)
Could also add p-value to the figure above
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")), # adds R^2 and p-value
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..), # adds equation to linear regression
label.x = 0, label.y = 400, size = 4)
Also works well with facet_wrap() when you have multiple groups
df$group <- rep(1:2,50)
ggplot(data = df, aes(x = x, y = y)) +
stat_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point() +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
r.accuracy = 0.01,
p.accuracy = 0.001,
label.x = 0, label.y = 375, size = 4) +
stat_regline_equation(aes(label = ..eq.label..),
label.x = 0, label.y = 400, size = 4) +
theme_bw() +
facet_wrap(~group)

Can I mimick facet_wrap() with 5 completely separate ggplots?

I like the neatness of using facet_wrap() or facet_grid() with ggplot since the plots are all made to be the same size and are fitted row and column wise automatically.
I have a data frame and I am experimenting with various transformations and their impact on fit as measured by R2
dm1 <- lm(price ~ x, data = diamonds)
dm1R2 <- summary(dm1)$r.squared #0.78
dm2 <- lm(log(price) ~ x, data = diamonds)
dm2R2 <- summary(dm2)$r.squared # 0.9177831
dm3 <- lm(log(price) ~ x^2, data = diamonds)
dm3R2 <- summary(dm3)$r.squared # also 0.9177831. Aside, why?
ggplot(diamonds, aes(x = x, y = price)) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3.5, y = 10000, label = paste0('R-Squared: ', round(dm1R2, 3)))
ggplot(diamonds, aes(x = x, y = log(price))) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3, y = 9, label = paste0('R-Squared: ', round(dm2R2, 3)))
ggplot(diamonds, aes(x = x^2, y = log(price))) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3, y = 20, label = paste0('R-Squared: ', round(dm3R2, 3)))
This produces 3 completely separate plots. Within Rmd file they will appear one after the other.
Is there a way to add them to a grid like when using facet_wrap?
You can use ggplot2's built-in faceting if you generate a "long" data frame from the regression model objects. The model object returned by lm includes the data used to fit the model, so we can extract the data and the r-squared for each model, stack them into a single data frame, and generate a faceted plot.
The disadvantage of this approach is that you lose the ability to easily set separate x-axis and y-axis titles for each panel, which is important, because the x and y values have different transformations in different panels. In an effort to mitigate that problem, I've used the model formulas as the facet labels.
Also, the reason you got the same r-squared for the models specified by log(price) ~ x and log(price) ~ x^2 is that R treats them as the same model. To tell R that you literally mean x^2 in a model formula, you need to wrap it in the I() function, making the formula log(price) ~ I(x^2). You could also do log(price) ~ poly(x, 2, raw=TRUE).
library(tidyverse)
theme_set(theme_bw(base_size=14))
# Generate a small subset of the diamonds data frame
set.seed(2)
dsub = diamonds[sample(1:nrow(diamonds), 2000), ]
dm1 <- lm(price ~ x, data = dsub)
dm2 <- lm(log(price) ~ x, data = dsub)
dm3 <- lm(log(price) ~ I(x^2), data = dsub)
# Create long data frame from the three model objects
dat = list(dm1, dm2, dm3) %>%
map_df(function(m) {
tibble(r2=summary(m)$r.squared,
form=as_label(formula(m))) %>%
cbind(m[["model"]] %>% set_names(c("price","x")))
}, .id="Model") %>%
mutate(form=factor(form, levels=unique(form)))
# Create data subset for geom_text
text.dat = dat %>% group_by(form) %>%
summarise(x = quantile(x, 1),
price = quantile(price, 0.05),
r2=r2[1])
dat %>%
ggplot(aes(x, price)) +
geom_point(alpha=0.3, colour="red") +
geom_smooth(method="lm") +
geom_text(data=text.dat, parse=TRUE,
aes(label=paste0("r^2 ==", round(r2, 2))),
hjust=1, size=3.5, colour="grey30") +
facet_wrap(~ form, scales="free")
ggarrange from the ggpubr package can do this:
p1 = ggplot(diamonds, aes(x = x, y = price)) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3.5, y = 10000, label = paste0('R-Squared: ', round(dm1R2, 3)))
p2 = ggplot(diamonds, aes(x = x, y = log(price))) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3, y = 9, label = paste0('R-Squared: ', round(dm2R2, 3)))
p3 = ggplot(diamonds, aes(x = x^2, y = log(price))) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3, y = 20, label = paste0('R-Squared: ', round(dm3R2, 3)))
ggpubr::ggarrange(p1, p2, p3, ncol = 2, nrow = 2, align = "hv")
Other packages that have been suggested in the comments like cowplot and patchwork also offer good options for this.

Resources