ggplot in R: add regression equation in a plot - r

I saw this answer from Jayden a while ago about adding regression equation to a plot, which I found very useful. But I don't want to display R^2, so I changed the code a bit to this:
lm_eqn = function(m) {
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x))
} else {
eq <- substitute(italic(y) == a - b %.% italic(x))
}
as.character(as.expression(eq));
}
This managed to plot "a+bx" or "a-bx" to the plot, but without actual coefficients replacing a and b. Does anyone know how to fix the problem? Thanks very much!
Jayden's answer:
lm_eqn = function(m) {
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}

It looks like you are missing the l in substitute(). That is, use substitute(yourFormula, l). Here's a MWE without the r^2 that parallels the one you're looking at (which I think is at Adding Regression Line Equation and R2 on graph).
library(ggplot2)
# Function to generate correlated data.
GenCorrData = function(mu, Sig, n = 1000) {
U <- chol(Sig)
Z <- matrix(rnorm(n*length(mu)), nrow = length(mu))
Y <- crossprod(U,Z) + mu
Y <- as.data.frame(t(Y))
names(Y) <- c("x", "y")
return(Y)
}
# Function to add text
LinEqn = function(m) {
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x),l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x),l)
}
as.character(as.expression(eq));
}
# Example
set.seed(700)
n1 <- 1000
mu1 <- c(4, 5)
Sig1 <- matrix(c(1, .8, .8, 1), nrow = length(mu1))
df1 <- GenCorrData(mu1, Sig1, n1)
scatter1 <- ggplot(data = df1, aes(x, y)) +
geom_point(shape = 21, color = "blue", size = 3.5) +
scale_x_continuous(expand = c(0, 0), limits = c(0, 8)) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 8))
scatter.line1 <- scatter1 +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE,
color="black", size = 1) +
annotate("text", x = 2, y = 7, color = "black", size = 5,
label = LinEqn(lm(y ~ x, df1)), parse = TRUE)
scatter.line1

Related

Add a Passing-Bablok regression line

I have to perform many comparisons between different measurement methods and I have to use the Passing-Bablok regression approach.
I would like to take advantage of ggplot2 and faceting, but I don't know how to add a geom_smooth layer based on the Passing-Bablok regression.
I was thinking about something like: https://stackoverflow.com/a/59173260/2096356
Furthermore, I would also need to show the regression line equation, with confidence interval for intercept and slope parameters, in each plot.
Edit with partial solution
I've found a partial solution combining the code provided in this post and in this answer.
## Regression algorithm
passing_bablok.fit <- function(x, y) {
x_name <- deparse(substitute(x))
lx <- length(x)
l <- lx*(lx - 1)/2
k <- 0
S <- rep(NA, lx)
for (i in 1:(lx - 1)) {
for (j in (i + 1):lx) {
k <- k + 1
S[k] <- (y[i] - y[j])/(x[i] - x[j])
}
}
S.sort <- sort(S)
N <- length(S.sort)
neg <- length(subset(S.sort,S.sort < 0))
K <- floor(neg/2)
if (N %% 2 == 1) {
b <- S.sort[(N+1)/2+K]
} else {
b <- sqrt(S.sort[N / 2 + K]*S.sort[N / 2 + K + 1])
}
a <- median(y - b * x)
res <- as.vector(c(a,b))
names(res) <- c("(Intercept)", x_name)
class(res) <- "Passing_Bablok"
res
}
## Computing confidence intervals
passing_bablok <- function(formula, data, R = 100, weights = NULL){
ret <- boot::boot(
data = model.frame(formula, data),
statistic = function(data, ind) {
data <- data[ind, ]
args <- rlang::parse_exprs(colnames(data))
names(args) <- c("y", "x")
rlang::eval_tidy(rlang::expr(passing_bablok.fit(!!!args)), data, env = rlang::current_env())
},
R=R
)
class(ret) <- c("Passing_Bablok", class(ret))
ret
}
## Plotting confidence bands
predictdf.Passing_Bablok <- function(model, xseq, se, level) {
pred <- as.vector(tcrossprod(model$t0, cbind(1, xseq)))
if(se) {
preds <- tcrossprod(model$t, cbind(1, xseq))
data.frame(
x = xseq,
y = pred,
ymin = apply(preds, 2, function(x) quantile(x, probs = (1-level)/2)),
ymax = apply(preds, 2, function(x) quantile(x, probs = 1-((1-level)/2)))
)
} else {
return(data.frame(x = xseq, y = pred))
}
}
An example of usage:
z <- data.frame(x = rnorm(100, mean = 100, sd = 5),
y = rnorm(100, mean = 110, sd = 8))
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0)
So far, I haven't been able to show the regression line equation, with confidence interval for intercept and slope parameters (as +- or in parentheses).
You've arguably done with difficult part with the PaBa regression.
Here's a basic solution using your passing_bablok.fit function:
z <- data.frame(x = 101:200+rnorm(100,sd=10),
y = 101:200+rnorm(100,sd=8))
mycoefs <- as.numeric(passing_bablok.fit(x = z$x, y=z$y))
paba_eqn <- function(thecoefs) {
l <- list(m = format(thecoefs[2], digits = 2),
b = format(abs(thecoefs[1]), digits = 2))
if(thecoefs[1] >= 0){
eq <- substitute(italic(y) == m %.% italic(x) + b,l)
} else {
eq <- substitute(italic(y) == m %.% italic(x) - b,l)
}
as.character(as.expression(eq))
}
library(ggplot2)
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0) +
annotate("text",x = 110, y = 220, label = paba_eqn(mycoefs), parse = TRUE)
Note the equation will vary because of rnorm in the data creation..
The solution could definitely be made more slick and robust, but it works for both positive and negative intercepts.
Equation concept sourced from: https://stackoverflow.com/a/13451587/2651663

Why the lm_eqn way of adding Regression line function did not work in my case?

I ran into a problem when I was writing a function in R.
I want to compare two variables in this function and I want to draw the regression line of the comparison. I would also want to add the information of the regression line, including the equation and the R^2. The lm_eqn way I have already tried and it did not work on my case, here is my code when I try it. I do not know why, please help!
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
compareFunction <- function(my_dataset, var1, var2) {
ggplot(data = my_dataset,
aes(x = my_dataset[[var1]],
y = my_dataset[[var2]])) +
geom_point() +
geom_smooth(method = 'lm', formula = 'y ~ x') +
geom_text(x = 100, y = 100, label = lm_eqn(my_dataset), parse = TRUE)
}
Ok it becomes a bit tedious in the comment.
So first I recommend adding some useful sample data:
x = runif(100)
y = runif(100)+x
df = data.frame(x,y)
Then update your lm_eqn function as follows - I removed the as.character from your return value.
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.expression(eq);
}
The compareFunction I would change to use ggtitle:
compareFunction <- function(my_dataset, var1, var2) {
ggplot(data = my_dataset,
aes(x = my_dataset[[var1]],
y = my_dataset[[var2]])) +
geom_point() +
geom_smooth(method = 'lm', formula = 'y ~ x') +
ggtitle(lm_eqn(my_dataset))
}
Then compareFunction(df,"x","y") yields:

Adding stats to a plot (ggplot2)

Given a data frame like :
df1= data.frame(x = c(1:50))
df1$val=df1$x*(-0.35)
I used the ggplot2 and added a regression line with the command
t=ggplot(df1, aes(x=val, y=x))+geom_smooth(method=lm) + geom_point()
In order to add the equation and the r value I tried the code from this question Adding Regression Line Equation and R2 on graph
but I am getting the error
Error in terms.formula(formula, data = data) :
'data' argument is of the wrong type
Any ideas on how to fix this?
EDIT
The code I used
my_sts <- function(df1){
m <- lm(df1$x ~ df1$val, df1);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
tgen = t + geom_text(x = -10, y = 50, label = eq(df1), parse = TRUE)
This is copied from a console session. I corrected two things that I thought were errors: 1) as mention in my comment you should not use df1$ in a formula when you have a data argument, and 2) I think you mean to use my_sts(df1)
> df1= data.frame(x = c(1:50))
> df1$val=df1$x*(-0.35)
> my_sts <- function(df1){
+ m <- lm(x ~ val, df1);
+ eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
+ list(a = format(coef(m)[1], digits = 2),
+ b = format(coef(m)[2], digits = 2),
+ r2 = format(summary(m)$r.squared, digits = 3)))
+ as.character(as.expression(eq));
+ }
> t=ggplot(df1, aes(x=val, y=x))+geom_smooth(method=lm) + geom_point()
> tgen = t + geom_text(x = -10, y = 50, label = eq(df1), parse = TRUE)
Error in layer(data = data, mapping = mapping, stat = stat, geom = GeomText, :
could not find function "eq"
> tgen = t + geom_text(x = -10, y = 50, label = my_sts(df1), parse = TRUE)
Warning message:
In summary.lm(m) : essentially perfect fit: summary may be unreliable
> print(tgen)
Seems to print well: Note that x an y roles are reversed, hence the coefficient being the inverse of the modeled factor.

R, ggplot, loop, for multiple panels

Hey I am trying to do ggplot looping through 4 different cases, I have a data with 5 variables, I would like to plot out all points, regression lines, and add y=ax+b, and r^2 in the plots. All are plotted in a 4 panels plots. I use print(do.call(grid.arrange,plot) in the end, it will generate 4 panels in same plot
If I comment out this line, it generates:
As you can see, the scatter plots are different. I wonder why this happens. Also, the y=ax+b and r^2 look ugly, how to clean it up?
Thank you for your help
lm_eqn1 <- function(df,x,y){
m <- lm(y ~ x, df);
# eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
# list(a = format(coef(m)[1], digits = 2),
# b = format(coef(m)[2], digits = 2),
# r2 = format(summary(m)$r.squared, digits = 3)))
eq<- substitute(italic(y) == b %.% italic(x)+ a,list(a = format(coef(m)[1], digits = 2),b = format(coef(m)[2], digits = 2)))
as.character(as.expression(eq));
}
lm_eqn2 <- function(df,x,y){
m <- lm(y ~ x, df);
eq2<- substitute(italic(r)^2~"="~r2,
list(r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq2));
}
plot = list()
df <- data.frame(as.vector(data[[1]]),as.vector(data[[2]]),as.vector(data[[3]]),as.vector(data[[4]]),as.vector(data[[5]]))
colnames(df) <- case
p = 1
for (k in 1:1){
for (j in 2:5){
# for (p in 1:4){
x_lab <- paste(case[k]," [ug/m3]",sep=" ")
y_lab <- paste(case[j]," [ug/m3]",sep=" ")
x=df[,case[k]]
y=df[,case[j]]
print(case[k])
print(case[j])
plot[[p]] = ggplot(df,aes(x=x,y=y))+
geom_point(size=2,alpha = 0.3,color="red")+
theme_bw(base_size=12, base_family = "Helvetica")+
xlab(x_lab)+
ylab(y_lab)+
theme(aspect.ratio=1)+
ggtitle(case[j]) +
geom_smooth(method='lm',se = FALSE, color="black",formula = y ~ x)+
geom_text(x=-0.0005,y=0.05,label=lm_eqn1(df,x,y),parse = TRUE)+
geom_text(x=-0.0005,y=0.04,label=lm_eqn2(df,x,y),parse = TRUE)
# }
print(plot[[p]])
p = p + 1
}
}
#print(do.call(grid.arrange,plot))
Huang. Your script was very helpful to me. For your question, if you use 'aes_string()' instead of 'aes()' in the first line in ggplot(), it will works.
ggplot(df, aes_string(x=x, y=y))

Put text in same coords with ggplot

I'm trying to make a lot of graphs using ggplot2 script, and add some text (Lm equation and r2 value, using this function) for each graph.
The issue is that my x and y coordinates will be different between each graph.
With 'plot' function, you can convert 'plot' coords to 'figure' coords using cnvr.coord function, but in ggplot2 (grid base package), isn't functionally.
below and example (where "p" is a preexistent ggplot2 object) :
p <- p + geom_text(aes(X, Y, label = lm_eqn(lm(as.numeric(a$value) ~ as.numeric(a$date), a))))
I agree with shujaa. You can simply calculate where the function goes based on the range of your data. Using your link above, I've created an example:
library(ggplot2)
df1 <- data.frame(x = c(1:100))
df1$y <- 2 + 3 * df1$x + rnorm(100, sd = 40)
df1$grp <- rep("Group 1",100)
df2 <- data.frame(x = c(1:100))
df2$y <- 10 -.5 * df2$x + rnorm(100, sd = 100)
df2$grp <- rep("Group 2",100)
df3 <- data.frame(x = c(1:100))
df3$y <- -5 + .2 * df3$x + rnorm(100, sd = 10)
df3$grp <- rep("Group 3",100)
df4 <- data.frame(x = c(1:100))
df4$y <- 2 - 3 * df4$x + rnorm(100, sd = 40)
df4$grp <- rep("Group 4",100)
df <- list(df1,df2,df3,df4)
lm_eqn = function(df) {
m = lm(y ~ x, df);
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
pdf("I:/test.pdf")
for (i in 1:4) {
text.x <- ifelse(lm(df[[i]]$y~1+df[[i]]$x)$coef[2]>0,min(df[[i]]$x),max(df[[i]]$x))
text.y <- max(df[[i]]$y)
text.hjust <- ifelse(lm(df[[i]]$y~1+df[[i]]$x)$coef[2]>0,0,1)
p <- ggplot(data = df[[i]], aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p1 = p + geom_text(aes(x = text.x, y = text.y, label = lm_eqn(df[[i]])), parse = TRUE,hjust=text.hjust)
print(p1)
}
dev.off()

Resources