ggplot of lm() with equation [duplicate] - r

I have read many postings on this topic using expression(), paste(), and bquote(), or some combination. I think I am close to solving my problem, but I just can't get there. The following script generates a plot labelled with "y = 1 + 2(x); r^2= 0.9". How can I italicize "y" and "x", and italicize the "r" and superscript the 2 of "r^2"? If I have overlooked a relevant earlier post, sorry, but please direct me to it.
df <- data.frame(x=c(1:5), y=c(1:5))
a <- 1
b <- 2
r2 <- 0.9
eq <- paste("y = ", a, " + ", b, "(x); r^2=", r2)
ggplot(data=df, aes(x=x, y=y))+
geom_point(color="black")+
geom_text(x=2, y=4,label=eq, parse=FALSE)

You could use annotate() which allows you to paste directly into the plot.
library(ggplot2)
ggplot(data=df, aes(x=x, y=y)) +
geom_point(color="black") +
annotate('text', 2.5, 4,
label=paste("italic(y)==", a, "+", b,
"~italic(x)~';'~italic(r)^2==", r2),
parse=TRUE,
hjust=1, size=5)
Yields:
Data:
df <- data.frame(x=c(1:5), y=c(1:5))
a <- 1
b <- 2
r2 <- 0.9

You can use a combination of substitute and plotmath (https://www.rdocumentation.org/packages/grDevices/versions/3.5.1/topics/plotmath) to italicize the text-
# setup
set.seed(123)
library(ggplot2)
# dataframe
df <- data.frame(x = c(1:5), y = c(1:5))
# label
eq <- substitute(
expr =
paste(
italic("y"),
" = ",
a,
" + ",
b,
"(",
italic("x"),
"); ",
italic("r") ^ 2,
" = ",
r2
),
env = base::list(a = 1,
b = 2,
r2 = 0.9)
)
# plot
ggplot(data = df, aes(x = x, y = y)) +
geom_point(color = "black") +
labs(subtitle = eq)
Created on 2018-12-04 by the reprex package (v0.2.1)

In addition to the answer by Indrajit Patil & jay-sf, I would like to add that there is an automated way to fit regression lines (I believe there are many), using a package called ggpmisc. The letters that you want in italic, are already formatted in such a way. The code that needs to be used is:
> install.packages('ggpmisc'); library(ggpmisc); formula <- y ~ x
> df <- data.frame(x=c(1:5), y=c(1:5))
> ggplot(data = df, aes(x, y)) + geom_point(color="black") +
geom_smooth(method = "lm", formula = formula) +
stat_poly_eq(aes(label = paste(..eq.label.., ..adj.rr.label.., sep = "~~~~")),
formula = formula, parse = TRUE)
It shows the fitted lines also, which I hope is not an impediment to the main goal.
EDIT: The line can be removed using linetype = 0, compatible with
most of the aesthetics in ggplot2.
... + geom_smooth(method = "lm", formula = formula, linetype = 0) + ...

Related

How to format ggplot `geom_text` with formula, getting unwanted "c(...)"

In my ggplot2 code below, I want to show the formula for a linear-regression fit on my plot with geom_text, but I get unwanted c before the values of a and b, how do I prevent this?
p <- ggplot(data=Algae, aes(x=a254, y=DOC))+
geom_point(color="blue",stat="identity") +
geom_smooth(method="lm",se=FALSE,color="red",formula=y~x)
model.lm <- lm(DOC~a254, data=Algae)
l <- list(a=format(coef(model.lm)[1], digits=4),
b=format(coef(model.lm)[2], digits=4),
r2=format(summary(model.lm)$r.squared, digits=4),
p=format(summary(model.lm)$coefficients[2,4], digits=4))
eq <- substitute(italic(DOC) == a - b %*% italic(a254)~","~italic(R)^2~"="~r2~",
"~italic(P)~"="~p, l)
p1 <- p + geom_text(aes(x =6, y = 0, label = as.character(as.expression(eq))), parse = TRUE)
p1
The reason for this is that you first format() your data into character format and then try to calculate with strings. You could solve the problem this way:
First, it is more convenient to transform your list into a data.frame, using:
d <- as.data.frame(l)
The values should be converted back to numeric, since you yet want to do arithmetics inside the formula:
d[] <- lapply(d, function(x) as.numeric(as.character(x)))
Then it should work fine:
eq <- substitute(italic(Sepal.Length) == a - b %*% italic(Petal.Length)~","~italic(R)^2~"="~r2~",
"~italic(P)~"="~p, d)
p + geom_text(aes(x =5, y = 0, label = as.character(as.expression(eq))), parse = TRUE)
You could also use annotate() to add the formula to the plot, which might look a little nicer:
p + annotate('text', 7, 4,
label=paste("italic(Sepal.Length)==", d$a, "~-~", d$b, "~x~",
"~italic(Petal.Length)~';'~italic(r)^2==", d$r2,
"~italic(P)==", d$p),
parse=TRUE,
hjust=1, size=3.5)
Yielding:
Data:
library(ggplot2)
p <- ggplot(data=iris, aes(x=Petal.Length, y=Sepal.Length)) +
geom_point(color="blue", stat="identity") +
geom_smooth(method="lm", se=FALSE, color="red", formula=y~x)
model.lm <- lm(Sepal.Length ~ Petal.Length, data=iris)
l <- list(a=format(coef(model.lm)[1], digits=4),
b=format(coef(model.lm)[2], digits=4),
r2=format(summary(model.lm)$r.squared, digits=4),
p=format(summary(model.lm)$coefficients[2, 4], digits=4))

Getting a variable to pass into function in R (ggplot2)

I'm trying to plot a graph between two columns of data from the data frame called "final". I want the p value and r^2 value to show up on the graph.
I'm using this function and code, but it gives me the error "cannot find y value"
library(ggplot2)
lm_eqn <- function(final, x, y){
m <- lm(final[,y] ~ final[,x])
output <- paste("r.squared = ", round(summary(m)$adj.r.squared, digits = 4), " | p.value = ", formatC(summary(m)$coefficients[8], format = "e", digits = 4))
return(output)
}
output_plot <- lm_eqn(final, x, y)
p1 <- ggplot(final, aes(x=ENSG00000153563, y= ENSG00000163599)) + geom_point() + geom_smooth(method=lm, se=FALSE) + labs(x = "CD8A", y = "CTLA-4") + ggtitle("CD8 v/s CTLA-4", subtitle = paste("Linear Regression of Expression |", output_plot))
How do I get both columns of data x and y to flow through the function and for the graph to plot with the p value and residual value printed on the graph?
Thanks in advance.
When you call function for output_plot generation you have to use the same ENS... variables as in your plot. After simplifying slightly function, should work now
library(stats)
library(ggplot2)
lm_eqn <- function(x, y){
m <- lm(y ~ x)
output <- paste("r.squared = ", round(summary(m)$adj.r.squared, digits = 4), " | p.value = ", formatC(summary(m)$coefficients[8], format = "e", digits = 4))
return(output)
}
x <-c(1,2,5,2,3,6,7,0)
y <-c(2,3,5,9,8,3,3,1)
final <- data_frame(x,y)
output_plot <- lm_eqn(x, y)
p1 <- ggplot(final, aes(x=x, y= y)) + geom_point() + geom_smooth(method=lm, se=FALSE) + labs(x = "x", y = "y") + ggtitle("CD8 v/s CTLA-4", subtitle = paste("Linear Regression of Expression |", output_plot))

R, R², p-value and regression equation

This code gives me a plot with the regression equation and R2: (but i need to mention in which x and y the equation will be (manually)
CORRELATIONP3 <-CORRELATIONP2[product=='a',]
x<-CORRELATIONP3$b
y<-CORRELATIONP3$p
df <- data.frame(x = x)
m <- lm(y ~ x, data = df)
p <- ggplot(data = df, aes(x = x, y = y)) +
scale_x_continuous("b (%)") +
scale_y_continuous("p (%)")+
geom_smooth(method = "lm", formula = y ~ x) +
geom_point()
p
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list( a = format(coef(m)[1], digits = 4),
b = format(coef(m)[2], digits = 4),
r2 = format(summary(m)$r.squared, digits = 3)))
dftext <- data.frame(x = 3, y = 0.2, eq = as.character(as.expression(eq)))
p + geom_text(aes(label = eq), data = dftext, parse = TRUE)
But, with this code I have R and p-value: And here the information about R and p values fits automatically in the plot, why? I want this in the first one as well.
CORRELATIONP3 <-CORRELATIONP2[product=='a',]
x<-CORRELATIONP3$b
y<-CORRELATIONP3$p
df <- data.frame(x = x)
m <- lm(y ~ x, data = df)
p <- ggplot(data = df, aes(x = x, y = y)) +
scale_x_continuous("b (%)") +
scale_y_continuous("p (%)")+
geom_smooth(method = "lm", formula = y ~ x) +
geom_point()
p
eq <- substitute(italic(r)~"="~rvalue*","~italic(p)~"="~pvalue, list(rvalue = sprintf("%.2f",sign(coef(m)[2])*sqrt(summary(m)$r.squared)), pvalue = format(summary(m)$coefficients[2,4], digits = 3)))
dftext <- data.frame(x = 30, y = 0.4, eq = as.character(as.expression(eq)))
p + geom_text(aes(label = eq), data = dftext, parse = TRUE)
Can you tell me how can I join all the 4 informations in one sigle plot? (R, R2, equation and p-value)
Besides that, i would like that these informations could be fitted automatically in the plot, not manually.
Ok, I am not sure if this works as you have not given a reproducible example of your data but I guess you just have to rename one of your variables e.g.:
eq2 <- substitute(italic(r)~"="~rvalue*","~italic(p)~"="~pvalue,
list(rvalue = sprintf("%.2f",sign(coef(m)[2])*sqrt(summary(m)$r.squared)),
pvalue = format(summary(m)$coefficients[2,4], digits = 3)))
and then you change the points you put it on in your plot just a bit below your other block in the first plot. x and y here refer to the position of the text lable so play around with these until your text looks ok.
dftext2 <- data.frame(x = 30, y = 0.12, eq2 = as.character(as.expression(eq2)))
p + geom_text(aes(label = eq2), data = dftext2, parse = TRUE)
please let me know if this works and if this is what you meant.

ggplot2: How to parse a character variable (e.g. x <- ".35") as character, not number, in geom_text label

I am working on a figure for publication and wish to annotate it with some beta and p values; the style guidelines of my area dictate that these numbers be formatted without leading zeros (e.g., ".003", not "0.003"). I have run into what seems like a Catch-22; I have extracted beta and p values from my models and done some preprocessing to correctly format them so that they are now characters rather than numeric:
fake.beta.vals <- c(".53", ".29", ".14")
fake.p.vals <- c(".034", ".001", ".050")
But, when I try to use these values in my figure, parse = TRUE turns them back into numeric values, losing the formatting I need.
fake.beta.vals <- c(".53", ".29", ".14")
fake.p.vals <- c(".034", ".001", ".050")
p <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width))
p <- p +
geom_smooth(method = "lm") +
geom_point() +
facet_wrap( ~ Species)
p
len <-length(levels(iris$Species))
vars <- data.frame(expand.grid(levels(iris$Species)))
colnames(vars) <- c("Species")
betalabs <- as.data.frame(fake.beta.vals)
plabs <- as.data.frame(fake.p.vals)
dat <- data.frame(
x = rep(7, len),
y = rep(4, len),
vars,
betalabs,
plabs)
dat$fake.beta.vals <- as.factor(dat$fake.beta.vals)
dat$fake.p.vals <- as.factor(dat$fake.p.vals)
p <- p +
geom_text(
aes(x = x,
y = y,
label = paste("list(beta ==",
fake.beta.vals,
", italic(p) ==",
fake.p.vals,
")"),
group = NULL),
size = 5,
data = dat,
parse = TRUE)
p
I have been banging my head against this problem for a while now but adding as.character():
label = paste("list(beta ==",
as.character(fake.beta.vals),
", italic(p) ==",
as.character(fake.p.vals),
")"),
Is obviously also cancelled out by parse = TRUE
And adding the function I had previously used to format my values:
statformat <- function(val,z){
sub("^(-?)0.", "\\1.", sprintf(paste("%.",z,"f", sep = ""), val))
}
Is even worse:
label = paste("list(beta ==",
statformat(fake.beta.vals, 2),
", italic(p) ==",
statformat(fake.p.vals, 3),
")"),
And just ends up with a mess.
Help?
Use bquote to create the labels, then coerce to a character representation using deparse
For example
# create a list of labels using bquotw
labs <- Map(.beta = fake.beta.vals,
.p = fake.p.vals,
f = function(.beta,.p) bquote(list(beta == .(.beta), italic(p) == .(.p))))
# coerce to a character representation for parse=TRUE to work within
# geom_text
dat <- data.frame(
x = rep(7, len),
y = rep(4, len),
vars,
labels = sapply(labs,deparse))
p <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_smooth(method = "lm") +
geom_point() +
facet_wrap( ~ Species) +
geom_text(data = dat, aes(x=x,y=y,label=labels), parse=TRUE)
p
After getting back to my computer and re-reading your question, I found that I misinterpreted the question. Trying out the I function, I found that it doesn't seem to work with parse.
I found a way to get it to work, and this is by encasing your fake.beta.vals and fake.p.vals with the ` character or the ' character in your call to parse.
p <- p +
geom_text(
aes(x = x,
y = y,
label = paste("list(beta ==",
"`", fake.beta.vals, "`",
", italic(p) ==",
"`", fake.p.vals, "`",
")",
sep=""),
group = NULL),
size = 5,
data = dat,
parse = TRUE)
That should work.

How to create faceted linear regression plot using GGPLOT

I have a data frame created the following way.
library(ggplot2)
x <- data.frame(letters[1:10],abs(rnorm(10)),abs(rnorm(10)),type="x")
y <- data.frame(letters[1:10],abs(rnorm(10)),abs(rnorm(10)),type="y")
# in reality the number of row could be larger than 10 for each x and y
all <- rbind(x,y)
colnames(all) <- c("name","val1","val2","type")
What I want to do is to create a faceted ggplot that looks roughly like this:
Hence each facet above is the correlation plot of the following:
# Top left facet
subset(all,type=="x")$val1
subset(all,type=="y")$val1
# Top right facet
subset(all,type=="x")$val1
subset(all,type=="y")$val2
# ...etc..
But I'm stuck with the following code:
p <- ggplot(all, aes(val1, val2))+ geom_smooth(method = "lm") + geom_point() +
facet_grid(type ~ )
# Calculate correlation for each group
cors <- ddply(all, c(type ~ ), summarise, cor = round(cor(val1, val2), 2))
p + geom_text(data=cors, aes(label=paste("r=", cor, sep="")), x=0.5, y=0.5)
What's the right way to do it?
Some of your code was incorrect. This works for me:
p <- ggplot(all, aes(val1, val2))+ geom_smooth(method = "lm") + geom_point() +
facet_grid(~type)
# Calculate correlation for each group
cors <- ddply(all, .(type), summarise, cor = round(cor(val1, val2), 2))
p + geom_text(data=cors, aes(label=paste("r=", cor, sep="")), x=1, y=-0.25)
Edit: Following OP's comment and edit. The idea is to re-create the data with all four combinations and then facet.
# I consider the type in your previous data to be xx and yy
dat <- data.frame(val1 = c(rep(all$val1[all$type == "x"], 2),
rep(all$val1[all$type == "y"], 2)),
val2 = rep(all$val2, 2),
grp1 = rep(c("x", "x", "y", "y"), each=10),
grp2 = rep(c("x", "y", "x", "y"), each=10))
p <- ggplot(dat, aes(val1, val2)) + geom_point() + geom_smooth(method = "lm") +
facet_grid(grp1 ~ grp2)
cors <- ddply(dat, .(grp1, grp2), summarise, cor = round(cor(val1, val2), 2))
p + geom_text(data=cors, aes(label=paste("r=", cor, sep="")), x=1, y=-0.25)
Since your data is not in the appropriate format, some reshaping is necessary before it can be plotted.
Firstly, reshape the data to the long format:
library(reshape2)
allM <- melt(all[-1], id.vars = "type")
Split the values along type and val1 vs. val2:
allList <- split(allM$value, interaction(allM$type, allM$variable))
Create a list of all combinations:
allComb <- unlist(lapply(c(1, 3),
function(x)
lapply(c(2 ,4),
function(y)
do.call(cbind, allList[c(x, y)]))),
recursive = FALSE)
Create a new dataset:
allNew <- do.call(rbind,
lapply(allComb, function(x) {
tmp <- as.data.frame(x)
tmp <- (within(tmp, {xval <- names(tmp)[1];
yval <- names(tmp)[2]}))
names(tmp)[1:2] <- c("x", "y")
tmp}))
Plot:
library(ggplot2)
p <- ggplot(allNew, aes(x = x, y = y)) +
geom_smooth(method = "lm") +
geom_point() +
facet_grid(yval ~ xval)
# Calculate correlation for each group
library(plyr)
cors <- ddply(allNew, .(yval, xval), summarise, cor = round(cor(x, y), 2))
p + geom_text(data=cors, aes(label=paste("r=", cor, sep="")), x=0.5, y=0.5)
There is an additional package ggpubr available now addressing exactly this issue with the stat_cor() function.
library(tidyverse)
library(ggpubr)
ggplot(all, aes(val1, val2))+
geom_smooth(method = "lm") +
geom_point() +
facet_grid(~type) +
stat_cor()

Resources