Remove unwanted symbols from expression function - R - r

I am using the below function (found here) to generate linear model equations.
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
lm_eqn(df)
[1] "italic(y) == \"14\" + \"3\" %.% italic(x) * \",\" ~ ~italic(r)^2 ~ \"=\" ~ \"0.806\""
However, this function was built for use in ggplot2, meaning it includes specific expression symbols that ggplot2 recognises and acts upon. I am using this function for something else. How can I alter the code so that I just end up with "y = 14 + 3x, r2=0.806"? Thank you.

If this is about dynamically generating/formatting an output string, you can also use stringr::str_interp:
# Sample data
set.seed(2017);
df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
# Fit
m <- lm(y ~ x, df);
# Extract coefficients and generate string
a <- coef(m)[1];
b <- coef(m)[2];
r2 <- summary(m)$r.squared;
stringr::str_interp("y = $[2.0f]{a} + $[2.0f]{b} x, R2 = $[4.3f]{r2}")
#[1] "y = 9 + 3 x, R2 = 0.793"
Or use sprintf:
sprintf("y = %2.0f + %2.0f x, R2 = %4.3f", a, b, r2);
#[1] "y = 9 + 3 x, R2 = 0.793"

We can use glue
as.character(glue::glue("y = {round(a)} + {round(b)} x, R2 = {round(r2, 3)}"))
#[1] "y = 9 + 3 x, R2 = 0.793"
NOTE: Data based on #MauritsEvers post

Ah, found it.
g<-as.character("y = a + b x, R2= r2 ")
library(magrittr)
g %<>%
gsub("a", format(coef(m)[1], digits = 2), .) %>%
gsub("b", format(coef(m)[2], digits = 2), .) %>%
gsub("r2", format(summary(m)$r.squared, digits = 3), .)
g
[1] "y = 14 + 3 x, R2= 0.806 "

Related

Why the lm_eqn way of adding Regression line function did not work in my case?

I ran into a problem when I was writing a function in R.
I want to compare two variables in this function and I want to draw the regression line of the comparison. I would also want to add the information of the regression line, including the equation and the R^2. The lm_eqn way I have already tried and it did not work on my case, here is my code when I try it. I do not know why, please help!
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
compareFunction <- function(my_dataset, var1, var2) {
ggplot(data = my_dataset,
aes(x = my_dataset[[var1]],
y = my_dataset[[var2]])) +
geom_point() +
geom_smooth(method = 'lm', formula = 'y ~ x') +
geom_text(x = 100, y = 100, label = lm_eqn(my_dataset), parse = TRUE)
}
Ok it becomes a bit tedious in the comment.
So first I recommend adding some useful sample data:
x = runif(100)
y = runif(100)+x
df = data.frame(x,y)
Then update your lm_eqn function as follows - I removed the as.character from your return value.
lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.expression(eq);
}
The compareFunction I would change to use ggtitle:
compareFunction <- function(my_dataset, var1, var2) {
ggplot(data = my_dataset,
aes(x = my_dataset[[var1]],
y = my_dataset[[var2]])) +
geom_point() +
geom_smooth(method = 'lm', formula = 'y ~ x') +
ggtitle(lm_eqn(my_dataset))
}
Then compareFunction(df,"x","y") yields:

Adding stats to a plot (ggplot2)

Given a data frame like :
df1= data.frame(x = c(1:50))
df1$val=df1$x*(-0.35)
I used the ggplot2 and added a regression line with the command
t=ggplot(df1, aes(x=val, y=x))+geom_smooth(method=lm) + geom_point()
In order to add the equation and the r value I tried the code from this question Adding Regression Line Equation and R2 on graph
but I am getting the error
Error in terms.formula(formula, data = data) :
'data' argument is of the wrong type
Any ideas on how to fix this?
EDIT
The code I used
my_sts <- function(df1){
m <- lm(df1$x ~ df1$val, df1);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
tgen = t + geom_text(x = -10, y = 50, label = eq(df1), parse = TRUE)
This is copied from a console session. I corrected two things that I thought were errors: 1) as mention in my comment you should not use df1$ in a formula when you have a data argument, and 2) I think you mean to use my_sts(df1)
> df1= data.frame(x = c(1:50))
> df1$val=df1$x*(-0.35)
> my_sts <- function(df1){
+ m <- lm(x ~ val, df1);
+ eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
+ list(a = format(coef(m)[1], digits = 2),
+ b = format(coef(m)[2], digits = 2),
+ r2 = format(summary(m)$r.squared, digits = 3)))
+ as.character(as.expression(eq));
+ }
> t=ggplot(df1, aes(x=val, y=x))+geom_smooth(method=lm) + geom_point()
> tgen = t + geom_text(x = -10, y = 50, label = eq(df1), parse = TRUE)
Error in layer(data = data, mapping = mapping, stat = stat, geom = GeomText, :
could not find function "eq"
> tgen = t + geom_text(x = -10, y = 50, label = my_sts(df1), parse = TRUE)
Warning message:
In summary.lm(m) : essentially perfect fit: summary may be unreliable
> print(tgen)
Seems to print well: Note that x an y roles are reversed, hence the coefficient being the inverse of the modeled factor.

R, ggplot, loop, for multiple panels

Hey I am trying to do ggplot looping through 4 different cases, I have a data with 5 variables, I would like to plot out all points, regression lines, and add y=ax+b, and r^2 in the plots. All are plotted in a 4 panels plots. I use print(do.call(grid.arrange,plot) in the end, it will generate 4 panels in same plot
If I comment out this line, it generates:
As you can see, the scatter plots are different. I wonder why this happens. Also, the y=ax+b and r^2 look ugly, how to clean it up?
Thank you for your help
lm_eqn1 <- function(df,x,y){
m <- lm(y ~ x, df);
# eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
# list(a = format(coef(m)[1], digits = 2),
# b = format(coef(m)[2], digits = 2),
# r2 = format(summary(m)$r.squared, digits = 3)))
eq<- substitute(italic(y) == b %.% italic(x)+ a,list(a = format(coef(m)[1], digits = 2),b = format(coef(m)[2], digits = 2)))
as.character(as.expression(eq));
}
lm_eqn2 <- function(df,x,y){
m <- lm(y ~ x, df);
eq2<- substitute(italic(r)^2~"="~r2,
list(r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq2));
}
plot = list()
df <- data.frame(as.vector(data[[1]]),as.vector(data[[2]]),as.vector(data[[3]]),as.vector(data[[4]]),as.vector(data[[5]]))
colnames(df) <- case
p = 1
for (k in 1:1){
for (j in 2:5){
# for (p in 1:4){
x_lab <- paste(case[k]," [ug/m3]",sep=" ")
y_lab <- paste(case[j]," [ug/m3]",sep=" ")
x=df[,case[k]]
y=df[,case[j]]
print(case[k])
print(case[j])
plot[[p]] = ggplot(df,aes(x=x,y=y))+
geom_point(size=2,alpha = 0.3,color="red")+
theme_bw(base_size=12, base_family = "Helvetica")+
xlab(x_lab)+
ylab(y_lab)+
theme(aspect.ratio=1)+
ggtitle(case[j]) +
geom_smooth(method='lm',se = FALSE, color="black",formula = y ~ x)+
geom_text(x=-0.0005,y=0.05,label=lm_eqn1(df,x,y),parse = TRUE)+
geom_text(x=-0.0005,y=0.04,label=lm_eqn2(df,x,y),parse = TRUE)
# }
print(plot[[p]])
p = p + 1
}
}
#print(do.call(grid.arrange,plot))
Huang. Your script was very helpful to me. For your question, if you use 'aes_string()' instead of 'aes()' in the first line in ggplot(), it will works.
ggplot(df, aes_string(x=x, y=y))

How to automate linear regression of multiple rows in a loop and plot using R

I am working with 2 data frames and trying to automate the way I currently do.
ID <- c("ID101","ID102","ID103","ID104","ID105","ID106","ID107","ID108","ID109","ID110")
A <- c(420,440,490,413,446,466,454,433,401,414)
B <- c(230,240,295,253,266,286,254,233,201,214)
C <- c(20,40,90,13,46,66,54,33,61,14)
D <- c(120,140,190,113,146,166,154,133,101,114)
E <- c(38,34,33,56,87,31,12,44,68,91)
F <- c(938,934,973,956,987,931,962,944,918,921)
df1 <- data.frame(ID,A,B,C,D,E,F)
Upstream <- c("A","C","E")
Downstream <- c("B","D","F")
df2 <- data.frame(Upstream,Downstream)
I am currently running a simple linear regression between upstream and downstream data and plot its residuals along with it. The way I do it manually is
fit <- lm(A ~ B, data=df)
lm_eqn <- function(df){
m <- lm(A ~ B, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2* "," ~~ RMSE ~"="~rmse,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
rmse = round(sqrt(mean(resid(m)^2,na.rm=TRUE)), 3)))
as.character(as.expression(eq));
}
library(ggplot2)
library(grid)
library(gridExtra)
p1 <- ggplot(df, aes(x=A, y=B)) + geom_point(colour="red",size = 3) + geom_smooth(method=lm) + geom_text(aes(size=10),x = -Inf, hjust = -1, y = Inf, vjust = 1, label = lm_eqn(df), parse = TRUE,show.legend = F)
p2 <- ggplot(df, aes(x=B, y=resid(fit))) + ylab("Residuals") + geom_point(shape=1,colour="red",size = 3) + geom_smooth(method = "lm")
grid.arrange(p1, p2, ncol=2,top=textGrob("Regression data",
gp=gpar(cex=1.5, fontface="bold")))
I get this plot
I redo this manually for the next row in df2 which is C & D and then manually change the parameters again for the next row which is E & F.
How can I use functions or automate this logic so that I run only one time and get the 3 plots, one for each (A&B), (C&D), (E&F).
Please let me know if I am not clear on what I want. Ideally I am looking a way to code up so that I don't manually need to enter the values (A,B,C,D,E,F) at the respective places every time I run. Kindly please provide some directions on how to solve this.
You can use apply() on each df2s row, using as.formula() and aes_string():
apply(df2, 1, function(d)
{
fit <- lm(as.formula(paste(d["Upstream"], " ~ ", d["Downstream"])), data=df1)
lm_eqn <- function(df){
m <- lm(as.formula(paste(d["Upstream"], " ~ ", d["Downstream"])), df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2* "," ~~ RMSE ~"="~rmse,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
rmse = round(sqrt(mean(resid(m)^2,na.rm=TRUE)), 3)))
as.character(as.expression(eq));
}
library(ggplot2)
library(grid)
library(gridExtra)
p1 <- ggplot(df1, aes_string(x=d["Upstream"], y=d["Downstream"])) + geom_point(colour="red",size = 3) + geom_smooth(method=lm) + geom_text(aes(size=10),x = -Inf, hjust = -1, y = Inf, vjust = 1, label = lm_eqn(df1), parse = TRUE,show.legend = FALSE)
p2 <- ggplot(df1, aes_string(x=d["Downstream"], y=resid(fit))) + ylab("Residuals") + geom_point(shape=1,colour="red",size = 3) + geom_smooth(method = "lm")
grid.arrange(p1, p2, ncol=2,top=textGrob("Regression data",
gp=gpar(cex=1.5, fontface="bold")))
})
Another answer using reshape2 to organize your data and plyr to run the regression and plots on the data subsets:
library(reshape2)
df3 <- cbind(
#melt(data, id.vars, measure.vars, variable.name, value.name)
melt(df1, c("ID"), df2$Upstream, "up", "Independent.var"),
melt(df1, c("ID"), df2$Downstream, "down", "Dependent.var")
)
#df3 #Results of the above cbind(melt, melt)
# ID up value ID down Dependent.var
#1 ID101 A 420 ID101 B 230
#2 ID102 A 440 ID102 B 240
#3 ID103 A 490 ID103 B 295
# . . . . . .
#28 ID108 E 44 ID108 F 944
#29 ID109 E 68 ID109 F 918
#30 ID110 E 91 ID110 F 921
#Small edit to the labeling function:
lm_eqn <- function(df){
m <- lm(Dependent.var ~ Independent.var, df); #This is the only change
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2* "," ~~ RMSE ~"="~rmse,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
rmse = round(sqrt(mean(resid(m)^2,na.rm=TRUE)), 3)))
as.character(as.expression(eq));
}
# Put your plot code into a function.
plotter <- function(zz) {
zz$resid <- resid(lm(Dependent.var ~ Independent.var, zz))
p1 <- ggplot(zz, aes(x= Independent.var, y= Dependent.var)) +
geom_point(colour="red",size = 3) + geom_smooth(method=lm) +
geom_text( aes(size=10),x = -Inf, hjust = -2, y = Inf, vjust = 1, label = lm_eqn(zz), parse = TRUE,show.legend = FALSE)
p2 <- ggplot(zz, aes(x= Dependent.var, y=resid )) + ylab("Residuals") +
geom_point(shape=1,colour="red",size = 3) + geom_smooth(method = "lm")
p3 <- grid.arrange(p1, p2, ncol=2,top=textGrob("Regression data",
gp=gpar(cex=1.5, fontface="bold")))
#Choose what you want to output here. You can output a list: ex. list(fit, graph, etc)
p3 #Only the last plot is returned in this case
}
library(plyr)
#Run on every subset of data:
#dlply = take input (d)ataframe and output a (l)ist using (ply)r
dlply(df3, .variables = c("up"), .fun = plotter)
#p3 is output in list format

Put text in same coords with ggplot

I'm trying to make a lot of graphs using ggplot2 script, and add some text (Lm equation and r2 value, using this function) for each graph.
The issue is that my x and y coordinates will be different between each graph.
With 'plot' function, you can convert 'plot' coords to 'figure' coords using cnvr.coord function, but in ggplot2 (grid base package), isn't functionally.
below and example (where "p" is a preexistent ggplot2 object) :
p <- p + geom_text(aes(X, Y, label = lm_eqn(lm(as.numeric(a$value) ~ as.numeric(a$date), a))))
I agree with shujaa. You can simply calculate where the function goes based on the range of your data. Using your link above, I've created an example:
library(ggplot2)
df1 <- data.frame(x = c(1:100))
df1$y <- 2 + 3 * df1$x + rnorm(100, sd = 40)
df1$grp <- rep("Group 1",100)
df2 <- data.frame(x = c(1:100))
df2$y <- 10 -.5 * df2$x + rnorm(100, sd = 100)
df2$grp <- rep("Group 2",100)
df3 <- data.frame(x = c(1:100))
df3$y <- -5 + .2 * df3$x + rnorm(100, sd = 10)
df3$grp <- rep("Group 3",100)
df4 <- data.frame(x = c(1:100))
df4$y <- 2 - 3 * df4$x + rnorm(100, sd = 40)
df4$grp <- rep("Group 4",100)
df <- list(df1,df2,df3,df4)
lm_eqn = function(df) {
m = lm(y ~ x, df);
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
pdf("I:/test.pdf")
for (i in 1:4) {
text.x <- ifelse(lm(df[[i]]$y~1+df[[i]]$x)$coef[2]>0,min(df[[i]]$x),max(df[[i]]$x))
text.y <- max(df[[i]]$y)
text.hjust <- ifelse(lm(df[[i]]$y~1+df[[i]]$x)$coef[2]>0,0,1)
p <- ggplot(data = df[[i]], aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p1 = p + geom_text(aes(x = text.x, y = text.y, label = lm_eqn(df[[i]])), parse = TRUE,hjust=text.hjust)
print(p1)
}
dev.off()

Resources