2 polynomial regressions in a ggplot() graph - r

This is my Dataset:
As you can see, there are two quantitative variables (X, Y) and 1 categorical variable (molar, with two factors: M1, M2).
I would like to represent in one single graph two polynomial regressions and their respective prediction intervals: one for the M1 factor and one for the M2 factor. Each polynomial regression has its own degree (M1 is a 4 degree polynomial regression, and M2 is a 6 degree).
I want to use ggplot() function (which is in package ggplot2 in R). I have actually performed this figure but with all data merged (I mean, with no distinction between factors). This is the code I used:
# Fit a linear model
m <- lm(Y ~ X+I(X^2)+I(X^3)+I(X^4), data = Dataset)
# cbind the predictions to Dataset
mpi <- cbind(Dataset, predict(m, interval = "prediction"))
ggplot(mpi, aes(x = X)) +
geom_ribbon(aes(ymin = lwr, ymax = upr),
fill = "blue", alpha = 0.2) +
geom_point(aes(y = Y)) +
geom_line(aes(y = fit), colour = "blue", size = 1)
With this result:
So, I would like to have two different-grade polynomial regressions (one for the M1 and one for the M2), taking into account their respective predictions intervals. Which would be the exact code?
UPDATE - New code! I run this code with no success:
M1=subset(Dataset,Dataset$molar=="M1",select=X:Y)
M2=subset(Dataset,Dataset$molar=="M2",select=X:Y)
M1.R <- lm(Y ~ X +I(X^2)+I(X^3)+I(X^4),
data=subset(Dataset,Dataset$molar=="M1",select=X:Y))
M2.R <- lm(Y ~ X +I(X^2)+I(X^3)+I(X^4),
data=subset(Dataset,Dataset$molar=="M2",select=X:Y))
newdf <- data.frame(x = seq(0, 1, c(408,663)))
M1.P <- cbind(data=subset(Dataset,Dataset$molar=="M1",select=X:Y), predict(M1.R, interval = "prediction"))
M2.P <- cbind(data=subset(Dataset,Dataset$molar=="M2",select=X:Y), predict(M2.R, interval = "prediction"))
p = cbind(as.data.frame(rbind(M1.P, M2.P)), f = factor(rep(1:2, c(408,663)), x = rep(newdf$x, 2))
mdf = with(Dataset, data.frame(x = rep(x, 2), y = c(subset(Dataset,Dataset$molar=="M1",select=Y), subset(Dataset,Dataset$molar=="M2",select=Y),
f = factor(rep(1:2, c(408,663))))
ggplot(mdf, aes(x = x, y = y, colour = f)) + geom_point() +
geom_ribbon(data = p, aes(x = x, ymin = lwr, ymax = upr,
fill = f, y = NULL, colour = NULL),
alpha = 0.2) +
geom_line(data = p, aes(x = x, y = fit))
These are the messages I get now:
[98] WARNING: Warning in if (n < 0L) stop("wrong sign in 'by' argument") :
the condition has length > 1 and only the first element will be used
Warning in if (n > .Machine$integer.max) stop("'by' argument is much too small") :
the condition has length > 1 and only the first element will be used
Warning in 0L:n :
numerical expression has 2 elements: only the first used
Warning in if (by > 0) pmin(x, to) else pmax(x, to) :
the condition has length > 1 and only the first element will be used
[99] WARNING: Warning in predict.lm(M1.R, interval = "prediction") :
predictions on current data refer to _future_ responses
[100] WARNING: Warning in predict.lm(M2.R, interval = "prediction") :
predictions on current data refer to _future_ responses
[101] ERROR: <text>
I think I am closer but still can't see it. Help!

Here is one way. If you have more than two models/levels in the factor you should look into code that will work over the levels of the factor and fit the models that way.
Anyway, first some dummy data:
set.seed(100)
x <- runif(100)
y1 <- 2 + (0.3 * x) + (2.4 * x^2) + (-2.5 * x^3) + (3.4 * x^4) + rnorm(100)
y2 <- -1 + (0.3 * x) + (2.4 * x^2) + (-2.5 * x^3) + (3.4 * x^4) +
(-0.3 * x^5) + (2.4 * x^6) + rnorm(100)
df <- data.frame(x, y1, y2)
Fit our two models:
m1 <- lm(y1 ~ poly(x, 4), data = df)
m2 <- lm(y2 ~ poly(x, 6), data = df)
Now precict at some new locations x and stick it together with x and f, a factor indexing the model, into a tidy format:
newdf <- data.frame(x = seq(0, 1, length = 100))
p1 <- predict(m1, newdata = newdf, interval = "prediction")
p2 <- predict(m2, newdata = newdf, interval = "prediction")
p <- cbind(as.data.frame(rbind(p1, p2)), f = factor(rep(1:2, each = 100)),
x = rep(newdf$x, 2))
Melt the original data into tidy form
mdf <- with(df, data.frame(x = rep(x, 2), y = c(y1, y2),
f = factor(rep(1:2, each = 100))))
Draw the plot, using colour to distinguish the models/data
ggplot(mdf, aes(x = x, y = y, colour = f)) +
geom_point() +
geom_ribbon(data = p, aes(x = x, ymin = lwr, ymax = upr,
fill = f, y = NULL, colour = NULL),
alpha = 0.2) +
geom_line(data = p, aes(x = x, y = fit))
This gets us

Related

Adding orthogonal regression line in ggplot

I have plotted a scatter graph in R, comparing expected to observed values,using the following script:
library(ggplot2)
library(dplyr)
r<-read_csv("Uni/MSci/Project/DATA/new data sheets/comparisons/for comarison
graphs/R Regression/GAcAs.csv")
x<-r[1]
y<-r[2]
ggplot()+geom_point(aes(x=x,y=y))+
scale_size_area() +
xlab("Expected") +
ylab("Observed") +
ggtitle("G - As x Ac")+ xlim(0, 40)+ylim(0, 40)
My plot is as follows:
I then want to add an orthogonal regression line (as there could be errors in both the expected and observed values). I have calculated the beta value using the following:
v <- prcomp(cbind(x,y))$rotation
beta <- v[2,1]/v[1,1]
Is there a way to add an orthogonal regression line to my plot?
Borrowed from this blog post & this answer. Basically, you will need Deming function from MethComp or prcomp from stats packages together with a custom function perp.segment.coord. Below is an example taken from above mentioned blog post.
library(ggplot2)
library(MethComp)
data(airquality)
airquality <- na.exclude(airquality)
# Orthogonal, total least squares or Deming regression
deming <- Deming(y=airquality$Wind, x=airquality$Temp)[1:2]
deming
#> Intercept Slope
#> 24.8083259 -0.1906826
# Check with prcomp {stats}
r <- prcomp( ~ airquality$Temp + airquality$Wind )
slope <- r$rotation[2,1] / r$rotation[1,1]
slope
#> [1] -0.1906826
intercept <- r$center[2] - slope*r$center[1]
intercept
#> airquality$Wind
#> 24.80833
# https://stackoverflow.com/a/30399576/786542
perp.segment.coord <- function(x0, y0, ortho){
# finds endpoint for a perpendicular segment from the point (x0,y0) to the line
# defined by ortho as y = a + b*x
a <- ortho[1] # intercept
b <- ortho[2] # slope
x1 <- (x0 + b*y0 - a*b)/(1 + b^2)
y1 <- a + b*x1
list(x0=x0, y0=y0, x1=x1, y1=y1)
}
perp.segment <- perp.segment.coord(airquality$Temp, airquality$Wind, deming)
perp.segment <- as.data.frame(perp.segment)
# plot
plot.y <- ggplot(data = airquality, aes(x = Temp, y = Wind)) +
geom_point() +
geom_abline(intercept = deming[1],
slope = deming[2]) +
geom_segment(data = perp.segment,
aes(x = x0, y = y0, xend = x1, yend = y1),
colour = "blue") +
theme_bw()
Created on 2018-03-19 by the reprex package (v0.2.0).
The MethComp package seems to be no longer maintained (was removed from CRAN).
Russel88/COEF allows to use stat_/geom_summary with method="tls" to add an orthogonal regression line.
Based on this and wikipedia:Deming_regression I created the following functions, which allow to use noise ratios other than 1:
deming.fit <- function(x, y, noise_ratio = sd(y)/sd(x)) {
if(missing(noise_ratio) || is.null(noise_ratio)) noise_ratio <- eval(formals(sys.function(0))$noise_ratio) # this is just a complicated way to write `sd(y)/sd(x)`
delta <- noise_ratio^2
x_name <- deparse(substitute(x))
s_yy <- var(y)
s_xx <- var(x)
s_xy <- cov(x, y)
beta1 <- (s_yy - delta*s_xx + sqrt((s_yy - delta*s_xx)^2 + 4*delta*s_xy^2)) / (2*s_xy)
beta0 <- mean(y) - beta1 * mean(x)
res <- c(beta0 = beta0, beta1 = beta1)
names(res) <- c("(Intercept)", x_name)
class(res) <- "Deming"
res
}
deming <- function(formula, data, R = 100, noise_ratio = NULL, ...){
ret <- boot::boot(
data = model.frame(formula, data),
statistic = function(data, ind) {
data <- data[ind, ]
args <- rlang::parse_exprs(colnames(data))
names(args) <- c("y", "x")
rlang::eval_tidy(rlang::expr(deming.fit(!!!args, noise_ratio = noise_ratio)), data, env = rlang::current_env())
},
R=R
)
class(ret) <- c("Deming", class(ret))
ret
}
predictdf.Deming <- function(model, xseq, se, level) {
pred <- as.vector(tcrossprod(model$t0, cbind(1, xseq)))
if(se) {
preds <- tcrossprod(model$t, cbind(1, xseq))
data.frame(
x = xseq,
y = pred,
ymin = apply(preds, 2, function(x) quantile(x, probs = (1-level)/2)),
ymax = apply(preds, 2, function(x) quantile(x, probs = 1-((1-level)/2)))
)
} else {
return(data.frame(x = xseq, y = pred))
}
}
# unrelated hlper function to create a nicer plot:
fix_plot_limits <- function(p) p + coord_cartesian(xlim=ggplot_build(p)$layout$panel_params[[1]]$x.range, ylim=ggplot_build(p)$layout$panel_params[[1]]$y.range)
Demonstration:
library(ggplot2)
#devtools::install_github("Russel88/COEF")
library(COEF)
fix_plot_limits(
ggplot(data.frame(x = (1:5) + rnorm(100), y = (1:5) + rnorm(100)*2), mapping = aes(x=x, y=y)) +
geom_point()
) +
geom_smooth(method=deming, aes(color="deming"), method.args = list(noise_ratio=2)) +
geom_smooth(method=lm, aes(color="lm")) +
geom_smooth(method = COEF::tls, aes(color="tls"))
Created on 2019-12-04 by the reprex package (v0.3.0)
I'm not sure I completely understand the question, but if you want line segments to show errors along both x and y axis, you can do this using geom_segment.
Something like this:
library(ggplot2)
df <- data.frame(x = rnorm(10), y = rnorm(10), w = rnorm(10, sd=.1))
ggplot(df, aes(x = x, y = y, xend = x, yend = y)) +
geom_point() +
geom_segment(aes(x = x - w, xend = x + w)) +
geom_segment(aes(y = y - w, yend = y + w))

How do I propagate the error of a linear regression when projecting from Y to X?

I'm trying to figure out how to propagate errors in the following case
I am calibrating a machine with a couple of standards (a, b, c) with
accepted values x. My machine measures y for these standards, with a
certain error (standard deviation of 1 in this example).
Then I measure replicates of a sample, yielding ynew. Now I want to
convert these values to the accepted measurement scale (the x-axis).
To do this, I can of course do some linear algebra and convert the slope and
intercept that I got from my standard measurements to a reversed slope and
intercept as follows
This works nicely to convert the input values, but how do I get proper estimates of the errors?
In R, I've tried the following:
library(broom) # for tidy lm
library(ggplot2) # for plotting
library(dplyr) # to allow piping
# find confidence value
cv <- function(x, level = 95) {
qt(1 - ((100 - level) / 100) / 2, df = length(x) - 1) * sd(x) / sqrt(length(x))
}
# find confidence interval
ci <- function(x, level = 95) {
xbar <- mean(x)
xci <- cv(x, level = level)
c(fit = xbar, lwr = xbar - xci, upr = xbar + xci)
}
set.seed(1337)
# create fake data
dat <- data.frame(id = rep(letters[1:3], 20),
x = rep(c(1, 7, 10), 20)) %>%
mutate(y = rnorm(n(), -20 + 1.25 * x, 1))
# generate linear model
mod <- lm(y ~ x, dat)
# tidy
mod_aug <- augment(mod)
# these are the new samples that my machine measures
ynew <- rnorm(10, max(dat$y) + 3)
# predict new x-value based on y-value that is outside of range
## predict(mod, newdata = data.frame(y = ynew), interval = "predict")
# Error in eval(predvars, data, env) : object 'x' not found
# or tidy
## augment(mod, newdata = data.frame(y = ynew))
# 50 row df that doesn't make sense
# found this function that should do the job, but it doesn't extrapolate
## approx(x = mod$fitted.values, y = dat$x, xout = ynew)$y
# [1] NA NA NA NA NA NA NA NA NA NA
# this one from Hmisc does allow for extrapolation
with_approx <- Hmisc::approxExtrap(x = mod_aug$.fitted, y = mod_aug$x, xout = ynew)$y
# but in case of lm, isn't using the slope and intercept of a model okay too?
with_itc_slp <- (- coef(mod)[1] / coef(mod)[2]) + (1 / coef(mod)[2] * ynew)
# this would be the 95% prediction interval of the model at the average
# sample position. Could also use "confidence" but this is more correct?
avg_prediction <- predict(mod,
newdata = data.frame(x = mean(with_itc_slp)),
interval = "prediction")
# plot it
ggplot(dat, aes(x = x, y = y, col = id)) +
geom_point() +
geom_hline(yintercept = ynew, col = "gray") +
geom_smooth(aes(group = 1), method = "lm", se = F, fullrange = T,
col = "lightblue") +
geom_smooth(aes(group = 1), method = "lm") +
# 95% CI of the new sample
annotate("pointrange", x = 1, y = mean(ynew),
ymin = ci(ynew)[2], ymax = ci(ynew)[3], col = "green") +
# 95% prediction interval of the linear model at the average transformed
# x-position
annotate("pointrange", x = mean(with_approx), y = mean(ynew),
ymin = avg_prediction[2], ymax = avg_prediction[3], col = "green") +
# transformed using approx
annotate("point", x = with_approx, y = ynew, size = 3, col = "blue",
shape = 1) +
# transformed using intercept and slope
annotate("point", x = with_itc_slp, y = ynew, size = 3, col = "red",
shape = 2) +
# it's pretty
coord_fixed()
resulting in this plot:
Now how do I go from these 95% CIs in the y-direction to transformed sample
x-values with a confidence interval in the x-direction?

R, R², p-value and regression equation

This code gives me a plot with the regression equation and R2: (but i need to mention in which x and y the equation will be (manually)
CORRELATIONP3 <-CORRELATIONP2[product=='a',]
x<-CORRELATIONP3$b
y<-CORRELATIONP3$p
df <- data.frame(x = x)
m <- lm(y ~ x, data = df)
p <- ggplot(data = df, aes(x = x, y = y)) +
scale_x_continuous("b (%)") +
scale_y_continuous("p (%)")+
geom_smooth(method = "lm", formula = y ~ x) +
geom_point()
p
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list( a = format(coef(m)[1], digits = 4),
b = format(coef(m)[2], digits = 4),
r2 = format(summary(m)$r.squared, digits = 3)))
dftext <- data.frame(x = 3, y = 0.2, eq = as.character(as.expression(eq)))
p + geom_text(aes(label = eq), data = dftext, parse = TRUE)
But, with this code I have R and p-value: And here the information about R and p values fits automatically in the plot, why? I want this in the first one as well.
CORRELATIONP3 <-CORRELATIONP2[product=='a',]
x<-CORRELATIONP3$b
y<-CORRELATIONP3$p
df <- data.frame(x = x)
m <- lm(y ~ x, data = df)
p <- ggplot(data = df, aes(x = x, y = y)) +
scale_x_continuous("b (%)") +
scale_y_continuous("p (%)")+
geom_smooth(method = "lm", formula = y ~ x) +
geom_point()
p
eq <- substitute(italic(r)~"="~rvalue*","~italic(p)~"="~pvalue, list(rvalue = sprintf("%.2f",sign(coef(m)[2])*sqrt(summary(m)$r.squared)), pvalue = format(summary(m)$coefficients[2,4], digits = 3)))
dftext <- data.frame(x = 30, y = 0.4, eq = as.character(as.expression(eq)))
p + geom_text(aes(label = eq), data = dftext, parse = TRUE)
Can you tell me how can I join all the 4 informations in one sigle plot? (R, R2, equation and p-value)
Besides that, i would like that these informations could be fitted automatically in the plot, not manually.
Ok, I am not sure if this works as you have not given a reproducible example of your data but I guess you just have to rename one of your variables e.g.:
eq2 <- substitute(italic(r)~"="~rvalue*","~italic(p)~"="~pvalue,
list(rvalue = sprintf("%.2f",sign(coef(m)[2])*sqrt(summary(m)$r.squared)),
pvalue = format(summary(m)$coefficients[2,4], digits = 3)))
and then you change the points you put it on in your plot just a bit below your other block in the first plot. x and y here refer to the position of the text lable so play around with these until your text looks ok.
dftext2 <- data.frame(x = 30, y = 0.12, eq2 = as.character(as.expression(eq2)))
p + geom_text(aes(label = eq2), data = dftext2, parse = TRUE)
please let me know if this works and if this is what you meant.

How to display different levels in a multilevel analysis with different colors

I am a beginner at multilevel analysis and try to understand how I can do graphs with the plot functions from base-R. I understand the output of fit below but I am struggeling with the visualization. df is just some simple test data:
t <- seq(0, 10, 1)
df <- data.frame(t = t,
y = 1.5+0.5*(-1)^t + (1.5+0.5*(-1)^t) * t,
p1 = as.factor(rep(c("p1", "p2"), 10)[1:11]))
fit <- lm(y ~ t * p1, data = df)
# I am looking for an automated version of that:
plot(df$t, df$y)
lines(df$t[df$p1 == "p1"],
fit$coefficients[1] + fit$coefficients[2] * df$t[df$p1 == "p1"], col = "blue")
lines(df$t[df$p1 == "p2"],
fit$coefficients[1] + fit$coefficients[2] * df$t[df$p1 == "p2"] +
+ fit$coefficients[3] + fit$coefficients[4] * df$t[df$p1 == "p2"], col = "red")
It should know that it has to include p1 and that there are two lines.
The result should look like this:
Edit: Predict est <- predict(fit, newx = t) gives the same result as fit but still I don't know "how to cluster".
Edit 2 #Keith: The formula y ~ t * p1 reads y = (a + c * p1) + (b + d * p1) * t. For the "first blue line" c, d are both zero.
This is how I would do it. I'm including a ggplot2 version of plot as well because I find it better fitted for the way I think about plots.
This version will account for the number of levels in p1. If you want to compensate for the number of model parameters, you will just have to adjust the way you construct xy to include all the relevant variables. I should point out that if you omit the newdata argument, fitting will be done on the dataset provided to lm.
t <- seq(0, 10, 1)
df <- data.frame(t = t,
y = 1.5+0.5*(-1)^t + (1.5+0.5*(-1)^t) * t,
p1 = as.factor(rep(c("p1", "p2"), 10)[1:11]))
fit <- lm(y ~ t * p1, data = df)
xy <- data.frame(t = t, p1 = rep(levels(df$p1), each = length(t)))
xy$fitted <- predict(fit, newdata = xy)
library(RColorBrewer) # for colors, you can define your own
cols <- brewer.pal(n = length(levels(df$p1)), name = "Set1") # feel free to ignore the warning
plot(x = df$t, y = df$y)
for (i in 1:length(levels(xy$p1))) {
tmp <- xy[xy$p1 == levels(xy$p1)[i], ]
lines(x = tmp$t, y = tmp$fitted, col = cols[i])
}
library(ggplot2)
ggplot(xy, aes(x = t, y = fitted, color = p1)) +
theme_bw() +
geom_point(data = df, aes(x = t, y = y)) +
geom_line()

graphing confidence intervals nls r

I'm in the process of putting some incidence data together for a proposal. I know that the data takes on a sigmoid shape overall so I fit it using NLS in R. I was trying to get some confidence intervals to plot as well so I used bootstrapping for the parameters, made three lines and here's where I'm having my problem. The bootstrapped CIs give me three sets of values, but because of equation the lines they are crossing.
Picture of Current Plot with "Ideal" Lines in Black
NLS is not my strong suit so perhaps I'm not going about this the right way. I've used mainly a self start function to this point just to get something down on the plot. The second NLS equation will give the same output, but I've put it down now so that I can alter later if needed.
Here is my code thus far:
data <- readRDS(file = "Incidence.RDS")
inc <- nls(y ~ SSlogis(x, beta1, beta2, beta3),
data = data,
control = list(maxiter = 100))
b1 <- summary(inc)$coefficients[1,1]
b2 <- summary(inc)$coefficients[2,1]
b3 <- summary(inc)$coefficients[3,1]
inc2 <- nls(y ~ phi1 / (1 + exp(-(x - phi2) / phi3)),
data = data,
start = list(phi1 = b1, phi2 = b2, phi3 = b3),
control = list(maxiter = 100))
inc2.boot <- nlsBoot(inc2, niter = 1000)
phi1 <- summary(inc2)$coefficients[1,1]
phi2 <- summary(inc2)$coefficients[2,1]
phi3 <- summary(inc2)$coefficients[3,1]
phi1_L <- inc2.boot$bootCI[1,2]
phi2_L <- inc2.boot$bootCI[2,2]
phi3_L <- inc2.boot$bootCI[3,2]
phi1_U <- inc2.boot$bootCI[1,3]
phi2_U <- inc2.boot$bootCI[2,3]
phi3_U <- inc2.boot$bootCI[3,3]
#plot lines
age <- c(20:95)
mean_incidence <- phi1 / (1 + exp(-(age - phi2) / phi3))
lower_incidence <- phi1_L / (1 + exp(-(age - phi2_L) / phi3_L))
upper_incidence <- phi1_U / (1 + exp(-(age - phi2_U) / phi3_U))
inc_line <- data.frame(age, mean_incidence, lower_incidence, upper_incidence)
p <- ggplot()
p <- (p
+ geom_point(data = data, aes(x = x, y = y), color = "darkgreen")
+ geom_line(data = inc_line,
aes(x = age, y = mean_incidence),
color = "blue",
linetype = "solid")
+ geom_line(data = inc_line,
aes(x = age, y = lower_incidence),
color = "blue",
linetype = "dashed")
+ geom_line(data = inc_line,
aes(x = age, y = upper_incidence),
color = "blue",
linetype = "dashed")
+ geom_ribbon(data = inc_line,
aes(x = age, ymin = lower_incidence, ymax = upper_incidence),
fill = "blue", alpha = 0.20)
+ labs(x = "\nAge", y = "Incidence (per 1,000 person years)\n")
)
print(p)
Here's a link to the data.
Any help on what to do next or if this is even possible given my current set up would be appreciated.
Thanks
Try plot.drc in the drc package.
library(drc)
fm <- drm(y ~ x, data = data, fct = LL.3())
plot(fm, type = "bars")
P.S. Please include the library calls in your questions so that the code is self contained and complete. In the case of the question here: library(ggplot2); library(nlstools) .

Resources