Different colours for values above / below a linear trend line - r

I'm using ggplot to plot a time series with a linear regression line. I would like to have different colours for my time series depending on whether it is above or below the trend line.
Here is a code example to plot the series and the corresponding trend line with different colours for the series and the line:
x <- seq(as.Date("2000/1/1"), as.Date("2010/1/1"), "years")
y <- rnorm(length(x),0,10)
df <- data.frame(x,y)
ggplot(df, aes(x, y)) +
stat_smooth(method = 'lm', aes(colour = 'Trend'), se = FALSE) +
geom_line(aes(colour = 'Observation') ) +
theme_bw() +
xlab("x") +
ylab("y") +
scale_colour_manual(values = c("blue","red"))
Have a nice day!

I got rid of the dates, since they were driving me nuts. Perhaps someone can add a solution for that. Otherwise it seems quite doable, with some basic high school maths.
df <- data.frame(x = 2000:2010,
y = rnorm(11, 0, 10))
fm <- lm(y ~ x, data = df)
co <- coef(fm)
df$under_over <- sign(fm$residuals)
for (i in 1:(nrow(df) - 1)) {
# Get slope and intercept for line segment
slope <- (df$y[i + 1] - df$y[i]) / (df$x[i + 1] - df$x[i])
int <- df$y[i] - slope * df$x[i]
# find where they would cross
x <- (co[1] - int) / (slope - co[2])
y <- slope * x + int
# if that is in the range of the segment it is a crossing, add to the data
if (x > df$x[i] & x < df$x[i + 1])
df <- rbind(df, c(x = x, y = y, under_over = NA))
}
#order by x
df <- df[order(df$x), ]
# find color for intersections
for (i in 1:nrow(df))
if (is.na(df$under_over[i]))
df$under_over[i] <- df$under_over[i + 1]
ggplot(df) +
geom_abline(intercept = co[1], slope = co[2]) +
geom_path(aes(x, y, col = as.factor(under_over), group = 1)) +
theme_bw()

Related

draw vertical lines in ggplot with faceting

I have line plots y vs x. y is sigmoid and varies from 0 to 1.
determine the value of x where y = 0.5 or very close by interpolation.
draw vertical line at x where y = 0.5
library(tidyverse)
# continuous variables
x <- seq(-5, 5, 0.1)
# compute y1
error_term <- runif(1, min = -2, max = 2)
y1 <- 1/(1 + exp(-x + error_term))
# compute y2
error_term <- runif(1, min = -2, max = 2)
y2 <- 1/(1 + exp(-x + error_term))
# merge y
y <- c(y1, y2)
x <- c(x, x)
# categorical variable
a <- c(rep(0, 101), rep(1, 101))
tbl <- tibble(x, a, y)
# TASK
# 1. determine values of x at which y = 0.5 for all categories and store them in variable x0
# 2. Use x0 to draw vertical lines in plots at x where y is 0.5
# ggplot
ggplot(data = tbl,
aes(x = x,
y = y)) +
geom_line() +
theme_bw() +
facet_grid(a ~ .)
This really isn't something built in to ggplot so you'll need to summarize the data yourself prior to plotting. You can write a helper function and then create the data you need for the lines
find_intersect <- function(x,y, target=0.5) {
optimize(function(z) (approxfun(x,y)(z)-target)^2, x)$minimum
}
line_data <- tbl %>%
group_by(a) %>%
summarize(xint=find_intersect(x,y))
Then plot with
ggplot(data = tbl,
aes(x = x,
y = y)) +
geom_line() +
theme_bw() +
geom_vline(aes(xintercept=xint), data=line_data) +
facet_grid(a ~ .)

Make ggplot with regression line and normal distribution overlay

I am trying to make a plot to show the intuition behind logistic (or probit) regression. How would I make a plot that looks something like this in ggplot?
(Wolf & Best, The Sage Handbook of Regression Analysis and Causal Inference, 2015, p. 155)
Actually, what I would rather even do is have one single normal distribution displayed along the y axis with mean = 0, and a specific variance, so that I can draw horizontal lines going from the linear predictor to the y axis and sideways normal distribution. Something like this:
What this is supposed to show (assuming I haven't misunderstood something) is . I haven't had much success so far...
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# Probability density function of a normal logistic distribution
pdfDeltaFun <- function(x) {
prob = (exp(x)/(1 + exp(x))^2)
return(prob)
}
# Tried switching the x and y to be able to turn the
# distribution overlay 90 degrees with coord_flip()
ggplot(df, aes(x = y, y = x)) +
geom_point() +
geom_line() +
stat_function(fun = pdfDeltaFun)+
coord_flip()
I think this comes pretty close to the first illustration you give. If this is a thing you don't need to repeat many times, it is probably best to compute the density curves prior to plotting and use a seperate dataframe to plot these.
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# For every row in `df`, compute a rotated normal density centered at `y` and shifted by `x`
curves <- lapply(seq_len(NROW(df)), function(i) {
mu <- df$y[i]
range <- mu + c(-3, 3)
seq <- seq(range[1], range[2], length.out = 100)
data.frame(
x = -1 * dnorm(seq, mean = mu) + df$x[i],
y = seq,
grp = i
)
})
# Combine above densities in one data.frame
curves <- do.call(rbind, curves)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
# The path draws the curve
geom_path(data = curves, aes(group = grp)) +
# The polygon does the shading. We can use `oob_squish()` to set a range.
geom_polygon(data = curves, aes(y = scales::oob_squish(y, c(0, Inf)),group = grp))
The second illustration is pretty close to your code. I simplified your density function by the standard normal density function and added some extra paramters to stat function:
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
stat_function(fun = dnorm,
aes(x = after_stat(-y * 4 - 5), y = after_stat(x)),
xlim = range(df$y)) +
# We fill with a polygon, squishing the y-range
stat_function(fun = dnorm, geom = "polygon",
aes(x = after_stat(-y * 4 - 5),
y = after_stat(scales::oob_squish(x, c(-Inf, -1)))),
xlim = range(df$y))

How to calculate 95% prediction interval from nls

Borrowing the example data from this question, if I have the following data and I fit the following non linear model to it, how can I calculate the 95% prediction interval for my curve?
library(broom)
library(tidyverse)
x <- seq(0, 4, 0.1)
y1 <- (x * 2 / (0.2 + x))
y <- y1 + rnorm(length(y1), 0, 0.2)
d <- data.frame(x, y)
mymodel <- nls(y ~ v * x / (k + x),
start = list(v = 1.9, k = 0.19),
data = d)
mymodel_aug <- augment(mymodel)
ggplot(mymodel_aug, aes(x, y)) +
geom_point() +
geom_line(aes(y = .fitted), color = "red") +
theme_minimal()
As an example, I can easily calculate the prediction interval from a linear model like this:
## linear example
d2 <- d %>%
filter(x > 1)
mylinear <- lm(y ~ x, data = d2)
mypredictions <-
predict(mylinear, interval = "prediction", level = 0.95) %>%
as_tibble()
d3 <- bind_cols(d2, mypredictions)
ggplot(d3, aes(x, y)) +
geom_point() +
geom_line(aes(y = fit)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = .15) +
theme_minimal()
Based on the linked question, it looks like the investr::predFit function will do what you want.
investr::predFit(mymodel,interval="prediction")
?predFit doesn't explain how the intervals are computed, but ?plotFit says:
Confidence/prediction bands for nonlinear regression (i.e.,
objects of class ‘nls’) are based on a linear approximation as
described in Bates & Watts (2007). This fun[c]tion was in[s]pired by the
‘plotfit’ function from the ‘nlstools’ package.
also known as the Delta method (e.g. see emdbook::deltavar).

Adding orthogonal regression line in ggplot

I have plotted a scatter graph in R, comparing expected to observed values,using the following script:
library(ggplot2)
library(dplyr)
r<-read_csv("Uni/MSci/Project/DATA/new data sheets/comparisons/for comarison
graphs/R Regression/GAcAs.csv")
x<-r[1]
y<-r[2]
ggplot()+geom_point(aes(x=x,y=y))+
scale_size_area() +
xlab("Expected") +
ylab("Observed") +
ggtitle("G - As x Ac")+ xlim(0, 40)+ylim(0, 40)
My plot is as follows:
I then want to add an orthogonal regression line (as there could be errors in both the expected and observed values). I have calculated the beta value using the following:
v <- prcomp(cbind(x,y))$rotation
beta <- v[2,1]/v[1,1]
Is there a way to add an orthogonal regression line to my plot?
Borrowed from this blog post & this answer. Basically, you will need Deming function from MethComp or prcomp from stats packages together with a custom function perp.segment.coord. Below is an example taken from above mentioned blog post.
library(ggplot2)
library(MethComp)
data(airquality)
airquality <- na.exclude(airquality)
# Orthogonal, total least squares or Deming regression
deming <- Deming(y=airquality$Wind, x=airquality$Temp)[1:2]
deming
#> Intercept Slope
#> 24.8083259 -0.1906826
# Check with prcomp {stats}
r <- prcomp( ~ airquality$Temp + airquality$Wind )
slope <- r$rotation[2,1] / r$rotation[1,1]
slope
#> [1] -0.1906826
intercept <- r$center[2] - slope*r$center[1]
intercept
#> airquality$Wind
#> 24.80833
# https://stackoverflow.com/a/30399576/786542
perp.segment.coord <- function(x0, y0, ortho){
# finds endpoint for a perpendicular segment from the point (x0,y0) to the line
# defined by ortho as y = a + b*x
a <- ortho[1] # intercept
b <- ortho[2] # slope
x1 <- (x0 + b*y0 - a*b)/(1 + b^2)
y1 <- a + b*x1
list(x0=x0, y0=y0, x1=x1, y1=y1)
}
perp.segment <- perp.segment.coord(airquality$Temp, airquality$Wind, deming)
perp.segment <- as.data.frame(perp.segment)
# plot
plot.y <- ggplot(data = airquality, aes(x = Temp, y = Wind)) +
geom_point() +
geom_abline(intercept = deming[1],
slope = deming[2]) +
geom_segment(data = perp.segment,
aes(x = x0, y = y0, xend = x1, yend = y1),
colour = "blue") +
theme_bw()
Created on 2018-03-19 by the reprex package (v0.2.0).
The MethComp package seems to be no longer maintained (was removed from CRAN).
Russel88/COEF allows to use stat_/geom_summary with method="tls" to add an orthogonal regression line.
Based on this and wikipedia:Deming_regression I created the following functions, which allow to use noise ratios other than 1:
deming.fit <- function(x, y, noise_ratio = sd(y)/sd(x)) {
if(missing(noise_ratio) || is.null(noise_ratio)) noise_ratio <- eval(formals(sys.function(0))$noise_ratio) # this is just a complicated way to write `sd(y)/sd(x)`
delta <- noise_ratio^2
x_name <- deparse(substitute(x))
s_yy <- var(y)
s_xx <- var(x)
s_xy <- cov(x, y)
beta1 <- (s_yy - delta*s_xx + sqrt((s_yy - delta*s_xx)^2 + 4*delta*s_xy^2)) / (2*s_xy)
beta0 <- mean(y) - beta1 * mean(x)
res <- c(beta0 = beta0, beta1 = beta1)
names(res) <- c("(Intercept)", x_name)
class(res) <- "Deming"
res
}
deming <- function(formula, data, R = 100, noise_ratio = NULL, ...){
ret <- boot::boot(
data = model.frame(formula, data),
statistic = function(data, ind) {
data <- data[ind, ]
args <- rlang::parse_exprs(colnames(data))
names(args) <- c("y", "x")
rlang::eval_tidy(rlang::expr(deming.fit(!!!args, noise_ratio = noise_ratio)), data, env = rlang::current_env())
},
R=R
)
class(ret) <- c("Deming", class(ret))
ret
}
predictdf.Deming <- function(model, xseq, se, level) {
pred <- as.vector(tcrossprod(model$t0, cbind(1, xseq)))
if(se) {
preds <- tcrossprod(model$t, cbind(1, xseq))
data.frame(
x = xseq,
y = pred,
ymin = apply(preds, 2, function(x) quantile(x, probs = (1-level)/2)),
ymax = apply(preds, 2, function(x) quantile(x, probs = 1-((1-level)/2)))
)
} else {
return(data.frame(x = xseq, y = pred))
}
}
# unrelated hlper function to create a nicer plot:
fix_plot_limits <- function(p) p + coord_cartesian(xlim=ggplot_build(p)$layout$panel_params[[1]]$x.range, ylim=ggplot_build(p)$layout$panel_params[[1]]$y.range)
Demonstration:
library(ggplot2)
#devtools::install_github("Russel88/COEF")
library(COEF)
fix_plot_limits(
ggplot(data.frame(x = (1:5) + rnorm(100), y = (1:5) + rnorm(100)*2), mapping = aes(x=x, y=y)) +
geom_point()
) +
geom_smooth(method=deming, aes(color="deming"), method.args = list(noise_ratio=2)) +
geom_smooth(method=lm, aes(color="lm")) +
geom_smooth(method = COEF::tls, aes(color="tls"))
Created on 2019-12-04 by the reprex package (v0.3.0)
I'm not sure I completely understand the question, but if you want line segments to show errors along both x and y axis, you can do this using geom_segment.
Something like this:
library(ggplot2)
df <- data.frame(x = rnorm(10), y = rnorm(10), w = rnorm(10, sd=.1))
ggplot(df, aes(x = x, y = y, xend = x, yend = y)) +
geom_point() +
geom_segment(aes(x = x - w, xend = x + w)) +
geom_segment(aes(y = y - w, yend = y + w))

2 polynomial regressions in a ggplot() graph

This is my Dataset:
As you can see, there are two quantitative variables (X, Y) and 1 categorical variable (molar, with two factors: M1, M2).
I would like to represent in one single graph two polynomial regressions and their respective prediction intervals: one for the M1 factor and one for the M2 factor. Each polynomial regression has its own degree (M1 is a 4 degree polynomial regression, and M2 is a 6 degree).
I want to use ggplot() function (which is in package ggplot2 in R). I have actually performed this figure but with all data merged (I mean, with no distinction between factors). This is the code I used:
# Fit a linear model
m <- lm(Y ~ X+I(X^2)+I(X^3)+I(X^4), data = Dataset)
# cbind the predictions to Dataset
mpi <- cbind(Dataset, predict(m, interval = "prediction"))
ggplot(mpi, aes(x = X)) +
geom_ribbon(aes(ymin = lwr, ymax = upr),
fill = "blue", alpha = 0.2) +
geom_point(aes(y = Y)) +
geom_line(aes(y = fit), colour = "blue", size = 1)
With this result:
So, I would like to have two different-grade polynomial regressions (one for the M1 and one for the M2), taking into account their respective predictions intervals. Which would be the exact code?
UPDATE - New code! I run this code with no success:
M1=subset(Dataset,Dataset$molar=="M1",select=X:Y)
M2=subset(Dataset,Dataset$molar=="M2",select=X:Y)
M1.R <- lm(Y ~ X +I(X^2)+I(X^3)+I(X^4),
data=subset(Dataset,Dataset$molar=="M1",select=X:Y))
M2.R <- lm(Y ~ X +I(X^2)+I(X^3)+I(X^4),
data=subset(Dataset,Dataset$molar=="M2",select=X:Y))
newdf <- data.frame(x = seq(0, 1, c(408,663)))
M1.P <- cbind(data=subset(Dataset,Dataset$molar=="M1",select=X:Y), predict(M1.R, interval = "prediction"))
M2.P <- cbind(data=subset(Dataset,Dataset$molar=="M2",select=X:Y), predict(M2.R, interval = "prediction"))
p = cbind(as.data.frame(rbind(M1.P, M2.P)), f = factor(rep(1:2, c(408,663)), x = rep(newdf$x, 2))
mdf = with(Dataset, data.frame(x = rep(x, 2), y = c(subset(Dataset,Dataset$molar=="M1",select=Y), subset(Dataset,Dataset$molar=="M2",select=Y),
f = factor(rep(1:2, c(408,663))))
ggplot(mdf, aes(x = x, y = y, colour = f)) + geom_point() +
geom_ribbon(data = p, aes(x = x, ymin = lwr, ymax = upr,
fill = f, y = NULL, colour = NULL),
alpha = 0.2) +
geom_line(data = p, aes(x = x, y = fit))
These are the messages I get now:
[98] WARNING: Warning in if (n < 0L) stop("wrong sign in 'by' argument") :
the condition has length > 1 and only the first element will be used
Warning in if (n > .Machine$integer.max) stop("'by' argument is much too small") :
the condition has length > 1 and only the first element will be used
Warning in 0L:n :
numerical expression has 2 elements: only the first used
Warning in if (by > 0) pmin(x, to) else pmax(x, to) :
the condition has length > 1 and only the first element will be used
[99] WARNING: Warning in predict.lm(M1.R, interval = "prediction") :
predictions on current data refer to _future_ responses
[100] WARNING: Warning in predict.lm(M2.R, interval = "prediction") :
predictions on current data refer to _future_ responses
[101] ERROR: <text>
I think I am closer but still can't see it. Help!
Here is one way. If you have more than two models/levels in the factor you should look into code that will work over the levels of the factor and fit the models that way.
Anyway, first some dummy data:
set.seed(100)
x <- runif(100)
y1 <- 2 + (0.3 * x) + (2.4 * x^2) + (-2.5 * x^3) + (3.4 * x^4) + rnorm(100)
y2 <- -1 + (0.3 * x) + (2.4 * x^2) + (-2.5 * x^3) + (3.4 * x^4) +
(-0.3 * x^5) + (2.4 * x^6) + rnorm(100)
df <- data.frame(x, y1, y2)
Fit our two models:
m1 <- lm(y1 ~ poly(x, 4), data = df)
m2 <- lm(y2 ~ poly(x, 6), data = df)
Now precict at some new locations x and stick it together with x and f, a factor indexing the model, into a tidy format:
newdf <- data.frame(x = seq(0, 1, length = 100))
p1 <- predict(m1, newdata = newdf, interval = "prediction")
p2 <- predict(m2, newdata = newdf, interval = "prediction")
p <- cbind(as.data.frame(rbind(p1, p2)), f = factor(rep(1:2, each = 100)),
x = rep(newdf$x, 2))
Melt the original data into tidy form
mdf <- with(df, data.frame(x = rep(x, 2), y = c(y1, y2),
f = factor(rep(1:2, each = 100))))
Draw the plot, using colour to distinguish the models/data
ggplot(mdf, aes(x = x, y = y, colour = f)) +
geom_point() +
geom_ribbon(data = p, aes(x = x, ymin = lwr, ymax = upr,
fill = f, y = NULL, colour = NULL),
alpha = 0.2) +
geom_line(data = p, aes(x = x, y = fit))
This gets us

Resources