Having several fits in one plot (in R) - r

I was wondering how I can modify the following code to have a plot something like
data(airquality)
library(quantreg)
library(ggplot2)
library(data.table)
library(devtools)
# source Quantile LOESS
source("https://www.r-statistics.com/wp-content/uploads/2010/04/Quantile.loess_.r.txt")
airquality2 <- na.omit(airquality[ , c(1, 4)])
#'' quantreg::rq
rq_fit <- rq(Ozone ~ Temp, 0.95, airquality2)
rq_fit_df <- data.table(t(coef(rq_fit)))
names(rq_fit_df) <- c("intercept", "slope")
#'' quantreg::lprq
lprq_fit <- lapply(1:3, function(bw){
fit <- lprq(airquality2$Temp, airquality2$Ozone, h = bw, tau = 0.95)
return(data.table(x = fit$xx, y = fit$fv, bw = paste0("bw=", bw), fit = "quantreg::lprq"))
})
#'' Quantile LOESS
ql_fit <- Quantile.loess(airquality2$Ozone, jitter(airquality2$Temp), window.size = 10,
the.quant = .95, window.alignment = c("center"))
ql_fit_df <- data.table(x = ql_fit$x, y = ql_fit$y.loess, bw = "bw=1", fit = "Quantile LOESS")
I want to have all these fits in a plot.

geom_quantile can calculate quantiles using the rq method internally, so we don't need to create the rq_fit_df separately. However, the lprq and Quantile LOESS methods aren't available within geom_quantile, so I've used the data frames you provided and plotted them using geom_line.
In addition, to include the rq line in the color and linetype mappings and in the legend we add aes(colour="rq", linetype="rq") as a sort of "artificial" mapping inside geom_quantile.
library(dplyr) # For bind_rows()
ggplot(airquality2, aes(Temp, Ozone)) +
geom_point() +
geom_quantile(quantiles=0.95, formula=y ~ x, aes(colour="rq", linetype="rq")) +
geom_line(data=bind_rows(lprq_fit, ql_fit_df),
aes(x, y, colour=paste0(gsub("q.*:","",fit),": ", bw),
linetype=paste0(gsub("q.*:","",fit),": ", bw))) +
theme_bw() +
scale_linetype_manual(values=c(2,4,5,1,1)) +
labs(colour="Method", linetype="Method",
title="Different methods of estimating the 95th percentile by quantile regression")

Related

Tweaking ggpairs() or a better solution to a correlation matrix

I am trying to create a correlation matrix between my X and Y variables and display this information in a nice figure. I am currently using ggpairs() from the GGally package, but if there's a better way to do this then I am happy to try something new. The figure should:
-Fit linear regression models (using lm) between X and Y variables
-Display scatterplots with a regression line
-Display the Coefficient of the determination (R2)
-Map the colour of points/lines/R2 values by group
I have been able to do most of this, but ggpairs only displays the correlation coefficient (r) and not the coefficient of determination (R2). I was able to use the suggestion from this post, but unfortunately the solution does not display R2 values by group.
So far:
library(GGally)
library(ggplot2)
cars <- mtcars
cars$group <- factor(c(rep("A", 16), rep("B", 16))) #adding grouping variable
#function to return R2 (coefficient of determination) and not just r (Coefficient of correlation) in the top portion of the figure
upper_fn <- function(data, mapping, ndp=2, ...){
# Extract the relevant columns as data
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
# Calculate the r^2 & format output
m <- summary(lm(y ~ x))
lbl <- paste("r^2: ", formatC(m$r.squared, digits=ndp, format="f"))
# Write out label which is centered at x&y position
ggplot(data=data, mapping=mapping) +
annotate("text", x=mean(x, na.rm=TRUE), y=mean(y, na.rm=TRUE), label=lbl, parse=TRUE, ...)+
theme(panel.grid = element_blank())
}
#lower function basically fits a linear model and displays points
lower_fn <- function(data, mapping, ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point(alpha = 0.7) +
geom_smooth(method=lm, fill="blue", se = F, ...)
p
}
#The actual figure
ggpairs(cars,
columns = c(1:11),
mapping = ggplot2::aes(color = group),
upper = list(continuous = "cor", size = 15),
diag = list(continuous = "densityDiag", alpha=0.5),
lower = list(continuous = lower_fn))
Based on Is it possible to split correlation box to show correlation values for two different treatments in pairplot?, below is a little code to get you started.
The idea is that you need to 1. split the data over the aesthetic variable (which is assumed to be colour), 2. run a regression over each data subset and extract the r^2, 3. quick calculation of where to place the r^2 labels, 4. plot. Some features are left to do.
upper_fn <- function(data, mapping, ndp=2, ...){
# Extract the relevant columns as data
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
col <- eval_data_col(data, mapping$colour)
# if no colour mapping run over full data
if(is.null(col)) {
## add something here
}
# if colour aesthetic, split data and run `lm` over each group
if(!is.null(col)) {
idx <- split(seq_len(nrow(data)), col)
r2 <- unlist(lapply(idx, function(i) summary(lm(y[i] ~ x[i]))$r.squared))
lvs <- if(is.character(col)) sort(unique(col)) else levels(col)
cuts <- seq(min(y, na.rm=TRUE), max(y, na.rm=TRUE), length=length(idx)+1L)
pos <- (head(cuts, -1) + tail(cuts, -1))/2
p <- ggplot(data=data, mapping=mapping, ...) +
geom_blank() +
theme_void() +
# you could map colours to each level
annotate("text", x=mean(x), y=pos, label=paste(lvs, ": ", formatC(r2, digits=ndp, format="f")))
}
return(p)
}

How can I add confidence intervals to a scatterplot for a regression on two variables?

I need to create an insightful graphic with a regression line, data points, and confidence intervals. I am not looking for smoothed lines. I have tried multiple codes, but I just can't get it right.
I am looking for something like this:
Some codes I have tried:
p <- scatterplot(df.regsoft$w ~ df.regsoft$b,
data = df.regsoft,
boxplots = FALSE,
regLine = list(method=lm, col="red"),
pch = 16,
cex = 0.7,
xlab = "Fitted Values",
ylab = "Residuals",
legend = TRUE,
smooth = FALSE)
abline(coef = confint.lm(result.rs))
But this doesn't create what I want to create, however it is closest to what I intended. Notice that I took out "smooth" since this is not really what I am looking for.
How can I make this plot interactive?
If you don't mind switch to ggplot and the tidyverse, then this is simply a geom_smooth(method = "lm"):
library(tidyverse)
d <- tibble( #random stuff
x = rnorm(100, 0, 1),
y = 0.25 * x + rnorm(100, 0, 0.25)
)
m <- lm(y ~ x, data = d) #linear model
d %>%
ggplot() +
aes(x, y) + #what to plot
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
without method = "lm" it draws a smoothed line.
As for the Conf. interval (Obs 95%) lines, it seems to me that's simply a quantile regression. In that case, you can use the quantreg package.
If you want to make it interactive, you can use the plotly package:
library(plotly)
p <- d %>%
ggplot() +
aes(x, y) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
ggplotly(p)
================================================
P.S.
I am not completely sure this is what the figure you posted is showing (I guess so), but to add the quantile lines, I would just perform two quantile regressions (upper and lower) and then calculate the values of the quantile lines for your data:
library(tidyverse)
library(quantreg)
d <- tibble( #random stuff
x = rnorm(100, 0, 1),
y = 0.25 * x + rnorm(100, 0, 0.25)
)
m <- lm(y ~ x, data = d) #linear model
# 95% quantile, two tailed
rq_low <- rq(y ~ x, data = d, tau = 0.025) #lower quantile
rq_high <- rq(y ~ x, data = d, tau = 0.975) #upper quantile
d %>%
mutate(low = rq_low$coefficients[1] + x * rq_low$coefficients[2],
high = rq_high$coefficients[1] + x * rq_high$coefficients[2]) %>%
ggplot() +
geom_point(aes(x, y)) +
geom_smooth(aes(x, y), method = "lm") +
geom_line(aes(x, low), linetype = "dashed") +
geom_line(aes(x, high), linetype = "dashed") +
theme_bw()

Replicating lattice graph for a mixed model

I am trying to replicate a lattice graph using ggplot2 for a mixed model. My ggplot graph looks very similar but I am not sure about about loess line model fitted.
My goal is to add a loess line from the mixed model using ggplot2. Below is an example of my commands :
library(nlme)
library(ggplot2)
library(lattice)
library(lme4)
data(MathAchieve)
attach(MathAchieve)
mses <- tapply(SES, School, mean)
mses[as.character(MathAchSchool$School[1:10])]
Bryk <- as.data.frame(MathAchieve[, c("School", "SES", "MathAch")])
names(Bryk) <- c("school", "ses", "mathach")
sample20 <- sort(sample(7185, 20)) # 20 randomly sampled students
Bryk$meanses <- mses[as.character(Bryk$school)]
Bryk$cses <- Bryk$ses - Bryk$meanses
sector <- MathAchSchool$Sector
names(sector) <- row.names(MathAchSchool)
Bryk$sector <- sector[as.character(Bryk$school)]
attach(Bryk)
cat <- sample(unique(school[sector=="Catholic"]), 20)
Cat.20 <- groupedData(mathach ~ ses | school, data=Bryk[is.element(school, cat),])
Graph with Lattice:
trellis.device(color=T)
xyplot(mathach ~ ses | school, data=Cat.20, main="Catholic",
panel=function(x, y) {
panel.loess(x, y, span=1)
panel.xyplot(x, y)
panel.lmline(x, y, lty=2)
})
Graph with ggplot:
ggplot(Cat.20, aes(x = ses, y =mathach )) +
geom_point(size=1, shape=1) +
stat_smooth(method="lm",se=F)+
stat_smooth(, colour="Red",se=F)+
facet_wrap(school~., scale = "free_y")
Please any advice will be appreciated.
Preamble
Before going into the explanation, allow me to refer you to this question: Why is it not advisable to use attach() in R, and what should I use instead?
While it's recommendable that you made your question reproducible, the code you used can do with some clean-up. For example:
Don't include packages that aren't used in the code (I didn't see a need for the lme4 package);
There's no need to use data(...) to load MathAchieve. See the "Good Practices" section from ?data for more details.
As mentioned above, don't use attach().
For complete reproducibility, use set.seed() before any random sampling.
For a minimal example, don't plot 20 schools when a smaller number would do.
Since you are using one of the tidyverse packages for plotting, I recommend another from its collection for data manipulation:
library(nlme)
library(ggplot2)
library(lattice)
library(dplyr)
Bryk <- MathAchieve %>%
select(School, SES, MathAch) %>%
group_by(School) %>%
mutate(meanses = mean(SES),
cses = SES - meanses) %>%
ungroup() %>%
left_join(MathAchSchool %>% select(School, Sector),
by = "School")
colnames(Bryk) <- tolower(colnames(Bryk))
set.seed(123)
cat <- sample(unique(Bryk$school[Bryk$sector == "Catholic"]), 2)
Cat.2 <- groupedData(mathach ~ ses | school,
data = Bryk %>% filter(school %in% cat))
Explanation
Now that that's out of the way, let's look at the relevant functions for loess:
from ?panel.loess:
panel.loess(x, y, span = 2/3, degree = 1,
family = c("symmetric", "gaussian"),
... # omitted for space
)
from ?stat_smooth:
stat_smooth(mapping = NULL, data = NULL, geom = "smooth",
method = "auto", formula = y ~ x, span = 0.75, method.args = list(),
... # omitted for space
)
where method = "auto" defaults to loess from the stats package for <1000 observations.
from ?loess:
loess(formula, data, span = 0.75, degree = 2,
family = c("gaussian", "symmetric"),
... #omitted for space
)
In short, a loess plot's default parameters are span = 2/3, degree = 1, family = "symmetric" for the lattice package, and span = 0.75, degree = 2, family = "gaussian" for the ggplot2 package. You have to specify matching parameters if you want the resulting plots to match:
xyplot(mathach ~ ses | school, data = Cat.2, main = "Catholic",
panel=function(x, y) {
panel.loess(x, y, span=1, col = "red") # match ggplot's colours
panel.xyplot(x, y, col = "black") # to facilitate comparison
panel.lmline(x, y, lty=2, col = "blue")
})
ggplot(Cat.2, aes(x = ses, y = mathach)) +
geom_point(size = 2, shape = 1) +
stat_smooth(method = "lm", se = F)+
stat_smooth(span = 1,
method.args = list(degree = 1, family = "symmetric"),
colour = "red", se = F)+
facet_wrap(school ~ .) +
theme_classic() # less cluttered background to facilitate comparison

geom_abline for logistic regression (ggplot2)

I am sorry if this question is very simple, however, I could not find any solution to my problem. I want to plot logistic regressions lines with ggplot2. The problem is that I cannot use geom_abline because I dont have the original model, just the slope and intercept for each regression line. I have use this approach for linear regressions, and this works fine with geom_abline, because you can just give multiple slopes and intercepts to the function.
geom_abline(data = estimates, aes(intercept = inter, slope = slo)
where inter and slo are vectors with more then one value.
If I try the same approach with coefficients from a logistic regression, I will get the wrong regression lines (linear). I am trying to use geom_line, however, I cannot use the function predict to generate the predicted values because I dont have the a original model objetc.
Any suggestion?
Thanks in advance,
Gustavo
If the model had a logit link then you could plot the prediction using only the intercept (coefs[1]) and slope (coefs[2]) as:
library(ggplot2)
n <- 100L
x <- rnorm(n, 2.0, 0.5)
y <- factor(rbinom(n, 1L, plogis(-0.6 + 1.0 * x)))
mod <- glm(y ~ x, binomial("logit"))
coefs <- coef(mod)
x_plot <- seq(-5.0, 5.0, by = 0.1)
y_plot <- plogis(coefs[1] + coefs[2] * x_plot)
plot_data <- data.frame(x_plot, y_plot)
ggplot(plot_data) + geom_line(aes(x_plot, y_plot), col = "red") +
xlab("x") + ylab("p(y | x)") +
scale_y_continuous(limits = c(0, 1)) + theme_bw()
Edit
Here one way of plotting k predicted probability lines on the same graph following from the previous code:
library(reshape2)
k <- 5L
intercepts <- rnorm(k, coefs[1], 0.5)
slopes <- rnorm(k, coefs[2], 0.5)
x_plot <- seq(-5.0, 5.0, by = 0.1)
model_predictions <- sapply(1:k, function(idx) {
plogis(intercepts[idx] + slopes[idx] * x_plot)
})
colnames(model_predictions) <- 1:k
plot_data <- as.data.frame(cbind(x_plot, model_predictions))
plot_data_melted <- melt(plot_data, id.vars = "x_plot", variable.name = "model",
value.name = "y_plot")
ggplot(plot_data_melted) + geom_line(aes(x_plot, y_plot, col = model)) +
xlab("x") + ylab("p(y | x)") +
scale_y_continuous(limits = c(0, 1)) + theme_bw()

R Language - Sorting data into ranges; averaging; ignore outliers

I am analyzing data from a wind turbine, normally this is the sort of thing I would do in excel but the quantity of data requires something heavy-duty. I have never used R before and so I am just looking for some pointers.
The data consists of 2 columns WindSpeed and Power, so far I have arrived at importing the data from a CSV file and scatter-plotted the two against each other.
What I would like to do next is to sort the data into ranges; for example all data where WindSpeed is between x and y and then find the average of power generated for each range and graph the curve formed.
From this average I want recalculate the average based on data which falls within one of two standard deviations of the average (basically ignoring outliers).
Any pointers are appreciated.
For those who are interested I am trying to create a graph similar to this. Its a pretty standard type of graph but like I said the shear quantity of data requires something heavier than excel.
Since you're no longer in Excel, why not use a modern statistical methodology that doesn't require crude binning of the data and ad hoc methods to remove outliers: locally smooth regression, as implemented by loess.
Using a slight modification of csgillespie's sample data:
w_sp <- sample(seq(0, 100, 0.01), 1000)
power <- 1/(1+exp(-(w_sp -40)/5)) + rnorm(1000, sd = 0.1)
plot(w_sp, power)
x_grid <- seq(0, 100, length = 100)
lines(x_grid, predict(loess(power ~ w_sp), x_grid), col = "red", lwd = 3)
Throw this version, similar in motivation as #hadley's, into the mix using an additive model with an adaptive smoother using package mgcv:
Dummy data first, as used by #hadley
w_sp <- sample(seq(0, 100, 0.01), 1000)
power <- 1/(1+exp(-(w_sp -40)/5)) + rnorm(1000, sd = 0.1)
df <- data.frame(power = power, w_sp = w_sp)
Fit the additive model using gam(), using an adaptive smoother and smoothness selection via REML
require(mgcv)
mod <- gam(power ~ s(w_sp, bs = "ad", k = 20), data = df, method = "REML")
summary(mod)
Predict from our model and get standard errors of fit, use latter to generate an approximate 95% confidence interval
x_grid <- with(df, data.frame(w_sp = seq(min(w_sp), max(w_sp), length = 100)))
pred <- predict(mod, x_grid, se.fit = TRUE)
x_grid <- within(x_grid, fit <- pred$fit)
x_grid <- within(x_grid, upr <- fit + 2 * pred$se.fit)
x_grid <- within(x_grid, lwr <- fit - 2 * pred$se.fit)
Plot everything and the Loess fit for comparison
plot(power ~ w_sp, data = df, col = "grey")
lines(fit ~ w_sp, data = x_grid, col = "red", lwd = 3)
## upper and lower confidence intervals ~95%
lines(upr ~ w_sp, data = x_grid, col = "red", lwd = 2, lty = "dashed")
lines(lwr ~ w_sp, data = x_grid, col = "red", lwd = 2, lty = "dashed")
## add loess fit from #hadley's answer
lines(x_grid$w_sp, predict(loess(power ~ w_sp, data = df), x_grid), col = "blue",
lwd = 3)
First we will create some example data to make the problem concrete:
w_sp = sample(seq(0, 100, 0.01), 1000)
power = 1/(1+exp(-(rnorm(1000, mean=w_sp, sd=5) -40)/5))
Suppose we want to bin the power values between [0,5), [5,10), etc. Then
bin_incr = 5
bins = seq(0, 95, bin_incr)
y_mean = sapply(bins, function(x) mean(power[w_sp >= x & w_sp < (x+bin_incr)]))
We have now created the mean values between the ranges of interest. Note, if you wanted the median values, just change mean to median. All that's left to do, is to plot them:
plot(w_sp, power)
points(seq(2.5, 97.5, 5), y_mean, col=3, pch=16)
To get the average based on data that falls within two standard deviations of the average, we need to create a slightly more complicated function:
noOutliers = function(x, power, w_sp, bin_incr) {
d = power[w_sp >= x & w_sp < (x + bin_incr)]
m_d = mean(d)
d_trim = mean(d[d > (m_d - 2*sd(d)) & (d < m_d + 2*sd(d))])
return(mean(d_trim))
}
y_no_outliers = sapply(bins, noOutliers, power, w_sp, bin_incr)
Here are some examples of fitted curves (weibull analysis) for commercial turbines:
http://www.inl.gov/wind/software/
http://www.irec.cmerp.net/papers/WOE/Paper%20ID%20161.pdf
http://www.icaen.uiowa.edu/~ie_155/Lecture/Power_Curve.pdf
I'd recommend also playing around with Hadley's own ggplot2. His website is a great resource: http://had.co.nz/ggplot2/ .
# If you haven't already installed ggplot2:
install.pacakges("ggplot2", dependencies = T)
# Load the ggplot2 package
require(ggplot2)
# csgillespie's example data
w_sp <- sample(seq(0, 100, 0.01), 1000)
power <- 1/(1+exp(-(w_sp -40)/5)) + rnorm(1000, sd = 0.1)
# Bind the two variables into a data frame, which ggplot prefers
wind <- data.frame(w_sp = w_sp, power = power)
# Take a look at how the first few rows look, just for fun
head(wind)
# Create a simple plot
ggplot(data = wind, aes(x = w_sp, y = power)) + geom_point() + geom_smooth()
# Create a slightly more complicated plot as an example of how to fine tune
# plots in ggplot
p1 <- ggplot(data = wind, aes(x = w_sp, y = power))
p2 <- p1 + geom_point(colour = "darkblue", size = 1, shape = "dot")
p3 <- p2 + geom_smooth(method = "loess", se = TRUE, colour = "purple")
p3 + scale_x_continuous(name = "mph") +
scale_y_continuous(name = "power") +
opts(title = "Wind speed and power")

Resources