Create a matrix of residual plots using purrr and ggplot - r

Suppose I have the following dataframe:
library(tidyverse)
fit <- lm(speed ~ dist, data = cars)
select(broom::augment(fit), .fitted:.std.resid) -> dt
names(dt) <- substring(names(dt), 2)
I would like to create a grid of residuals plots using purrr. For example, I have the formulas for 2 diagnostic plots so far:
residual <- function(model) {ggplot(model, aes(fitted, resid)) +
geom_point() +
geom_hline(yintercept = 0) +
geom_smooth(se = FALSE)}
stdResidual <- function(model) {ggplot(model, aes(fitted, std.resid)) +
geom_point() +
geom_hline(yintercept = 0) +
geom_smooth(se = FALSE)}
And I am storing the formulas in a list that I plan to run against the fortified dataset dt.
formulas <- tibble(charts = list(residual, stdResidual))
# A tibble: 2 x 1
charts
<list>
1 <fun>
2 <fun>
Now I need to pass dt to each element of the column chart in formulas. I am actually also trying to combine both using gridExtra, but for now I would be satisfied if I could at least render both of them. I think I should run something like
pwalk(list(dt, formulas), ???)
But I have no idea what function I should use in ??? to render the plots.

Set up functions to plot each one, just like you did above:
diagplot_resid <- function(df) {
ggplot(df, aes(.fitted, .resid)) +
geom_hline(yintercept = 0) +
geom_point() +
geom_smooth(se = F) +
labs(x = "Fitted", y = "Residuals")
}
diagplot_stdres <- function(df) {
ggplot(df, aes(.fitted, sqrt(.std.resid))) +
geom_hline(yintercept = 0) +
geom_point() +
geom_smooth(se = F) +
labs(x = "Fitted", y = expression(sqrt("Standardized residuals")))
}
diagplot_qq <- function(df) {
ggplot(df, aes(sample = .std.resid)) +
geom_abline(slope = 1, intercept = 0, color = "black") +
stat_qq() +
labs(x = "Theoretical quantiles", y = "Standardized residuals")
}
Then call each in a list, with the dataframe as your second argument. Here you're invokeing a list of functions, and parallel-ly applying them to a list of function arguments. Since there's only one element to the second list, invoke_map loops over them.
fit <- lm(mpg~wt, mtcars)
df_aug <- augment(fit)
purrr::invoke_map(.f = list(diagplot_resid, diagplot_stdres, diagplot_qq),
.x = list(list(df_aug))) %>%
gridExtra::grid.arrange(grobs = ., ncol = 2,
top = paste("Diagnostic plots for",
as.expression(fit$call)))

Related

R: ggplot - Different Title for each object in a plot list

currently I am trying to making a plot list with ggplot from a list of data frames (94 time series). Then I want to export the plots to a PDF. This, so far, was successful using following code:
plot.list = lapply(HR_clean, function(x) {
y = length(x)
z = data.frame("HR" = x, "Time" = rep(1:y, 1))
ggplot(z, aes(x = Time, y = HR)) +
theme_bw() +
geom_line(linetype = "solid") +
ggtitle("Plot Title")
})
ggsave(
filename = "plots2.pdf",
plot = marrangeGrob(plot.list, nrow=1, ncol=1),
width = 15, height = 9
)
However, I also want that the main title of each plot is equal name of the corresponding list object. Perhaps anyone knows a smart solution for this problem.
Best,
Johnson
I couldn't test this (since you are not providing example data) but this should work using purrr's imap():
plot.list <- purrr::imap(HR_clean, function(x, name) {
y <- length(x)
z <- data.frame("HR" = x, "Time" = rep(1:y, 1))
ggplot(z, aes(x = Time, y = HR)) +
theme_bw() +
geom_line(linetype = "solid") +
ggtitle(name)
})

annotate r squared to ggplot by using facet_wrap

I just joined the community and looking forward to get some help for the data analysis for my master thesis.
At the moment I have the following problem:
I plotted 42 varieties with ggplot by using facet_wrap:
`ggplot(sumfvvar,aes(x=TemperaturCmean,y=Fv.Fm,col=treatment))+
geom_point(shape=1,size=1)+
geom_smooth(method=lm)+
scale_color_brewer(palette = "Set1")+
facet_wrap(.~Variety)`
That works very well, but I would like to annotate the r squared values for the regression lines. I have two treatments and 42 varieties, therefore 84 regression lines.
Are there any possibilties to calculate all r squared values and integrate them into the ggplot? I found allready the function
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm") +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))
}
but that works just for one variety and one treatment. Could be a loop for the lm() function an option?
Here is an example with the ggpmisc package:
library(ggpmisc)
set.seed(4321)
x <- 1:100
y <- (x + x^2 + x^3) + rnorm(length(x), mean = 0, sd = mean(x^3) / 4)
my.data <- data.frame(x = x,
y = y,
group = c("A", "B"))
formula <- y ~ poly(x, 1, raw = TRUE)
ggplot(my.data, aes(x, y)) +
facet_wrap(~ group) +
geom_point() +
geom_smooth(method = "lm", formula = formula) +
stat_poly_eq(formula = formula, parse = TRUE,
mapping = aes(label = stat(rr.label)))
You can't apply different labels to different facet, unless you add another r^2 column to your data.. One way is to use geom_text, but you need to calculate the stats you need first. Below I show an example with iris, and for your case, just change Species for Variety, and so on
library(tidyverse)
# simulate data for 2 treatments
# d2 is just shifted up from d1
d1 <- data.frame(iris,Treatment="A")
d2 <- data.frame(iris,Treatment="B") %>%
mutate(Sepal.Length=Sepal.Length+rnorm(nrow(iris),1,0.5))
# combine datasets
DF <- rbind(d1,d2) %>% rename(Variety = Species)
# plot like you did
# note I use "free" scales, if scales very different between Species
# your facet plots will be squished
g <- ggplot(DF,aes(x=Sepal.Width,y=Sepal.Length,col=Treatment))+
geom_point(shape=1,size=1)+
geom_smooth(method=lm)+
scale_color_brewer(palette = "Set1")+
facet_wrap(.~Variety,scales="free")
# rsq function
RSQ = function(y,x){signif(summary(lm(y ~ x))$adj.r.squared, 3)}
#calculate rsq for variety + treatment
STATS <- DF %>%
group_by(Variety,Treatment) %>%
summarise(Rsq=RSQ(Sepal.Length,Sepal.Width)) %>%
# make a label
# one other option is to use stringr::str_wrap in geom_text
mutate(Label=paste("Treat",Treatment,", Rsq=",Rsq))
# set vertical position of rsq
VJUST = ifelse(STATS$Treatment=="A",1.5,3)
# finally the plot function
g + geom_text(data=STATS,aes(x=-Inf,y=+Inf,label=Label),
hjust = -0.1, vjust = VJUST,size=3)
For the last geom_text() call, I allowed the y coordinates of the text to be different by multiplying the Treatment.. You might need to adjust that depending on your plot..

R: ggplot2, how to annotate summary statistics on each panel of a panel plot

How would I add a text annotation (eg. sd = sd_value) of the standard deviation in each panel of the following plot using ggplot2 in R?
library(datasets)
data(mtcars)
ggplot(data = mtcars, aes(x = hp)) +
geom_dotplot(binwidth = 1) +
geom_density() +
facet_grid(. ~ cyl) +
theme_bw()
I'd post an image of the plot, but I don't have enough rep.
I think "geom_text" or "annotate" might be useful but I'm not sure quite sure how.
If you want to vary the text label in each facet, you will want to use geom_text. If you want the same text to appear in each facet, you can use annotate.
p <- ggplot(data = mtcars, aes(x = hp)) +
geom_dotplot(binwidth = 1) +
geom_density() +
facet_grid(. ~ cyl)
mylabels <- data.frame(cyl = c(4, 6, 8),
label = c("first label", "seond label different", "and another"))
p + geom_text(x = 200, y = 0.75, aes(label = label), data = my labels)
### compare that to this way with annotate
p + annotate("text", x = 200, y = 0.75, label = "same label everywhere")
Now, if you really want standard deviation by cyl in this example, I'd probably use dplyr to do the calculation first and then complete this with geom_text like so:
library(ggplot2)
library(dplyr)
df.sd.hp <- mtcars %>%
group_by(cyl) %>%
summarise(hp.sd = round(sd(hp), 2))
ggplot(data = mtcars, aes(x = hp)) +
geom_dotplot(binwidth = 1) +
geom_density() +
facet_grid(. ~ cyl) +
geom_text(x = 200, y = 0.75,
aes(label = paste0("SD: ", hp.sd)),
data = df.sd.hp)
I prefer the appearance of the graph when the statistic appears within the facet label itself. I made the following script, which allows the choice of displaying the standard deviation, mean or count. Essentially it calculates the summary statistic then merges this with the name so that you have the format CATEGORY (SUMMARY STAT = VALUE).
#' Function will update the name with the statistic of your choice
AddNameStat <- function(df, category, count_col, stat = c("sd","mean","count"), dp= 0){
# Create temporary data frame for analysis
temp <- data.frame(ref = df[[category]], comp = df[[count_col]])
# Aggregate the variables and calculate statistics
agg_stats <- plyr::ddply(temp, .(ref), summarize,
sd = sd(comp),
mean = mean(comp),
count = length(comp))
# Dictionary used to replace stat name with correct symbol for plot
labelName <- mapvalues(stat, from=c("sd","mean","count"), to=c("\u03C3", "x", "n"))
# Updates the name based on the selected variable
agg_stats$join <- paste0(agg_stats$ref, " \n (", labelName," = ",
round(agg_stats[[stat]], dp), ")")
# Map the names
name_map <- setNames(agg_stats$join, as.factor(agg_stats$ref))
return(name_map[as.character(df[[category]])])
}
Using this script with your original question:
library(datasets)
data(mtcars)
# Update the variable name
mtcars$cyl <- AddNameStat(mtcars, "cyl", "hp", stat = "sd")
ggplot(data = mtcars, aes(x = hp)) +
geom_dotplot(binwidth = 1) +
geom_density() +
facet_grid(. ~ cyl) +
theme_bw()
The script should be easy to alter to include other summary statistics. I am also sure it could be rewritten in parts to make it a bit cleaner!

Is it possible to plot the smooth components of a gam fit with ggplot2?

I am fitting a model using gam from the mgcv package and store the result in model and so far I have been looking at the smooth components using plot(model). I have recently started using ggplot2 and like its output. So I am wondering, is it possible to plot these graphs using ggplot2?
Here is an example:
x1 = rnorm(1000)
x2 = rnorm(1000)
n = rpois(1000, exp(x1) + x2^2)
model = gam(n ~ s(x1, k=10) + s(x2, k=20), family="poisson")
plot(model, rug=FALSE, select=1)
plot(model, rug=FALSE, select=2)
And I am interest in s(x1, k=10) and s(x2, k=20) not in the fit.
Partial answer:
I dug deeper into plot.gam and mgcv:::plot.mgcv.smooth and built my own function which extracts the predicted effects and standard errors from the smooth components. It doesn't handle all options and cases of plot.gam so I only consider it a partial solution, but it works well for me.
EvaluateSmooths = function(model, select=NULL, x=NULL, n=100) {
if (is.null(select)) {
select = 1:length(model$smooth)
}
do.call(rbind, lapply(select, function(i) {
smooth = model$smooth[[i]]
data = model$model
if (is.null(x)) {
min = min(data[smooth$term])
max = max(data[smooth$term])
x = seq(min, max, length=n)
}
if (smooth$by == "NA") {
by.level = "NA"
} else {
by.level = smooth$by.level
}
range = data.frame(x=x, by=by.level)
names(range) = c(smooth$term, smooth$by)
mat = PredictMat(smooth, range)
par = smooth$first.para:smooth$last.para
y = mat %*% model$coefficients[par]
se = sqrt(rowSums(
(mat %*% model$Vp[par, par, drop = FALSE]) * mat
))
return(data.frame(
label=smooth$label
, x.var=smooth$term
, x.val=x
, by.var=smooth$by
, by.val=by.level
, value = y
, se = se
))
}))
}
This returns a "molten" data frame with the smooth components, so it is now possible to use ggplot with the example above :
smooths = EvaluateSmooths(model)
ggplot(smooths, aes(x.val, value)) +
geom_line() +
geom_line(aes(y=value + 2*se), linetype="dashed") +
geom_line(aes(y=value - 2*se), linetype="dashed") +
facet_grid(. ~ x.var)
If anyone knows a package which allows this in the general case I would be very grateful.
You can use the visreg package combined with the plyr package. visreg basically plots any model that you can use predict() on.
library(mgcv)
library(visreg)
library(plyr)
library(ggplot2)
# Estimating gam model:
x1 = rnorm(1000)
x2 = rnorm(1000)
n = rpois(1000, exp(x1) + x2^2)
model = gam(n ~ s(x1, k=10) + s(x2, k=20), family="poisson")
# use plot = FALSE to get plot data from visreg without plotting
plotdata <- visreg(model, type = "contrast", plot = FALSE)
# The output from visreg is a list of the same length as the number of 'x' variables,
# so we use ldply to pick the objects we want from the each list part and make a dataframe:
smooths <- ldply(plotdata, function(part)
data.frame(Variable = part$meta$x,
x=part$fit[[part$meta$x]],
smooth=part$fit$visregFit,
lower=part$fit$visregLwr,
upper=part$fit$visregUpr))
# The ggplot:
ggplot(smooths, aes(x, smooth)) + geom_line() +
geom_line(aes(y=lower), linetype="dashed") +
geom_line(aes(y=upper), linetype="dashed") +
facet_grid(. ~ Variable, scales = "free_x")
We can put the whole thing into a function, and add an option to show the residuals from the model (res = TRUE):
ggplot.model <- function(model, type="conditional", res=FALSE,
col.line="#7fc97f", col.point="#beaed4", size.line=1, size.point=1) {
require(visreg)
require(plyr)
plotdata <- visreg(model, type = type, plot = FALSE)
smooths <- ldply(plotdata, function(part)
data.frame(Variable = part$meta$x,
x=part$fit[[part$meta$x]],
smooth=part$fit$visregFit,
lower=part$fit$visregLwr,
upper=part$fit$visregUpr))
residuals <- ldply(plotdata, function(part)
data.frame(Variable = part$meta$x,
x=part$res[[part$meta$x]],
y=part$res$visregRes))
if (res)
ggplot(smooths, aes(x, smooth)) + geom_line(col=col.line, size=size.line) +
geom_line(aes(y=lower), linetype="dashed", col=col.line, size=size.line) +
geom_line(aes(y=upper), linetype="dashed", col=col.line, size=size.line) +
geom_point(data = residuals, aes(x, y), col=col.point, size=size.point) +
facet_grid(. ~ Variable, scales = "free_x")
else
ggplot(smooths, aes(x, smooth)) + geom_line(col=col.line, size=size.line) +
geom_line(aes(y=lower), linetype="dashed", col=col.line, size=size.line) +
geom_line(aes(y=upper), linetype="dashed", col=col.line, size=size.line) +
facet_grid(. ~ Variable, scales = "free_x")
}
ggplot.model(model)
ggplot.model(model, res=TRUE)
Colors are picked from http://colorbrewer2.org/.
FYI, visreg can directly output a gg object:
visreg(model, "x1", gg=TRUE)
There is now also the gratia package by #GavinSimpson and available on CRAN: https://cran.r-project.org/web/packages/gratia/index.html
Information is also on Gavin's Github site and a getting started vignette can be found here.
Updated to allow user to choose which variables are plotted.
Changed 'residuals' term to 'res_data' to avoid conflict with residuals function.
ggplot.model <- function(model, type="conditional", res=FALSE,
col.line="#7fc97f", col.point="#beaed4", size.line=1, size.point=1, no_col = NULL,
what = "all") {
require(visreg)
require(plyr)
plotdata <- visreg(model, type = type, plot = FALSE)
smooths <- ldply(plotdata, function(part)
data.frame(Variable = part$meta$x,
x=part$fit[[part$meta$x]],
smooth=part$fit$visregFit,
lower=part$fit$visregLwr,
upper=part$fit$visregUpr))
res_data <- ldply(plotdata, function(part)
data.frame(Variable = part$meta$x,
x=part$res[[part$meta$x]],
y=part$res$visregRes))
if (what != "all") {
smooths <- smooths %>%
filter(lapply(Variable,as.character)%in% what)
res_data <- res_data%>%
filter(lapply(Variable,as.character)%in% what)
}
if (res)
ggplot(smooths, aes(x, smooth)) + geom_line(col=col.line, size=size.line) +
geom_line(aes(y=lower), linetype="dashed", col=col.line, size=size.line) +
geom_line(aes(y=upper), linetype="dashed", col=col.line, size=size.line) +
geom_point(data = res_data, aes(x, y), col=col.point, size=size.point) +
facet_wrap(. ~ Variable, scales = "free_x", ncol = no_col) + theme_bw()
else
ggplot(smooths, aes(x, smooth)) + geom_line(col=col.line, size=size.line) +
geom_line(aes(y=lower), linetype="dashed", col=col.line, size=size.line) +
geom_line(aes(y=upper), linetype="dashed", col=col.line, size=size.line) +
facet_wrap(. ~ Variable, scales = "free_x", ncol=no_col)
}

Most succinct way to label/annotate extreme values with ggplot?

I'd like to annotate all y-values greater than a y-threshold using ggplot2.
When you plot(lm(y~x)), using the base package, the second graph that pops up automatically is Residuals vs Fitted, the third is qqplot, and the fourth is Scale-location. Each of these automatically label your extreme Y values by listing their corresponding X value as an adjacent annotation. I'm looking for something like this.
What's the best way to achieve this base-default behavior using ggplot2?
Updated scale_size_area() in place of scale_area()
You might be able to take something from this to suit your needs.
library(ggplot2)
#Some data
df <- data.frame(x = round(runif(100), 2), y = round(runif(100), 2))
m1 <- lm(y ~ x, data = df)
df.fortified = fortify(m1)
names(df.fortified) # Names for the variables containing residuals and derived qquantities
# Select extreme values
df.fortified$extreme = ifelse(abs(df.fortified$`.stdresid`) > 1.5, 1, 0)
# Based on examples on page 173 in Wickham's ggplot2 book
plot = ggplot(data = df.fortified, aes(x = x, y = .stdresid)) +
geom_point() +
geom_text(data = df.fortified[df.fortified$extreme == 1, ],
aes(label = x, x = x, y = .stdresid), size = 3, hjust = -.3)
plot
plot1 = ggplot(data = df.fortified, aes(x = .fitted, y = .resid)) +
geom_point() + geom_smooth(se = F)
plot2 = ggplot(data = df.fortified, aes(x = .fitted, y = .resid, size = .cooksd)) +
geom_point() + scale_size_area("Cook's distance") + geom_smooth(se = FALSE, show_guide = FALSE)
library(gridExtra)
grid.arrange(plot1, plot2)

Resources