How can I show only parts of geom_smooth prediction? - r

I am trying to plot a graph which involved both a geom_point() function and a geom_smooth() function.
I would like to crop the geom_smooth() function at a certain x-value, BUT only after all its values have been used to calculate the smoothed curve (i.e. I do NOT want to use xlim(), which will remove the values from being used for plotting).
Reproducible example:
library(dplyr)
library(ggplot2)
set.seed(42)
test <- data.frame(replicate(2,sample(0:10,100,rep=TRUE)))
g <- ggplot() + geom_point(data = test, aes(x = X1, y = X2))
t_i <- test
t_i$group <- as.factor(as.numeric(cut(t_i$X1, 25)))
summar_t <- t_i %>%
group_by(group) %>%
summarise(y_mean=mean(X2),
y_sd=sd(X2),
c_mean =mean(X1,na.rm=T),
n =n()
)
summar_t$t_2sd <- summar_t$y_mean + summar_t$y_sd*2
g2 <- g + geom_smooth(data = summar_t, aes(x=c_mean, y = t_2sd), se=FALSE, method = lm, formula=y~poly(x,2), color = "black", linetype=3)

You can use the argument xseq – which is passed to StatSmooth$compute_group – as follows :
library(ggplot2)
ggplot(mtcars, aes(hp, mpg)) +
geom_point() +
geom_smooth(se = FALSE) +
geom_smooth(se = FALSE, xseq = 100:200, col = "red")
Result

Related

How to visualize GAM results with contour & tile plot (using ggplot2)

I would like to make a contour plot with ggplot2 by using gam results. Below is a detailed explanation of what I want:
#packages
library(mgcv)
library(ggplot2)
library(tidyr)
#prepare data
df <- data.frame(x = iris$Sepal.Width,
y = iris$Sepal.Length,
z = iris$Petal.Length)
#fit gam
gam_fit <- gam(z ~
s(x) +
s(y),
data=df,na.action = "na.fail")
To predict z values based on the gam_fit, I found a way from https://drmowinckels.io/blog/2019-11-16-plotting-gamm-interactions-with-ggplot2/
#predict z values
df_pred <- expand_grid(
x = seq(from=min(df$x),
to=max(df$x),
length.out = 100),
y = seq(from=min(df$y),
to=max(df$y),
length.out = 100)
)
df_pred <- predict(gam_fit, newdata = df_pred,
se.fit = TRUE) %>%
as_tibble() %>%
cbind(df_pred)
gg <- ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
geom_point(data=df,aes(x=x, y=y))+
scale_fill_distiller(palette = "YlGnBu")+
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white")
print(gg)
This give me a below plot
My goal is removing tile and contour at where there are no measured x-y points. For example, there is no measured points around the top-right & top-left corners of the plot.
I wonder if mgcViz can achieve this, but it requires including x & y as an interaction term as below (also I am not sure how to add measured points on the below figure):
library(mgcViz)
gamm_fit2 <- gam(z ~
s(x,y),
data=df,na.action = "na.fail") #,REML=TRUE
b <- getViz(gamm_fit2)
plot(sm(b, 1))
I think df_pred may not the best format to achieve my goal, but I am not sure how to do this. I would be grateful if you give me any solution with ggplot2.
There might be a package designed to handle this task, but if you can't find the right 'tool' for the job one option is to draw a polygon around the 'points' and colour everything outside the polygon grey, e.g.
library(tidyverse)
library(mgcv)
#prepare data
df <- data.frame(x = iris$Sepal.Width,
y = iris$Sepal.Length,
z = iris$Petal.Length)
#fit gam
gam_fit <- gam(z ~
s(x) +
s(y),
data=df,na.action = "na.fail")
df_pred <- expand_grid(
x = seq(from=min(df$x),
to=max(df$x),
length.out = 100),
y = seq(from=min(df$y),
to=max(df$y),
length.out = 100)
)
df_pred <- predict(gam_fit, newdata = df_pred,
se.fit = TRUE) %>%
as_tibble() %>%
cbind(df_pred)
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
geom_point(data=df,aes(x=x, y=y))+
scale_fill_distiller(palette = "YlGnBu")+
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
coord_cartesian(xlim = c(1.9, 4.5),
ylim = c(4, 8))
# Get the 'hull' around all of the dots
hulls <- df[chull(df$x, df$y), ]
# Get the 'edges' of the frame, starting at the first hull point
edges <- data.frame(x = c(4.1,4.5,4.5,1.9,1.9,4.5),
y = c(5.2,4,8,8,4,4),
z = NA)
# Combine
draw_poly <- rbind(hulls, edges)
# Draw the plot, and overlay the gray polygon
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
geom_point(data=df, aes(x=x, y=y)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
geom_polygon(data=draw_poly, aes(x=x, y=y), fill = "grey")
# Without the points
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
# geom_point(data=df, aes(x=x, y=y)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
geom_polygon(data=draw_poly, aes(x=x, y=y), fill = "grey")
Created on 2022-09-16 by the reprex package (v2.0.1)
Here's another example using the concaveman package to calculate the concave hull:
library(ggforce)
#install.packages("concaveman")
library(concaveman)
border <- concaveman(as.matrix(df[,1:2]), concavity = 2)
edges <- data.frame(V1 = c(4.5,4.5,1.9,1.9,4.5),
V2 = c(4,8,8,4,4))
draw_poly <- rbind(border, edges)
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
geom_point(data=df, aes(x=x, y=y)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
geom_shape(data=draw_poly, aes(x=V1, y=V2), fill = "grey",
expand = unit(-0.05, "cm"))
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
# geom_point(data=df, aes(x=x, y=y)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
geom_shape(data=draw_poly, aes(x=V1, y=V2), fill = "grey",
expand = unit(-0.05, "cm"))
Created on 2022-09-16 by the reprex package (v2.0.1)
To get something more akin to how mgcv::plot.gam() and mgcViz produce their plots for something like this, you need to identify pairs of covariates that lie too far from the support of your data. The reason we might prefer this over say clipping the predictions to the convex hull of the observations is that some mild extraxpolation beyond the data is probably not too much of a violation of the fact that splines have penalties that apply over the range of the data only. From a more pragmatic view, and this is something shown in the Anderson's Iris data used in the example, there are regions of the covariate space where we would have to interpolate that lie as far as, if not further from, the support of the data than point we might extrapolate to.
mgcv has a function for doing this called exclude.too.far(), so if you want total control you can do, reusing code from #jared_mamrot's excellent answer (modified a little)
library("dplyr")
library("tidyr")
library("ggplot2")
library("mgcv")
# prepare data
df <- with(iris, data.frame(x = Sepal.Width,
y = Sepal.Length,
z = Petal.Length))
#fit gam
gam_fit <- gam(z ~ s(x) + s(y), data = df, method = "REML")
df_new <- with(df, expand_grid(x = seq(from = min(x), to = max(x),
length.out = 100),
y = seq(from = min(y), to = max(y),
length.out = 100)))
df_pred <- predict(gam_fit, newdata = df_new)
df_pred <- tibble(fitted = df_pred) |>
bind_cols(df_new)
Now we can find out which of our rows in the grid we're predicting at represent covariate pairs that are too far from the support of the original data. What exclude.too.far() does is transform the pairs of covariates in the prediction grid to a unit square, with [0,0] representing the coordinate (min(x), min(y)), and [1,1] the coordinate (max(x), max(y)). It transforms than original covariate data onto this unit square also. It then computes the euclidean distance between each point in the grid (on the unit square) and each row in the observed data (projected on to the unit square).
Any observation that lies > dist from a node in the prediction grid is then identified to be excluded as lying too far from the support of the data. dist is the argument that controls what we mean by "too far". dist is specified in terms of the unit square, so the maximum any two points can be on the unit square is
r$> dist(data.frame(x = c(0,1), y = c(0,1)))
1
2 1.414214
The default in plot.gam and IIRC in mgcvViz is dist = 0.1. If we do this for our example
drop <- exclude.too.far(df_pred$x, df_pred$y, df$x, df$y, dist = 0.1)
drop is now a logical vector of length nrow(df_pred), with TRUE indicating we should exclude the observation pair.
Using drop we can set fitted to NA for the points we want to exclude:
df_pred <- df_pred |>
mutate(fitted = if_else(drop, NA_real_, fitted))
Now we can plot:
df_pred |>
ggplot(aes(x = x, y = y, fill = fitted)) +
geom_tile() +
geom_point(data = df, aes(x = x, y = y, fill = NULL)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(aes(z = fitted, fill = NULL), colour = "white")
producing
You can do this a bit more easily using my gratia package (IMHO), but the general idea is the same
# remotes::install_github("gavinsimpson/gratia") # need's dev version
library("gratia")
# prepare data
df <- with(iris, data.frame(x = Sepal.Width,
y = Sepal.Length,
z = Petal.Length))
# fit model
gam_fit <- gam(z ~ s(x) + s(y), data = df, method = "REML")
# prepare a data slice through the covariate space
ds <- data_slice(gam_fit, x = evenly(x, n = 100), y = evenly(y, n = 100))
# predict
fv <- fitted_values(gam_fit, data = ds)
# exclude points that are too far
drop <- too_far(ds$x, ds$y, df$x, df$y, dist = 0.1)
fv <- fv |>
mutate(fitted = if_else(drop, NA_real_, fitted))
# then plot
fv |>
ggplot(aes(x = x, y = y, fill = fitted)) +
geom_tile() +
geom_point(data = df, aes(x = x, y = y, fill = NULL)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(aes(z = fitted, fill = NULL), colour = "white")

Can I mimick facet_wrap() with 5 completely separate ggplots?

I like the neatness of using facet_wrap() or facet_grid() with ggplot since the plots are all made to be the same size and are fitted row and column wise automatically.
I have a data frame and I am experimenting with various transformations and their impact on fit as measured by R2
dm1 <- lm(price ~ x, data = diamonds)
dm1R2 <- summary(dm1)$r.squared #0.78
dm2 <- lm(log(price) ~ x, data = diamonds)
dm2R2 <- summary(dm2)$r.squared # 0.9177831
dm3 <- lm(log(price) ~ x^2, data = diamonds)
dm3R2 <- summary(dm3)$r.squared # also 0.9177831. Aside, why?
ggplot(diamonds, aes(x = x, y = price)) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3.5, y = 10000, label = paste0('R-Squared: ', round(dm1R2, 3)))
ggplot(diamonds, aes(x = x, y = log(price))) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3, y = 9, label = paste0('R-Squared: ', round(dm2R2, 3)))
ggplot(diamonds, aes(x = x^2, y = log(price))) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3, y = 20, label = paste0('R-Squared: ', round(dm3R2, 3)))
This produces 3 completely separate plots. Within Rmd file they will appear one after the other.
Is there a way to add them to a grid like when using facet_wrap?
You can use ggplot2's built-in faceting if you generate a "long" data frame from the regression model objects. The model object returned by lm includes the data used to fit the model, so we can extract the data and the r-squared for each model, stack them into a single data frame, and generate a faceted plot.
The disadvantage of this approach is that you lose the ability to easily set separate x-axis and y-axis titles for each panel, which is important, because the x and y values have different transformations in different panels. In an effort to mitigate that problem, I've used the model formulas as the facet labels.
Also, the reason you got the same r-squared for the models specified by log(price) ~ x and log(price) ~ x^2 is that R treats them as the same model. To tell R that you literally mean x^2 in a model formula, you need to wrap it in the I() function, making the formula log(price) ~ I(x^2). You could also do log(price) ~ poly(x, 2, raw=TRUE).
library(tidyverse)
theme_set(theme_bw(base_size=14))
# Generate a small subset of the diamonds data frame
set.seed(2)
dsub = diamonds[sample(1:nrow(diamonds), 2000), ]
dm1 <- lm(price ~ x, data = dsub)
dm2 <- lm(log(price) ~ x, data = dsub)
dm3 <- lm(log(price) ~ I(x^2), data = dsub)
# Create long data frame from the three model objects
dat = list(dm1, dm2, dm3) %>%
map_df(function(m) {
tibble(r2=summary(m)$r.squared,
form=as_label(formula(m))) %>%
cbind(m[["model"]] %>% set_names(c("price","x")))
}, .id="Model") %>%
mutate(form=factor(form, levels=unique(form)))
# Create data subset for geom_text
text.dat = dat %>% group_by(form) %>%
summarise(x = quantile(x, 1),
price = quantile(price, 0.05),
r2=r2[1])
dat %>%
ggplot(aes(x, price)) +
geom_point(alpha=0.3, colour="red") +
geom_smooth(method="lm") +
geom_text(data=text.dat, parse=TRUE,
aes(label=paste0("r^2 ==", round(r2, 2))),
hjust=1, size=3.5, colour="grey30") +
facet_wrap(~ form, scales="free")
ggarrange from the ggpubr package can do this:
p1 = ggplot(diamonds, aes(x = x, y = price)) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3.5, y = 10000, label = paste0('R-Squared: ', round(dm1R2, 3)))
p2 = ggplot(diamonds, aes(x = x, y = log(price))) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3, y = 9, label = paste0('R-Squared: ', round(dm2R2, 3)))
p3 = ggplot(diamonds, aes(x = x^2, y = log(price))) +
geom_point() +
geom_smooth(method = "lm", se = F) +
geom_text(x = 3, y = 20, label = paste0('R-Squared: ', round(dm3R2, 3)))
ggpubr::ggarrange(p1, p2, p3, ncol = 2, nrow = 2, align = "hv")
Other packages that have been suggested in the comments like cowplot and patchwork also offer good options for this.

apply transparent background to divide plot area based on x values using ggplot

I would appreciate any help to apply the transparent background colours below to
divide into two parts the plot area based on x-values as illustrated in the plot below (vertical division).
Here are my sample data and code:
mtcars$cyl <- as.factor(mtcars$cyl)
ggplot(mtcars, aes(x=wt, y=mpg, color=cyl)) +
geom_point() +
theme(legend.position="none")+
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
Here is the plot I would like to replicate, and the legend illustrates the change I want to implement:
Thank you in advance.
I think you want something like this. You'll have to designate groups and fill by that group in your geom_ribbon, and set your ymin and ymax as you like.
library(tidyverse)
mtcars$group <- ifelse(mtcars$wt <= 3.5, "<= 3.5", "> 3.5")
mtcars <- arrange(mtcars, wt)
mtcars$group2 <- rleid(mtcars$group)
mtcars_plot <- head(do.call(rbind, by(mtcars, mtcars$group2, rbind, NA)), -1)
mtcars_plot[,c("group2","group")] <- lapply(mtcars_plot[,c("group2","group")], na.locf)
mtcars_plot[] <- lapply(mtcars_plot, na.locf, fromLast = TRUE)
ggplot(mtcars_plot, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(aes(), method=lm, se=F, fullrange=TRUE) +
geom_ribbon(aes(ymin = mpg *.75, ymax = mpg * 1.25, fill = group), alpha = .25) +
labs(fill = "Weight Class")
Edit:
To map confidence intervals using geom_ribbon you'll have to calculate them beforehand using lm and predict.
mtmodel <- lm(mpg ~ wt, data = mtcars)
mtcars$Low <- predict(mtmodel, newdata = mtcars, interval = "confidence")[,2]
mtcars$High <- predict(mtmodel, newdata = mtcars, interval = "confidence")[,3]
Followed by the previous code to modify mtcars. Then plot with the calculated bounds.
ggplot(mtcars_plot, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(aes(), method=lm, se=F, fullrange=TRUE) +
geom_ribbon(aes(ymin = Low, ymax = High, fill = group), alpha = .25) +
labs(fill = "Weight Class") +
scale_fill_manual(values = c("red", "orange"), name = "fill")

Combined bar plot and points in ggplot2

I would like to plot a "combined" bar plot with points.
Consider to following dummy data:
library(ggplot2)
library(gridExtra)
library(dplyr)
se <- function(x){sd(x)/sqrt(length(x))}
p1 <- ggplot(mtcars, aes(y=disp, x=cyl, fill=cyl))
p1 <- p1 + geom_point() + theme_classic() + ylim(c(0,500))
my_dat <- summarise(group_by(mtcars, cyl), my_mean=mean(disp),my_se=se(disp))
p2 <- ggplot(my_dat, aes(y=my_mean,x=cyl,ymin=my_mean-my_se,ymax=my_mean+my_se))
p2 <- p2 + geom_bar(stat="identity",width=0.75) + geom_errorbar(stat="identity",width=0.75) + theme_classic() + ylim(c(0,500))
The final plot should look like that:
You can add layers together, but if they have different data and/or aesthetics you'll want to include the data and aes arguments in each graphical layer.
p3 <- ggplot() +
geom_bar(data=my_dat, aes(y=my_mean,x=cyl,ymin=my_mean-my_se,ymax=my_mean+my_se), stat="identity", width = 0.75) +
geom_errorbar(data=my_dat, aes(y=my_mean,x=cyl,ymin=my_mean-my_se,ymax=my_mean+my_se), width = 0.75) +
geom_point(data=mtcars, aes(y=disp, x=cyl, fill=cyl)) +
ylim(c(0,500)) +
theme_classic()
If you want to make it so that the the points are off to the side of the bars, you could subtract an offset from the cyl values to move over the points. Like #LukeA mentioned, by changing the geom_point to geom_point(data=mtcars, aes(y=disp, x=cyl-.5, fill=cyl)).
You can specify each layer individually to ggplot2. Often you are using the same data frame and options for each geom, so it makes sense to set defaults in ggplot(). In your case you should specify each geom separately:
library(ggplot2)
library(gridExtra)
library(dplyr)
se <- function(x){sd(x)/sqrt(length(x))}
my_dat <- summarise(group_by(mtcars, cyl),
my_mean = mean(disp),
my_se = se(disp))
p1 <- ggplot() +
geom_bar(data = my_dat,
aes(y = my_mean, x = cyl,
ymin = my_mean - my_se,
ymax = my_mean + my_se), stat="identity", width=0.75) +
geom_errorbar(data = my_dat,
aes(y = my_mean, x = cyl,
ymin = my_mean - my_se,
ymax = my_mean + my_se), stat="identity", width=0.75) +
geom_point(data = mtcars, aes(y = disp, x = cyl, fill = cyl)) +
theme_classic() + ylim(c(0,500))
p1

Plot mean in an R plot

I would like to add a mean of valuus to windows in a scatter plot I have. I created the scatter plot with ggplot2
p <- ggplot(mtcars, aes(wt, mpg))
p + geom_point()
This will give the scatter plot but I woudl like to add add the mean of a window (say size equals 1) and plot this points of the mean as a line. Additionally I woudl like to have vertical bars at each point to indicate the variance.
Mtcars is the data set standard available in ggplot 2
This uses the new dplyr library.
library(dplyr)
forLines <- mtcars %.%
group_by(cut(wt, breaks = 6)) %.%
summarise(mean_mpg = mean(mpg), mean_wt = mean(wt))
p +
geom_point(size=5) +
geom_boxplot(aes(group = cut(wt, breaks = 6))) +
geom_line(data=forLines,aes(x=mean_wt,y=mean_mpg))
Maybe this is what you're looking for:
library(ggplot2)
s <- seq(0, ceiling(max(mtcars$wt)), 1)
ind <- as.integer(cut(mtcars$wt, s))
myfun <- function(i)
c(y = mean(i), ymin = mean(i) - var(i), ymax = mean(i) + var(i))
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
stat_summary(fun.data = myfun, aes(group = ind, x = ind - .5),
colour = "red") +
stat_summary(fun.y = mean, aes(x = ind - .5), geom = "line",
colour = "red")
Is this what you want?
p <- ggplot(mtcars, aes(wt, mpg))
p + geom_point() + geom_smooth(aes(wt, mpg, group=1), method = "lm")

Resources