ggplot2 geom_ribbon from mgcv::gamm - r

I'm trying to add a ribbon based on predictions from a gamm model, this seems a little harder than intended, as gamm is somewhat different from gam.
I first tried directly with geom_stat, but that will not work (and will not use my entire model, which also includes several other covariates)
library(tidyverse); library(mgcv)
dt = cbind(V1=scale(sample(1000)),
Age=rnorm(n = 1000, mean = 40, sd = 10),
ID=rep(seq(1:500),each=2) %>% as.data.frame()
# Works fine ----
dt %>% ggplot(aes(x=Age, y=V1)) +
stat_smooth(method="gam", formula= y~s(x,bs="cr"))
# Fails horribly :P
dt %>% ggplot(aes(x=Age, y=V1)) +
stat_smooth(method="gamm", formula= y~s(x,bs="cr"))
Maximum number of PQL iterations: 20
iteration 1
Warning message:
Computation failed in `stat_smooth()`:
no applicable method for 'predict' applied to an object of class "c('gamm', 'list')"
I've tried using the predict function on the model$gamm, but I'm not sure how to use this, and how to make the CI ribbon
dt.model = gamm(V1 ~ s(Age, bs="cr") + s(ID, bs = 're'), data=dt, family="gaussian", discrete=T)
dt$pred = predict(dt.model$gam)
dt %>% ggplot(aes(x = Age, y = V1)) +
geom_line(aes(group=ID), alpha=.3) +
geom_point(alpha=.2) +
geom_smooth(aes(y=pred))
I recognise this is shitty example data because this is a stupid shape.
But I'd like to be able to add a ribbon with the CI along the line as predicted by the model.fit. And I'd prefer to do this in ggplot, particularly as I want a spagetti plot in the background.

Use se.fit=TRUE inside predict:
library(tidyverse)
library(mgcv)
dt <- cbind(V1=scale(sample(1000)),
Age=rnorm(n = 1000, mean = 40, sd = 10),
ID=rep(seq(1:500),each=2)) %>% as.data.frame()
dt.model <- gamm(V1 ~ s(Age, bs="cr") + s(ID, bs = "re"),
data=dt, family="gaussian", discrete=T)
pred <- predict(dt.model$gam, se.fit=T)
dt %>% ggplot(aes(x = Age, y = V1)) +
geom_line(aes(group=ID), alpha=.3) +
geom_point(alpha=.2) +
geom_ribbon(aes(ymin=pred$fit-1.96*pred$se.fit,
ymax=pred$fit+1.96*pred$se.fit), alpha=0.2, fill="red")+
geom_line(aes(y=pred$fit), col="blue", lwd=1)

Related

Error in FUN(X[[i]], ...) : object 'predicted' not found

i try to search about this problem but i couldn't find,that i try to apply ggplot for the relation between the two variables (predictor and predicted x,y), the method was linear regression model (lm) but i got this error
Error in FUN(X[[i]], ...) : object 'predicted' not found
and this my code:
# Install tidymodels if you haven't done so
install.packages("rlang")
install.packages("tidymodels")
install.packages("dplyr")
# Library for modeling
library(tidymodels)
# Load tidyverse
library(tidyverse)
library(dplyr)
URL <- 'https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-sample-data.tar.gz'
download.file (URL, destfile='noaa-weather-sample-data.tar.gz')
untar('noaa-weather-sample-data.tar.gz',tar = 'internal')
dataset<- read.csv ('noaa-weather-sample-data/jfk_weather_sample.csv')
head(dataset)
glimpse(dataset)
subset_data <- data.frame(dataset$HOURLYRelativeHumidity,dataset$HOURLYDRYBULBTEMPF,dataset$HOURLYStationPressure,dataset$HOURLYWindSpeed,dataset$HOURLYPrecip)
subset_data<-setNames(subset_data,c('HOURLYRelativeHumidity','HOURLYDRYBULBTEMPF','HOURLYStationPressure','HOURLYWindSpeed', 'HOURLYPrecip'))
head(subset_data,10)
unique(subset_data$HOURLYPrecip)
subset_data <- subset_data %>%mutate(HOURLYPrecip = replace(HOURLYPrecip, HOURLYPrecip == "T", "0.0"))
View(subset_data)
install.packages('stringr ')
library(stringr)
subset_data$HOURLYPrecip <- str_remove(subset_data$HOURLYPrecip, pattern = 's')
unique(subset_data$HOURLYPrecip)
subset_data$HOURLYPrecip <- as.numeric(subset_data$HOURLYPrecip)
subset_data$HOURLYRelativeHumidity <- as.numeric(subset_data$HOURLYRelativeHumidity)
subset_data$HOURLYDRYBULBTEMPF <- as.numeric(subset_data$HOURLYDRYBULBTEMPF)
subset_data$HOURLYWindSpeed <- as.numeric(subset_data$HOURLYWindSpeed)
str(subset_data1)
subset_data1 <- setNames(subset_data,c('relative_humidity','dry_bulb_temp_f','station_pressure','wind_speed', 'precip'))
#library(rsample)
set.seed(1234)
#split_data <- initial_split(subset_data1)
dt<- sort(sample(nrow(subset_data1), nrow(subset_data1)*.8))
train_data<- subset_data1 [dt,]
test_data <- subset_data1 [-dt,]
install.packages('ggplot')
library(ggplot2)
hist(train_data$relative_humidity)
hist(train_data$dry_bulb_temp_f)
hist(train_data$station_pressure)
hist(train_data$wind_speed)
hist(train_data$precip)
install.packages('stats')
library(stats)
linear_reg1 <- lm (precip ~ relative_humidity, train_data)
linear_reg2 <- lm (precip ~ dry_bulb_temp_f, train_data)
linear_reg3 <- lm (precip ~ wind_speed, train_data)
linear_reg4 <- lm (precip ~ station_pressure, train_data)
#ggplot(train_data, aes(x = train_data$relative_humidity ,y= train_data$precip)) + geom_point()+ stat_smooth(method = 'lm', col ='red')
ggplot(train_data, aes(x= relative_humidity, y= precip))+
geom_smooth(method='lm',se=FALSE ,color='red')+
geom_segment(aes(xend= relative_humidity ,yend = predicted),alpha=.2)+
geom_point()+
geom_point(aes(y = predicted),shape=1)+
theme_bw()
You simply don't have a column called predicted in your data set. Presumably you want this to be the predicted value of precipitation based on humidity. In that case, you can easily create the column in your data frame using your first linear model:
train_data$predicted <- predict(linear_reg1, train_data)
ggplot(train_data, aes(x = relative_humidity, y = precip)) +
geom_segment(aes(xend = relative_humidity, yend = predicted), alpha = 0.2) +
geom_point(alpha = 0.1) +
geom_smooth(method='lm', se = FALSE , color = 'red') +
geom_point(aes(y = predicted), shape = 1, alpha = 0.1) +
theme_bw()
As you can see, a linear model is no good here. It predicts negative precipitation at low humidity and underestimates the precipitation at high humidity.

Binomial logit model with glmer doesn't yield a good fit to sigmoidal data

I am trying to fit a model to my data, which has a dependent variable that can be 0 or 1.
I tried to fit a binomial glmer to the data, but the fit is pretty bad as you can see below. This puzzles me because this is quite a sigmoid so I thought I would get a great fit with that kind of model? Am I using the wrong model?
(color is my data, black is the fit)
Here is the code I used on r
library(lme4)
library(ggplot2)
exdata <- read.csv("https://raw.githubusercontent.com/FlorianLeprevost/dummydata/main/exdata.csv")
model=glmer(VD~ as.factor(VI2)*VI1 + (1|ID),exdata,
family=binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun=2e5)))
summary(model)
exdata$fit=predict(model, type = "response")
ggplot(exdata,aes(VI1, VD, color=as.factor(VI2),
group=as.factor(VI2))) +
stat_summary(geom="line", size=0.8) +
stat_summary(aes(y=(fit)),geom="line", size=0.8, color="black") +
theme_bw()
And I tried without the random effect to see if it would change but no...
ggplot(exdata, aes(x=VI1, y=VD, color=as.factor(VI2),
group=as.factor(VI2))) +
stat_summary(fun.data=mean_se, geom="line", size=1)+
stat_smooth(method="glm", se=FALSE, method.args = list(family=binomial), color='black')
Here is the data:https://github.com/FlorianLeprevost/dummydata/blob/main/exdata.csv
tl;dr I don't think these data are as sigmoidal as you think. In particular, a logistic regression estimates a sigmoid curve that ranges from 0 to 1, whereas yours levels out (sort of) at 0.9. In much more detail:
slightly streamlined data import/model fitting
library(lme4)
library(ggplot2)
library(dplyr)
exdata <- (read.csv("https://raw.githubusercontent.com/FlorianLeprevost/dummydata/main/exdata.csv")
|> mutate(across(VI2, factor))
)
model <- glmer(VD~ VI2*VI1 + (1|ID),
exdata,
family=binomial(link = "logit"))
compute data summary and predictions
This can also be done with stat_summary(), but I like the finer control of doing it myself. In particular, I like to get Clopper-Pearson CIs on the proportions (could also do this with prop.test() to get score-test CIs). I'm also computing predictions across a wider VI1-range than the data (see why below).
ddsum <- (exdata
|> group_by(VI1, VI2)
|> summarise(
num = n(),
pos = sum(VD),
VD = mean(VD),
lwr = binom.test(pos, num)$conf.int[1],
upr = binom.test(pos, num)$conf.int[2],
.groups = "drop")
)
pframe <- expand.grid(
VI1 = seq(-50, 50, length = 101),
VI2 = unique(exdata$VI2))
pframe$VD <- predict(model, newdata = pframe, re.form = ~0, type = "response")
plot
gg0 <- ggplot(ddsum,aes(x=VI1, y=VD, color=VI2)) +
geom_pointrange(position = position_dodge(width = 0.3),
aes(ymin = lwr, ymax = upr, size = num), alpha = 0.5) +
scale_size_area(max_size = 4) +
theme_bw()
gg1 <- gg0 + geom_line(data = pframe)
ggsave(g1, file = "gglogist1.png")
Conclusion: the sharp increase from x=0 to x=15 combined with the saturation below 1.0 makes it hard to fit with a logistic curve.
We could try a quadratic-logistic fit:
model2 <- update(model, . ~ poly(VI1,2)*VI2 + (1|ID))
pframe$VD2 <- predict(model2, newdata = pframe, re.form = ~0, type = "response")
gg2 <- gg1 + geom_line(data=pframe, aes(y=VD2), linetype = 2)
ggsave(gg2, file = "gglogist2.png")
This fits better (it couldn't fit worse), but might not make sense for your application.
In principle we could fit a logistic that saturated at a value <1, but it's a bit tricky ...

How to fit non-linear function to data in ggplot2 using maximum likelihood model in R?

The data set (x.test, y.test) is an exponential fit. I'm trying to fit a custom non-linear function and attached is the code. The regular points plot just fine but I'm unable to get the fit line to work. Any suggestions?
x.test <- runif(50,2,8)
y.test <- 0.5^(x.test)
df <- data.frame(x.test, y.test)
library(ggpmisc)
my.formula <- y ~ lambda/ (1 + aii*x)
ggplot(data = df, aes(x=x.test,y=y.test)) +
geom_point(shape=21, fill="white", color="red", size=3) +
stat_smooth(method="nls",formula = y.test ~ lambda/ (1 + aii*x.test), method.args=list(start=c(lambda=1000,aii=-816.39)),se=F,color="red") +
geom_smooth(method="lm", formula = my.formula , col = "red") + stat_poly_eq(formula = my.formula, aes(label = stringr::str_wrap(paste(..eq.label.., ..rr.label.., sep = "~~~"))), parse = TRUE, size = 2.5, col = "red") + stat_function(fun=function (x.test){
y.test ~ lambda/ (1 + aii*x.test)}, color = "blue")
A few things:
you need to use y and x as the variable names in the formula argument to geom_smooth, regardless of what the names are in your data set
you need better starting values (see below)
there's a GLM trick you can use to fit this model; doesn't always work (can be numerically unstable), but it doesn't need starting values and will work more often than nls()
I don't think lm() and stat_poly_eq() are going to work as expected (or maybe at all) with a nonlinear formula ...
simulate data
(same as your code but using set.seed() - probably not important here but good practice)
set.seed(101)
x.test <- runif(50,2,8)
y.test <- 0.5^(x.test)
df <- data.frame(x.test, y.test)
attempt nls fit with your starting values
It's usually a good idea to troubleshoot by fitting any smoothing terms outside of ggplot2, so you have fewer layers to dig through to find the problems:
nls(y.test ~ lambda/(1+ aii*x.test),
start = list(lambda=1000,aii=-816.39),
data = df)
Error in nls(y.test ~ lambda/(1 + aii * x.test), start = list(lambda = 1000, :
singular gradient
OK, still doesn't work. Let's use glm() to get better starting values: we use an inverse-link GLM:
1/y = b0 + b1*x
y = 1/(b0 + b1*x)
= (1/b0)/(1 + (b1/b0)*x)
So:
g1 <- glm(y.test ~ x.test, family = gaussian(link = "inverse"))
s0 <- with(as.list(coef(g1)), list(lambda = 1/`(Intercept)`, aii = x.test/`(Intercept)`))
This gives lambda = -0.09, aii = -0.638 (with a little bit more work we could probably also figure out how to eyeball these by looking at the starting point and scale of the curve).
ggplot(data = df, aes(x=x.test,y=y.test)) +
geom_point(shape=21, fill="white", color="red", size=3) +
stat_smooth(method="nls",
formula = y ~ lambda/ (1 + aii*x),
method.args=list(start=s0),
se=FALSE,color="red") +
stat_smooth(method = "glm",
formula = y ~ x,
method.args = list(gaussian(link = "inverse")),
color = "blue", linetype = 2)

How do I plot a single numerical covariate using emmeans (or other package) from a model?

After variable selection I usually end up in a model with a numerical covariable (2nd or 3rd degree). What I want to do is to plot using emmeans package preferentially. Is there a way of doing it?
I can do it using predict:
m1 <- lm(mpg ~ poly(disp,2), data = mtcars)
df <- cbind(disp = mtcars$disp, predict.lm(m1, interval = "confidence"))
df <- as.data.frame(df)
ggplot(data = df, aes(x = disp, y = fit)) +
geom_line() +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = disp, y = fit),alpha = 0.2)
I didn't figured out a way of doing it using emmip neither emtrends
For illustration purposes, how could I do it using mixed models via lme?
m1 <- lme(mpg ~ poly(disp,2), random = ~1|factor(am), data = mtcars)
I suspect that your issue is due to the fact that by default, covariates are reduced to their means in emmeans. You can use theat or cov.reduce arguments to specify a larger number of values. See the documentation for ref_grid and vignette(“basics”, “emmeans”), or the index of vignette topics.
Using sjPlot:
plot_model(m1, terms = "disp [all]", type = "pred")
gives the same graphic.
Using emmeans:
em1 <- ref_grid(m1, at = list(disp = seq(min(mtcars$disp), max(mtcars$disp), 1)))
emmip(em1, ~disp, CIs = T)
returns a graphic with a small difference in layout. An alternative is to add the result to an object and plot as the way that I want to:
d1 <- emmip(em1, ~disp, CIs = T, plotit = F)

Visualizing multiple curves in ggplot from bootstrapping, curve fitting

I have time series data that is well modeled using a sinusoidal curve. I'd like to visualize the uncertainty in the fitted model using bootstrapping.
I adapted the approach from here. I am also interested in this approach too, using nlsBoot. I can get the first approach to run, but the resulting plot contains curves that are not continuous, but jagged.
library(dplyr)
library(broom)
library(ggplot2)
xdata <- c(-35.98, -34.74, -33.46, -32.04, -30.86, -29.64, -28.50, -27.29, -26.00,
-24.77, -23.57, -22.21, -21.19, -20.16, -18.77, -17.57, -16.47, -15.35,
-14.40, -13.09, -11.90, -10.47, -9.95,-8.90,-7.77,-6.80, -5.99,
-5.17, -4.21, -3.06, -2.29, -1.04)
ydata <- c(-4.425, -4.134, -5.145, -5.411, -6.711, -7.725, -8.087, -9.059, -10.657,
-11.734, NA, -12.803, -12.906, -12.460, -12.128, -11.667, -10.947, -10.294,
-9.185, -8.620, -8.025, -7.493, -6.713, -6.503, -6.316, -5.662, -5.734, -4.984,
-4.723, -4.753, -4.503, -4.200)
data <- data.frame(xdata,ydata)
bootnls_aug <- data %>% bootstrap(100) %>%
do(augment(nls(ydata ~ A*cos(2*pi*((xdata-x_0)/z))+M, ., start=list(A=4,M=-7,x_0=-10,z=30),.)))
ggplot(bootnls_aug, aes(xdata, ydata)) +
geom_line(aes(y=.fitted, group=replicate), alpha=.1, color="blue") +
geom_point(size=3) +
theme_bw()
ggplot output
Can anyone offer help? Why are the displayed curves not smooth? Is there a better way to implement?
broom::augment is merely returning fitted values for each of the available data points. Therefore, the resolution of x is limited to the resolution of the data. You can predict values from the model with a much higher resolution:
x_range <- seq(min(xdata), max(xdata), length.out = 1000)
fitted_boot <- data %>%
bootstrap(100) %>%
do({
m <- nls(ydata ~ A*cos(2*pi*((xdata-x_0)/z))+M, ., start=list(A=4,M=-7,x_0=-10,z=30))
f <- predict(m, newdata = list(xdata = x_range))
data.frame(xdata = x_range, .fitted = f)
} )
ggplot(data, aes(xdata, ydata)) +
geom_line(aes(y=.fitted, group=replicate), fitted_boot, alpha=.1, color="blue") +
geom_point(size=3) +
theme_bw()
Some more work is needed to add the mean and 95% confidence interval:
quants <- fitted_boot %>%
group_by(xdata) %>%
summarise(mean = mean(.fitted),
lower = quantile(.fitted, 0.025),
upper = quantile(.fitted, 0.975)) %>%
tidyr::gather(stat, value, -xdata)
ggplot(mapping = aes(xdata)) +
geom_line(aes(y = .fitted, group = replicate), fitted_boot, alpha=.05) +
geom_line(aes(y = value, lty = stat), col = 'red', quants, size = 1) +
geom_point(aes(y = ydata), data, size=3) +
scale_linetype_manual(values = c(lower = 2, mean = 1, upper = 2)) +
theme_bw()

Resources