Add negative binomial distribution to scatterplot - r

I'm hoping to do the same as in this question but this time add a negative binomial distribution to the plot.
This is my code:
library(ggplot2); library(MASS)
year <- 1990:2009
set.seed(1)
counts <- sample(1:1000, 20)
df <- data.frame(year, counts)
my_nb_reg <- glm.nb(counts ~ year, data = df)
my_nb_reg$model$fitted <- predict(my_nb_reg, type = "response")
library(plyr)
# nb_sim <- unlist(llply(my_nb_reg$model$fitted, function(x) rnbinom(n = ?, size = ?, prob = ?, mu = x)))
df.new <- data.frame(year, nb_sim)
ggplot(my_nb_reg$model) + geom_point(aes(year, counts)) + geom_jitter(data= nb_sim, aes(year, nb_sim), color = "red")
The line that is commented out requires arguments n, size and prob. Does anyone know how to add negative binomial distributions to the plot?

I would use rnegbin from MASS.
Here is use:
n as the number of simulated points.
mu as the predicted values from the model and
theta as the estimated theta from the model.
library(ggplot2); library(MASS)
year <- 1990:2009
set.seed(1)
counts <- sample(1:1000, 20)
df <- data.frame(year, counts)
my_nb_reg <- glm.nb(counts ~ year, data = df)
my_nb_reg$model$fitted <- predict(my_nb_reg, type = "response")
nb_sim <- unlist(lapply(my_nb_reg$model$fitted, function(x) rnegbin(n = 1000, mu = x, theta = my_nb_reg$theta)))
df.new <- data.frame(year, nb_sim)
ggplot() +
geom_jitter(data = df.new, aes(year, nb_sim), color = "red", alpha = 0.2) +
geom_point(data = my_nb_reg$model, aes(year, counts)) +
geom_point(data = my_nb_reg$model, aes(year, fitted), shape = 'x', size = 4)

Related

Back-transforming predictions from glmmTMB with truncated distribution

I'm running a glmmTMB with truncated count distributions, and am interested in predicting on the link scale and back-transforming the result. This is a follow-up to this question. The answer to the linked question addressed predicting from a glmmTMB with a truncated distribution on the response scale. I'm interested in predicting on the link scale and back-transforming, since my sample size is not large and the variability is high, so predicting on response scale results in lower CIs below 0.
As shown in the toy example, a simple exp() obviously is the wrong way to back-transform, since the resulting values do not account for the truncation. Any help would be appreciated!
library(dplyr)
library(extraDistr)
library(glmmTMB)
library(ggplot2)
set.seed(1)
df <- data.frame(Group = rep(c("a", "b"), each = 20),
N = rtpois(40, 1, a = 0))
m <- glmmTMB(N ~ Group, data = df, family = "truncated_poisson")
preds <- predict(m, type = "response", se.fit = TRUE)
df$PredResponse <- preds$fit
df$PredResponseLower <- preds$fit - 1.98*preds$se.fit
df$PredResponseUpper <- preds$fit + 1.98*preds$se.fit
preds <- predict(m, type = "link", se.fit = TRUE)
df$PredLink <- exp(preds$fit)
df$PredLinkLower <- exp(preds$fit - 1.98*preds$se.fit)
df$PredLinkUpper <- exp(preds$fit + 1.98*preds$se.fit)
df %>%
group_by(Group) %>%
mutate(Mean = mean(N)) %>%
ggplot() +
geom_point(aes(x = Group, y = Mean), size = 5) +
geom_point(aes(x = Group, y = PredLink, colour = "Link")) +
geom_point(aes(x = Group, y = PredResponse, colour = "Response")) +
geom_errorbar(aes(x = Group, ymin = PredLinkLower, ymax = PredLinkUpper, colour = "Link")) +
geom_errorbar(aes(x = Group, ymin = PredResponseLower, ymax = PredResponseUpper, colour = "Response"))

How to generate a facet_grid where each facet has different interaction terms in R ggplot2?

There are four species A-D in a community influenced by temperature and moisture. Species A has an interaction term between temperature and moisture, everything else has a simple linear relationship with moisture (as assumed). I want to create a ggplot where B,C, and D have linear relationships but A shows an interaction. As oultlined here I will need to choose either mositure or temperature for the x axis, and represent the other variable with another aesthetic such as color.
Here is the code for the species
library(lme4)
library(ggplot2)
library(geomtextpath)
library(lme4)
##Creating the DataFrame
set.seed(111)
mean.temp <- rnorm(100, 20,1)
mean.moisture <- rnorm(100,30,1)
y.var <- rnorm(100, 2,1)
site <- rep(c("1","2","3","4"), times = 25)
species <- rep(c("A","B","C","D"), each = 25)
df <- data.frame(mean.temp, mean.moisture, y.var, site, species)
df$site <- as.factor(as.character(df$site))
df$species <- as.factor(as.character(df$species))
##Assume only species A has singificant interactions terms
dfA <- df[df$species == "A",]
modA <- lmer(dfA$y.var ~ dfA$mean.temp * dfA$mean.moisture + (1|dfA$site), data = dfA)
dfA.mod <- expand.grid(mean.temp = seq(min(dfA$mean.temp), max(dfA$mean.temp), length = 4),
mean.moisture = seq(min(dfA$mean.moisture), max(dfA$mean.moisture),
length = length(dfA$species)))
dfA.mod$y.var <- predict(modA, newdata = dfA.mod)
##Assume all other species have a linear relationship
dfD <- df[df$species == "D",]
modD <- lmer(y.var ~ mean.moisture + (1|site), data = dfD)
dfD$y.var <- predict(modD)
dfC <- df[df$species == "C",]
modC <- lmer(y.var ~ mean.moisture + (1|site), data = dfC)
dfC$y.var <- predict(modC)
dfB <- df[df$species == "B",]
modB <- lmer(y.var ~ mean.moisture + (1|site), data = dfB)
dfB$y.var <- predict(modB)
#Merge A,B,C and get rid of site, species etc..
df.ABC <- rbind(dfB, dfC, dfD)
df.ABC <- df.ABC[,c(1:3)]
#Merge predictions from all four species
df.pred <- rbind(dfA.mod,df.ABC)
If I plotted A alone it would look like
ggplot(dfA.mod, aes(x = mean.moisture, y = y.var, group = mean.temp)) +
geom_point(data = dfA, aes(shape = site, color = mean.temp)) +
geom_textline(aes(color = mean.temp, label = round(mean.temp, 2)), hjust = 0.95) + scale_color_gradient(low = 'navy', high = 'red4') + theme_light(base_size = 16) + facet_grid(.~species)
I want to use some such similar code for all the species with facet_wrap. Here is my attempt
#Plot the raw data coded by site, and the predictions
ggplot(df.pred, aes(x = mean.moisture, y = y.var, group = mean.temp)) +
geom_point(data = df, aes(shape = site, color = mean.temp)) +
geom_textline(aes(color = mean.temp, label = round(mean.temp, 2)), hjust = 0.95) + scale_color_gradient(low = 'navy', high = 'red4') + theme_light(base_size = 16) + facet_grid(.~species)
It's a mush. How can I change these plots to represent only a single line in B to C, but 4 lines in plot A

ggplotly: how to return dates with stat_smooth?

I am trying to plot cumulative sums of groups over time, and add corresponding linear prediction lines for each group. The plot turns out well, however, I cannot read the dates of the prediction slopes on the x-axis as these are numbers.
How can I change the code below so that stat_smooth returns dates instead of numbers?
g <- ggplot(aes(x = created_at_day, y = sum, color = groups), data= df) +
geom_line() +
stat_smooth(aes(x = as.Date(created_at_day), y = sum, color = groups), method = "lm", fullrange = TRUE, se = FALSE, size = 0.1) +
xlab('Date') +
ylab('cumulative sum of patients') +
expand_limits(x = as.Date(c("2017-11-13", "2018-04-01"))) +
ggtitle('Number of patients included per practice') +
theme_bw()
g
ggplotly(g)
EDIT: Here is some code to generate the plot above.
set.seed(101)
created_at_day= sample(seq(as.Date('2017-01-01'), as.Date('2018-01-01'), by = "day"), 300)
set.seed(101)
groups= sample(1:3,300, replace= TRUE)
df = data.frame(created_at_day, groups)
group1= seq(1, table(groups)[[1]]*1.5, 1.5)
group2= seq(0.8, table(groups)[[2]]*0.8, 0.8)
group3= seq(1, table(groups)[[3]]*1.8, 1.8)
df_g1= subset(df, groups == 1)
df_g1 = arrange(df_g1, created_at_day)
df_g1= cbind(df_g1, sum= group1)
df_g2= subset(df, groups == 2)
df_g2 = arrange(df_g2, created_at_day)
df_g2= cbind(df_g2, sum= group2)
df_g3= subset(df, groups == 3)
df_g3 = arrange(df_g3, created_at_day)
df_g3= cbind(df_g3, sum= group3)
df= rbind(df_g1, df_g2, df_g3)
df$groups= as.factor(df$groups)

Include weibull fit in ggsurvplot

I would like to fit a weibull curve to some event data and then include the fitted weibull curve in a survival plot plotted by survminer::ggsurvplot. Any ideas of how?
Here is an example to work on:
A function for simulating weibull data:
# N = sample size
# lambda = scale parameter in h0()
# rho = shape parameter in h0()
# beta = fixed effect parameter
# rateC = rate parameter of the exponential distribution of C
simulWeib <- function(N, lambda, rho, beta, rateC)
{
# covariate --> N Bernoulli trials
x <- sample(x=c(0, 1), size=N, replace=TRUE, prob=c(0.5, 0.5))
# Weibull latent event times
v <- runif(n=N)
Tlat <- (- log(v) / (lambda * exp(x * beta)))^(1 / rho)
# censoring times
C <- rexp(n=N, rate=rateC)
# follow-up times and event indicators
time <- pmin(Tlat, C)
status <- as.numeric(Tlat <= C)
# data set
data.frame(id=1:N,
time=time,
status=status,
x=x)
}
generate data
set.seed(1234)
betaHat <- rep(NA, 1e3)
for(k in 1:1e3)
{
dat <- simulWeib(N=100, lambda=0.01, rho=1, beta=-0.6, rateC=0.001)
fit <- coxph(Surv(time, status) ~ x, data=dat)
betaHat[k] <- fit$coef
}
#Estimate a survival function
survfit(Surv(as.numeric(time), x)~1, data=dat) -> out0
#plot
library(survminer)
ggsurvplot(out0, data = dat, risk.table = TRUE)
gg1 <- ggsurvplot(
out0, # survfit object with calculated statistics.
data = dat, # data used to fit survival curves.
risk.table = TRUE, # show risk table.
pval = TRUE, # show p-value of log-rank test.
conf.int = TRUE, # show confidence intervals for
# point estimaes of survival curves.
xlim = c(0,2000), # present narrower X axis, but not affect
# survival estimates.
break.time.by = 500, # break X axis in time intervals by 500.
ggtheme = theme_minimal(), # customize plot and risk table with a theme.
risk.table.y.text.col = T, # colour risk table text annotations.
risk.table.y.text = FALSE,
surv.median.line = "hv",
color = "darkgreen",
conf.int.fill = "lightblue",
title = "Survival probability",# show bars instead of names in text annotations
# in legend of risk table
)
gg1
As far as I see this, it is not possible do it with ggsurvplot at this moment.
I created an issue requesting this feature: https://github.com/kassambara/survminer/issues/276
You can plot survivor curves of a weibull model with ggplot2 like this:
library("survival")
wbmod <- survreg(Surv(time, status) ~ x, data = dat)
s <- seq(.01, .99, by = .01)
t_0 <- predict(wbmod, newdata = data.frame(x = 0),
type = "quantile", p = s)
t_1 <- predict(wbmod, newdata = data.frame(x = 1),
type = "quantile", p = s)
smod <- data.frame(time = c(t_0, t_1),
surv = rep(1 - s, times = 2),
strata = rep(c(0, 1), each = length(s)),
upper = NA, lower = NA)
head(surv_summary(cm))
library("ggplot2")
ggplot() +
geom_line(data = smod, aes(x = time, y = surv, color = factor(strata))) +
theme_classic()
However to my knowledge you cannot use survminer (yet):
library("survminer")
# wrong:
ggsurvplot(smod)
# does not work:
gg1$plot + geom_line(data = smod, aes(x = time, y = surv, color = factor(strata)))
The following works for me. Probably the credit goes to Heidi filling a feature request.
Hope, someone finds this useful.
library(survminer)
library(tidyr)
s <- with(lung,Surv(time,status))
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung)
fKM <- survfit(s ~ sex,data=lung)
pred.sex1 = predict(sWei, newdata=list(sex=1),type="quantile",p=seq(.01,.99,by=.01))
pred.sex2 = predict(sWei, newdata=list(sex=2),type="quantile",p=seq(.01,.99,by=.01))
df = data.frame(y=seq(.99,.01,by=-.01), sex1=pred.sex1, sex2=pred.sex2)
df_long = gather(df, key= "sex", value="time", -y)
p = ggsurvplot(fKM, data = lung, risk.table = T)
p$plot = p$plot + geom_line(data=df_long, aes(x=time, y=y, group=sex))

How to graph my multiple linear regression model (caret)?

I have created an multiple linear regression model and would now like to plot it. But I can't seem to figure it out. Any help would be greatly appreciated! I used baruto to find the feature attributes and then used train() to get the model. When I try to plot model_lm I get the error:
There are no tuning parameters with more than 1 value.
Here is my code at what I have attempted so far:
rt_train <- rttotal2
rt_train$year <- NULL
#rt_train$box_office <- NULL
#impute na and address multicoliniearity
preproc <- preProcess(rt_train, method = c("knnImpute","center",
"scale"))
rt_proc <- predict(preproc, rt_train)
rt_proc$box_office <- rt_train$box_office
sum(is.na(rt_proc))
titles <- rt_proc$titles
rt_proc$titles <- NULL
#rt_train$interval <- as.factor(rt_train$interval)
dmy <- dummyVars(" ~ .", data = rt_proc,fullRank = T)
rt_transform <- data.frame(predict(dmy, newdata = rt_proc))
index <- createDataPartition(rt_transform$interval, p =.75, list = FALSE)
train_m <- rt_transform[index, ]
rt_test <- rt_transform[-index, ]
str(rt_train)
y_train <- train_m$box_office
y_test <-rt_test$box_office
train_m$box_office <- NULL
rt_test$box_office <- NULL
#selected feature attributes
boruta.train <- Boruta(interval~., train_m, doTrace =1)
#graph to see most important var to interval
lz<-lapply(1:ncol(boruta.train$ImpHistory),function(i)
boruta.train$ImpHistory[is.finite(boruta.train$ImpHistory[,i]),i])
names(lz) <- colnames(boruta.train$ImpHistory)
plot(boruta.train, xlab = "", xaxt = "n")
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(boruta.train$ImpHistory), cex.axis = 0.7)
#get most important attributes
final.boruta <- TentativeRoughFix(boruta.train)
print(final.boruta)
getSelectedAttributes(final.boruta, withTentative = F)
boruta.rt_df <- attStats(final.boruta)
boruta.rt_df
boruta.rt_df <- setDT(boruta.rt_df, keep.rownames = TRUE)[]
predictors <- boruta.rt_df %>%
filter(., decision =="Confirmed") %>%
select(., rn)
predictors <- unlist(predictors)
control <- trainControl(method="repeatedcv",
number=10,
repeats=6)
#look at residuals
#p-value is very small so reject H0 that predictors have no effect so
#we can use rotten tomatoes to predict box_office ranges
train_m$interval <- NULL
model_lm <- train(train_m[,predictors],
y_train, method='lm',
trControl = control, tuneLength = 10)
model_lm #.568
#
plot(model_lm)
plot(model_lm)
z <- varImp(object=model_lm)
z <- setDT(z, keep.rownames = TRUE)
z$model <- NULL
z$calledFrom <- NULL
row.names(z)
plot(varImp(object=model_lm),main="Linear Model Variable Importance")
predictions<-predict.train(object=model_lm,rt_test[,predictors],type="raw")
table(predictions)
#get coeff
interc <- coef(model_lm$finalModel)
slope <- coef(model_lm$finalModel)
ggplot(data = rt_train, aes(y = box_office)) +
geom_point() +
geom_abline(slope = slope, intercept = interc, color = 'red')
This is what some of my input looks like. Thank you!!
Here is an example using the inbuilt data set cars:
data(cars, package = "datasets")
library(caret)
build the model
control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 6)
model_lm <- train(dist ~ speed, data = cars, method='lm',
trControl = control, tuneLength = 10)
I will assume you would like to plot the final model.
You can use the caret predict.train function to get the predictions from the model and plot them:
pred <- predict(model_lm, cars)
pred <- data.frame(pred = pred, speed = cars$speed)
additionally you can provide the cars data set to geom point and plot the observations:
library(ggplot2)
ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x=speed, y = dist))
if you would like to obtain the confidence or prediction interval you can use the predict.lm function on model_lm$finalModel:
Here is an example for the prediction interval:
pred <- predict(model_lm$finalModel, cars, se.fit = TRUE, interval = "prediction")
pred <- data.frame(pred = pred$fit[,1], speed = cars$speed, lwr = pred$fit[,2], upr = pred$fit[,3])
pred_int <- ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x = speed, y = dist)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = speed), alpha = 0.2)
or the confidence interval:
pred <- predict(model_lm$finalModel, cars, se.fit = TRUE, interval = "confidence")
pred <- data.frame(pred = pred$fit[,1], speed = cars$speed, lwr = pred$fit[,2], upr = pred$fit[,3])
pred_conf <- ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x = speed, y = dist)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = speed), alpha = 0.2)
plotting them side by side:
library(cowplot)
plot_grid(pred_int, pred_conf)
to plot the linear dependence on two variables you can use a 3D plot, for more than 3 it will be a problem.

Resources