How to extend logistic regression plot? - r

I have created a logistic model on R, the issue is my max x value is 0.85 hence the plot stops at this value.
Is there a way I can extend this to plot to x=100 and y values calculated using my logistic model?
library(caret)
library(mlbench)
library(ggplot2)
library(tidyr)
library(caTools)
my_data2 <- read.csv('C:/Users/Magician/Desktop/R files/Fnaticfirstround.csv', header=TRUE, stringsAsFactors = FALSE)
my_data2
#converting Map names to the calculated win probability
my_data2[my_data2$Map == "Dust2", "Map"] <- 0.307692
my_data2[my_data2$Map == "Inferno", "Map"] <- 0.47619
my_data2[my_data2$Map == "Mirage", "Map"] <- 0.708333
my_data2[my_data2$Map == "Nuke", "Map"] <- 0.444444
my_data2[my_data2$Map == "Overpass", "Map"] <- 0.333333
my_data2[my_data2$Map == "Train", "Map"] <- 0.692308
my_data2[my_data2$Map == "Vertigo", "Map"] <- 0
my_data2[my_data2$Map == "Cache", "Map"] <- 0.857143
#converting W and L to 1 and 0
my_data2$WinorLoss <- ifelse(my_data2$WinorLoss == "W", 1,0)
my_data2$WinorLoss <- factor(my_data2$WinorLoss, levels = c(0,1))
#converting Map to numeric characters
my_data2$Map <- as.numeric(my_data2$Map)
#Logistic regression model
glm.fit <- glm(WinorLoss ~ Map, family=binomial, data=my_data2)
summary(glm.fit)
#make predictions on the training data
glm.probs <- predict(glm.fit, type="response")
glm.pred <- ifelse(glm.probs>0.5, 1, 0)
attach(my_data2)
table(glm.pred,WinorLoss)
mean(glm.pred==WinorLoss)
#splitting the data for trying and testing
Split <- sample.split(my_data2, SplitRatio = 0.7)
traindata <- subset(my_data2, Split == "TRUE")
testdata <- subset(my_data2, Split == "FALSE")
glm.fit <- glm(WinorLoss ~ Map,
data=traindata,
family="binomial")
glm.probs <- predict(glm.fit,
newdata=testdata,
type="response")
glm.pred <- ifelse(glm.probs > 0.5, "1", "0")
table(glm.pred, testdata$WinorLoss)
mean(glm.pred == testdata$WinorLoss)
summary(glm.fit)
#changing the x axis to 0-100%, min map win prob - max map win prob
newdat <- data.frame(Map = seq(min(traindata$Map), max(traindata$Map), len=100))
newdat$WinorLoss = predict(glm.fit, newdata=newdat, type="response")
p <- ggplot(newdat, aes(x=Map,y=WinorLoss))+
geom_point() +
geom_smooth(method = "glm",
method.args = list(family="binomial"),
se = FALSE) +
xlim(0,1) +
ylim(0,1)
I have tried extending the x value to 100 but that just extended the axis but did not calculate the corresponding y value and hence plot these values..

I cannot reproduce your data, so I will show how to do it using the "challenger disaster" example (see this LINK), with confidence interval ribbons.
You should create artificial points in your data and fit it before plotting.
Next time, try to use reprex or provide a minimal reproducible example.
Preparing data and model fitting:
library(dplyr)
fails <- c(2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0)
temp <- c(53, 66, 68, 70, 75, 78, 57, 67, 69, 70, 75, 79, 58, 67, 70, 72, 76, 80, 63, 67, 70, 73, 76)
challenger <- tibble::tibble(fails, temp)
orings = 6
challenger <- challenger %>%
dplyr::mutate(resp = fails/orings)
model_fit <- glm(resp ~ temp,
data = challenger,
weights = rep(6, nrow(challenger)),
family=binomial(link="logit"))
##### ------- this is what you need: -------------------------------------------
# setting limits for x axis
x_limits <- challenger %>%
dplyr::summarise(min = 0, max = max(temp)+10)
# creating artificial obs for curve smoothing -- several points between the limits
x <- seq(x_limits[[1]], x_limits[[2]], by=0.5)
# artificial points prediction
# see: https://stackoverflow.com/questions/26694931/how-to-plot-logit-and-probit-in-ggplot2
temp.data = data.frame(temp = x) #column name must be equal to the variable name
# Predict the fitted values given the model and hypothetical data
predicted.data <- as.data.frame(
predict(model_fit,
newdata = temp.data,
type="link", se=TRUE)
)
# Combine the hypothetical data and predicted values
new.data <- cbind(temp.data, predicted.data)
##### --------------------------------------------------------------------------
# Compute confidence intervals
std <- qnorm(0.95 / 2 + 0.5)
new.data$ymin <- model_fit$family$linkinv(new.data$fit - std * new.data$se)
new.data$ymax <- model_fit$family$linkinv(new.data$fit + std * new.data$se)
new.data$fit <- model_fit$family$linkinv(new.data$fit) # Rescale to 0-1
Plotting:
library(ggplot2)
plotly_palette <- c('#1F77B4', '#FF7F0E', '#2CA02C', '#D62728')
p <- ggplot(challenger, aes(x=temp, y=resp))+
geom_point(colour = plotly_palette[1])+
geom_ribbon(data=new.data,
aes(y=fit, ymin=ymin, ymax=ymax),
alpha = 0.5,
fill = '#FFF0F5')+
geom_line(data=new.data, aes(y=fit), colour = plotly_palette[2]) +
labs(x="Temperature", y="Estimated Fail Probability")+
ggtitle("Predicted Probabilities for fail/orings with 95% Confidence Interval")+
theme_bw()+
theme(panel.border = element_blank(), plot.title = element_text(hjust=0.5))
p
# if you want something fancier:
# library(plotly)
# ggplotly(p)
Result:
Interesting Fact About the Challenger Data:
NASA Engineers used linear regression to estimate the likelihood of O-ring failure. If they had used a more appropriate technique for their data, such as logistic regression, they would have noticed that the probability of failure at lower temperatures (such as ~ 36F at launch time) was extremely high. The plot shows us that for ~36F (a temperature which we extrapolate from the observed ones), we have a probability of ~0.75. If we consider the confidence interval ... well, the accident was pretty much a certainty.

Related

Monte Carlo simulations for VAR models

I've been trying to estimate VAR models using Monte Carlo Simulation. I have 3 endogenous variables. I need some guidance regarding this.
First of all, I want to add an outlier as a percentage of the sample size.
Second (second simulation for same model), I want to add multivariate contaminated normal distribution like 0.9N (0, I) + 0.1((0,0,0)',(100, 100, 100)) instead of outlier.
Could you tell me how to do these?
Thank you.
RR <- function(n, out){
# n is number of observations
k <- 3 # Number of endogenous variables
p <- 2 # Number of lags
# add outlier
n[1]<- n[1]+out
# Generate coefficient matrices
B1 <- matrix(c(.1, .3, .4, .1, -.2, -.3, .03, .1, .1), k) # Coefficient matrix of lag 1
B2 <- matrix(c(0, .2, .1, .07, -.4, -.1, .5, 0, -.1), k) # Coefficient matrix of lag 2
M <- cbind(B1, B2) # Companion form of the coefficient matrices
# Generate series
DT <- matrix(0, k, n + 2*p) # Raw series with zeros
for (i in (p + 1):(n + 2*p)){ # Generate series with e ~ N(0,1)
DT[, i] <- B1%*%DT[, i-1] + B2%*%DT[, i-2] + rnorm(k, 0, 1)
}
DT <- ts(t(DT[, -(1:p)])) # Convert to time series format
#names <- c("V1", "V2", "V3") # Rename variables
colnames(DT) <- c("Y1", "Y2", "Y3")
#plot.ts(DT) # Plot the series
# estimate VECM
vecm1 <- VECM(DT, lag = 2, r = 2, include = "const", estim ="ML")
vecm2 <- VECM(DT, lag = 2, r = 1, include = "const", estim ="ML")
# mse
mse1 <- mean(vecm1$residuals^2)
mse2 <- mean(vecm2$residuals^2)
#param_list <- unname(param_list)
return(list("mse1" = mse1, "mse2" = mse2, "mse3" = mse3))
}
# defined the parameter grids(define the parameters ranges we want to run our function with)
n_grid = c(50, 80, 200, 400)
out_grid = c(0 ,5, 10)
# collect parameter grids in a list (to enter it into the Monte Carlo function)
prml = list("n" = n_grid, "out" = out_grid)
# run simulation
RRS <- MonteCarlo(func = RR, nrep = 1000, param_list = prml)
summary(RRS)
# make table:
rows = "n"
cols = "out"
MakeTable(output = RRS, rows = rows, cols = cols)

extract(model, what = dic) from JAGS model returns NA for penalty

Using JAGS, I am fitting different models to data and would like to compare their fits using the deviance information criterion (DIC). I am using "run.jags" to fit a model and then "extract" to determine the DIC for the model after it runs. My models converge without a problem, but I am only getting values for the deviance portion of the DIC. All of my penalty values are either 0 or NA. I think I understand why I am getting NA - those are the scenarios where the predicted value and observed value are both 0. I do not understand why I am getting 0 for the other instances. Any ideas on how to fix this?
Other posts where someone was getting NA for the penalty suggested altering the priors (https://sourceforge.net/p/mcmc-jags/discussion/610037/thread/2fcd66ea/), but they were using dic.samples(), not extract(). I tried changing my priors, but did not find that it altered my outcome.
Here is some code that reproduces the situation (run time < 1 min):
# install.packages("remotes")
# remotes::install_github("gilesjohnr/hmob")
library(hmob)
library(dplyr)
library(stringr)
library(foreach)
library(parallel)
library(doParallel)
library(zoo)
library(sp)
library(rgdal)
library(rgeos)
library(abind)
library(rjags)
library(coda)
library(runjags)
library(truncnorm)
library(rmutil)
library(dclone)
library(R2WinBUGS)
# subset of data to run
M <- matrix(c(0, 5514, 5290, 88, 5501, 0, 10868, 392, 5388, 10830, 0, 6641, 91, 400, 6660, 0),
nrow = 4, ncol = 4, byrow = TRUE)
D <- matrix(c(0, 38, 58, 162, 38, 0, 35, 125, 58, 35, 0, 111, 162, 125, 111, 0),
nrow = 4, ncol = 4, byrow = TRUE)
N <- c(15350, 17803, 29825, 5772)
n.districts <- nrow(M)
jags.data <- list(
M=M,
D=D,
N=N,
n.districts=n.districts)
# JAGS model
model.test <- "
model {
for (i in 1:n.districts) {
for (j in 1:n.districts) {
M[i,j] ~ dpois(pi[i,j]*N[i])
}
pi[i,1:n.districts] <- c[i,]/sum(c[i,])
}
for (i in 1:n.districts) {
for (j in 1:n.districts) {
c[i,j] <- ifelse(
i == j,
0,
exp(log(theta) + (omega.1*log(N[i]) + omega.2*log(N[j]) - log(f.d[i,j])))
)
f.d[i,j] <- D[i,j]^gamma
}
}
### Priors ###
theta ~ dgamma(1, 1)
omega.1 ~ dgamma(1, 1)
omega.2 ~ dgamma(1, 1)
gamma ~ dgamma(1, 1)
}"
params <- c('omega.1', 'omega.2', 'theta', 'gamma')
# Burnin and samples are intentionally low when troubleshooting
nc <- 4 # number of chains
na <- 1000 # adaptations
nb <- 4000 # burn in
ni <- 10000 # samples
nt <- 5 # thin
init.list <- replicate(nc,
list(.RNG.name='lecuyer::RngStream',
.RNG.seed= 423486), #sample(1:1e6, 1)), uncomment for random sample
simplify=FALSE)
out <- run.jags(model=model.test,
data=jags.data,
monitor=params,
n.chains=nc,
adapt=na,
burnin=nb,
sample=ni,
thin=nt,
inits=init.list,
modules=c('lecuyer'),
method="parallel",
summarise=FALSE)
dic.basic <- extract(out, what="dic")

How to get an R^2 and P-Value for R ELISA Analysis 4PL

I am trying to mimic the graphpad ELISA analysis using R, however I am having a bit of difficulty getting a P-Value and an R^2 value.
I have followed the tutorial: http://weightinginbayesianmodels.github.io/poctcalibration/calib_tut4_curve_ocon.html#unweighted-nonlinear-regression-in-r
It got me a majority of the information needed using a package called "minpack.lm", however I am not sure how to approach getting the R^2 and P value from here.
ODCalc1 <- c(.007, .072, .328, .988, 1.534, 1.983)
ODCalc2 <- c(.006, .074, .361, .858, 1.612, 1.993)
ODCalc <- (ODCalc1 + ODCalc2)/2
concentration <- log10(c(1, 36, 180, 540, 1080, 1800))
ocon <- data.frame(10^(concentration), "rep", ODCalc, stringsAsFactors = F)
ocon$X.rep. <- as.numeric(ocon$X.rep.)
ocon$X.rep. <- 1
names(ocon) <- c("conc", "rep", "od")
# Plot the O'Connell data
par(mfrow = c(1, 2), cex.main = 1, mar = c(4, 4, 1, 2), oma = c(0.5, 0.5, 2.5, 0))
plot(ocon$conc, ocon$od, pch = 21, bg = "grey", ylab = "Response (od)",
xlab = "Concentration")
grid()
# Plot on the log(x) scale
plot(log(ocon$conc), ocon$od, pch = 21, bg = "grey", ylab = "Response (od)",
xlab = "log(concentration)")
grid()
title("O'Connell's ELISA: concentration on absolute (left) and log (right) scales",
outer = T)
par(mfrow = c(1, 1))
# ------------ Function: 4PL curve function ---------------------------------
M.4pl <- function(x, small.x.asymp, inf.x.asymp, inflec, hill){
f <- small.x.asymp + ((inf.x.asymp - small.x.asymp)/
(1 + (x / inflec)^hill))
return(f)
}
# ------------- end ---------------------------------------------------------
start.ocon <- c(small.x.asymp = 0.1, inf.x.asymp = 1, inflec = 3000, hill = -1)
library(minpack.lm)
uw.4pl <- nlsLM(od ~ M.4pl(conc, small.x.asymp, inf.x.asymp, inflec, hill),
data = ocon,
start = start.ocon)
data.4pl <- summary(uw.4pl)
bottom.4pl <- data.4pl$parameters[1,1]
top.4pl <- data.4pl$parameters[2,1]
IC50.4pl <- data.4pl$parameters[3,1]
HillSlope.4pl <- abs(data.4pl$parameters[4,1])
RSS.p <- sum(residuals(uw.4pl)^2)
TSS <- sum((ocon$od - mean(ocon$od))^2)
r.squared <- 1-(RSS.p/TSS) # is this the proper way to get an r^2 value? It does not match what graphpad has which is an issue.
# I have also read this should work, but since the model is a linear model instead of a Sigmoidal, 4PL, X is log (concentration) model
model <- lm(concentration ~ poly(ODCalc, degree = 4, raw=T))
summary(model) # R^2 is not the correct value I am looking for.
# Not sure if sample data is needed but these were the values we were using to produce the values below
sample.od.values1 <- c(0.275, 1.18, 0.085, 0.054, 0.119)
sample.od.values2 <- c(0.263, 1.149, 0.068, 0.062, 0.109)
sample.od.values <- (sample.od.values1+sample.od.values2)/2
Values to prove the methods are the same:
bottom.4pl = 0.01657
top.4pl = 3.002
HillSlope = 1.222
R^2 = 0.9978
R^2(adjusted) = 0.9969
P-Value = 0.5106
Thank you in advance for any helpful tips!
Since R^2 measures linear association it is normally used for linear regression but ignoring that this seems to give the numbers you want or at least numbers that are close to those. For the adjusted R squared formula see https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2 and for the p-value I have assumed that you are looking for the p-value for the hypothesis that the first coefficient is zero.
RSS <- deviance(uw.4pl); RSS
## [1] 0.001514624
coef(uw.4pl) # coefficients/parameters
## small.x.asymp inf.x.asymp inflec hill
## 0.01654996 3.00261439 1033.53324214 -1.22171740
R2 <- cor(ocon$od, fitted(uw.4pl))^2; R2
## [1] 0.9995529
n <- nobs(uw.4pl)
p <- length(coef(uw.4pl))
adjR2 <- 1 - (1-R2) * (n - 1) / (n - p - 1); adjR2
## [1] 0.9977645
pvalue <- coef(summary(uw.4pl))[1, 4]; pvalue
## [1] 0.5486584

Two different multiple GLM poisson model regression, mean points and confidence interval

I'd like to create a plot in ggplot2 that combines two different multiple GLM poisson model ajusted, mean points and confidence interval (IC 95%). But my mean point representation doesn't work.
#Artificial data set
Consumption <- c(501, 502, 503, 504, 26, 27, 55, 56, 68, 69, 72, 93)
Gender <- gl(n = 2, k = 6, length = 2*6, labels = c("Male", "Female"), ordered = FALSE)
Income <- c(5010, 5020, 5030, 5040, 260, 270, 550, 560, 680, 690, 720, 930)
df3 <- data.frame(Consumption, Gender, Income)
df3
# GLM Regression
fm1 <- glm(Consumption~Gender+Income, data=df3, family=poisson)
summary(fm1)
# ANOVA
anova(fm1,test="Chi")
#Genders are different than I ajusted one model for male and another for Female
#Male model
df4<-df3[df3$Gender=="Male",]
fm2 <- glm(Consumption~Income, data=df4, family=poisson)
summary(fm2)
#Female model
df5<-df3[df3$Gender=="Female",]
fm3 <- glm(Consumption~Income, data=df5, family=poisson)
summary(fm3)
#Create preditions amd confidence interval
Predictions <- c(predict(fm2, type="link", se.fit = TRUE),
predict(fm3, type="link", se.fit = TRUE))
df3_combined <- cbind(df3, Predictions)
df3_combined$UCL<-df3_combined$fit + 1.96*df3_combined$se.fit
df3_combined$LCL<-df3_combined$fit - 1.96*df3_combined$se.fit
df3_combined<-df3_combined[,-(6:9)]
df3_combined<-as.data.frame(df3_combined)
#Create mean values for plot this values
library(dplyr)
df<-df3_combined %>%
group_by(Income, Gender) %>%
summarize(Consumption = mean(Consumption, na.rm = TRUE))
df<-as.data.frame(df)
#Plot
library(tidyverse)
library(ggplot2)
df3_combined %>%
gather(type, value, Consumption) %>%
ggplot(mapping=aes(x=Income, y=Consumption, color = Gender)) +
geom_point(df,mapping=aes(x=Income, y=Consumption, color = Gender)) +
geom_line(mapping=aes(x=Income, y=exp(fit))) +
geom_smooth(mapping=aes(ymin = exp(LCL), ymax = exp(UCL)), stat="identity")
#
I don't see the mean values created in df object in my output plot and I don't know why.

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Resources