I have created a logistic regression model and a corresponding ROC using pROC. I got the threshold for the "best" value that maximizes sensitivity and specificity. The predictor is a score that goes from 4 to 13 points. The predicted variable is survival. I need to know which value in my score (from 4 to 13) is represented by the threshold value (e.g. 0.043). Will appreaciate your help.
Code looks like this>
#MULTIVARIATE ANALYSIS
summary(glm((VIVO_AL_ALTA==0)~ CALCULADORA_CALL_SCORE +
SOPORTE_VENT_AL_INGRESO + ETE_DURANTE_HOSP +
SOBREINFECC_BACT_DURANTE_HOSP + COP_DURANTE_HOSP + TOCILIZUMAB +
CORTICOIDES_HOSP, family = binomial, data = work_data))
exp(coef(glm((VIVO_AL_ALTA==0)~ CALCULADORA_CALL_SCORE +
SOPORTE_VENT_AL_INGRESO + ETE_DURANTE_HOSP +
SOBREINFECC_BACT_DURANTE_HOSP + COP_DURANTE_HOSP + TOCILIZUMAB +
CORTICOIDES_HOSP , family = binomial, data = work_data)))
exp(confint.default(glm((VIVO_AL_ALTA==0) ~
CALCULADORA_CALL_SCORE + SOPORTE_VENT_AL_INGRESO +
ETE_DURANTE_HOSP + SOBREINFECC_BACT_DURANTE_HOSP +
COP_DURANTE_HOSP + TOCILIZUMAB + CORTICOIDES_HOSP , family=
binomial, data= work_data), level = .95))
mod_vivo_alta_multi<-glm((VIVO_AL_ALTA==0)~
CALCULADORA_CALL_SCORE + SOPORTE_VENT_AL_INGRESO +
ETE_DURANTE_HOSP + SOBREINFECC_BACT_DURANTE_HOSP +
COP_DURANTE_HOSP + TOCILIZUMAB + CORTICOIDES_HOSP, family =
binomial, data = work_data, na.action = "na.exclude")
#ROC Curves
library(pROC)
#ROC VIVO ALTA MULTI
work_data$pred_vivo_alta_multi<-predict(mod_vivo_alta_multi, type
= "response", na.action = "na.omit")
pROC_obj_pred_vivo_alta_multi <-
roc((work_data$VIVO_AL_ALTA==0),work_data$pred_vivo_alta_multi,
smoothed = TRUE, direction="<",
# arguments for ci
ci=TRUE, ci.alpha=0.95,
stratified=FALSE,
# arguments for plot
plot=TRUE, auc.polygon=F,
max.auc.polygon=TRUE, grid=TRUE,
print.thres=T,
print.auc=TRUE, show.thres=TRUE)
coords(pROC_obj_pred_vivo_alta_multi,x= "best",
input="threshold", ret=c("threshold", "specificity",
"sensitivity", "npv", "ppv","youden"), as.list=FALSE, drop=TRUE,
best.method=c("youden"), best.weights=c(1, 0.5), transpose =
FALSE, as.matrix=FALSE)
Sorry for some of the variables are in Spanish. Basically, these are my variables for the model: CALCULADORA_CALL_SCORE + SOPORTE_VENT_AL_INGRESO +
ETE_DURANTE_HOSP + SOBREINFECC_BACT_DURANTE_HOSP +
COP_DURANTE_HOSP + TOCILIZUMAB + CORTICOIDES_HOSP
And my predicted variable is VIVO_AL_ALTA
You can use coords(my_roc, "best").
Here's a demonstration using a built in example from pROC
library(pROC)
ROC <- roc(aSAH$outcome, aSAH$s100b, levels=c("Good", "Poor"))
#> Setting direction: controls < cases
coords(ROC, "best")
#> threshold specificity sensitivity
#> 1 0.205 0.8055556 0.6341463
Created on 2022-05-20 by the reprex package (v2.0.1)
Related
I have 2 different datasets. To each one i apply the same plm regression. I would like to know how can i visualize, in the same plot, the estimated coefficients of each model.
mainstream <- plm(log(sum_plays) ~ cancel_public_events + close_public_transport + internationaltravel + restrictions_on_gatherings + school_closing + stay_at_home_requirements + workplace_closing + new_cases_per_million + new_deaths_per_million +
data = top200, model = "within")
long_tail <- plm(log(sum_plays) ~ cancel_public_events + close_public_transport + internationaltravel + restrictions_on_gatherings + school_closing + stay_at_home_requirements + workplace_closing + new_cases_per_million + new_deaths_per_million +
data = bottom, model = "within")
I can make the plot for each individual model, however i want to have the info of this both plots in just one. Probably differentiate the coefficients by color (i.e coefficients from "mainstream" in red and the coefficients from "longtail" in blue)
a <- plot_model(long_tail, transform = NULL, show.values = TRUE, value.offset =.3, terms = c("workplace_closing" , "stay_at_home_requirements", "school_closing", "close_public_transport", "internationaltravel", "restrictions_on_gatherings", "cancel_public_events"), title = "Coefficients for Long-Tail Music Consumption")
b <- plot_model(mainstream, transform = NULL, show.values = TRUE, value.offset =.3, terms = c("workplace_closing" , "stay_at_home_requirements", "school_closing", "close_public_transport", "internationaltravel", "restrictions_on_gatherings", "cancel_public_events"), title = "Coefficients for Long-Tail Music Consumption")
I initially did a logistic model using the glm package but wanted to correct for separation so I have used the logistf function and I'm now trying to redo my plots. I'm unsure how to make a plot like the one below with a logistf object. A lot of packages don't seem to support it, I've tried using sjPlot packages' plot_model() function which plots a dot for the predicted probability but doesn't add the error bars as it does automatically with a glm object. How can I get round this? Is there perhaps another package that would make this easier or is there a way to manually add the error bars?
The code for the plot I wish to add error bars to is:
sjPlot::plot_model(lr3, type="int", mdrt.values = "meansd", show.values = TRUE, value.offset = .3)
The output of my model lr3 is:
logistf(formula = foodbank_cv ~ wave + ff_country + relevel(race_grp,
ref = "White") + sex_cv + age_r + relevel(numchildren,
ref = "None") + wave * ff_hcondhas + relevel(carer,
ref = "Not") + sempderived + wave * cd_ff_furlough +
log(ff_hours) + qual + num + relevel(keyworksector, ref = "Not keyworker") +
ca_clinvuln_dv + freemeals + ca_blbenefits1 + log(hhincome_week),
data = data, firth = TRUE, family = binomial(link = "logit"))
Model fitted by Penalized ML
Coefficients:
coef se(coef) lower 0.95 upper 0.95 Chisq p method
(Intercept) -5.237542354 0.46736532 -6.23016284 -4.30807241 Inf 0.000000e+00 2
wave5 -0.377956413 0.32598420 -1.07410577 0.28545651 1.232122e+00 2.669947e-01 2
wave7 -0.929934987 0.40813067 -1.84652632 -0.12926473 5.260388e+00 2.181615e-02 2
ff_country2 -0.118780142 0.33317501 -0.86893024 0.51197342 1.196576e-01 7.294061e-01 2
ff_country3 0.393456771 0.25097814 -0.15010616 0.88210537 2.077828e+00 1.494527e-01 2
ff_country4 -0.219066153 0.43493435 -1.23008781 0.57774984 2.481153e-01 6.184053e-01 2
relevel(race_grp, ref = "White")Asian or Asian British 0.882833792 0.22906054 0.39628625 1.33641305 1.183859e+01 5.801581e-04 2
relevel(race_grp, ref = "White")Black or Black British 1.759374627 0.27942672 1.16321835 2.29702048 2.678592e+01 2.272869e-07 2
relevel(race_grp, ref = "White")Mixed 1.786978145 0.27773294 1.19285979 2.32350705 2.763841e+01 1.462461e-07 2
relevel(race_grp, ref = "White")Other -0.345106379 1.38712570 -5.19048868 1.62733736 6.509258e-02 7.986208e-01 2
ff_hcondhas 0.691244774 0.26776923 0.14697164 1.25269746 6.228205e+00 1.257311e-02 2
Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
The code that I used to make the hunger and race plot. I did some manual editing to make it look nicer but this is what I ideally want my plot to look like:
plot_model(model12, type = "pred", terms = c("race_grp"), mdrt.values = "meansd", axis.textsize = .3, wrap.labels = 5)+ theme_sjplot2() + scale_color_sjplot("simply") + ggplot2::labs(title= "Predicted probabilities of Hunger", x= "Race", y="Percentage")
I have found a way to get around this issue, however, not with the logistf package. In case anyone in the future wants to know the answer to this question, my suggestion is that you use the brglm package. I have checked and the results from the brglm package are exactly the same as the logistf package. This is how I recreated the Hunger plot posted above:
hi2<- brglm(formula= hungry_cv~ wave + ff_country + race_grp + sex_cv + age_r + numchildren + wave*ff_hcondhas + carer + sempderived + wave*cd_ff_furlough + log(ff_hours) + qual + num + keyworksector + ca_clinvuln_dv + freemeals + ca_blbenefits1 + log(hhincome_week), data=data, family=binomial(logit), method = "brglm.fit", pl = TRUE)
racehunger<- plot_model(hi2, type = "pred", terms = c("race_grp"), mdrt.values = "meansd", axis.textsize = .3, wrap.labels = 5, show.values = TRUE)+ theme_sjplot2() + ggplot2::labs(title= "Predicted probabilities of Hunger", x= "Race", y="Percentage")
racehunger
png(file="racehunger.png", units="in", width=11, height=8.5, res=300)
print(racehunger)
dev.off()
The output of the code is:
I am personally very happy with the result.
I want reduce the expression in r code
model1 <- pglm::pglm(formula = lfp ~ lfp_1+lfp1+ kids + *kids2 + kids3 + kids4 + kids5+ lhinc + lhinc2 + lhinc3 +lhinc4 + lhinc5 +educ+ black + age + agesq + per2+ per3 + per4+ per5,
family = binomial("probit"),
data = lfp1,
model = "random")
on stata will put kids2 - kids5 and list the variables kids from 2 to 5 in the regression.
Same to lhinc2-lhinc5 and to per2 - per5
Try this one:
model1 <- pglm::pglm(formula = lfp ~.,
family = binomial("probit"),
data = lfp1,
model = "random")
I've created a logistic regression with the glm function
mynewlogit <- glm(is_bad ~ ulmp_s_ratio + plmp_mac_all_60d + plmp_est_mac_all_90d + plmp_c_mac_all_90d + lmp_s_ratio + plmp_c_mac_hrsk_60d + ulmp_c_ycount + pp_usr_lmp_count + l2pp_pp_age_min + lmp_c_ratio + lmp_age_max + lmp_age_avg
, data = rajsub, family = "binomial")
and I've got this result:
Coefficients:
(Intercept) ulmp_s_ratio plmp_mac_all_60d plmp_est_mac_all_90d
-1.6226917 1.8704011 0.1037387 0.1583566
plmp_c_mac_all_90d lmp_s_ratio plmp_c_mac_hrsk_60d ulmp_c_ycount
-0.1333490 1.0456631 1.1447296 1.6073142
pp_usr_lmp_count l2pp_pp_age_min lmp_c_ratio lmp_age_max
0.0404034 0.0000457 -0.1052236 0.0002902
lmp_age_avg
-0.0010493
How to present the outcome as an equation format, as:
x = -1.6226917 + 1.8704011*ulmp_s_ratio ...
I use the example from the sampleSelection package
## Greene( 2003 ): example 22.8, page 786
data( Mroz87 )
Mroz87$kids <- ( Mroz87$kids5 + Mroz87$kids618 > 0 )
# Two-step estimation
test1 = heckit( lfp ~ age + I( age^2 ) + faminc + kids + educ,
wage ~ exper + I( exper^2 ) + educ + city, Mroz87 )
# ML estimation
test2 = selection( lfp ~ age + I( age^2 ) + faminc + kids + educ,
wage ~ exper + I( exper^2 ) + educ + city, Mroz87 )
pr2 <- predict(test2,Mroz87)
pr1 <- predict(test1,Mroz87)
My problem is that the predict function does not work. I get this error:
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('selection', 'maxLik', 'maxim', 'list')"
The predict function works for many models so I wonder why I get an error for heckman regression models.
-----------UPDATE-----------
I made some progress but I still need your help. I build an original heckman model for comparsion:
data( Mroz87 )
Mroz87$kids <- ( Mroz87$kids5 + Mroz87$kids618 > 0 )
test1 = heckit( lfp ~ age + I( age^2 ) + faminc + kids + educ,
wage ~ exper + I( exper^2 ) + educ + city, Mroz87[1:600,] )
After that I start building it on my own. Heckman model requires a selection equation:
zi* = wi γ + ui
where zi =1 if zi* >0 and zi = 0 if zi* <=0
after you calculate yi = xi*beta +ei ONLY for the cases where zi*>0
I build the probit model first:
library(MASS)
#probit1 = probit(lfp ~ age + I( age^2 ) + faminc + kids + educ, Mroz87, x = TRUE, print.level = print.level - 1, iterlim = 30)
myprobit <- glm(lfp ~ age + I( age^2 ) + faminc + kids + educ, family = binomial(link = "probit"),
data = Mroz87[1:600,])
summary(myprobit)
The model is exactly the same just as with the heckit command.
Then I build a lm model:
#get predictions for the variables (the data is not needed but I specify it anyway)
selectvar <- predict(myprobit,data = Mroz87[1:600,])
#bind the prediction to the table (I build a new one in my case)
newdata = cbind(Mroz87[1:600,],selectvar)
#Build an lm model for the subset where zi>0
lm1 = lm(wage ~ exper + I( exper^2 ) + educ + city , newdata, subset = selectvar > 0)
summary(lm1)
My issue now is that the lm model does not much the one created by heckit. I have no idea why. Any ideas?
Implementation
Here is an implementation of the predict.selection function -- it produces 4 different types of predictions (which are explained here):
library(Formula)
library(sampleSelection)
predict.selection = function(objSelection, dfPred,
type = c('link', 'prob', 'cond', 'uncond')) {
# construct the Formula object
tempS = evalq(objSelection$call$selection)
tempO = evalq(objSelection$call$outcome)
FormHeck = as.Formula(paste0(tempO[2], '|', tempS[2], '~', tempO[3], '|', tempS[3]))
# regressor matrix for the selection equation
mXSelection = model.matrix(FormHeck, data = dfPred, rhs = 2)
# regressor matrix for the outcome equation
mXOutcome = model.matrix(FormHeck, data = dfPred, rhs = 1)
# indices of the various parameters in selectionObject$estimate
vIndexBetaS = objSelection$param$index$betaS
vIndexBetaO = objSelection$param$index$betaO
vIndexErr = objSelection$param$index$errTerms
# get the estimates
vBetaS = objSelection$estimate[vIndexBetaS]
vBetaO = objSelection$estimate[vIndexBetaO]
dLambda = objSelection$estimate[vIndexErr['rho']]*
objSelection$estimate[vIndexErr['sigma']]
# depending on the type of prediction requested, return
# TODO allow the return of multiple prediction types
pred = switch(type,
link = mXSelection %*% vBetaS,
prob = pnorm(mXSelection %*% vBetaS),
uncond = mXOutcome %*% vBetaO,
cond = mXOutcome %*% vBetaO +
dnorm(temp <- mXSelection %*% vBetaS)/pnorm(temp) * dLambda)
return(pred)
}
Test
Suppose you estimate the following Heckman sample selection model using MLE:
data(Mroz87)
# define a new variable
Mroz87$kids = (Mroz87$kids5 + Mroz87$kids618 > 0)
# create the estimation sample
Mroz87Est = Mroz87[1:600, ]
# create the hold out sample
Mroz87Holdout = Mroz87[601:nrow(Mroz87), ]
# estimate the model using MLE
heckML = selection(selection = lfp ~ age + I(age^2) + faminc + kids + educ,
outcome = wage ~ exper + I(exper^2) + educ + city, data = Mroz87Est)
summary(heckML)
The different types of predictions are computed as below:
vProb = predict(objSelection = heckML, dfPred = Mroz87Holdout, type = 'prob')
vLink = predict(objSelection = heckML, dfPred = Mroz87Holdout, type = 'link')
vCond = predict(objSelection = heckML, dfPred = Mroz87Holdout, type = 'cond')
vUncond = predict(objSelection = heckML, dfPred = Mroz87Holdout, type = 'uncond')
You can verify these computation on a platform that produces these outputs, such as Stata.