For my homework, I am working with a dataset titled Default. I split my data into training and test sets, and ran a logistic regression for the relationship of default1 and the other 3 predictors(income (continuous), balance(continuous), student(0/1)).
I am supposed to plot the regression model, but it keeps showing a straight horizontal line on the graph and I don't think that's correct.
How can I graph multiple predictors with a singular binary outcome using my Default_train_logistic glm?
Also, how can I obtain those coefficients and error rates of the model?
TIA!
set.seed(1234)
Default$subsample <- runif(nrow(Default))
Default$test <- ifelse(Default$subsample < 0.80, "train", "test")
Default_train <- filter(Default, test == "train")
Default_test <- filter(Default, test == "test")
###Q1 Part B: b. Construct a logistic regression to predict if an individual will default based on all of the provided predictors, and visualize your final predicted model.
#Immediately after loading data, I created default1 to use default as a numerical binary variable for logistic regression.
Default_train_logistic <- glm(default1 ~ ., data = Default_train %>% select(-test), family = "binomial")
summary(Default_train_logistic)
plot(Default_train_logistic)
G1 <- ggplot(Default_train_logistic, aes(balance + income + student1, default1)) +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE)
print(G1)
I want to know if it's possible to plot quadratic curves with Poisson glm with interactions in categorical/numeric variables. In my case:
##Data set artificial
set.seed(20)
d <- data.frame(
behv = c(rpois(100,10),rpois(100,100)),
mating=sort(rep(c("T1","T2"), 200)),
condition = scale(rnorm(200,5))
)
#Condition quadratic
d$condition2<-(d$condition)^2
#Binomial GLM ajusted
md<-glm(behv ~ mating + condition + condition2, data=d, family=poisson)
summary(md)
In a situation where mating, condition and condition2 are significant in the model, I make:
#Create x's vaiues
x<-d$condition##
x2<-(d$condition)^2
# T1 estimation
y1<-exp(md$coefficients[1]+md$coefficients[3]*x+md$coefficients[4]*x2)
#
# T2 estimation
y2<-exp(md$coefficients[1]+md$coefficients[2]+md$coefficients[3]*x+md$coefficients[4]*x2)
#
#
#Separete data set
d_T1<-d[d[,2]!="T2",]
d_T2<-d[d[,2]!="T1",]
#Plot
plot(d_T1$condition,d_T1$behv,main="", xlab="condition", ylab="behv",
xlim=c(-4,3), ylim=c(0,200), col= "black")
points(d_T2$condition,d_T2$behv, col="gray")
lines(x,y1,col="black")
lines(x,y2,col="grey")
#
Doesn't work and I don't have my desirable curves. I'd like a curve for T1 and other for T2 in mating variable. There are any solution for this?
In the code below, we use the poly function to generate a quadratic model without needing to create an extra column in the data frame. In addition, we create a prediction data frame to generate model predictions across the range of condition values and for each level of mating. The predict function with type="response" generates predictions on the scale of the outcome, rather than on the linear predictor scale, which is the default. Also, we change 200 to 100 in creating the data for mating in order to avoid having the exact same outcome data for each level of mating.
library(ggplot2)
# Fake data
set.seed(20)
d <- data.frame(
behv = c(rpois(100,10),rpois(100,100)),
mating=sort(rep(c("T1","T2"), 100)), # Changed from 200 to 100
condition = scale(rnorm(200,5))
)
# Model with quadratic condition
md <- glm(behv ~ mating + poly(condition, 2, raw=TRUE), data=d, family=poisson)
#summary(md)
# Get predictions at range of condition values
pred.data = data.frame(condition = rep(seq(min(d$condition), max(d$condition), length=50), 2),
mating = rep(c("T1","T2"), each=50))
pred.data$behv = predict(md, newdata=pred.data, type="response")
Now plot with ggplot2 and with base R:
ggplot(d, aes(condition, behv, colour=mating)) +
geom_point() +
geom_line(data=pred.data)
plot(NULL, xlim=range(d$condition), ylim=range(d$behv),
xlab="Condition", ylab="behv")
with(subset(d, mating=="T1"), points(condition, behv, col="red"))
with(subset(d, mating=="T2"), points(condition, behv, col="blue"))
with(subset(pred.data, mating=="T1"), lines(condition, behv, col="red"))
with(subset(pred.data, mating=="T2"), lines(condition, behv, col="blue"))
legend(-3, 70, title="Mating", legend=c("T1","T2"), pch=1, col=c("blue", "red"))
I want to know if is possible to plotting binomial glm with interactions in numeric variables. In my case:
##Data set artificial
set.seed(20)
d <- data.frame(
mating=sample(0:1, 200, replace=T),
behv = scale(rpois(200,10)),
condition = scale(rnorm(200,5))
)
#Binomial GLM ajusted
model<-glm(mating ~ behv + condition, data=d, family=binomial)
summary(model)
In a situation where behv and condition are significant in the model
#Plotting first for behv
x<-d$behv ###Take behv values
x2<-rep(mean(d$condition),length(d_p[,1])) ##Fixed mean condition
# Points
plot(d$mating~d$behv)
#Curve
curve(exp(model$coefficients[1]+model$coefficients[2]*x+model$coefficients[3]*x2)
/(1+exp(model$coefficients[1]+model$coefficients[2]*x+model$coefficients[3]*x2)))
But doesn't work!! There is another correct approach?
Thanks
It seems like your desired output is a plot of the conditional means (or best-fit line). You can do this by computing predicted values with the predict function.
I'm going to change your example a bit, to get a nicer looking result.
d$mating <- ifelse(d$behv > 0, rbinom(200, 1, .8), rbinom(200, 1, .2))
model <- glm(mating ~ behv + condition, data = d, family = binomial)
summary(model)
Now, we make a newdata dataframe with your desired values:
newdata <- d
newdata$condition <- mean(newdata$condition)
newdata$yhat <- predict(model, newdata, type = "response")
Finally, we sort newdata by the x-axis variable (if not, we'll get lines that zig-zag all over the plot), and then plot:
newdata <- newdata[order(newdata$behv), ]
plot(newdata$mating ~ newdata$behv)
lines(x = newdata$behv, y = newdata$yhat)
Output:
I would get a coefplot only with part of independent variables. My regression equation is a fixed effects regression as follows:
aa1 <-glm(Eighty_Twenty ~ Market_Share_H+Market_Share_L+Purchase_Frequency_H+Purchase_Frequency_L+factor(product_group))
coefplot(aa1)
However, I do NOT want to plot coefficients of factor(product_group) variables since there are product groups. Instead, I would get a coefplot with only the coefficients of other variables. How can I do this?
From the help pages (see ?coefplot.default) you can select what predictors or coefficients that you want in your plot.
# some example data
df <- data.frame(Eighty_Twenty = rbinom(100,1,0.5),
Market_Share_H = runif(100),
Market_Share_L = runif(100),
Purchase_Frequency_H = rpois(100, 40),
Purchase_Frequency_L = rpois(100, 40),
product_group = sample(letters[1:3], 100, TRUE))
# model
aa1 <- glm(Eighty_Twenty ~ Market_Share_H+Market_Share_L +
Purchase_Frequency_H + Purchase_Frequency_L +
factor(product_group), df, family="binomial")
library(coefplot)
# coefficient plot with the intercept
coefplot(aa1, coefficients=c("(Intercept)","Market_Share_H","Market_Share_L",
"Purchase_Frequency_H","Purchase_Frequency_L"))
# coefficient plot specifying predictors (no intercept)
coefplot(aa1, predictors=c("Market_Share_H","Market_Share_L" ,
"Purchase_Frequency_H","Purchase_Frequency_L"))
I’m trying to fit and plot a Weibull model to a survival data. The data has just one covariate, cohort, which runs from 2006 to 2010. So, any ideas on what to add to the two lines of code that follows to plot the survival curve of the cohort of 2010?
library(survival)
s <- Surv(subSetCdm$dur,subSetCdm$event)
sWei <- survreg(s ~ cohort,dist='weibull',data=subSetCdm)
Accomplishing the same with the Cox PH model is rather straightforward, with the following lines. The problem is that survfit() doesn’t accept objects of type survreg.
sCox <- coxph(s ~ cohort,data=subSetCdm)
cohort <- factor(c(2010),levels=2006:2010)
sfCox <- survfit(sCox,newdata=data.frame(cohort))
plot(sfCox,col='green')
Using the data lung (from the survival package), here is what I'm trying to accomplish.
#create a Surv object
s <- with(lung,Surv(time,status))
#plot kaplan-meier estimate, per sex
fKM <- survfit(s ~ sex,data=lung)
plot(fKM)
#plot Cox PH survival curves, per sex
sCox <- coxph(s ~ as.factor(sex),data=lung)
lines(survfit(sCox,newdata=data.frame(sex=1)),col='green')
lines(survfit(sCox,newdata=data.frame(sex=2)),col='green')
#plot weibull survival curves, per sex, DOES NOT RUN
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung)
lines(survfit(sWei,newdata=data.frame(sex=1)),col='red')
lines(survfit(sWei,newdata=data.frame(sex=2)),col='red')
Hope this helps and I haven't made some misleading mistake:
copied from above:
#create a Surv object
s <- with(lung,Surv(time,status))
#plot kaplan-meier estimate, per sex
fKM <- survfit(s ~ sex,data=lung)
plot(fKM)
#plot Cox PH survival curves, per sex
sCox <- coxph(s ~ as.factor(sex),data=lung)
lines(survfit(sCox,newdata=data.frame(sex=1)),col='green')
lines(survfit(sCox,newdata=data.frame(sex=2)),col='green')
for Weibull, use predict, re the comment from Vincent:
#plot weibull survival curves, per sex,
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung)
lines(predict(sWei, newdata=list(sex=1),type="quantile",p=seq(.01,.99,by=.01)),seq(.99,.01,by=-.01),col="red")
lines(predict(sWei, newdata=list(sex=2),type="quantile",p=seq(.01,.99,by=.01)),seq(.99,.01,by=-.01),col="red")
The trick here was reversing the quantile orders for plotting vs predicting. There is likely a better way to do this, but it works here. Good luck!
An alternative option is to make use of the package flexsurv. This offers some additional functionality over the survival package - including that the parametric regression function flexsurvreg() has a nice plot method which does what you ask.
Using lung as above;
#create a Surv object
s <- with(lung,Surv(time,status))
require(flexsurv)
sWei <- flexsurvreg(s ~ as.factor(sex),dist='weibull',data=lung)
sLno <- flexsurvreg(s ~ as.factor(sex),dist='lnorm',data=lung)
plot(sWei)
lines(sLno, col="blue")
You can plot on the cumulative hazard or hazard scale using the type argument, and add confidence intervals with the ci argument.
This is just a note clarifying Tim Riffe's answer, which uses the following code:
lines(predict(sWei, newdata=list(sex=1),type="quantile",p=seq(.01,.99,by=.01)),seq(.99,.01,by=-.01),col="red")
lines(predict(sWei, newdata=list(sex=2),type="quantile",p=seq(.01,.99,by=.01)),seq(.99,.01,by=-.01),col="red")
The reason for the two mirror-image sequences, seq(.01,.99,by=.01) and seq(.99,.01,by=-.01), is because the predict() method is giving quantiles for the event distribution f(t) - that is, values of the inverse CDF of f(t) - while a survival curve is plotting 1-(CDF of f) versus t. In other words, if you plot p versus predict(p), you'll get the CDF, and if you plot 1-p versus predict(p) you'll get the survival curve, which is 1-CDF. The following code is more transparent and generalizes to arbitrary vectors of p values:
pct <- seq(.01,.99,by=.01)
lines(predict(sWei, newdata=list(sex=1),type="quantile",p=pct),1-pct,col="red")
lines(predict(sWei, newdata=list(sex=2),type="quantile",p=pct),1-pct,col="red")
In case someone wants to add a Weibull distribution to the Kaplan-Meyer curve in the ggplot2 ecosystem, we can do the following:
library(survminer)
library(tidyr)
s <- with(lung,Surv(time,status))
fKM <- survfit(s ~ sex,data=lung)
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung)
pred.sex1 = predict(sWei, newdata=list(sex=1),type="quantile",p=seq(.01,.99,by=.01))
pred.sex2 = predict(sWei, newdata=list(sex=2),type="quantile",p=seq(.01,.99,by=.01))
df = data.frame(y=seq(.99,.01,by=-.01), sex1=pred.sex1, sex2=pred.sex2)
df_long = gather(df, key= "sex", value="time", -y)
p = ggsurvplot(fKM, data = lung, risk.table = T)
p$plot = p$plot + geom_line(data=df_long, aes(x=time, y=y, group=sex))
In case you'd like to use the survival function itself S(t) (instead of the inverse survival function S^{-1}(p) used in other answers here) I've written a function to implement that for the case of the Weibull distribution (following the same inputs as the pec::predictSurvProb family of functions:
survreg.predictSurvProb <- function(object, newdata, times){
shape <- 1/object$scale # also equals 1/exp(fit$icoef[2])
lps <- predict(object, newdata = newdata, type = "lp")
surv <- t(sapply(lps, function(lp){
sapply(times, function(t) 1 - pweibull(t, shape = shape, scale = exp(lp)))
}))
return(surv)
}
You can then do:
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung)
times <- seq(min(lung$time), max(lung$time), length.out = 1000)
new_dat <- data.frame(sex = c(1,2))
surv <- survreg.predictSurvProb(sWei, newdata = new_dat, times = times)
lines(times, surv[1, ],col='red')
lines(times, surv[2, ],col='red')