Effect plot prediction in R - r

So far, i have reached to fit the model in the survreg function like below:
model <- survreg(formula = Surv(TimeDeath, event) ~ age + BM1 + BM2 +
mutation + sex + BM1:BM2 + BM1:mutation,
data = DF, dist = "lognormal")
Now, i need to predict failure time of a male patient who is 51 years old, he did not have the gene mutation, and for BM1 he had the value 3.7 mg/dL and for BM2 the value 251 mg/dL.
I continued like below:
ND <- with(DF, data.frame(
age = rep(seq(min(age), max(age), length.out = 20), 2),
BM1 = rep(seq(min(BM1), max(BM1), length.out = 20), 2),
BM2 = rep(seq(min(BM2), max(BM2), length.out = 20), 2),
mutation = c("No", "Yes"),
sex = c("male", "40")
))
prs <- predict(model_final, ND, se.fit = TRUE, type = "lp")
ND$pred <- prs[[1]]
ND$se <- prs[[2]]
ND$lo <- exp(ND$pred - 1.96 * ND$se)
ND$up <- exp(ND$pred + 1.96 * ND$se)
ND$pred <- exp(ND$pred)
library(lattice)
xyplot(pred + lo + up ~ age + BM1, data = ND, type = "l",
lty = c(1,2,2), col = "black", lwd = 4, xlab = "Age",
ylab = "Survival Time")
I know i have not defined the ND object correctly, but i don't know how to do it, and also, the plot function.
Some help please?

Look at ?predict.survreg. The construction of CI's does look suspicious, I would have thought you would instead have set se.fit=TRUE There is a new data argument which is where you include parameters needed for prediction as part of the newdata argument:
all.combos < expand.grid( mutation=c("No", "Yes"), BM1= 3.7 , BM2= 251 ,
sex = c("male", "40"),
age-seq(min(age), max(age), length.out = 20) ) )
preds.combos <- predict(model, all.combos, se.fit=TRUE)

Related

R GLM: Modify coefficients of an existing glm model

I have been trying to adjust the coefficients of an existing glm model but the predictions don't seem to change. The idea is to enhance an existing logistic model by incorporating 'qualitative' parameters in the quantitative coefficients (see 'adj model' block). I replicated the problem below.
I really appreciate any. Thank you!
set.seed(100)
#create sim data (correlated)
input_size <- 200
scale <- 10000
y_var = sample(0:1, input_size, replace = TRUE)
input_data <- cbind.data.frame(y_var, x1 = sample(1:1000, input_size, replace = TRUE) + (y_var*200), x2 = sample(1:50, input_size, replace = TRUE) - (y_var*30))
cor(input_data)
#build log-reg model
reg1 <- glm(input_data$y ~ input_data$x1 + input_data$x2, data = input_data, family = "binomial")
reg1$coefficients
#test log-reg model
input_test <- cbind.data.frame(x1 = sample(1:1000, input_size, replace = TRUE) + (y_var*400), x2 = sample(1:50, input_size, replace = TRUE) - (y_var*10))
y_predict <- predict(reg1, input_test, type="response")
#adjust log-reg model
adj_coeff <- round(c(intercept = reg1$coefficients[1], x1 = reg1$coefficients[2] * 3, x2 = -reg1$coefficients[3] * 0.5), 4)
reg2 <- reg1
reg2$coefficients <- as.numeric(adj_coeff)
reg2$coefficients
#visualize predication of the log-reg models
y2_predict <- predict(reg1, input_test, type="response")
plot(y_predict, type = "p", lwd = 2)
lines(y2_predict, type = "p", pch = 3, col = "orange")

Using `ordinal::clmm` model to make predictions on new data

I have some repeated measures, ordinal response data:
dat <- data.frame(
id = factor(sample(letters[1:5], 50, replace = T)),
response = factor(sample(1:7, 50, replace = T), ordered = T),
x1 = runif(n = 50, min = 1, max = 10),
x2 = runif(n = 50, min = 100, max = 1000)
)
I have built the following model:
library(ordinal)
model <- clmm(response ~ x1 + x2 + (1|id), data = dat)
I have some new data:
new_dat <- data.frame(
id = factor(sample(letters[1:5], 5, replace = T)),
x1 = runif(n = 5, min = 1, max = 10),
x2 = runif(n = 5, min = 100, max = 1000)
)
I want to be able to use the model to predict the probability of each level of dat$response occurring for new_dat, whilst still also accounting for id.
Unfortunately predict() does not work for clmm objects. predict() does work for clmm2 objects but it ignores any random effects included.
What I want to achieve is something similar to what has been done in Figure 3 of the following using this code:
library(ordinal)
fm2 <- clmm2(rating ~ temp + contact, random=judge, data=wine, Hess=TRUE, nAGQ=10)
pred <- function(eta, theta, cat = 1:(length(theta)+1), inv.link = plogis){
Theta <- c(-1e3, theta, 1e3)
sapply(cat, function(j)
inv.link(Theta[j+1] - eta) - inv.link(Theta[j] - eta))
}
mat <- expand.grid(judge = qnorm(0.95) * c(-1, 0, 1) * fm2$stDev,
contact = c(0, fm2$beta[2]),
temp = c(0, fm2$beta[1]))
pred.mat <- pred(eta=rowSums(mat), theta=fm2$Theta)
lab <- paste("contact=", rep(levels(wine$contact), 2), ", ", "temp=", rep(levels(wine$temp), each=2), sep="")
par(mfrow=c(2, 2))
for(k in c(1, 4, 7, 10)) {
plot(1:5, pred.mat[k,], lty=2, type = "l", ylim=c(0,1),
xlab="Bitterness rating scale", axes=FALSE,
ylab="Probability", main=lab[ceiling(k/3)], las=1)
axis(1); axis(2)
lines(1:5, pred.mat[k+1, ], lty=1)
lines(1:5, pred.mat[k+2, ], lty=3)
legend("topright",
c("avg. judge", "5th %-tile judge", "95th %-tile judge"),
lty=1:3, bty="n")
}
Except, my model contains multiple continuous covariates (as opposed to binary factors).
How can I use the model data to predict the probability of each level of dat$response occurring for new_dat, whilst still also accounting for id?
Many thanks.

How to calculate mean sojourn time in each nonabsorbing state using R package MSTATE

I am working on a survival analysis and cannot seem to figure out how do to this.
From the MSTATE tutorial the following is a block of code for as simple Cox-regression. How does one calculate the mean sojourn time in each nonabsorbing state?
Code:
library(mstate)
data(ebmt3)
tmat <- trans.illdeath(names=c("Tx","PR","RelDeath"))
ebmt3$prtime <- ebmt3$prtime/365.25
ebmt3$rfstime <- ebmt3$rfstime/365.25
covs <- c("dissub", "age", "drmatch", "tcd", "prtime")
msbmt <- msprep(time = c(NA, "prtime", "rfstime"), status = c(NA, "prstat", "rfsstat"), data = ebmt3, trans = tmat, keep = covs)
expcovs <- expand.covs(msbmt, covs[2:3], append = FALSE)
msbmt <- expand.covs(msbmt, covs, append = TRUE, longnames = FALSE)
c1 <- coxph(Surv(Tstart, Tstop, status) ~ dissub1.1 + dissub2.1 +
age1.1 + age2.1 + drmatch.1 + tcd.1 + dissub1.2 + dissub2.2 +
age1.2 + age2.2 + drmatch.2 + tcd.2 + dissub1.3 + dissub2.3 +
age1.3 + age2.3 + drmatch.3 + tcd.3 + strata(trans), data = msbmt,
method = "breslow")
newd <- data.frame(dissub = rep(0, 3), age = rep(0, 3), drmatch = rep(0,
3), tcd = rep(0, 3), trans = 1:3)
newd$dissub <- factor(newd$dissub, levels = 0:2, labels = levels(ebmt3$dissub))
newd$age <- factor(newd$age, levels = 0:2, labels = levels(ebmt3$age))
newd$drmatch <- factor(newd$drmatch, levels = 0:1, labels = levels(ebmt3$drmatch))
newd$tcd <- factor(newd$tcd, levels = 0:1, labels = levels(ebmt3$tcd))
attr(newd, "trans") <- tmat
class(newd) <- c("msdata", "data.frame")
newd <- expand.covs(newd, covs[1:4], longnames = FALSE)
newd$strata = 1:3
newd
msf1 <- msfit(c1, newdata = newd, trans = tmat)
Thanks!
I think you are looking for the ELOS function in mstate - it stands for the Expected Length of Stay in a state - to complete your example you would need to calculate the transition probabilities using probtrans and then you can calculate ELOS for every state.
pt <- probtrans(msf1,predt=0)
# ELOS until last observed time point
ELOS(pt)

lme4::glmer.nb function produces "Error in family$family : $ operator not defined for this S4 class" depending on the order I run models

library(lme4)
dummy <- as.data.frame(cbind(speed = rpois(100, 10), pop = rep(1:4, each = 25), season = rep(1:2, each = 50), id = seq(1, 100, by = 1)))
dummy2 <- as.data.frame(cbind(speed = c(rnbinom(50, 10, 0.6), rnbinom(50, 10, 0.1)), pop = rep(1:4, each = 25), season = rep(1:2, each = 50), id = seq(1, 100, by = 1)))
poisson <- glmer(speed~pop*season + (1|id),
data=dummy, family="poisson")
neg.bin <- glmer.nb(speed ~ pop*season + (1|id),
data=dummy2, control=glmerControl(optimizer="bobyqa"))
When I run a script creating a Poisson model before a negative binomial model using the lme4 package, I get the following error when running the neg.bin model:
Error in family$family : $ operator not defined for this S4 class
However, if I run the models in the opposite order, I don't the error message.
library(lme4)
dummy <- as.data.frame(cbind(speed = rpois(100, 10), pop = rep(1:4, each = 25), season = rep(1:2, each = 50), id = seq(1, 100, by = 1)))
dummy2 <- as.data.frame(cbind(speed = c(rnbinom(50, 10, 0.6), rnbinom(50, 10, 0.1)), pop = rep(1:4, each = 25), season = rep(1:2, each = 50), id = seq(1, 100, by = 1)))
neg.bin <- glmer.nb(speed ~ pop*season + (1|id),
data=dummy2, control=glmerControl(optimizer="bobyqa"))
poisson <- glmer(speed~pop*season + (1|id),
data=dummy, family="poisson")
The neg.bin model example does have convergence warnings, but the same pattern is happening with my actual models which are converging fine. How is running the Poisson model first affecting the neg.bin model?
Because you have masked R function poisson. The following would work fine (except that there is convergence warning for neg.bin):
library(lme4)
set.seed(0)
dummy <- as.data.frame(cbind(speed = rpois(100, 10), pop = rep(1:4, each = 25), season = rep(1:2, each = 50), id = seq(1, 100, by = 1)))
dummy2 <- as.data.frame(cbind(speed = c(rnbinom(50, 10, 0.6), rnbinom(50, 10, 0.1)), pop = rep(1:4, each = 25), season = rep(1:2, each = 50), id = seq(1, 100, by = 1)))
## use a different name for your model, say `poisson_fit`
poisson_fit <- glmer(speed~pop*season + (1|id),
data=dummy, family="poisson")
negbin_fit <- glmer.nb(speed ~ pop*season + (1|id),
data=dummy2, control=glmerControl(optimizer="bobyqa"))
Here is the issue. Among the very first few lines of glmer.nb there is one line:
mc$family <- quote(poisson)
So, if you mask poisson, correct function poisson from stats package can not be found.
Ben has just fixed this issue, by replacing this to:
mc$family <- quote(stats::poisson)
My original observation on family = "poisson" and match.fun stuff is not the real issue here. It only explains why in routines like glm and mgcv::gam, it is legitimate to pass a string of family.

predicting from flexmix object (R)

I fit some data to a mixture distribution of two gaussian in flexmix:
data("NPreg", package = "flexmix")
mod <- flexmix(yn ~ x, data = NPreg, k = 2,
model = list(FLXMRglm(yn ~ x, family= "gaussian"),
FLXMRglm(yn ~ x, family = "gaussian")))
the model fit is as follows:
> mod
Call:
flexmix(formula = yn ~ x, data = NPreg, k = 2, model = list(FLXMRglm(yn ~ x, family = "gaussian"),
FLXMRglm(yn ~ x, family = "gaussian")))
Cluster sizes:
1 2
74 126
convergence after 31 iterations
But how do I actually predict from this model?
when I do
pred <- predict(mod, NPreg)
I get a list with the predictions from each of the two components
To get a single prediction, do I have to add in the cluster sizes like this?
single <- (74/200)* pred$Comp.1[,1] + (126/200)*pred$Comp.2[,2]
I use flexmix for prediction in the following way:
pred = predict(mod, NPreg)
clust = clusters(mod,NPreg)
result = cbind(NPreg,data.frame(pred),data.frame(clust))
plot(result$yn,col = c("red","blue")[result$clust],pch = 16,ylab = "yn")
And the confusion matrix:
table(result$class,result$clust)
For getting the predicted values of yn, I select the component value of the cluster to which a data point belongs.
for(i in 1:nrow(result)){
result$pred_model1[i] = result[,paste0("Comp.",result$clust[i],".1")][i]
result$pred_model2[i] = result[,paste0("Comp.",result$clust[i],".2")][i]
}
The actual vs predicted results show the fit (adding only one of them here as both of your models are same, you would use pred_model2 for the second model).
qplot(result$yn, result$pred_model1,xlab="Actual",ylab="Predicted") + geom_abline()
RMSE = sqrt(mean((result$yn-result$pred_model1)^2))
gives a root mean square error of 5.54.
This answer is based on many SO answers I read through while working with flexmix. It worked well for my problem.
You may also be interested in visualizing the two distributions. My model was the following, which shows some overlap as the ratio of components are not close to 1.
Call:
flexmix(formula = yn ~ x, data = NPreg, k = 2,
model = list(FLXMRglm(yn ~ x, family = "gaussian"),
FLXMRglm(yn ~ x, family = "gaussian")))
prior size post>0 ratio
Comp.1 0.481 102 129 0.791
Comp.2 0.519 98 171 0.573
'log Lik.' -1312.127 (df=13)
AIC: 2650.255 BIC: 2693.133
I also generate a density distribution with histograms to visulaize both components. This was inspired by a SO answer from the maintainer of betareg.
a = subset(result, clust == 1)
b = subset(result, clust == 2)
hist(a$yn, col = hcl(0, 50, 80), main = "",xlab = "", freq = FALSE, ylim = c(0,0.06))
hist(b$yn, col = hcl(240, 50, 80), add = TRUE,main = "", xlab = "", freq = FALSE, ylim = c(0,0.06))
ys = seq(0, 50, by = 0.1)
lines(ys, dnorm(ys, mean = mean(a$yn), sd = sd(a$yn)), col = hcl(0, 80, 50), lwd = 2)
lines(ys, dnorm(ys, mean = mean(b$yn), sd = sd(b$yn)), col = hcl(240, 80, 50), lwd = 2)
# Joint Histogram
p <- prior(mod)
hist(result$yn, freq = FALSE,main = "", xlab = "",ylim = c(0,0.06))
lines(ys, p[1] * dnorm(ys, mean = mean(a$yn), sd = sd(a$yn)) +
p[2] * dnorm(ys, mean = mean(b$yn), sd = sd(b$yn)))
You can pass an additional argument to your prediction call.
pred <- predict(mod, NPreg, aggregate = TRUE)[[1]][,1]

Resources