Adapting the meansd moderator option in sjPlot interaction - r

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)

With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Related

Predicting from glmmTMB with truncated counts

I'm running a glmmTMB model with various truncated count distributions (truncated_poisson, truncated_compois, truncated_nbinom1, truncated_nbinom2). When I predict from the model, the values seem to be lower than expected, as if the prediction is not accounting for the truncation. Where am I going wrong? A toy example is provided, showing that predicted values are lower than observed means.
Any advice would be appreciated. Extra points if the advice can extend to the other truncated count distributions (see above) and if it shows how to correctly get the 95% confidence band around the estimated values in these cases.
library(dplyr)
library( extraDistr)
library(glmmTMB)
set.seed(1)
df <- data.frame(Group = rep(c("a", "b"), each = 20), N = rtpois(40, 1, a = 0), ran = "a") %>%
mutate(N = ifelse(N == 0, 1, N))
m <- glmmTMB(N ~ Group + (1|ran), data = df, family = "truncated_poisson")
df %>% group_by(Group) %>% summarize(mean(N))
predict(m, newdata = data.frame(Group = c("a", "b"), ran = NA), type = "response")
I think the main issue is probably that you're using a slightly older version of glmmTMB (< 1.1.5, where a bug was fixed, see e.g. e.g. https://github.com/glmmTMB/glmmTMB/issues/860).
sample data
streamlined slightly (we don't need to include a random effect for this example), and adding a truncated nbinom2.
library(dplyr)
library(extraDistr)
library(glmmTMB)
set.seed(1)
df <- data.frame(Group = rep(c("a", "b"), each = 20),
Np = rtpois(40, 1, a = 0))
## clunky trunc nbinom generator
tnb <- rep(0, 40)
z <- (tnb==0)
while(any(z)) {
tnb[z] <- rnbinom(sum(z), mu = 1, size = 1)
z <- (tnb==0)
}
df$Nnb <- tnb
## summarize
df %>% group_by(Group) %>% summarize(across(starts_with("N"), mean))
## Group Np Nnb
## 1 a 1.75 1.8
## 2 b 1.45 2.35
fit models
m1 <- glmmTMB(Np ~ Group, data = df, family = "truncated_poisson")
m2 <- update(m1, Nnb ~ ., family = truncated_nbinom2)
Predicting with se.fit = TRUE will give you standard errors for the predictions, from which you can compute confidence intervals (assuming Normality/Wald intervals/blah blah blah ...) ...
pfun <- function(m, level = 0.95) {
pp <- predict(m, newdata = data.frame(Group = c("a", "b")),
type = "response",
se.fit = TRUE)
list(est = unname(pp$fit),
lwr = unname(pp$fit + qnorm((1-level)/2)*pp$se.fit),
upr = unname(pp$fit + qnorm((1+level)/2)*pp$se.fit))
}
pfun(m1)
pfun(m2)

Find value of covariate given a probability in R

Given a fractional polynomial GLM, I am looking to find the value of a covariate that gives me an output of a given probability.
My data is simulated using:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
And my model is fitted by:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
For each level of sex (i.e. "Male" or "Female"), I want to find the value of duration that gives me a probability equal to some value frontier <- 0.8.
So far, I can only think of using an approximation using a vector of possibilities:
pred_duration <- seq(10, 20, by = 0.1)
pred <- data.frame(expand.grid(duration = pred_duration,
sex = sex),
p = predict(model1,
newdata = expand.grid(duration = pred_duration,
sex = sex),
type = "response"))
pred[which(pred$p > 0.8), ] %>%
group_by(sex) %>%
summarize(min(duration))
But I am really after an exact solution.
The function uniroot allows you to detect the point at which the output of a function equals 0. If you create a function that takes duration as input, calculates the predicted probability from that duration, then subtracts the desired probability, then this function will have an output of 0 at the desired value of duration. uniroot will find this value for you. If you wrap this process in a little function, it makes it very easy to use:
find_prob <- function(p) {
f <- function(v) {
predict(model1, type = 'response',
newdata = data.frame(duration = v, sex = 'Male')) - p
}
uniroot(f, interval = range(data$duration), tol = 1e-9)$root
}
So, for example, to find the duration that gives an 80% probability, we just do:
find_prob(0.8)
#> [1] 12.86089
To prove that this is the correct value, we can feed it directly into predict to see what the predicted probability will be given sex = male and duration = 12.86089
predict(model1, type = 'response',
newdata = data.frame(sex = 'Male', duration = find_prob(0.8)))
#> 1
#> 0.8

Using $ to refer to multiple variables in user defined function R

I am trying to run a function which calculates the marginal effects for different mixed effects models, based on two different main predictors (var1 vs. var2). The original code can be found here:
https://stats.idre.ucla.edu/r/dae/mixed-effects-logistic-regression/. Below is a reproducible example:
I create a dataframe (ex):
time <- seq(from = 1, to = 500, by =1)
var1 <- factor(sample(0:1, 500, replace = TRUE))
var2 <- factor(sample(0:1, 500, replace = TRUE))
var3 <- sample(1:500, 500, replace = TRUE)
group <- rep(1001:1005, 500)
out <- sample(0:1, 500, replace = TRUE)
group <- as.factor(group)
ex <- data.frame(time,var1,var2,var3,group,out)
Run the models:
m1a <- glmer(out ~ time + var1 + (1|group), data=ex, family = binomial(link = "logit"), nAGQ = 1,
control = glmerControl(calc.derivs = FALSE))
m1b <- glmer(out ~ time + var2 + (1|group), data=ex, family = binomial(link = "logit"), nAGQ = 1,
control = glmerControl(calc.derivs = FALSE))
Create subsets of the data with only the predictors for complete cases:
sub1a <- na.omit(ex[, c("time", "var1", "group")])
sub1b <- na.omit(ex[, c("time", "var2", "group")])
I cannot attach my data frame, ex, because R says var1 and var2 are masked. Therefore, the only way I know to refer to the variables is using $. However, every function I create produces a wrong or null result. I first tried:
marg <- function(v1, v2, d, m) {
biprobs <- lapply(levels(v1), function(var) {
v2[ ] <- var
lapply(time, function(ti) {
d$time <- ti
predict(m, newdata = d, type = "response")
})
})
plotdat <- lapply(biprobs, function(X) {
temp <- t(sapply(X, function(x) {
c(M=mean(x), quantile(x, c(.25, .75)))
}))
temp <- as.data.frame(cbind(temp,time))
colnames(temp) <- c("PP", "Lower", "Upper", "Dayssince")
return(temp)
})
plotdat <- do.call(rbind, plotdat)
}
result1 <- marg(ex$var1, sub1a$var1, sub1a, m1a)
Although this creates a data frame, it produces the same predicted probabilities for each level of var1 (0 vs. 1) at a given time (1-500), which is not what I want. So then I tried:
marg <- function(v, d, m) {
biprobs <- lapply(levels(ex$v), function(var) {
d$v[ ] <- var
lapply(time, function(ti) {
d$time <- ti
predict(m, newdata = d, type = "response")
})
})
.....
}
result2 <- marg(var1,sub1a, m1a)
This produces a null result. I also tried, which produces a null result:
marg <- function(d1,v,d2,m) {
biprobs <- lapply(levels(d1$v), function(var) {
d2$v[ ] <- var
lapply(time, function(ti) {
d2$time <- ti
predict(m, newdata = d2, type = "response")
})
})
......
}
result3 <- marg(ex,var1,sub1a,m1a)
I also tried creating a new object to input directly into the function:
v1 <- ex$var1
marg <- function(d, m) {
biprobs <- lapply(levels(v1), function(var) {
.....
})
})
That also produces a null result. How do I refer to different variables in an unattached data frame?? The code works with direct inputs, so it's a matter of correctly defining the function arguments. I appreciate any help!

How to pass the response variable to lm which is inside an expression within my own function

I try to pass the repsonse variable tv as a function argument into lm within an expression. I hope the code below makes it clearer what I try to achieve.
I preferrably would like to do that using tidy evaluation.
Furthermore, I tried to replace expression from base R with tidyeval terminology but I did not succeed to do so.
library(tidyverse)
library(mice)
data <- boys[boys$age >= 8, -4]
imp <- mice(data, seed = 28382, m = 10, print = FALSE)
choose_vars <- function(predictor_vars) {
predictors <- my_vars %>%
str_c(collapse = " + ") %>%
str_c("~", .) %>%
rlang::parse_expr(.)
scope <- list(upper = predictors, lower = ~1)
my_expression <- expression(
f1 <- lm(tv ~ 1),
f2 <- step(f1, scope = scope))
fit <- with(imp, my_expression)
formulas <- lapply(fit$analyses, formula)
terms <- lapply(formulas, terms)
votes <- unlist(lapply(terms, labels))
table(votes)
}
my_vars <- c("age", "hgt", "wgt", "hc", "gen", "phb", "reg")
choose_vars(predictor_vars = my_vars)
I would like to be able to pass tv via my own function.
choose_vars(predictor_vars = my_vars, response_var = tv)
The original code derives from Stef van Buuren's book Flexible Imputation of Missing Data.
data <- boys[boys$age >= 8, -4]
imp <- mice(data, seed = 28382, m = 10, print = FALSE)
scope <- list(upper = ~ age + hgt + wgt + hc + gen + phb + reg,
lower = ~1)
expr <- expression(f1 <- lm(tv ~ 1),
f2 <- step(f1, scope = scope))
fit <- with(imp, expr)
formulas <- lapply(fit$analyses, formula)
terms <- lapply(formulas, terms)
votes <- unlist(lapply(terms, labels))
table(votes)
Not exactly what I wanted but I found a way to pass the response variable into the function. The result is the same as in the example from the book.
library(tidyverse)
library(mice)
data <- boys[boys$age >= 8, -4]
imp <- mice(data, seed = 28382, m = 10, print = FALSE)
My code
choose_vars <- function(imp_data, predictor_vars, response_var) {
predictors <- predictor_vars %>%
str_c(collapse = " + ") %>%
str_c("~", .) %>%
rlang::parse_expr(.)
scope <- list(upper = predictors, lower = ~1)
form <- str_c(response_var, " ~ 1")
fit <- imp_data %>%
mice::complete("all") %>%
lapply(function(x) { step(lm(formula = as.formula(form), data = x), scope = scope) } )
formulas <- lapply(fit, formula)
terms <- lapply(formulas, terms)
votes <- unlist(lapply(terms, labels))
table(votes)
}
my_vars <- c("age", "hgt", "wgt", "hc", "gen", "phb", "reg")
my_table <- choose_vars(imp_data = imp, predictor_vars = my_vars, response_var = "tv")
Book example
scope <- list(upper = ~ age + hgt + wgt + hc + gen + phb + reg,
lower = ~1)
expr <- expression(f1 <- lm(tv ~ 1),
f2 <- step(f1, scope = scope))
fit <- with(imp, expr)
formulas <- lapply(fit$analyses, formula)
terms <- lapply(formulas, terms)
votes <- unlist(lapply(terms, labels))
stefs_table <- table(votes)
Compare results
identical(my_table, stefs_table)
[1] TRUE

reiterating a script using r

I have the following script
Posdef <- function (n, ev = runif(n, 0, 10))
{
Z <- matrix(ncol=n, rnorm(n^2))
decomp <- qr(Z)
Q <- qr.Q(decomp)
R <- qr.R(decomp)
d <- diag(R)
ph <- d / abs(d)
O <- Q %*% diag(ph)
Z <- t(O) %*% diag(ev) %*% O
return(Z)
}
Sigma <- Posdef(n = 11)
mu <- runif(11,0,10)
data <- as.data.frame(mvrnorm(n=1000, mu, Sigma))
data[data < 0] <- 0 #setting a floor#
data[data > 10] <- 10 #setting a ceiling#
names(data) = c('criteria_1', 'criteria_2', 'criteria_3', 'criteria_4', 'criteria_5',
'criteria_6', 'criteria_7', 'criteria_8', 'criteria_9', 'criteria_10',
'outcome')
data$outcome <- ifelse(data$outcome > 5, 1, 0)
data <- data[, sapply(data, is.numeric)]
maxValue <- as.numeric(apply (data, 2, max))
minValue <- as.numeric(apply (data, 2, min))
data_scaled <- as.data.frame(scale(data, center = minValue,
scale = maxValue-minValue))
ind <- sample (1:nrow(data_scaled), 600)
train <- data_scaled[ind,]
test <- data_scaled[-ind,]
model <- glm (formula =
outcome ~ criteria_1 + criteria_2 + criteria_3 + criteria_4 + criteria_5 +
criteria_6 + criteria_7 + criteria_8 + criteria_9 + criteria_10,
family = "binomial",
data = train)
summary (model)
predicted_model <- predict(model, test)
neural_model <- neuralnet(formula =
outcome ~ criteria_1 + criteria_2 + criteria_3 + criteria_4 + criteria_5 +
criteria_6 + criteria_7 + criteria_8 + criteria_9 + criteria_10,
hidden = c(2,2) ,
threshold = 0.01,
stepmax = 1e+07,
startweights = NULL,
rep = 1,
learningrate = NULL,
algorithm = "rprop+",
linear.output=FALSE,
data= train)
plot (neural_model)
results <- compute (neural_model, test[1:10])
results <- results$net.result*(max(data$outcome)-
min(data$outcome))+ min(data$outcome)
Values <- (test$outcome)*(max(data$outcome)-
min(data$outcome)) + min(data$outcome)
MSE_nueral_model <- sum((results - Values)^2)/nrow(test)
MSE_model <- sum((predicted_model - test$outcome)^2)/nrow(test)
print(MSE_model - MSE_nueral_model)
R1 <- (MSE_model - MSE_nueral_model)
The purpose of this script is to generate some arbitrary multivariate distribution and then compare two methods. In this case its a neural net and logistic regression. The end result is a difference in mean square error.
Now my issue with creating a loop has been with generating the 1000 observations.
I am able to create a loop without the data simulation portion of the script, putting that into the loop seems to make things go haywire. I tried creating a column vector filled with NA's but all I ended up getting was a single value returned rather than a vector of length n populated by the MSE reductions for each iteration of the loop.
Any help would be greatly appreciated.

Resources