How to calculate Standardized Mean Difference for Table1 Package in R? - r

I am using the package "table1" to create a fancy table one with extra column containing the standardized mean difference of continuous variables in my dataset.
The SMD should be a combination between the treatment and control groups stratified for a given variable within the table.
I am struggling to figure out a good way of doing this and would love some help creating the function to calculate SMD.
Here is some sample code:
f <- function(x, n, ...) factor(sample(x, n, replace=T, ...), levels=x)
set.seed(427)
n <- 146
dat <- data.frame(id=1:n)
dat$treat <- f(c("Placebo", "Treated"), n, prob=c(1, 2)) # 2:1 randomization
dat$age <- sample(18:65, n, replace=TRUE)
dat$sex <- f(c("Female", "Male"), n, prob=c(.6, .4)) # 60% female
dat$wt <- round(exp(rnorm(n, log(70), 0.23)), 1)
# Add some missing data
dat$wt[sample.int(n, 5)] <- NA
label(dat$age) <- "Age"
label(dat$sex) <- "Sex"
label(dat$wt) <- "Weight"
label(dat$treat) <- "Treatment Group"
units(dat$age) <- "years"
units(dat$wt) <- "kg"
my.render.cont <- function(x) {
with(stats.apply.rounding(stats.default(x), digits=2), c("",
"Mean (SD)"=sprintf("%s (± %s)", MEAN, SD)))
}
my.render.cat <- function(x) {
c("", sapply(stats.default(x), function(y) with(y,
sprintf("%d (%0.0f %%)", FREQ, PCT))))
}
#My attempt at an SMD function
smd_value <- function(x, ...) {
x <- x[-length(x)] # Remove "overall" group
# Construct vectors of data y, and groups (strata) g
y <- unlist(x)
g <- factor(rep(1:length(x), times=sapply(x, length)))
if (is.numeric(y) & g==1) {
# For numeric variables, calculate SMD
smd_val1 <- (mean(y)/sd(y))
} else if (is.numeric(y) & g==2) {
# For numeric variables, calculate SMD
smd_val2 <- (mean(y)/sd(y))
} else {print("--")
}
smd_val <- smdval2 - smdval1
}
table1(~ age + sex + wt | treat, data=dat, render.continuous=my.render.cont, render.categorical=my.render.cat, extra.col=list(`SMD`=smd_value))
I get the following error:
"Error in if (is.numeric(y) & g == 1) { : the condition has length > 1"
Any insight into a potential solution?
Thanks!

Here you go!
# Install Packages---------------------------------------------------
library(stddiff)
library(cobalt)
library(table1)
library(Hmisc)
#Using 'mtcars' as an example
my_data<-mtcars
# Format variables--------------------------------------------------------------
# amd - Transmission (0 = automatic; 1 = manual)
my_data$am <-factor(my_data$am,
levels = c(0,1),
labels =c("Automatic","Manual"))
label(my_data$am) <-"Transmission Type" #adding a label for the variable
# vs - Engine (0 = V-shaped, 1 = Straight)
my_data$vs <-factor(my_data$vs,
levels = c(0,1),
labels =c("V-shaped","Straight"))
label(my_data$vs) <-"Engine"
# Adding a label to the numeric variables
label(my_data$mpg)<-"Miles per gallon"
label(my_data$hp)<-"Horsepower"
# SMD FUNCTION------------------------------------------------------------------
SMD_value <- function(x, ...) {
# Construct vectors of data y, and groups (strata) g
y <- unlist(x)
g <- factor(rep(1:length(x), times=sapply(x, length)))
if (is.numeric(y)) {
# For numeric variables
try({a<-data.frame(y)
a$g<-g
smd<-(as.data.frame(stddiff.numeric(data=a,gcol = "g", vcol = "y")))$stddiff
},silent=TRUE)
} else {
# For categorical variables
try({
a<-data.frame(y)
a$g<-g
smd<-(abs((bal.tab(a, treat = "g",data=a,binary="std",continuous =
"std",s.d.denom = "pooled",stats=c("mean.diffs"))$Balance)$Diff.Un))
},silent=TRUE)
}
c("",format(smd,digits=2)) #Formatting number of digits
}
# CONTINUOUS VARIABLES FORMATTING-----------------------------------------------
my.render.cont <- function(x) {
with(stats.default(x),
c("",
"Mean (SD)" = sprintf("%s (%s)",
round_pad(MEAN, 1),
round_pad(SD, 1)),
"Median (IQR)" = sprintf("%s (%s, %s)",
round_pad(MEDIAN, 1),
round_pad(Q1, 1),
round_pad(Q3, 1)))
)}
# Creating the final table-----------------------------------------------------
Table1<-table1(~ vs + mpg + hp | am,
data=my_data,
overall = FALSE,
render.continuous = my.render.cont,
extra.col=list(`SMD`=SMD_value)) #SMD Column
Table1 #displays final table

Related

Making a function for matching on multiple dependent variables, purrr

I want to estimate the matched treatment effect using the Matching package on multiple dependent variables.
For just a single dependent variable, I can run the below which returns what I want:
library(carData)
library(purrr)
library(tidyverse)
library(Matching)
matching_df <- Mroz %>%
mutate(wc = case_when(wc == "yes" ~ "TRUE",
wc == "no" ~ "FALSE")) %>%
drop_na(k5, k618, age, wc, hc, lfp)
matching_df$wc <- as.logical(matching_df$wc)
ps1 <- glm(wc ~ k5 + k618 + age + hc,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df$lfp
Tr <- as.logical(matching_df$wc)
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
summary(psm1)
Estimate... 0.17479
SE......... 0.044963
T-stat..... 3.8873
p.val...... 0.00010135
Original number of observations.............. 753
Original number of treated obs............... 212
Matched number of observations............... 207
Matched number of observations (unweighted). 1074
Caliper (SDs)........................................ 0.05
Number of obs dropped by 'exact' or 'caliper' 5
But when I try and make a function using purrr:map_dfr so I can repeat this operation for multiple dependent variables, it returns an error. This is my attempt at the function:
vars <- c("lfp", "lwg", "inc")
names(vars) <- vars
matching_fcn <- function(.x){
matching_df <- Mroz %>%
mutate(wc = case_when(wc == "yes" ~ "TRUE",
wc == "no" ~ "FALSE")) %>%
drop_na(k5, k618, age, wc, hc, .x)
matching_df$wc <- as.logical(matching_df$wc)
ps1 <- glm(wc ~ k5 + k618 + age + hc,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df$.x
Tr <- as.logical(matching_df$wc)
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
summary(psm1)
}
purrr::map_dfr(
.x = all_of(vars),
.f = matching_fcn)
Error: All columns in a tibble must be vectors.
x Column `lfp` is a `summary.Match` object.
x Column `lwg` is a `summary.Match` object.
x Column `inc` is a `summary.Match` object.
Run `rlang::last_error()` to see where the error occurred.
Ultimately, I would like a tibble which includes the name of the dependent variable in one column, then the estimate, se, T-stat, and p.val that are returned by the Matching::Match function in other columns
The summary(psm1) can't be put into a tibble. So choose some values of psm1 and make your own. Further, drop_na is no good idea and will bias your results.
library(Matching)
vars <- c("dependent_var_1", "dependent_var_2", "dependent_var_3")
names(vars) <- vars
matching_fcn <- function(.x){
# matching_df <- matching_df %>%
# drop_na(covar_1, covar_2, covar_3, covar_4, covar_5, covar_6, covar_7, treat_1, .x)
ps1 <- glm(treat_1 ~ covar_1 + covar_2 + covar_3 + covar_4 + covar_5 + covar_6 + covar_7,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df[[.x]]
Tr <- matching_df$treat_1
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
p <- 1 - pnorm(abs(psm1$est.noadj/psm1$se.standard))
with(psm1, tibble(dv=.x, est=est.noadj, se=se.standard, p=p, ndrops=ndrops))
}
Usage and result
library(dplyr)
library(tidyr)
purrr::map_df(
.x = tidyselect::all_of(vars),
.f = matching_fcn)
# # A tibble: 3 × 5
# dv est se p ndrops
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 dependent_var_1 0.652 0.231 0.00238 8
# 2 dependent_var_2 -0.216 0.188 0.125 8
# 3 dependent_var_3 -0.506 0.249 0.0210 8
Data
v <- c('covar_1', 'covar_2', 'covar_3', 'covar_4', 'covar_5', 'covar_6',
'covar_7', 'treat_1', 'dependent_var_1', 'dependent_var_2', 'dependent_var_3')
set.seed(830595665)
matching_df <- data.frame(matrix(rnorm(100*length(v)), 100, length(v), dimnames=list(c(), v)))
matching_df$treat_1 <- +(matching_df$treat_1 > 0)

Error in confidence interval mice R package

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards
Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

Adding descriptive statistics to a function when making use of the ellipsis as input variable

For an assignment I have created a function in R that calculates the regression coefficients, predicted values and residuals of data that is useful for multiple linear regression. It did that as follows:
MLR <- function(y_var, ...){
y <- y_var
X <- as.matrix(cbind(...))
intercept <- rep(1, length(y))
X <- cbind(intercept, X)
regression_coef <- solve(t(X) %*% X) %*% t(X) %*% y
predicted_val <- X %*% regression_coef
residual_val <- y - predicted_val
scatterplot <- plot(predicted_val, residual_val,
ylab = 'Residuals', xlab = 'Predicted values',
main = 'Predicted values against the residuals',
abline(0,0))
list('y' = y,
'X' = X,
'Regression coefficients' = regression_coef,
'Predicted values' = predicted_val,
'Residuals' = residual_val,
'Scatterplot' = scatterplot
)
}
Now, my struggle is to add descriptive statistics of my input variables. Since I want my independent variables to be able to be any number, I used the ellipsis as input variable. Is there a way to calculate useful descriptive statistics (mean, variance, standard deviation) of my independent variables (defined by the ...)?
This
mean(...)
does not work...
Thank you for the replies already!
Try this slight changes on your function. I have applied to some variables of iris dataset. You can compute the desired statistics over X and then output as an additional slot for your output. Here the code:
#Function
MLR <- function(y_var, ...){
y <- y_var
X <- as.matrix(cbind(...))
RX <- X
intercept <- rep(1, length(y))
X <- cbind(intercept, X)
regression_coef <- solve(t(X) %*% X) %*% t(X) %*% y
predicted_val <- X %*% regression_coef
residual_val <- y - predicted_val
scatterplot <- plot(predicted_val, residual_val,
ylab = 'Residuals', xlab = 'Predicted values',
main = 'Predicted values against the residuals',
abline(0,0))
#Summary
#Stats
DMeans <- apply(RX,2,mean,na.rm=T)
DSD <- apply(RX,2,sd,na.rm=T)
DVar <- apply(RX,2,var,na.rm=T)
DSummary <- rbind(DMeans,DSD,DVar)
#Out
list('y' = y,
'X' = X,
'Regression coefficients' = regression_coef,
'Predicted values' = predicted_val,
'Residuals' = residual_val,
'Scatterplot' = scatterplot,
'Summary' = DSummary
)
}
#Apply
MLR(y_var = iris$Sepal.Length,iris$Sepal.Width,iris$Petal.Length)
The final slot of the output will look like this:
$Scatterplot
NULL
$Summary
[,1] [,2]
DMeans 3.0573333 3.758000
DSD 0.4358663 1.765298
DVar 0.1899794 3.116278
I think I've got it. Unfortunately, the ellipsis seems to be quite quirky to work with them. Check if the cbind(...) functions correctly inside your function (when I've checked it at the output, it was only 1 column wide, while I input 2 variables into it, and that don't seem right.
My solution don't read variable names - it uses placeholder names (Var_1, Var_2, ... , Var_n)
MLR <- function(y_var, ...){
# these two packages will come in handy
require(dplyr)
require(tidyr)
y <- y_var
X <- as.matrix(cbind(...))
# firstly, we need to make df/tibble out of ellipsis
X2 <- list(...)
n <- tibble(n = rep(0, times = length(y)))
index <- 0
for(Var in X2){
index <- index + 1
n[, paste0("Var_", index)] <- Var
}
# after the df was created, now it's time for calculating desc
# Using tidyr::gather with dplyr::summarize creates nice summary,
# where each row is another variable
descriptives <- tidyr::gather(n, key = "Variable", value = "Value") %>%
group_by(Variable) %>%
summarize(mean = mean(Value), var = var(Value), sd = sd(Value), .groups = "keep")
# everything except the output list is the same
intercept <- rep(1, length(y))
X <- cbind(intercept, X)
regression_coef <- solve(t(X) %*% X) %*% t(X) %*% y
predicted_val <- X %*% regression_coef
residual_val <- y - predicted_val
scatterplot <- plot(predicted_val, residual_val,
ylab = 'Residuals', xlab = 'Predicted values',
main = 'Predicted values against the residuals',
abline(0,0))
list('y' = y,
'X' = X,
'Regression coefficients' = regression_coef,
'Predicted values' = predicted_val,
'Residuals' = residual_val,
'Scatterplot' = scatterplot,
'descriptives' = descriptives[-1,] # need to remove the first row
# because it is "n" placeholder
)
}

How do I speed up my function, specifically the ggplot commands?

I put together a function to identify outliers. It takes a dataframe and then shows plots of the data with lines to indicate potential outliers. It'll give a table with outliers marked, too.
But, it is SLOOOW. The problem is it takes a really long time for the plots to load.
I was curious if you might have advice on how to speed this up.
Related: Is the default plotting system faster than ggplot?
I'll start with the dependencies
#These next four functions are not mine. They're used in GetOutliers()
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
Hampel <- function(x, t = 3){
#
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up<-mu+t*sig
down<-mu-t*sig
out <- list(up = up, down = down)
return(out)
}
ThreeSigma <- function(x, t = 3){
#
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up<-mu+t* sig
down<-mu-t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule <- function(x, t = 1.5){
#
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q<-xU-xL
if(Q==0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up<-xU+t*Q
down<-xU-t*Q
out <- list(up = up, down = down)
return(out)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
#strip non-numeric variables out of a dataframe
num_vars <- function(df){
X <- which(sapply(df, is.numeric))
num_vars <- df[names(X)]
return(num_vars)
}
This is the function
GetOutliers <- function(df){
library('dplyr')
library('ggplot2')
#strip out the non-numeric columns
df_out <- num_vars(df)
#initialize the data frame
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
df_out_id <- df_out
#identify outliers for each column
for (i in 1:length(names(num_vars(df)))){
#find the outliers
Outs <- FindOutliers(df_out[[i]])
OutsSum <- Outs$summary
#re-enter the outlier status
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)
#visualize the outliers and print outlier information
Temp <- df_out
A <- colnames(Temp)[i]
AA <- paste(A,"Index")
colnames(Temp)[i] <- 'curr_column'
#table with outlier status
X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))
#scatterplot with labels
Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
xlab(AA) + ylab(A)
#scatterplot without labels
Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
xlab(AA) + ylab(A)
U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)
print(A)
print(X)
print(OutsSum)
print(Z)
print(Y)
print(U)
#mark the extreme outliers, the rest are reasonable outliers
A <- colnames(df_out_id[i])
Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
col <- df_out_id[i]
df_out_id[i] <- sapply(col[[1]], function(x){
if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
else return('Non-Outlier')
})
}
#return a dataframe with outlier status, excluding the outlier ID columns
summary(df_out_id)
return(df_out_id[1:(length(names(df_out_id))-3)])
}
Example
library('ISLR')
data(Carseats)
GetOutliers(Carseats)
It'll show you the outliers for each numeric variable.
It'll plot the variable density and then a scatterplot with identifier lines
It will also accept input so you can mark some outliers as reasonable and other as extreme

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Resources