Making a function for matching on multiple dependent variables, purrr - r

I want to estimate the matched treatment effect using the Matching package on multiple dependent variables.
For just a single dependent variable, I can run the below which returns what I want:
library(carData)
library(purrr)
library(tidyverse)
library(Matching)
matching_df <- Mroz %>%
mutate(wc = case_when(wc == "yes" ~ "TRUE",
wc == "no" ~ "FALSE")) %>%
drop_na(k5, k618, age, wc, hc, lfp)
matching_df$wc <- as.logical(matching_df$wc)
ps1 <- glm(wc ~ k5 + k618 + age + hc,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df$lfp
Tr <- as.logical(matching_df$wc)
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
summary(psm1)
Estimate... 0.17479
SE......... 0.044963
T-stat..... 3.8873
p.val...... 0.00010135
Original number of observations.............. 753
Original number of treated obs............... 212
Matched number of observations............... 207
Matched number of observations (unweighted). 1074
Caliper (SDs)........................................ 0.05
Number of obs dropped by 'exact' or 'caliper' 5
But when I try and make a function using purrr:map_dfr so I can repeat this operation for multiple dependent variables, it returns an error. This is my attempt at the function:
vars <- c("lfp", "lwg", "inc")
names(vars) <- vars
matching_fcn <- function(.x){
matching_df <- Mroz %>%
mutate(wc = case_when(wc == "yes" ~ "TRUE",
wc == "no" ~ "FALSE")) %>%
drop_na(k5, k618, age, wc, hc, .x)
matching_df$wc <- as.logical(matching_df$wc)
ps1 <- glm(wc ~ k5 + k618 + age + hc,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df$.x
Tr <- as.logical(matching_df$wc)
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
summary(psm1)
}
purrr::map_dfr(
.x = all_of(vars),
.f = matching_fcn)
Error: All columns in a tibble must be vectors.
x Column `lfp` is a `summary.Match` object.
x Column `lwg` is a `summary.Match` object.
x Column `inc` is a `summary.Match` object.
Run `rlang::last_error()` to see where the error occurred.
Ultimately, I would like a tibble which includes the name of the dependent variable in one column, then the estimate, se, T-stat, and p.val that are returned by the Matching::Match function in other columns

The summary(psm1) can't be put into a tibble. So choose some values of psm1 and make your own. Further, drop_na is no good idea and will bias your results.
library(Matching)
vars <- c("dependent_var_1", "dependent_var_2", "dependent_var_3")
names(vars) <- vars
matching_fcn <- function(.x){
# matching_df <- matching_df %>%
# drop_na(covar_1, covar_2, covar_3, covar_4, covar_5, covar_6, covar_7, treat_1, .x)
ps1 <- glm(treat_1 ~ covar_1 + covar_2 + covar_3 + covar_4 + covar_5 + covar_6 + covar_7,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df[[.x]]
Tr <- matching_df$treat_1
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
p <- 1 - pnorm(abs(psm1$est.noadj/psm1$se.standard))
with(psm1, tibble(dv=.x, est=est.noadj, se=se.standard, p=p, ndrops=ndrops))
}
Usage and result
library(dplyr)
library(tidyr)
purrr::map_df(
.x = tidyselect::all_of(vars),
.f = matching_fcn)
# # A tibble: 3 × 5
# dv est se p ndrops
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 dependent_var_1 0.652 0.231 0.00238 8
# 2 dependent_var_2 -0.216 0.188 0.125 8
# 3 dependent_var_3 -0.506 0.249 0.0210 8
Data
v <- c('covar_1', 'covar_2', 'covar_3', 'covar_4', 'covar_5', 'covar_6',
'covar_7', 'treat_1', 'dependent_var_1', 'dependent_var_2', 'dependent_var_3')
set.seed(830595665)
matching_df <- data.frame(matrix(rnorm(100*length(v)), 100, length(v), dimnames=list(c(), v)))
matching_df$treat_1 <- +(matching_df$treat_1 > 0)

Related

How to calculate Standardized Mean Difference for Table1 Package in R?

I am using the package "table1" to create a fancy table one with extra column containing the standardized mean difference of continuous variables in my dataset.
The SMD should be a combination between the treatment and control groups stratified for a given variable within the table.
I am struggling to figure out a good way of doing this and would love some help creating the function to calculate SMD.
Here is some sample code:
f <- function(x, n, ...) factor(sample(x, n, replace=T, ...), levels=x)
set.seed(427)
n <- 146
dat <- data.frame(id=1:n)
dat$treat <- f(c("Placebo", "Treated"), n, prob=c(1, 2)) # 2:1 randomization
dat$age <- sample(18:65, n, replace=TRUE)
dat$sex <- f(c("Female", "Male"), n, prob=c(.6, .4)) # 60% female
dat$wt <- round(exp(rnorm(n, log(70), 0.23)), 1)
# Add some missing data
dat$wt[sample.int(n, 5)] <- NA
label(dat$age) <- "Age"
label(dat$sex) <- "Sex"
label(dat$wt) <- "Weight"
label(dat$treat) <- "Treatment Group"
units(dat$age) <- "years"
units(dat$wt) <- "kg"
my.render.cont <- function(x) {
with(stats.apply.rounding(stats.default(x), digits=2), c("",
"Mean (SD)"=sprintf("%s (± %s)", MEAN, SD)))
}
my.render.cat <- function(x) {
c("", sapply(stats.default(x), function(y) with(y,
sprintf("%d (%0.0f %%)", FREQ, PCT))))
}
#My attempt at an SMD function
smd_value <- function(x, ...) {
x <- x[-length(x)] # Remove "overall" group
# Construct vectors of data y, and groups (strata) g
y <- unlist(x)
g <- factor(rep(1:length(x), times=sapply(x, length)))
if (is.numeric(y) & g==1) {
# For numeric variables, calculate SMD
smd_val1 <- (mean(y)/sd(y))
} else if (is.numeric(y) & g==2) {
# For numeric variables, calculate SMD
smd_val2 <- (mean(y)/sd(y))
} else {print("--")
}
smd_val <- smdval2 - smdval1
}
table1(~ age + sex + wt | treat, data=dat, render.continuous=my.render.cont, render.categorical=my.render.cat, extra.col=list(`SMD`=smd_value))
I get the following error:
"Error in if (is.numeric(y) & g == 1) { : the condition has length > 1"
Any insight into a potential solution?
Thanks!
Here you go!
# Install Packages---------------------------------------------------
library(stddiff)
library(cobalt)
library(table1)
library(Hmisc)
#Using 'mtcars' as an example
my_data<-mtcars
# Format variables--------------------------------------------------------------
# amd - Transmission (0 = automatic; 1 = manual)
my_data$am <-factor(my_data$am,
levels = c(0,1),
labels =c("Automatic","Manual"))
label(my_data$am) <-"Transmission Type" #adding a label for the variable
# vs - Engine (0 = V-shaped, 1 = Straight)
my_data$vs <-factor(my_data$vs,
levels = c(0,1),
labels =c("V-shaped","Straight"))
label(my_data$vs) <-"Engine"
# Adding a label to the numeric variables
label(my_data$mpg)<-"Miles per gallon"
label(my_data$hp)<-"Horsepower"
# SMD FUNCTION------------------------------------------------------------------
SMD_value <- function(x, ...) {
# Construct vectors of data y, and groups (strata) g
y <- unlist(x)
g <- factor(rep(1:length(x), times=sapply(x, length)))
if (is.numeric(y)) {
# For numeric variables
try({a<-data.frame(y)
a$g<-g
smd<-(as.data.frame(stddiff.numeric(data=a,gcol = "g", vcol = "y")))$stddiff
},silent=TRUE)
} else {
# For categorical variables
try({
a<-data.frame(y)
a$g<-g
smd<-(abs((bal.tab(a, treat = "g",data=a,binary="std",continuous =
"std",s.d.denom = "pooled",stats=c("mean.diffs"))$Balance)$Diff.Un))
},silent=TRUE)
}
c("",format(smd,digits=2)) #Formatting number of digits
}
# CONTINUOUS VARIABLES FORMATTING-----------------------------------------------
my.render.cont <- function(x) {
with(stats.default(x),
c("",
"Mean (SD)" = sprintf("%s (%s)",
round_pad(MEAN, 1),
round_pad(SD, 1)),
"Median (IQR)" = sprintf("%s (%s, %s)",
round_pad(MEDIAN, 1),
round_pad(Q1, 1),
round_pad(Q3, 1)))
)}
# Creating the final table-----------------------------------------------------
Table1<-table1(~ vs + mpg + hp | am,
data=my_data,
overall = FALSE,
render.continuous = my.render.cont,
extra.col=list(`SMD`=SMD_value)) #SMD Column
Table1 #displays final table

Extract categorical coeffients and all p-values from a mixed model into a data table

Here is a reproduceable code and sample data
I want to achieve a final data table with 3 columns: 1. exposure quantile 2. OR/RR 3. PV
set.seed(42)
n <- 100
dat = data.frame(ID = rep(c(1:25),times=4 ) ,
Score = rnorm(n, mean=0.3, sd=0.8))
dat = dat %>%
group_by(ID)%>%
dplyr::mutate(exposure1 = rep(c(rnorm(1, mean=6, sd=1.8))),
exposure2 = rep(c(rnorm(1, mean=3, sd=0.6))),
age = rep(c(rnorm(1, mean=40, sd=15))))%>%
ungroup()%>%
dplyr::mutate(exposure1_quantile = cut(exposure1, breaks = 4, labels = c("Q1","Q2","Q3","Q4")),
exposure2_quantile = cut(exposure2, breaks = 4, labels = c("Q1","Q2","Q3","Q4")))
exposures_var = c("exposure1_quantile","exposure2_quantile")
exposure_var_labels("exposure1 Q1","exposure1 Q2 ", "exposure 1 Q3",
"exposure2 Q1","exposure2 Q2 ", "exposure2 Q3")
age="age"
outcome = "Score"
exposure_data_table = c()
for(i in 1:length(exposures_var)){
exp = exposures_var[i]
fixed_effects_formula = paste0(outcome, "~",exp,"+",age)
fixed_effects_formula = as.formula(fixed_effects_formula)
mixedmodel = lme(fixed =fixed_effects_formula, random = ~1|ID, data=dat, method = "ML")
for(m in 2:4){
v = mixedmodel$coefficients$fixed[m]
vector = c(exp , v)
#P=p value for every quantile (HOW TO ADD?)
#exposure_name = exposure_var_labels[?] (HOW TO ADD LABEL)
exposure_data_table = rbind(exposure_data_table, vector)
}
}
exposure_data_table = as.data.table(exposure_data_table)
colnames(exposure_data_table)=c("Exposure","RR")#,"pv")
view(exposure_data_table)
I first used anova to try and get the pvalue but it didnt work.
I think a tidymodels approach using lme would work well here:
library(nlme)
library(tidymodels)
library(multilevelmod)
library(data.table)
lme_spec <-
linear_reg() %>%
set_engine("lme", random = ~ 1 | ID)
Map(function(exp) {
fixed_effects_formula <- as.formula(paste0("Score~",exp,"+ age +", 0))
lme_spec %>%
fit(fixed_effects_formula, data = dat) %>%
broom.mixed::tidy() %>%
filter(effect == "fixed", grepl("exposure", term)) %>%
select(term, estimate, std.error, p.value)
}, exposures_var) %>%
bind_rows() %>%
as.data.table()
#> term estimate std.error p.value
#> 1: exposure1_quantileQ1 -0.16147364 0.3532834 0.6525497
#> 2: exposure1_quantileQ2 0.22318505 0.2719366 0.4214784
#> 3: exposure1_quantileQ3 0.24976757 0.3484126 0.4817411
#> 4: exposure1_quantileQ4 0.14177064 0.4020702 0.7280757
#> 5: exposure2_quantileQ1 0.28976458 0.4191198 0.4972840
#> 6: exposure2_quantileQ2 0.19907863 0.2699164 0.4693496
#> 7: exposure2_quantileQ3 0.35040767 0.2827229 0.2295436
#> 8: exposure2_quantileQ4 -0.09587234 0.3533819 0.7889412
Created on 2022-08-07 by the reprex package (v2.0.1)

Optimize function in r

Here is my code:
cee = abs(qnorm(.5*0.1)) # Bonferroni threshold for achieving study-wide significance = 0.1
p.value = (simAll %>% select("p.value"))
p.value1 <- as.numeric(unlist(p.value))
# we use "cee" so R does not get confused with the function 'c'
betahat = log(OR) # Reported OR
z = sign(betahat)*abs(qnorm(0.5*p.value1)) # Reported p-value = 5.7e-4, which we convert to a z-value
###################################################
# THE PROPOSED APPROACH #
###################################################
se = betahat/z # standard error of betahat
mutilde1 = optimize(f=conditional.like,c(-20,20),maximum=T,z=z,cee=cee)$maximum # the conditional mle
The p.value is the p-values for 1000 simulations, same as OR, for the "se“ part, I can get 1000 different se values there. But for the mutilde1 line, there is an error exist: "Error in optimize(f = conditional.like, c(-20, 20), maximum = T, z = z, :
invalid function value in 'optimize'"
How can I fix the issue?
The conditional.like() function:
conditional.like=function(mu,cee,z){
like=dnorm(z-mu)/(pnorm(mu-cee)+pnorm(-cee-mu))
return((abs(z)>cee)*like) }
The simALL is a table looks like this (total 1000 lines):
# A tibble: 1,000 x 6
id term estimate std.error statistic p.value
<int> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 .x 0.226 0.127 1.78 0.0747
2 2 .x 0.137 0.127 1.08 0.280
3 3 .x 0.304 0.127 2.38 0.0171
4 4 .x 0.497 0.128 3.87 0.000111
OR (total 1000 lines):
> OR
[1] 1.5537098 1.0939850 1.4491432 1.6377551 1.1646904 1.3387534 1.6377551 1.5009351 1.7918552
Also, here is my overall code:
library(tidyverse)
library(broom)
# create a tibble with an id column for each simulation and x wrapped in list()
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum))
simOR <- sim %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)",
p.value < 0.05)
sim
j1=exp(simOR %>% select("estimate"))
OR1=as.numeric(unlist(j1))
mean(OR1)
simAll <- sim %>%
filter(term !="(Intercept)")
j <- exp(simAll %>% select("estimate"))
OR2 <- as.numeric(unlist(j))
mean(OR2)
simOR2 <- sim %>%
filter(term !="(Intercept)",
p.value < 0.005)
j2 <- exp(simOR2 %>% select("estimate"))
OR3 <- as.numeric(unlist(j2))
mean(OR3)
#op <- par(mfrow = c(3, 1))
hga=hist(OR2, main = NULL, freq = T, breaks = 10) #OR2:Overall OR
hgb=hist(OR1, freq = T,col=2,breaks=10, main="OR:p-value<0.05") #OR1:p-value<0.05
hgc=hist(OR3, freq = T,col=2,breaks=10, main="OR:p-value<0.005") #OR3:p-value<0.005
plot(hga,col=rgb(0,1,0,0.5),main = "OR",xlim=c(0.8,2),ylim=c(0,250))
plot(hgb, add = TRUE,col=rgb(0,0,0.8,0.5),xlim=c(0.8,2),ylim=c(0,250))
plot(hgc, add = TRUE,col=rgb(1,0,0,0.5),xlim=c(0.8,2))
abline(v = mean(OR2), lwd = 4, col = 3)
abline(v = mean(OR3), lwd = 4, col=2)
text(1.65,240,"1.31",col=1)
arrows(1.5,240,1.31,240,length=0.1,col=1,lwd=2)
abline(v = mean(OR1), lwd = 4, col=4)
text(2.1,220,"1.43",col=4)
arrows(1.98,220,1.43,220,length=0.1,col=4,lwd=2)
text(2.1,220,"1.55",col=2)
arrows(1.98,220,1.55,220,length=0.1,col=2,lwd=2)
#########################################
## THE FUNCTIONS BELOW ARE USED TO OBTAIN THE
## BIAS-CORRECTED ESTIMATES
#########################################
conditional.like=function(mu,cee,z){
like=dnorm(z-mu)/(pnorm(mu-cee)+pnorm(-cee-mu))
return((abs(z)>cee)*like) }
conditional.like.z=function(mu,cee,z){
return(conditional.like(mu,cee,z)*mu)
}
#########################################
## THE FUNCTIONS BELOW ARE USED TO OBTAIN THE
## BIAS-CORRECTED CONFIDENCE INTERVAL
#########################################
ptruncnorm.lower=function(z,mu,cee,alpha){
A=pnorm(-cee+mu)+pnorm(-cee-mu)
term1=pnorm(z-mu)
term2=pnorm(-cee-mu)
term3=pnorm(-cee-mu)+pnorm(z-mu)-pnorm(cee-mu)
result=(1/A)*(term1*(z<= -cee)+term2*(abs(z)<cee)+term3*(z>=cee))
return(result-(alpha/2))
}
ptruncnorm.upper=function(z,mu,cee,alpha){
A=pnorm(-cee+mu)+pnorm(-cee-mu)
term1=pnorm(z-mu)
term2=pnorm(-cee-mu)
term3=pnorm(-cee-mu)+pnorm(z-mu)-pnorm(cee-mu)
result=(1/A)*(term1*(z<= -cee)+term2*(abs(z)<cee)+term3*(z>=cee))
return(result-(1-alpha/2))
}
find.lowerz=function(mu,z,cee,alpha){
lowerz=uniroot(ptruncnorm.lower,lower=-20,upper=20,mu=mu,cee=cee,alpha=alpha)$root
return(lowerz-z)
}
find.upperz=function(mu,z,cee,alpha){
upperz=uniroot(ptruncnorm.upper,lower=-20,upper=20,mu=mu,cee=cee,alpha=alpha)$root
return(upperz-z)
}
getCI=function(z,cee,alpha){
uppermu=uniroot(find.lowerz,interval=c(-15,15),cee=cee,z=z,alpha=alpha)$root
lowermu=uniroot(find.upperz,interval=c(-15,15),cee=cee,z=z,alpha=alpha)$root
out=list(lowermu,uppermu)
names(out)=c("lowermu","uppermu")
return(out)
}
source("GW-functions.R")# YOU READ IN THE FUNCTIONS FOR OUR METHOD
cee=abs(qnorm(.5*0.1)) # Bonferroni threshold for achieving study-wide significance = 0.1
p.value=(simAll %>% select("p.value"))
p.value1 <- as.numeric(unlist(p.value))
# we use "cee" so R does not get confused with the function 'c'
betahat=log(OR) # Reported OR
z=sign(betahat)*abs(qnorm(0.5*p.value1)) # Reported p-value = 5.7e-4, which we convert to a z-value
###################################################
# THE PROPOSED APPROACH #
###################################################
se=betahat/z # standard error of betahat
mutilde1=optimize(f=conditional.like,c(-20,20),maximum=T,z=z,cee=cee)$maximum

Calculate/approach individual face probabilities of 10-faced dice, based on summed 2-roll dice experiment

I have a biochemistry problem , that can be simplified as a two-roll dice experiment (I think...).
Assume there is an uneven dice with 10 faces, i.e. individual face probabilities are not 1/10. We want to know these probabilities.
The given dataset that we have, however, is a histogram of summed faces of rolling the (same) dice twice. So, the range of the observed bins is 2-20 (2 = 1+1; 3 = 1+2, 2+1, 4 = 2+2, 1+3, 3+1; etc.).
The probabilities of summed faces are the product of the individual probabilities (s: observed probabilities of summed faces; p: probabilities of individual faces) and can be written as follows:
s2 ~ p1^2
s3 ~ 2*p1*p2
s4 ~ 2*p1*p3 + p2^2
s5 ~ 2*p1*p4 + 2*p2*p3
s6 ~ 2*p1*p5 + 2*p2*p4 + p3^2
s7 ~ 2*p1*p6 + 2*p2*p5 + 2*p3*p4
s8 ~ 2*p1*p7 + 2*p2*p6 + 2*p3*p5 + p4^2
s9 ~ 2*p1*p8 + 2*p2*p7 + 2*p3*p6 + 2*p4*p5
s10 ~ 2*p1*p9 + 2*p2*p8 + 2*p3*p7 + 2*p4*p6 + p5^2
s11 ~ 2*p1*p10 + 2*p2*p9 + 2*p3*p8 + 2*p4*p7 + 2*p5*p6
s12 ~ 2*p2*p10 + 2*p3*p9 + 2*p4*p8 + 2*p5*p7 + p6^2
s13 ~ 2*p3*p10 + 2*p4*p9 + 2*p5*p8 + 2*p6*p7
s14 ~ 2*p4*p10 + 2*p5*p9 + 2*p6*p8 + p7^2
s15 ~ 2*p5*p10 + 2*p6*p9 + 2*p7*p8
s16 ~ 2*p6*p10 + 2*p7*p9 + p8^2
s17 ~ 2*p7*p10 + 2*p8*p9
s18 ~ 2*p8*p10 + p9^2
s19 ~ 2*p9*p10
s20 ~ p10^2
In this case there are 20-1=19 known variables, and 10 unknowns, so the system is over-determined. It is also easy to solve by hand using algebra. As far as I can remember: quadratic terms will result in 2 possible solutions per individual face. Probabilities are always positive, so practically there should be one solution. Right?
Is there a way to solve this system in R? I am familiar with linear inverse problems in R, but I don't know how to approach this (quadratic?) system in R.
Here is some code to simulate the problem:
options(stringsAsFactors = FALSE)
library(gtools)
library(dplyr)
dice <- data.frame(face = 1:10)
### functions
split_dice_faces <- function(summed_face){
face_face <- strsplit(x = as.character(summed_face),split = "[/_\\|]")[[1]]
names(face_face) <- c("face1","face2")
as.numeric(face_face)
}
sum_dice_faces <- function(face_face){
sapply(face_face, function(face_face_i){
face1 <- split_dice_faces(face_face_i)[1]
face2 <- split_dice_faces(face_face_i)[2]
sum(c(face1[1], face2[1]))
})
}
simulate_2_rolls <- function(dice_pool){
dice_perm <- data.frame(permutations(n = dim(dice_pool)[1], r = 2, v = as.character(dice_pool$face), repeats.allowed = T ))
dice_perm$face_face <- paste(dice_perm[[1]],"|",dice_perm[[2]], sep = "")
dice_perm$prob <- dice_pool$prob[match(dice_perm[[1]], dice_pool$face)]*dice_pool$prob[match(dice_perm[[2]], dice_pool$face)]
dice_perm$summed_face <- sum_dice_faces(dice_perm$face_face)
dice_perm <- dice_perm %>% arrange(summed_face) %>% select(one_of(c("face_face", "summed_face","prob")))
dice_perm
}
summarise_2_rolls_experiment <- function(simulate_2_rolls_df){
simulate_2_rolls_df %>% group_by(summed_face) %>% summarise(prob = sum(prob))
}
from_face_probs_to_summed_observations <- function(face_probs){
face_probs %>%
data.frame(face = dice$face, prob = .) %>%
simulate_2_rolls() %>%
summarise_2_rolls_experiment() %>%
pull(prob)
}
generate_formulas <- function() {
output <-
dice_sum_probs %>% group_by(summed_face) %>% group_split() %>%
sapply(function(i){
left_hand <- paste("s",i$summed_face[1],sep="")
right_hand <-
sapply(strsplit(i$face_face, "\\|") , function(row){
row_i <- as.numeric(row)
row_i <- row_i[order(row_i)]
row_i <- paste("p",row_i,sep = "")
if(row_i[1] == row_i[2]){
paste(row_i[1],"^2",sep="")
} else {
paste(row_i,collapse="*")
}
})
right_hand <-
paste(sapply(unique(right_hand), function(right_hand_i){
fact <- sum(right_hand == right_hand_i)
if(fact > 1){fact <- paste(fact,"*",sep = "")} else {fact <- ""}
paste(fact,right_hand_i,sep = "")
}), collapse = " + ")
paste(left_hand, "~", right_hand)
})
return(output)
}
to simulate a dataset:
### random individual probabilites
dice_probs <- data.frame(face = dice$face,
prob = runif(n = dim(dice)[1]) %>% (function(x){x / sum(x)}))
dice_probs
### simulate infinite amount of trials, observations expressed as probabilities
dice_sum_probs <- simulate_2_rolls(dice_probs)
dice_sum_probs
### sum experiment outcomes with the same sum
dice_sum_probs_summary <- dice_sum_probs %>% group_by(summed_face) %>% summarise(prob = sum(prob))
### plot, this is the given dataset
with(data = dice_sum_probs_summary, barplot(prob, names.arg = summed_face))
### how to calculate / approach p1, p2, ..., p10?
Thanks!
If we create a multiplication table of the probabilities, outer(p, p) and then sum those over constant values of outer(1:10, 1:10, "+") using tapply we get the following nonlinear regression problem:
nls(prob ~ tapply(outer(p, p), outer(1:10, 1:10, `+`), sum),
dice_sum_probs_summary, algorithm = "port",
start = list(p = sqrt(dice_sum_probs_summary$prob[seq(1, 19, 2)])),
lower = numeric(10), upper = rep(1, 10))
giving:
Nonlinear regression model
model: prob ~ tapply(outer(p, p), outer(1:10, 1:10, `+`), sum)
data: dice_sum_probs_summary
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10
0.06514 0.04980 0.14439 0.06971 0.06234 0.19320 0.09491 0.01237 0.11936 0.18878
residual sum-of-squares: 1.33e-30
which is consistent with
> dice_probs
face prob
1 1 0.06513537
2 2 0.04980455
3 3 0.14438749
4 4 0.06971313
5 5 0.06234477
6 6 0.19319613
7 7 0.09491289
8 8 0.01236557
9 9 0.11936244
10 10 0.18877766
We can alternately express it as follows where X is a matrix of zeros and ones having dimension 19 x 100 such that each row corresponds to a possible outcome of rolling the two dice (i.e. 2:20) and each column corresponds to a pair of indexes from 1:10 and 1:10. An entry equals one if the column pair sums to the sum of the two faces represented by its row and zero otherwise.
g <- c(outer(1:10, 1:10, `+`))
X <- + outer(2:20, g, `==`)
nls(prob ~ X %*% kronecker(p, p), dice_sum_probs_summary, alg = "port",
start = list(p = sqrt(dice_sum_probs_summary$prob[seq(1, 19, 2)])),
lower = numeric(10), upper = rep(1, 10))

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Resources