Inflection point for binomial mixed GLM model - r

I'd like to explore some possibilities and comparison approaches for inflection point calculation for the binomial mixed GLM model. I find the inflection package that used Extremum Surface Estimator (ESE) and Extremeum Distance Estimator (EDE). I make:
library(inflection)
library(dplyr)
library(glmmTMB)
library(DHARMa)
library(ggplot2)
library(ggeffects)
# My binomial data set
binom.ds <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/mort_binon.csv")
str(binom.ds)
# 'data.frame': 400 obs. of 4 variables:
# $ temp : num 0 0 0 0 0 0 0 0 0 0 ...
# $ days : int 5 5 5 5 5 5 5 5 5 5 ...
# $ rep : chr "r1" "r2" "r3" "r4" ...
# $ mortality: int 0 1 1 1 1 1 1 1 0 1 ...
# Fit a binomial mixed GLM model
m_F <- glmmTMB(mortality ~ temp + days +
(1 | days ), data = binom.ds,
family = "binomial")
# Check the fitted model using DHARMa
plot(s1 <- simulateResiduals(m_F))
# All look likes OK
# Find a inflection point
# for temp
ds_F <- cbind(x=binom.ds$temp,y=exp(predict(m_F)))
ds_F<-as.data.frame(ds_F)
bb=bede(ds_F$x,ds_F$y,0);bb
bb$iplast
# [1] 12.5
# $iters
# n a b EDE
# 1 400 0 25 12.5
# Vizualize the inflection point for temp
ggpredict(m_F, terms = "temp [all]") %>% plot(add.data = TRUE) + geom_vline(xintercept = bb$iplast, colour="red", linetype = "longdash")
#for days
ds_F <- cbind(x=binom.ds$days,y=exp(predict(m_F)))
ds_F<-as.data.frame(ds_F)
bb2=bede(ds_F$x,ds_F$y,0);bb2
bb2$iplast
# [1] 22.5
# $iters
# n a b EDE
# 1 400 5 30 17.5
# 2 221 5 30 17.5
# 3 181 15 5 10.0
# 4 61 15 30 22.5
# Vizualize the inflection point for days
ggpredict(m_F, terms = "days [all]") %>% plot(add.data = TRUE) + geom_vline(xintercept = bb2$iplast, colour="red", linetype = "longdash")
My question is there other approaches/packages for this calculus?

Related

How to add stars for significance level with odds ratio for polr?

My question is at the very bottom of this post.
Here is an example of codes that is very similar to the method I use:
url <- "http://peopleanalytics-regression-book.org/data/soccer.csv"
soccer <- read.csv(url)
head(soccer)
## discipline n_yellow_25 n_red_25 position result country level
## 1 None 4 1 S D England 1
## 2 None 2 2 D W England 2
## 3 None 2 1 M D England 1
## 4 None 2 1 M L Germany 1
## 5 None 2 0 S W Germany 1
## 6 None 3 2 M W England 1
str(soccer)
## 'data.frame': 2291 obs. of 7 variables:
## $ discipline : chr "None" "None" "None" "None" ...
## $ n_yellow_25: int 4 2 2 2 2 3 4 3 4 3 ...
## $ n_red_25 : int 1 2 1 1 0 2 2 0 3 3 ...
## $ position : chr "S" "D" "M" "M" ...
## $ result : chr "D" "W" "D" "L" ...
## $ country : chr "England" "England" "England" "Germany" ...
## $ level : int 1 2 1 1 1 1 2 1 1 1 ...
# convert discipline to ordered factor
soccer$discipline <- ordered(soccer$discipline,
levels = c("None", "Yellow", "Red"))
# check conversion
str(soccer)
# apply as.factor to four columns
cats <- c("position", "country", "result", "level")
soccer[ ,cats] <- lapply(soccer[ ,cats], as.factor)
# check again
str(soccer)
# run proportional odds model
library(MASS)
model <- polr(
formula = discipline ~ n_yellow_25 + n_red_25 + position +
country + level + result,
data = soccer
)
# get summary
summary(model)
## Call:
## polr(formula = discipline ~ n_yellow_25 + n_red_25 + position +
## country + level + result, data = soccer)
##
## Coefficients:
## Value Std. Error t value
## n_yellow_25 0.32236 0.03308 9.7456
## n_red_25 0.38324 0.04051 9.4616
## positionM 0.19685 0.11649 1.6899
## positionS -0.68534 0.15011 -4.5655
## countryGermany 0.13297 0.09360 1.4206
## level2 0.09097 0.09355 0.9724
## resultL 0.48303 0.11195 4.3147
## resultW -0.73947 0.12129 -6.0966
##
## Intercepts:
## Value Std. Error t value
## None|Yellow 2.5085 0.1918 13.0770
## Yellow|Red 3.9257 0.2057 19.0834
##
## Residual Deviance: 3444.534
## AIC: 3464.534
# get coefficients (it's in matrix form)
coefficients <- summary(model)$coefficients
# calculate p-values
p_value <- (1 - pnorm(abs(coefficients[ ,"t value"]), 0, 1))*2
# bind back to coefficients
coefficients <- cbind(coefficients, p_value)
# calculate odds ratios
odds_ratio <- exp(coefficients[ ,"Value"])
# combine with coefficient and p_value
(coefficients <- cbind(
coefficients[ ,c("Value", "p_value")],
odds_ratio
))
Doing this i get the following output:
## Value p_value odds_ratio
## n_yellow_25 0.32236030 0.000000e+00 1.3803820
## n_red_25 0.38324333 0.000000e+00 1.4670350
## positionM 0.19684666 9.105456e-02 1.2175573
## positionS -0.68533697 4.982908e-06 0.5039204
## countryGermany 0.13297173 1.554196e-01 1.1422177
## level2 0.09096627 3.308462e-01 1.0952321
## resultL 0.48303227 1.598459e-05 1.6209822
## resultW -0.73947295 1.083595e-09 0.4773654
## None|Yellow 2.50850778 0.000000e+00 12.2865822
## Yellow|Red 3.92572124 0.000000e+00 50.6896241
My question
However I want stars with odds ratios. How is this possible with THIS method? If possible I would like to also add the standard error.
I do not want to use modelsummary() or gtsummary()
How about this:
url <- "http://peopleanalytics-regression-book.org/data/soccer.csv"
soccer <- read.csv(url)
soccer$discipline <- ordered(soccer$discipline,
levels = c("None", "Yellow", "Red"))
cats <- c("position", "country", "result", "level")
soccer[ ,cats] <- lapply(soccer[ ,cats], as.factor)
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked _by_ '.GlobalEnv':
#>
#> cats
model <- polr(
formula = discipline ~ n_yellow_25 + n_red_25 + position +
country + level + result,
data = soccer
)
coefficients <- summary(model)$coefficients
#>
#> Re-fitting to get Hessian
# calculate p-values
p_value <- (1 - pnorm(abs(coefficients[ ,"t value"]), 0, 1))*2
# bind back to coefficients
coefficients <- cbind(coefficients, p_value)
# calculate odds ratios
coefficients <- cbind(coefficients, odds_ratio = exp(coefficients[ ,"Value"]))
# combine with coefficient and p_value
printCoefmat(coefficients[ ,c("Value", "Std. Error", "odds_ratio", "p_value")],
P.values=TRUE,
has.Pvalue=TRUE)
#> Value Std. Error odds_ratio p_value
#> n_yellow_25 0.322360 0.033078 1.3804 < 2.2e-16 ***
#> n_red_25 0.383243 0.040505 1.4670 < 2.2e-16 ***
#> positionM 0.196847 0.116487 1.2176 0.09105 .
#> positionS -0.685337 0.150112 0.5039 4.983e-06 ***
#> countryGermany 0.132972 0.093599 1.1422 0.15542
#> level2 0.090966 0.093547 1.0952 0.33085
#> resultL 0.483032 0.111951 1.6210 1.598e-05 ***
#> resultW -0.739473 0.121293 0.4774 1.084e-09 ***
#> None|Yellow 2.508508 0.191826 12.2866 < 2.2e-16 ***
#> Yellow|Red 3.925721 0.205714 50.6896 < 2.2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Created on 2022-12-06 by the reprex package (v2.0.1)

Caret: There were missing values in resampled performance measures

I am running caret's neural network on the Bike Sharing dataset and I get the following error message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
I am not sure what the problem is. Can anyone help please?
The dataset is from:
https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
Here is the coding:
library(caret)
library(bestNormalize)
data_hour = read.csv("hour.csv")
# Split dataset
set.seed(3)
split = createDataPartition(data_hour$casual, p=0.80, list=FALSE)
validation = data_hour[-split,]
dataset = data_hour[split,]
dataset = dataset[,c(-1,-2,-4)]
# View strucutre of data
str(dataset)
# 'data.frame': 13905 obs. of 14 variables:
# $ season : int 1 1 1 1 1 1 1 1 1 1 ...
# $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
# $ hr : int 1 2 3 5 8 10 11 12 14 15 ...
# $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
# $ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
# $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
# $ weathersit: int 1 1 1 2 1 1 1 1 2 2 ...
# $ temp : num 0.22 0.22 0.24 0.24 0.24 0.38 0.36 0.42 0.46 0.44 ...
# $ atemp : num 0.273 0.273 0.288 0.258 0.288 ...
# $ hum : num 0.8 0.8 0.75 0.75 0.75 0.76 0.81 0.77 0.72 0.77 ...
# $ windspeed : num 0 0 0 0.0896 0 ...
# $ casual : int 8 5 3 0 1 12 26 29 35 40 ...
# $ registered: int 32 27 10 1 7 24 30 55 71 70 ...
# $ cnt : int 40 32 13 1 8 36 56 84 106 110 ...
## transform numeric data to Guassian
dataset_selected = dataset[,c(-13,-14)]
for (i in 8:12) { dataset_selected[,i] = predict(boxcox(dataset_selected[,i] +0.1))}
# View transformed dataset
str(dataset_selected)
#'data.frame': 13905 obs. of 12 variables:
#' $ season : int 1 1 1 1 1 1 1 1 1 1 ...
#' $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
#' $ hr : int 1 2 3 5 8 10 11 12 14 15 ...
#' $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
#' $ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
#' $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
#' $ weathersit: int 1 1 1 2 1 1 1 1 2 2 ...
#' $ temp : num -1.47 -1.47 -1.35 -1.35 -1.35 ...
#' $ atemp : num -1.18 -1.18 -1.09 -1.27 -1.09 ...
#' $ hum : num 0.899 0.899 0.637 0.637 0.637 ...
#' $ windspeed : num -1.8 -1.8 -1.8 -0.787 -1.8 ...
#' $ casual : num -0.361 -0.588 -0.81 -1.867 -1.208 ...
# Train data with Neural Network model from caret
control = trainControl(method = 'repeatedcv', number = 10, repeats =3)
metric = 'RMSE'
set.seed(3)
fit = train(casual ~., data = dataset_selected, method = 'nnet', metric = metric, trControl = control, trace = FALSE)
Thanks for your help!
phivers comment is spot on, however I would still like to provide a more verbose answer on this concrete example.
In order to investigate what is going on in more detail one should add the argument savePredictions = "all" to trainControl:
control = trainControl(method = 'repeatedcv',
number = 10,
repeats = 3,
returnResamp = "all",
savePredictions = "all")
metric = 'RMSE'
set.seed(3)
fit = train(casual ~.,
data = dataset_selected,
method = 'nnet',
metric = metric,
trControl = control,
trace = FALSE,
form = "traditional")
now when running:
fit$results
#output
size decay RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 1 0e+00 0.9999205 NaN 0.8213177 0.009655872 NA 0.007919575
2 1 1e-04 0.9479487 0.1850270 0.7657225 0.074211541 0.20380571 0.079640883
3 1 1e-01 0.8801701 0.3516646 0.6937938 0.074484860 0.20787440 0.077960642
4 3 0e+00 0.9999205 NaN 0.8213177 0.009655872 NA 0.007919575
5 3 1e-04 0.9272942 0.2482794 0.7434689 0.091409600 0.24363651 0.098854133
6 3 1e-01 0.7943899 0.6193242 0.5944279 0.011560524 0.03299137 0.013002708
7 5 0e+00 0.9999205 NaN 0.8213177 0.009655872 NA 0.007919575
8 5 1e-04 0.8811411 0.3621494 0.6941335 0.092169810 0.22980560 0.098987058
9 5 1e-01 0.7896507 0.6431808 0.5870894 0.009947324 0.01063359 0.009121535
we notice the problem occurs when decay = 0.
lets filter the observations and predictions for decay = 0
library(tidyverse)
fit$pred %>%
filter(decay == 0) -> for_r2
var(for_r2$pred)
#output
0
we can observe that all of the predictions when decay == 0 are the same (have zero variance). The model exclusively predicts 0:
unique(for_r2$pred)
#output
0
So when the summary function tries to predict R squared:
caret::R2(for_r2$obs, for_r2$pred)
#output
[1] NA
Warning message:
In cor(obs, pred, use = ifelse(na.rm, "complete.obs", "everything")) :
the standard deviation is zero
Answer by #topepo (Caret package main developer). See detailed Github thread here.
It looks like it happens when you have one hidden unit and almost no
regularization. What is happening is that the model is predicting a
value very close to a constant (so that the RMSE is a little worse
than the basic st deviation of the outcome):
> ANN_cooling_fit$resample %>% dplyr::filter(is.na(Rsquared))
RMSE Rsquared MAE size decay Resample
1 8.414010 NA 6.704311 1 0e+00 Fold04.Rep01
2 8.421244 NA 6.844363 1 0e+00 Fold01.Rep03
3 7.855925 NA 6.372947 1 1e-04 Fold10.Rep07
4 7.963816 NA 6.428947 1 0e+00 Fold07.Rep09
5 8.492898 NA 6.901842 1 0e+00 Fold09.Rep09
6 7.892527 NA 6.479474 1 0e+00 Fold10.Rep10
> sd(mydata$V7)
[1] 7.962888
So it's nothing to really worry about; just some parameters that do very poorly.
The answer by #missuse is already very insightful to understand why this error happens.
So I just want to add some straightforward ways how to get rid of this error.
If in some cross-validation folds the predictions get zero variance, the model didn't converge. In such cases, you can try the neuralnet package which offers two parameters you can tune:
threshold : default value = 0.01. Set it to 0.3 and then try lower values 0.2, 0.1, 0.05.
stepmax : default value = 1e+05. Set it to 1e+08 and then try lower values 1e+07, 1e+06.
In most cases, it is sufficient to change the threshold parameter like this:
model.nn <- caret::train(formula1,
method = "neuralnet",
data = training.set[,],
# apply preProcess within cross-validation folds
preProcess = c("center", "scale"),
trControl = trainControl(method = "repeatedcv",
number = 10,
repeats = 3),
threshold = 0.3
)

R Flexsurv and time-dependent covariates

I read that the R flexsurv package can also be used for modeling time-dependent covariates according to Christopher Jackson (2016) ["flexsurv: a platform for parametric survival modeling in R, Journal of Statistical Software, 70 (1)].
However, I was not able to figure out how, even after several adjustments and searches in online forums.
Before turning to the estimation of time-dependent covariates I tried to create a simple model with only time-independent covariates to test whether I specified the Surv object correctly. Here is a small example.
library(splitstackshape)
library(flexsurv)
## create sample data
n=50
set.seed(2)
t <- rpois(n,15)+1
x <- rnorm(n,t,5)
df <- data.frame(t,x)
df$id <- 1:n
df$rep <- df$t-1
Which looks like this:
t x id rep
1 12 17.696149 1 11
2 12 20.358094 2 11
3 11 2.058789 3 10
4 16 26.156213 4 15
5 13 9.484278 5 12
6 15 15.790824 6 14
...
And the long data:
long.df <- expandRows(df, "rep")
rep.vec<-c()
for(i in 1:n){
rep.vec <- c(rep.vec,1:(df[i,"t"]-1))
}
long.df$start <- rep.vec
long.df$stop <- rep.vec +1
long.df$censrec <- 0
long.df$censrec<-ifelse(long.df$stop==long.df$t,1,long.df$censrec)
Which looks like this:
t x id start stop censrec
1 12 17.69615 1 1 2 0
1.1 12 17.69615 1 2 3 0
1.2 12 17.69615 1 3 4 0
1.3 12 17.69615 1 4 5 0
1.4 12 17.69615 1 5 6 0
1.5 12 17.69615 1 6 7 0
1.6 12 17.69615 1 7 8 0
1.7 12 17.69615 1 8 9 0
1.8 12 17.69615 1 9 10 0
1.9 12 17.69615 1 10 11 0
1.10 12 17.69615 1 11 12 1
2 12 20.35809 2 1 2 0
...
Now I can estimate a simple Cox model to see whether it works:
coxph(Surv(t)~x,data=df)
This yields:
coef exp(coef) se(coef) z p
x -0.0588 0.9429 0.0260 -2.26 0.024
And in the long format:
coxph(Surv(start,stop,censrec)~x,data=long.df)
I get:
coef exp(coef) se(coef) z p
x -0.0588 0.9429 0.0260 -2.26 0.024
Taken together I conclude that my transformation into the long format was correct. Now, turning to the flexsurv framework:
flexsurvreg(Surv(time=t)~x,data=df, dist="weibull")
yields:
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 5.00086 4.05569 6.16631 0.53452 NA NA NA
scale NA 13.17215 11.27876 15.38338 1.04293 NA NA NA
x 15.13380 0.01522 0.00567 0.02477 0.00487 1.01534 1.00569 1.02508
But
flexsurvreg(Surv(start,stop,censrec) ~ x ,data=long.df, dist="weibull")
causes an error:
Error in flexsurvreg(Surv(start, stop, censrec) ~ x, data = long.df, dist = "weibull") :
Initial value for parameter 1 out of range
Would anyone happen to know the correct syntax for the latter Surv object? If you use the correct syntax, do you get the same estimates?
Thank you very much,
best,
David
===============
EDIT AFTER FEEDBACK FROM 42
===============
library(splitstackshape)
library(flexsurv)
x<-c(8.136527, 7.626712, 9.809122, 12.125973, 12.031536, 11.238394, 4.208863, 8.809854, 9.723636)
t<-c(2, 3, 13, 5, 7, 37 ,37, 9, 4)
df <- data.frame(t,x)
#transform into long format for time-dependent covariates
df$id <- 1:length(df$t)
df$rep <- df$t-1
long.df <- expandRows(df, "rep")
rep.vec<-c()
for(i in 1:length(df$t)){
rep.vec <- c(rep.vec,1:(df[i,"t"]-1))
}
long.df$start <- rep.vec
long.df$stop <- rep.vec +1
long.df$censrec <- 0
long.df$censrec<-ifelse(long.df$stop==long.df$t,1,long.df$censrec)
coxph(Surv(t)~x,data=df)
coxph(Surv(start,stop,censrec)~x,data=long.df)
flexsurvreg(Surv(time=t)~x,data=df, dist="weibull")
flexsurvreg(Surv(start,stop,censrec) ~ x ,data=long.df, dist="weibull",inits=c(shape=.1, scale=1))
Which yields the same estimates for both coxph models but
Call:
flexsurvreg(formula = Surv(time = t) ~ x, data = df, dist = "weibull")
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 1.0783 0.6608 1.7594 0.2694 NA NA NA
scale NA 27.7731 3.5548 216.9901 29.1309 NA NA NA
x 9.3012 -0.0813 -0.2922 0.1295 0.1076 0.9219 0.7466 1.1383
N = 9, Events: 9, Censored: 0
Total time at risk: 117
Log-likelihood = -31.77307, df = 3
AIC = 69.54614
and
Call:
flexsurvreg(formula = Surv(start, stop, censrec) ~ x, data = long.df,
dist = "weibull", inits = c(shape = 0.1, scale = 1))
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 0.8660 0.4054 1.8498 0.3353 NA NA NA
scale NA 24.0596 1.7628 328.3853 32.0840 NA NA NA
x 8.4958 -0.0912 -0.3563 0.1739 0.1353 0.9128 0.7003 1.1899
N = 108, Events: 9, Censored: 99
Total time at risk: 108
Log-likelihood = -30.97986, df = 3
AIC = 67.95973
Reading the error message:
Error in flexsurvreg(Surv(start, stop, censrec) ~ x, data = long.df, dist = "weibull", :
initial values must be a numeric vector
And then reading the help page, ?flexsurvreg, it seemed as though an attempt at setting values for inits to a named numeric vector should be attempted:
flexsurvreg(Surv(start,stop,censrec) ~ x ,data=long.df, dist="weibull", inits=c(shape=.1, scale=1))
Call:
flexsurvreg(formula = Surv(start, stop, censrec) ~ x, data = long.df,
dist = "weibull", inits = c(shape = 0.1, scale = 1))
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 5.00082 4.05560 6.16633 0.53454 NA NA NA
scale NA 13.17213 11.27871 15.38341 1.04294 NA NA NA
x 15.66145 0.01522 0.00567 0.02477 0.00487 1.01534 1.00569 1.02508
N = 715, Events: 50, Censored: 665
Total time at risk: 715
Log-likelihood = -131.5721, df = 3
AIC = 269.1443
Extremely similar results. My guess was basically a stab in the dark, so I have no guidance on how to make a choice if this had not succeeded other than to "expand the search."
I just want to mention that in flexsurv v1.1.1, running this code:
flexsurvreg(Surv(start,stop,censrec) ~ x ,data=long.df, dist="weibull")
doesn't return any errors. It also gives the same estimates as the non time-varying command
flexsurvreg(Surv(time=t)~x,data=df, dist="weibull")

ggplot function factor level as title

I want to create plots from the following data, so that each plot Title is the Site. I have the following dataframe:
> head(sum_stats)
Season Site Isotope Time n mean sd se
1 Summer Afon Cadnant 14CAA 0 3 100.00000 0.000000 0.0000000
2 Summer Afon Cadnant 14CAA 2 3 68.26976 4.375331 2.5260988
3 Summer Afon Cadnant 14CAA 5 3 69.95398 7.885443 4.5526627
4 Summer Afon Cadnant 14CAA 24 3 36.84054 2.421846 1.3982532
5 Summer Afon Cadnant 14CAA 48 3 27.96619 0.829134 0.4787008
6 Summer Afon Cadnant 14CAA 72 3 26.28713 1.454819 0.8399404
> str(sum_stats)
'data.frame': 648 obs. of 8 variables:
$ Season : Factor w/ 1 level "Summer": 1 1 1 1 1 1 1 1 1 1 ...
$ Site : Factor w/ 27 levels "Afon Cadnant",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Isotope: Factor w/ 4 levels "14CAA","14CGlu",..: 1 1 1 1 1 1 2 2 2 2 ...
$ Time : num 0 2 5 24 48 72 0 2 5 24 ...
$ n : int 3 3 3 3 3 3 3 3 3 3 ...
$ mean : num 100 68.3 70 36.8 28 ...
$ sd : num 0 4.375 7.885 2.422 0.829 ...
$ se : num 0 2.526 4.553 1.398 0.479 ...
I have written a function to create plots of the above data:
plot_func <- function(T){ggplot(data = T) + geom_point(aes(Time, mean, colour = Season)) +
geom_line(aes(Time, mean, colour = Season)) +
geom_errorbar(aes(Time, mean, ymax = (mean + se), ymin = (mean - se)), width = 0.1) +
labs(title = unique(levels(sum_stats$Site)), y = "Percentage of isotope remaining in solution", x = "Time (h)") +
facet_wrap(~Isotope, ncol = 2)} + theme(axis.title.y = element_text(vjust = 1)) +
theme(axis.title.x = element_text(vjust = -0.1)) + theme(plot.title = element_text(vjust = 1)) +
theme_bw()
I then use the function in a by call to run the function over each level of the Site factor:
by(sum_stats, sum_stats$Site, plot_func)
I get 27 graphs of the following form:
However, all the titles are the same. How can I make each title reflect the factor level that it is plotting? Can this be done inside the plotting function?
Thanks
Right now you are setting the title using the original data.frame, and not the subset of data passed to you function. If all the sites are the same in the subset you receive, you can just use the first as the title. Use
...
labs(title = T$Site[1], ...)
...

How do I extract lmer fixed effects by observation?

I have a lme object, constructed from some repeated measures nutrient intake data (two 24-hour intake periods per RespondentID):
Male.lme2 <- lmer(BoxCoxXY ~ -1 + AgeFactor + IntakeDay + (1|RespondentID),
data = Male.Data,
weights = SampleWeight)
and I can successfully retrieve the random effects by RespondentID using ranef(Male.lme1). I would also like to collect the result of the fixed effects by RespondentID. coef(Male.lme1) does not provide exactly what I need, as I show below.
> summary(Male.lme1)
Linear mixed model fit by REML
Formula: BoxCoxXY ~ AgeFactor + IntakeDay + (1 | RespondentID)
Data: Male.Data
AIC BIC logLik deviance REMLdev
9994 10039 -4990 9952 9980
Random effects:
Groups Name Variance Std.Dev.
RespondentID (Intercept) 0.19408 0.44055
Residual 0.37491 0.61230
Number of obs: 4498, groups: RespondentID, 2249
Fixed effects:
Estimate Std. Error t value
(Intercept) 13.98016 0.03405 410.6
AgeFactor4to8 0.50572 0.04084 12.4
AgeFactor9to13 0.94329 0.04159 22.7
AgeFactor14to18 1.30654 0.04312 30.3
IntakeDayDay2Intake -0.13871 0.01809 -7.7
Correlation of Fixed Effects:
(Intr) AgFc48 AgF913 AF1418
AgeFactr4t8 -0.775
AgeFctr9t13 -0.761 0.634
AgFctr14t18 -0.734 0.612 0.601
IntkDyDy2In -0.266 0.000 0.000 0.000
I have appended the fitted results to my data, head(Male.Data) shows
NutrientID RespondentID Gender Age SampleWeight IntakeDay IntakeAmt AgeFactor BoxCoxXY lmefits
2 267 100020 1 12 0.4952835 Day1Intake 12145.852 9to13 15.61196 15.22633
7 267 100419 1 14 0.3632839 Day1Intake 9591.953 14to18 15.01444 15.31373
8 267 100459 1 11 0.4952835 Day1Intake 7838.713 9to13 14.51458 15.00062
12 267 101138 1 15 1.3258785 Day1Intake 11113.266 14to18 15.38541 15.75337
14 267 101214 1 6 2.1198688 Day1Intake 7150.133 4to8 14.29022 14.32658
18 267 101389 1 5 2.1198688 Day1Intake 5091.528 4to8 13.47928 14.58117
The first couple of lines from coef(Male.lme1) are:
$RespondentID
(Intercept) AgeFactor4to8 AgeFactor9to13 AgeFactor14to18 IntakeDayDay2Intake
100020 14.28304 0.5057221 0.9432941 1.306542 -0.1387098
100419 14.00719 0.5057221 0.9432941 1.306542 -0.1387098
100459 14.05732 0.5057221 0.9432941 1.306542 -0.1387098
101138 14.44682 0.5057221 0.9432941 1.306542 -0.1387098
101214 13.82086 0.5057221 0.9432941 1.306542 -0.1387098
101389 14.07545 0.5057221 0.9432941 1.306542 -0.1387098
To demonstrate how the coef results relate to the fitted estimates in Male.Data (which were grabbed using Male.Data$lmefits <- fitted(Male.lme1), for the first RespondentID, who has the AgeFactor level 9-13:
- the fitted value is 15.22633, which equals - from the coeffs - (Intercept) + (AgeFactor9-13) = 14.28304 + 0.9432941
Is there a clever command for me to use that will do want I want automatically, which is to extract the fixed effect estimate for each subject, or am I faced with a series of if statements trying to apply the correct AgeFactor level to each subject to get the correct fixed effect estimate, after deducting the random effect contribution off the Intercept?
Update, apologies, was trying to cut down on the output I was providing and forgot about str(). Output is:
>str(Male.Data)
'data.frame': 4498 obs. of 11 variables:
$ NutrientID : int 267 267 267 267 267 267 267 267 267 267 ...
$ RespondentID: Factor w/ 2249 levels "100020","100419",..: 1 2 3 4 5 6 7 8 9 10 ...
$ Gender : int 1 1 1 1 1 1 1 1 1 1 ...
$ Age : int 12 14 11 15 6 5 10 2 2 9 ...
$ BodyWeight : num 51.6 46.3 46.1 63.2 28.4 18 38.2 14.4 14.6 32.1 ...
$ SampleWeight: num 0.495 0.363 0.495 1.326 2.12 ...
$ IntakeDay : Factor w/ 2 levels "Day1Intake","Day2Intake": 1 1 1 1 1 1 1 1 1 1 ...
$ IntakeAmt : num 12146 9592 7839 11113 7150 ...
$ AgeFactor : Factor w/ 4 levels "1to3","4to8",..: 3 4 3 4 2 2 3 1 1 3 ...
$ BoxCoxXY : num 15.6 15 14.5 15.4 14.3 ...
$ lmefits : num 15.2 15.3 15 15.8 14.3 ...
The BodyWeight and Gender aren't being used (this is the males data, so all the Gender values are the same) and the NutrientID is similarly fixed for the data.
I have been doing horrible ifelse statements sinced I posted, so will try out your suggestion immediately. :)
Update2: this works perfectly with my current data and should be future-proof for new data, thanks to DWin for the extra help in the comment for this. :)
AgeLevels <- length(unique(Male.Data$AgeFactor))
Temp <- as.data.frame(fixef(Male.lme1)['(Intercept)'] +
c(0,fixef(Male.lme1)[2:AgeLevels])[
match(Male.Data$AgeFactor, c("1to3", "4to8", "9to13","14to18", "19to30","31to50","51to70","71Plus") )] +
c(0,fixef(Male.lme1)[(AgeLevels+1)])[
match(Male.Data$IntakeDay, c("Day1Intake","Day2Intake") )])
names(Temp) <- c("FxdEffct")
Below is how I've always found it easiest to extract the individuals' fixed effects and random effects components in the lme4-package. It actually extracts the corresponding fit to each observation. Assuming we have a mixed-effects model of form:
y = Xb + Zu + e
where Xb are the fixed effects and Zu are the random effects, we can extract the components (using lme4's sleepstudy as an example):
library(lme4)
fm1 <- lmer(Reaction ~ Days + (Days|Subject), sleepstudy)
# Xb
fix <- getME(fm1,'X') %*% fixef(fm1)
# Zu
ran <- t(as.matrix(getME(fm1,'Zt'))) %*% unlist(ranef(fm1))
# Xb + Zu
fixran <- fix + ran
I know that this works as a generalized approach to extracting components from linear mixed-effects models. For non-linear models, the model matrix X contains repeats and you may have to tailor the above code a bit. Here's some validation output as well as a visualization using lattice:
> head(cbind(fix, ran, fixran, fitted(fm1)))
[,1] [,2] [,3] [,4]
[1,] 251.4051 2.257187 253.6623 253.6623
[2,] 261.8724 11.456439 273.3288 273.3288
[3,] 272.3397 20.655691 292.9954 292.9954
[4,] 282.8070 29.854944 312.6619 312.6619
[5,] 293.2742 39.054196 332.3284 332.3284
[6,] 303.7415 48.253449 351.9950 351.9950
# Xb + Zu
> all(round((fixran),6) == round(fitted(fm1),6))
[1] TRUE
# e = y - (Xb + Zu)
> all(round(resid(fm1),6) == round(sleepstudy[,"Reaction"]-(fixran),6))
[1] TRUE
nobs <- 10 # 10 observations per subject
legend = list(text=list(c("y", "Xb + Zu", "Xb")), lines = list(col=c("blue", "red", "black"), pch=c(1,1,1), lwd=c(1,1,1), type=c("b","b","b")))
require(lattice)
xyplot(
Reaction ~ Days | Subject, data = sleepstudy,
panel = function(x, y, ...){
panel.points(x, y, type='b', col='blue')
panel.points(x, fix[(1+nobs*(panel.number()-1)):(nobs*(panel.number()))], type='b', col='black')
panel.points(x, fixran[(1+nobs*(panel.number()-1)):(nobs*(panel.number()))], type='b', col='red')
},
key = legend
)
It is going to be something like this (although you really should have given us the results of str(Male.Data) because model output does not tell us the factor levels for the baseline values:)
#First look at the coefficients
fixef(Male.lme2)
#Then do the calculations
fixef(Male.lme2)[`(Intercept)`] +
c(0,fixef(Male.lme2)[2:4])[
match(Male.Data$AgeFactor, c("1to3", "4to8", "9to13","14to18") )] +
c(0,fixef(Male.lme2)[5])[
match(Male.Data$IntakeDay, c("Day1Intake","Day2Intake") )]
You are basically running the original data through a match function to pick the correct coefficient(s) to add to the intercept ... which will be 0 if the data is the factor's base level (whose spelling I am guessing at.)
EDIT: I just noticed that you put a "-1" in the formula so perhaps all of your AgeFactor terms are listed in the output and you can tale out the 0 in the coefficient vector and the invented AgeFactor level in the match table vector.

Resources