R: how to specify predictors in mboost model - r

I have the following dataset with 3 columns of covariates, and 1 outcome column:
data <- structure(list(V1 = c(0.368203440103238, 0.324519532540959, -0.267369607029419,
-0.551350850969297, 0.12599748535452), V2 = c(-0.685091020879978,
0.0302665318913346, 0.38152909685676, -0.741473194305708, 1.01476858643759
), V3 = c(-1.11459785962843, -0.012932271762972, 2.02715929057818,
0.118419126609398, -1.01804828579617), y = c(-1.95083653823476,
-0.50091658480941, 3.74423248124182, -0.0459478421882341, -1.24653151600941
)), class = "data.frame", row.names = c("X1", "X2", "X3", "X4",
"X5"))
> head(data)
V1 V2 V3 y
X1 0.3682034 -0.68509102 -1.11459786 -1.95083654
X2 0.3245195 0.03026653 -0.01293227 -0.50091658
X3 -0.2673696 0.38152910 2.02715929 3.74423248
X4 -0.5513509 -0.74147319 0.11841913 -0.04594784
X5 0.1259975 1.01476859 -1.01804829 -1.24653152
I want to fit the following model:
library(mboost)
model <- mboost(y ~ bols(V1, intercept = FALSE) +
bols(V2, intercept = FALSE) + bols(V3, intercept = FALSE),
data = data)
However, it is very tedious to type out bols(covariate, intercept = FALSE) for every single column in the model. Is there a way to automate this for an arbitrary number of covariates? For example, I currently have 3 covariates named V1, V2, V3. But what if I had 10 that are named V1-V10? I would like to avoid having to type out 10 bols() statements.

We can create a formula expression with paste
fmla <- as.formula(paste0('y ~ ', paste0('bols(', setdiff(names(data),
'y'), ', intercept = FALSE)', collapse= " + ")))
model <- mboost(fmla, data = data)
model$call[[2]] <- fmla
model
# Model-based Boosting
#Call:
#mboost(formula = y ~ bols(V1, intercept = FALSE) + bols(V2, intercept = FALSE) + bols(V3, intercept = FALSE), data = data)
# Squared Error (Regression)
#Loss function: (y - f)^2
#Number of boosting iterations: mstop = 100
#Step size: 0.1
#Offset: 1.157408e-15
#Number of baselearners: 3

Related

tbl_regression(), plm, and mice - Error: Tibble columns must have compatible sizes

I am trying to print a regression model using tbl_regression() on a plm object with multiply imputed data. I've found that I can print the regression table if the plm has one independent variable, but not if it has two or more independent variables.
I understand that the below error message is common, but I don't understand what it means in the context of tbl_regression and multiply imputed data. Is there a bug in gtsummary, or is something wrong with my code?
library(mice, warn.conflicts = FALSE)
library(mitools)
library(missMethods)
library(plm)
library(gtsummary)
options(scipen=999)
set.seed(12345)
data("Grunfeld")
df <- delete_MCAR(Grunfeld, p = 0.3, cols_mis = c(3:5))
imp <- mice::mice(df, m = 5, print = FALSE)
implist <- imputationList(
lapply(1:imp$m, function(n) mice::complete(imp, action = n)))
fit1 <- lapply(implist$imputations, function(x){ plm(inv ~ value, data = x, model = "within", index = c("firm", "year"))})
#tbl_regression(as.mira(fit1)) # works
fit2 <- lapply(implist$imputations, function(x){ plm(inv ~ value + capital, data = x, model = "within", index = c("firm", "year"))})
tbl_regression(as.mira(fit2)) # does not work
#> pool_and_tidy_mice(): Tidying mice model with
#> `mice::pool(x) %>% mice::tidy(exponentiate = FALSE, conf.int = TRUE, conf.level = 0.95)`
#> Error: Tibble columns must have compatible sizes.
#> * Size 2: Existing data.
#> * Size 3: Column `variable`.
#> ℹ Only values of size one are recycled.
Thank you! This code now works under the current version of gtsummary.
library(mice, warn.conflicts = FALSE)
library(mitools)
library(missMethods)
library(plm)
library(gtsummary)
options(scipen=999)
set.seed(12345)
mice::version(pkg = "gtsummary")
#> [1] "gtsummary 1.5.1.9001 /Library/Frameworks/R.framework/Versions/4.1/Resources/library"
data("Grunfeld")
df <- delete_MCAR(Grunfeld, p = 0.3, cols_mis = c(3:5))
imp <- mice::mice(df, m = 5, print = FALSE)
implist <- imputationList(
lapply(1:imp$m, function(n) mice::complete(imp, action = n)))
fit2 <- lapply(implist$imputations, function(x){ plm(inv ~ value + capital, data = x, model = "within", index = c("firm", "year"))})
tbl_regression(as.mira(fit2))
#> pool_and_tidy_mice(): Tidying mice model with
#> `mice::pool(x) %>% mice::tidy(exponentiate = FALSE, conf.int = TRUE, conf.level = 0.95)`

Repeated Single Regression w/ Factor and Continuous Variables

I have a dataset similar to the below structure that I'd like to run several single regressions with:
example <- tibble(
id = paste0('ID', runif(100,0,10)),
response = runif(100,0,10),
x1 = runif(100,0,10),
x2 = factor(rep(seq(1,5),20)))
regression_1 <- lm(response ~ x1, data = example)
regression_2 <- lm(response ~ x2, data = example)
And so on for n predictors. I've tried a variety of approaches but I keep running into an error and cannot figure out for the life of me any neat and easy way to do this.
Appreciate any help
Using reformulate.
lapply(c("x1", "x2", "xn"), function(x) lm(reformulate(x, response="response"), d))
# [[1]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) x1
# 0.03567 0.03603
#
#
# [[2]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) x2
# 0.03098 -0.14824
#
#
# [[3]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) xn
# 0.02961 0.08823
Data:
set.seed(42)
d <- data.frame(matrix(rnorm(400), 100, 4, dimnames=list(NULL, c("response", "x1", "x2", "xn"))))
We can subset the data and use ~ .
lapply(c("x1", "x2", "xn"), function(x) lm(response ~ ., df1[c('response', x)]))

Linear regression with ongoing data, in R

Modell
y ~ x1 + x2 + x3
about 1000 rows
What Iwant to do is to do an prediction "step-by-step"
Using Row 0:20 to predict y of 21:30 and then using 11:30 to predict y of 31:40 and so on.
You can use the predict function:
mod = lm(y ~ ., data=df[1:990,])
pred = predict(mod, newdata=df[991:1000,2:4])
Edit: to change the range of training data in a loop:
index = seq(10,990,10)
pred = matrix(nrow=10, ncol=length(index))
for(i in index){
mod = lm(y ~ ., data=df[1:i,])
pred[,i/10] = predict(mod, newdata=df[(i+1):(i+10),2:4])
MSE[i/10] = sum((df$y[(i+1):(i+10)]-pred[,i/10])^2)}
mean(MSE)
Are you looking for something like this?
# set up mock data
set.seed(1)
df <- data.frame(y = rnorm(1000),
x1 = rnorm(1000),
x2 = rnorm(1000),
x3 = rnorm(1000))
# for loop
prd <- list()
for(i in 1:970){
# training data
trn <- df[i:(i+20), ]
# test data
tst <- df[(i+21):(i+30), ]
# lm model
mdl <- lm(y ~ x1 + x2 + x3, trn)
# append a list of data.frame with both predicted and actual values
# for later confrontation
prd[[i]] <- data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
# your list
prd
You can also try something fancier with the package slider:
# define here your model and how you wanna handle the preditions
sliding_lm <- function(..., frm, n_trn, n_tst){
df <- data.frame(...)
trn <- df[1:n_trn, ]
tst <- df[n_trn+1:n_tst, ]
mdl <- lm(y ~ x1 + x2 + x3, trn)
data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
n_trn <- 20 # number of training obs
n_tst <- 10 # number of test obs
frm <- y ~ x1 + x2 + x3 # formula of your model
prd <- slider::pslide(df, sliding_lm,
frm = frm,
n_trn = n_trn,
n_tst = n_tst,
.after = n_trn + n_tst,
.complete = TRUE)
Note that the last 30 entries in the list are NULL, because you look only at complete windows [30 observations with training and test]

Drop each Regressor step by step

Is there any chance to specify the full model once and then just to drop regressors one after the other and producing a nice stargazer table with it without having to write every regression line again and again?
data <- datasets::airquality
# Treating Month and Day as crosssectional and time fixed effects
re1 <- plm(data = data, Ozone~Solar.R+Wind+Temp,
index=c("Month", "Day"), model="within", effect="twoways") # full model
# this is the only regression line I actually want to write.
# The other regressors should be automatically dropped one by one in subsequent regressions.
re2 <- plm(data = data, Ozone~Wind+Temp,
index=c("Month", "Day"), model="within", effect="twoways") # first regressor dropped
re3 <- plm(data = data, Ozone~Solar.R+Temp,
index=c("Month", "Day"), model="within", effect="twoways") # second regressor dropped
re4 <- plm(data = data, Ozone~Solar.R+Wind,
index=c("Month", "Day"), model="within", effect="twoways") # third regressor dropped
stargazer(re1, re2, re3, re4,
type = "html",
title = "Dropped after one another",
out="HopeThisWorks.html")
I have looked into the step() function but this isn't quite helping as I am not aiming to drop according to significance or anything.
re1 <- plm(data = data, Ozone~Solar.R+Wind+Temp,
index=c("Month", "Day"), model="within", effect="twoways") # full model
A=lapply(paste0(".~.-",c("Solar.R","Wind","Temp")),function(x)update(re1,as.formula(x)))
[[1]]
Model Formula: Ozone ~ Wind + Temp
Coefficients:
Wind Temp
-2.6933 2.3735
[[2]]
Model Formula: Ozone ~ Solar.R + Temp
Coefficients:
Solar.R Temp
0.040986 2.782978
[[3]]
Model Formula: Ozone ~ Solar.R + Wind
Coefficients:
Solar.R Wind
0.096607 -4.841992
Now to be able to access this in the global environment: use
list2env(setNames(A,paste0("re",seq_along(A)+1)),.GlobalEnv)
stargazer(re1, re2, re3, re4,
type = "html",
title = "Dropped after one another",
out="HopeThisWorks.html")
This is a more flexible alternative:
bene <- function(data = datasets::airquality,
depvar = "Ozone",
covariates = c("Wind","Temp","Solar.R")){
funny <- function(id){
covs <- ifelse(is.na(id),paste0(covariates,collapse = " + "),paste0(covariates[-id],collapse = " + "))
model <- eval(parse(text=paste0("plm(data = data,",depvar," ~ ",covs,",index=c('Month', 'Day'), model='within', effect='twoways')")))
return(model)
}
xx <- capture.output(stargazer::stargazer(purrr::map(c(NA,1:length(covariates)),funny),
type = "html",
out = paste0("results/model.html"),
star.cutoffs = c(0.05,0.01,0.001),
title = paste0(depvar)))
models <- purrr::map(c(NA,1:length(covariates)),funny)
map(models,function(x)print(summary(x)))
}
bene(data = datasets::airquality,
depvar = "Ozone",
covariates = c("Wind","Temp","Solar.R"))
You could use (an adaption of) this (so far only stepwise-working) function, which shows you the stargazer output and also saves the regression table as a html file.
stepwise_model <- function(data = "datasets::airquality",
depvar = "Ozone",
covariates = c("Wind","Temp","Solar.R")){
data_df <- eval(parse(text = data))
models <- c()
for(q in 1:length(covariates)){
label <- paste0("mod_",depvar,"_",q)
models[q] <- label
cov <- paste0(covariates[1:q],collapse = " + ")
eval(parse(text = paste0(label," <<- plm::plm(",depvar," ~ ",cov,",data = data_df,index=c('Month', 'Day'), model='within', effect='twoways')")))
eval(parse(text = paste0("print(summary(",label,"))")))
}
modellist <- eval(parse(text = paste0("list(",paste0(models,collapse = ","),")")))
xx <- capture.output(stargazer::stargazer(modellist ,
type = "html",
out = paste0("results/paper/models/mod_",depvar,".html"),
star.cutoffs = c(0.05,0.01,0.001),
title = paste0(depvar)))
}
stepwise_model()

Error from JM package in R

JM is a package to fit a model with joint longitudinal and survival data. I can get it to run with their example data, but I get an error with my own data. Any idea what the issue with JMfit1 or JMfit2 is?
My Data:
https://1drv.ms/u/s!AkG9wyz5G1c1gR4Vs_xohO--4Rb5
install.packages('JM')
require(JM)
?jointModel
# Example from vignette
# linear mixed model fit (random intercepts + random slopes)
fitLME <- lme(log(serBilir) ~ drug * year, random = ~ year | id, data = pbc2)
summary(fitLME)
# survival regression fit
fitSURV <- survreg(Surv(years, status2) ~ drug, data = pbc2.id, x = TRUE)
summary(fitSURV)
# joint model fit, under the (default) Weibull model
fitJOINT <- jointModel(fitLME, fitSURV, timeVar = "year")
fitJOINT
summary(fitJOINT)
# we can also include an interaction term of log(serBilir) with drug
fitJOINT <- jointModel(fitLME, fitSURV, timeVar = "year",
# interFact = list(value = ~ drug, data = pbc2.id))
fitJOINT
summary(fitJOINT)
# With my data:
data = readRDS('data.list.d1.dk.RDS')
d1 = data$d1
dk = data$dk
dim(d1); names(d1)
dim(dk); names(dk)
slct.cov = c('ID','Yi','Ai','zi.1','zi.2','zi.3','xi_A','di')
fmla.fix = as.formula('Yi ~ Ai*(zi.1+zi.2+zi.3)')
fmla.rnd = as.formula(' ~ Ai|ID')
fit.Yi = lme(fixed= fmla.fix, random=reStruct(fmla.rnd),
method="ML", data = dk[,slct.cov] )
surv.model = survreg(Surv(xi_A, di) ~ zi.1+zi.2+zi.3, data = d1[,slct.cov], x = TRUE)
JMfit1 = jointModel(lmeObject = fit.Yi, survObject = surv.model, timeVar = 'Ai')
# Error in if (t1 || t2) { : missing value where TRUE/FALSE needed
dForm <- list(fixed = ~ 1 + zi.1 + zi.2 + zi.3, indFixed = c(2,6,7,8), random = ~ 1, indRandom = 2)
JMfit2 = jointModel(lmeObject = fit.Yi, survObject = surv.model, timeVar = 'Ai',
derivForm = dForm, parameterization = c("both")) #"both", "value", "slope"
# method = "weibull-PH-aGH",
# "weibull-PH-aGH", "weibull-PH-GH", "weibull-AFT-aGH","weibull-AFT-GH",
# "piecewise-PH-aGH", "piecewise-PH-GH", "Cox-PH-aGH", "Cox-PH-GH",
# "spline-PH-aGH", "spline-PH-GH", # "ch-Laplace"
# interFact = NULL, lag = 0, scaleWB = NULL,
# CompRisk = FALSE, init = NULL, control = list())
It looks to me that your fmla.fix model needs more investigation in its own right maybe.
Simplifying the interactions to Yi ~ Ai+zi.1+zi.3 + Ai*(zi.2) or even Yi ~ Ai+zi.1+zi.2+zi.3 seem to give a valid JMfit1 output.
I suspect you'll get a different error for JMfit2 (do you??), so that may be a subsequent SO question.

Resources