create a reusable part in R formula - r

That is not a duplicate of this (Edit and reuse the formula part of the call for a model in R) but rather an extension.
So let's say I have several regression models to test. I also have a set of socio demographic variables that I want to add to each model to control for them.
m1 <- lm(a~b)
m1_full <- lm(a~b+age+gender+income)
m2 <- lm(c~d)
m2_full <- lm(c~d+age+gender+income)
Is there a way to store (age+gender+income) and add it to some models?
Something like that (in pseudocode):
ses <- age+gender+income
m1 <- lm(a~b)
m1_full <- lm(a~b+... ses)
m2 <- lm(c~d)
m2_full <- lm(c~d+...ses)

I guess you can just paste everything together into a formula. Here's a function that would do the whole process:
set.seed(0)
df <- data.frame(age = rpois(200,45),
gender = rbinom(200,1,0.5),
income = rnorm(200,30000,10000),
a = rnorm(200),
b = rnorm(200),
c = rnorm(200),
d = rnorm(200))
ses <- c("age", "gender", "income")
get_lm_model <- function(df, outcome_var, pred_var, ses) {
fm <- as.formula(paste(outcome_var, "~", pred_var, "+", paste(ses, collapse = " + ")))
lm(fm, data = df)
}
get_lm_model(df, "a", "b", ses)
#>
#> Call:
#> lm(formula = fm, data = df)
#>
#> Coefficients:
#> (Intercept) b age gender income
#> 3.345e-01 -9.516e-02 -3.748e-03 -7.033e-02 -6.718e-06
get_lm_model(df, "c", "d", ses)
#>
#> Call:
#> lm(formula = fm, data = df)
#>
#> Coefficients:
#> (Intercept) d age gender income
#> 2.775e-01 5.570e-02 -4.564e-03 -5.359e-02 -5.898e-06
Created on 2021-10-26 by the reprex package (v2.0.1)

Related

Extract categorical coeffients and all p-values from a mixed model into a data table

Here is a reproduceable code and sample data
I want to achieve a final data table with 3 columns: 1. exposure quantile 2. OR/RR 3. PV
set.seed(42)
n <- 100
dat = data.frame(ID = rep(c(1:25),times=4 ) ,
Score = rnorm(n, mean=0.3, sd=0.8))
dat = dat %>%
group_by(ID)%>%
dplyr::mutate(exposure1 = rep(c(rnorm(1, mean=6, sd=1.8))),
exposure2 = rep(c(rnorm(1, mean=3, sd=0.6))),
age = rep(c(rnorm(1, mean=40, sd=15))))%>%
ungroup()%>%
dplyr::mutate(exposure1_quantile = cut(exposure1, breaks = 4, labels = c("Q1","Q2","Q3","Q4")),
exposure2_quantile = cut(exposure2, breaks = 4, labels = c("Q1","Q2","Q3","Q4")))
exposures_var = c("exposure1_quantile","exposure2_quantile")
exposure_var_labels("exposure1 Q1","exposure1 Q2 ", "exposure 1 Q3",
"exposure2 Q1","exposure2 Q2 ", "exposure2 Q3")
age="age"
outcome = "Score"
exposure_data_table = c()
for(i in 1:length(exposures_var)){
exp = exposures_var[i]
fixed_effects_formula = paste0(outcome, "~",exp,"+",age)
fixed_effects_formula = as.formula(fixed_effects_formula)
mixedmodel = lme(fixed =fixed_effects_formula, random = ~1|ID, data=dat, method = "ML")
for(m in 2:4){
v = mixedmodel$coefficients$fixed[m]
vector = c(exp , v)
#P=p value for every quantile (HOW TO ADD?)
#exposure_name = exposure_var_labels[?] (HOW TO ADD LABEL)
exposure_data_table = rbind(exposure_data_table, vector)
}
}
exposure_data_table = as.data.table(exposure_data_table)
colnames(exposure_data_table)=c("Exposure","RR")#,"pv")
view(exposure_data_table)
I first used anova to try and get the pvalue but it didnt work.
I think a tidymodels approach using lme would work well here:
library(nlme)
library(tidymodels)
library(multilevelmod)
library(data.table)
lme_spec <-
linear_reg() %>%
set_engine("lme", random = ~ 1 | ID)
Map(function(exp) {
fixed_effects_formula <- as.formula(paste0("Score~",exp,"+ age +", 0))
lme_spec %>%
fit(fixed_effects_formula, data = dat) %>%
broom.mixed::tidy() %>%
filter(effect == "fixed", grepl("exposure", term)) %>%
select(term, estimate, std.error, p.value)
}, exposures_var) %>%
bind_rows() %>%
as.data.table()
#> term estimate std.error p.value
#> 1: exposure1_quantileQ1 -0.16147364 0.3532834 0.6525497
#> 2: exposure1_quantileQ2 0.22318505 0.2719366 0.4214784
#> 3: exposure1_quantileQ3 0.24976757 0.3484126 0.4817411
#> 4: exposure1_quantileQ4 0.14177064 0.4020702 0.7280757
#> 5: exposure2_quantileQ1 0.28976458 0.4191198 0.4972840
#> 6: exposure2_quantileQ2 0.19907863 0.2699164 0.4693496
#> 7: exposure2_quantileQ3 0.35040767 0.2827229 0.2295436
#> 8: exposure2_quantileQ4 -0.09587234 0.3533819 0.7889412
Created on 2022-08-07 by the reprex package (v2.0.1)

crr output list- remove df$ from coefficients?

I am using the cmprsk package to create a series of regressions. In the real models I used, I specified my models in the same way that is shown in the example that produces mel2 below. My problem is, I want the Melanoma$ in front of the coefficients to go away, as happens if I had specified the model like in mel1. Is there a way to delete that data frame prefix out of the object without re-running it?
library(cmprsk)
data(Melanoma, package = "MASS")
head(Melanoma)
mel1 <- crr(ftime = Melanoma$time, fstatus = Melanoma$status, cov1 = Melanoma[, c("sex", "age")], cencode = 2)
covs2 <- model.matrix(~ Melanoma$sex + Melanoma$age)[, -1]
mel2 <- crr(ftime = Melanoma$time, fstatus = Melanoma$status, cov1 = covs2, cencode = 2)
What I want:
What I have:
You could use the data argument in model.matrix, and wrap the crr call in with(Melanoma, ...)
covs2 <- model.matrix(~ sex + age, data = Melanoma)[, -1]
mel2 <- with(Melanoma, crr(ftime = time, fstatus = status,
cov1 = covs2, cencode = 2))
mel2$coef
#> sex age
#> 0.58838573 0.01259388
If you are stuck with existing models like this:
covs2 <- model.matrix(~ Melanoma$sex + Melanoma$age)[, -1]
mel2 <- crr(ftime = Melanoma$time, fstatus = Melanoma$status,
cov1 = covs2, cencode = 2)
You could simply rename the coefficients like this
names(mel2$coef) <- c("sex", "age")
mel2
#> convergence: TRUE
#> coefficients:
#> sex age
#> 0.58840 0.01259
#> standard errors:
#> [1] 0.271800 0.009301
#> two-sided p-values:
#> sex age
#> 0.03 0.18

Repeated Single Regression w/ Factor and Continuous Variables

I have a dataset similar to the below structure that I'd like to run several single regressions with:
example <- tibble(
id = paste0('ID', runif(100,0,10)),
response = runif(100,0,10),
x1 = runif(100,0,10),
x2 = factor(rep(seq(1,5),20)))
regression_1 <- lm(response ~ x1, data = example)
regression_2 <- lm(response ~ x2, data = example)
And so on for n predictors. I've tried a variety of approaches but I keep running into an error and cannot figure out for the life of me any neat and easy way to do this.
Appreciate any help
Using reformulate.
lapply(c("x1", "x2", "xn"), function(x) lm(reformulate(x, response="response"), d))
# [[1]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) x1
# 0.03567 0.03603
#
#
# [[2]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) x2
# 0.03098 -0.14824
#
#
# [[3]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) xn
# 0.02961 0.08823
Data:
set.seed(42)
d <- data.frame(matrix(rnorm(400), 100, 4, dimnames=list(NULL, c("response", "x1", "x2", "xn"))))
We can subset the data and use ~ .
lapply(c("x1", "x2", "xn"), function(x) lm(response ~ ., df1[c('response', x)]))

Number of rows in predicted data frame does not equal number of rows in new data frame fed to predict function

I have a data frame that I split based on a vector of factors. I'm trying to create a model for each data set and then create a set of predicted values from them.
I'm trying to span the predicted values over a large number of values (e.g. length.out = 500), but when I feed the predict function with a new data set with 500 rows, it still spits out a predicted data frame that's the same length as the original data frame fed into the model.
data(mtcars)
rownames(mtcars) <- NULL #I've ran this code with and without this line, both times it gave the same result
mtcars.split <- split(mtcars, mtcars$cyl)
mtcars.split <- lapply(mtcars.split, function(x){
rownames(x) <- NULL
x <- droplevels(x)
return(x)
})
mtcars.lm <- lapply(mtcars.split, function(x){
lm(disp ~ wt, data = x)
})
mtcars.fitted <- mapply(x = mtcars.lm, y = mtcars.split, function(x, y){
newdata = data.frame(wt = seq(min(y$wt), max(y$wt), length.out = 500))
fitted <- as.data.frame(predict(x, new.data = newdata, se = T))
return(fitted)
}, SIMPLIFY = F)
lapply(mtcars.fitted, nrow)
lapply(mtcars.split, nrow)
I tried running the linear model for the entire data set and it did the same thing.
mtcars.lm.all <- lm(disp ~ wt, data = mtcars)
newdata <- data.frame(wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 500))
nrow(as.data.frame(predict(mtcars.lm.all, new.data = newdata, se = T)))
Even attempting to subset the data set didn't make any difference.
mtcars.head <- head(mtcars, n = 16)
mtcars.head.lm <- lm(disp ~ wt, data = mtcars.head)
predict.mtcars <- as.data.frame(predict(mtcars.head.lm,
new.data = data.frame(wt = seq(min(mtcars.head$wt),
max(mtcars.head$wt),
length.out = 500)),
se = T))
nrow(predict.mtcars)
Am I missing something here? This used to work but it doesn't seem to work now. Even restarting the R session or my computer doesn't seem to make it work.
the argument in the predict function is not new.data but newdata.
Attached the desired result.
data(mtcars)
rownames(mtcars) <- NULL #I've ran this code with and without this line, both times it gave the same result
mtcars.split <- split(mtcars, mtcars$cyl)
mtcars.split <- lapply(mtcars.split, function(x){
rownames(x) <- NULL
x <- droplevels(x)
return(x)
})
mtcars.lm <- lapply(mtcars.split, function(x){
lm(disp ~ wt, data = x)
})
mtcars.fitted <- mapply(x = mtcars.lm, y = mtcars.split, function(x, y){
newdata = data.frame(wt = seq(min(y$wt), max(y$wt), length.out = 500))
fitted <- as.data.frame(predict(x, newdata = newdata, se = T))
return(fitted)
}, SIMPLIFY = F)
lapply(mtcars.fitted, nrow)
#> $`4`
#> [1] 500
#>
#> $`6`
#> [1] 500
#>
#> $`8`
#> [1] 500
lapply(mtcars.split, nrow)
#> $`4`
#> [1] 11
#>
#> $`6`
#> [1] 7
#>
#> $`8`
#> [1] 14
mtcars.lm.all <- lm(disp ~ wt, data = mtcars)
newdata <- data.frame(wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 500))
nrow(as.data.frame(predict(mtcars.lm.all, newdata = newdata, se = T)))
#> [1] 500
Created on 2020-07-22 by the reprex package (v0.3.0)

Multiple Linear Models

I currently have two data tables one of which contains independent and control variables in columns while the other contains rows of dependent variables.
Can anybody help in creating a method to do linear models from the two tables that repeats for each row in the dependent value table?
You haven't provided nor a reproducible example, nor the desired output, so I'll have to guess
If this is your column names vector
vec <- LETTERS[1:3]
And this is your data set
set.seed(1)
df <- data.frame(A = sample(10, 10),
B = sample(20, 10),
C = sample(30, 10))
Then you can try something like
lapply(vec,
function(x) lm(as.formula(paste(x, "~",
paste(setdiff(names(df), x),
collapse = "+"))),
data = df))
Which will give
# [[1]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) B C
# 4.9687 0.2410 -0.1565
#
#
# [[2]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) A C
# 2.7975 0.8182 0.2775
#
#
# [[3]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) A B
# 13.200 -1.675 0.875

Resources