Related
My goal is to run linear regressions with my defined equation, and then store the model residuals to my original dataset.
library(tidyverse)
library(stringr)
set.seed(5)
df <- data.frame(
id = c(1:100),
age = sample(20:80, 100, replace = TRUE),
sex = sample(c("M", "F"), 100, replace = TRUE, prob = c(0.7, 0.3)),
type = sample(letters[1:4], 100, replace = TRUE),
bmi = sample(15:35, 100, replace = TRUE),
sbp = sample(75:160, 100, replace = TRUE),
cat_outcome1 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.68, 0.32)),
cat_outcome2 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.65, 0.35)),
cat_outcome3 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.60, 0.40)),
cat_outcome4 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.45, 0.55)),
dog_outcome1 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.68, 0.32)),
dog_outcome2 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.65, 0.35)),
dog_outcome3 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.60, 0.40)),
dog_outcome4 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.45, 0.55))
)
outcome = colnames(df)[str_detect(colnames(df), "outcome")]
test_function = function(vars_dep, vars_indep, input_data){
for (z in vars_dep) {
formula = as.formula(paste0(z, " ~ ", vars_indep))
model = lm(formula, data = input_data, na.action = na.exclude)
# Take the residual from each model, create a new col with the suffix '.res'
input_data[, paste0(z, ".res")] = residuals(model)
}
}
Like shown above, I would like to save the residuals and give them a suffix depending on which y I use in the model, and finally save these residuals as columns in my original dataframe df. So I expected to see cat_outcome1.res, cat_outcome2.res as new columns but they were not saved in df. Any suggestions are greatly appreciated!
This function gives you what you want:
test_function <- function(vars_dep, vars_indep, input_data){
for (z in vars_dep) {
formula = as.formula(paste0(z, " ~ ", vars_indep))
model = lm(formula, data = input_data, na.action = na.exclude)
# Take the residual from each model, create a new col with the suffix '.res'
input_data[[paste0(z, ".res")]] <- residuals(model)
}
return(input_data)
}
I have code to loop a logistic regression over several selected dependant variables (called outcome1-4). I would like to only run the model if a condition in an independent variable is met. Let's say I want at least two females for each outcome and type combination.
Dummy data:
set.seed(5)
df <- data.frame(
id = c(1:100),
age = sample(20:80, 100, replace = TRUE),
sex = sample(c("M", "F"), 100, replace = TRUE, prob = c(0.7, 0.3)),
type = sample(letters[1:4], 100, replace = TRUE),
outcome1 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.68, 0.32)),
outcome2 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.65, 0.35)),
outcome3 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.60, 0.40)),
outcome4 = sample(c(0L, 1L), 100, replace = TRUE, prob = c(0.45, 0.55)))
Code to loop GLM (cred to https://stats.idre.ucla.edu/r/codefragments/looping_strings/):
outcomelist <- names(df)[5:8]
modelall <- lapply(outcomelist, function(x) {
glm(substitute(i ~ type + sex, list(i = as.name(x))), family = "binomial", data = df)})
I have found lots of questions concerning the loop but not any with additional condition. I am thinking subset but not being a pro on lapply yet I don't know where to put it.
If this is not an additional question I would like each model to be named the name of the outcome variable in the list (instead of 1 to 4) since it otherwise will be difficult to keep track of the models when the condition is added.
Appreciate any help!
One possibility is to clean the data prior to running lapply():
df.new <- df
for(ii in 1:length(outcomelist)){
temp <- outcomelist[ii]
# check the condition for outcome variable ii
condition <- any(aggregate(df$sex=="F", by=list(df$type, df[,temp]), FUN="sum")$x < 2)
if(condition){
# if the condition is met, remove the variable from df and outcomelist
df.new[,temp] <- NULL
outcomelist[ii] <- NA
}
}
# lose irrelevant outcomes
outcomelist <- na.omit(outcomelist)
modelall <- lapply(outcomelist, function(x) {
glm(substitute(i ~ type + sex, list(i = as.name(x))), family = "binomial", data = df.new)})
# name the list
names(modelall) <- outcomelist
Let's assume I have this dataframe:
N <- 50
df <- data.frame(
LA1 = sample(1:10, size = N, replace = TRUE),
LA2 = sample(1:10, size = N, replace = TRUE),
LA3 = sample(1:10, size = N, replace = TRUE),
LA4 = sample(1:10, size = N, replace = TRUE),
LA5 = sample(1:10, size = N, replace = TRUE),
LA6 = sample(1:10, size = N, replace = TRUE),
LA7 = sample(1:10, size = N, replace = TRUE),
LA8 = sample(1:10, size = N, replace = TRUE),
LAY = sample(1:10, size = N, replace = TRUE),
UF1 = sample(1:10, size = N, replace = TRUE),
UF2 = sample(1:10, size = N, replace = TRUE),
UF3 = sample(1:10, size = N, replace = TRUE),
UF4 = sample(1:10, size = N, replace = TRUE),
UF5 = sample(1:10, size = N, replace = TRUE),
UF6 = sample(1:10, size = N, replace = TRUE),
UFY = sample(1:10, size = N, replace = TRUE),
EK1 = sample(1:10, size = N, replace = TRUE),
EK2 = sample(1:10, size = N, replace = TRUE),
EK3 = sample(1:10, size = N, replace = TRUE),
EK4 = sample(1:10, size = N, replace = TRUE),
EK5 = sample(1:10, size = N, replace = TRUE),
EK6 = sample(1:10, size = N, replace = TRUE),
EK7 = sample(1:10, size = N, replace = TRUE),
EK8 = sample(1:10, size = N, replace = TRUE),
EK9 = sample(1:10, size = N, replace = TRUE),
EK10 = sample(1:10, size = N, replace = TRUE),
EK11 = sample(1:10, size = N, replace = TRUE),
EK12 = sample(1:10, size = N, replace = TRUE),
EKY = sample(1:10, size = N, replace = TRUE),
Z1 = sample(1:10, size = N, replace = TRUE),
Z2 = sample(1:10, size = N, replace = TRUE),
Z3 = sample(1:10, size = N, replace = TRUE)
)
Where I want to compute this models:
m1=lm(formula = LAY ~ LA1+LA2+LA3+LA4+LA5+LA6+LA7+LA8, data = df)
m11=step(m1,direction="both")
m2=lm(formula = UFY ~ UF1+UF2+UF3+UF4+UF5+UF6,data = df)
m22=step(m2,direction="both")
m3=lm(formula = EKY ~ EK1+EK2+EK3+EK4+EK5+EK6+EK7+EK8+EK9+EK10+EK11+EK12, data = df)
m33=step(m3,direction="both")
m8=lm(formula = Z1 ~ LAY+UFY+EKY, data = df)
m88=step(m8,direction="both")
m9=lm(formula = Z2 ~ LAY+UFY+EKY, data = df)
m99=step(m9,direction="both")
m10=lm(formula = Z3 ~ LAY+UFY+EKY, data = df)
m100=step(m10,direction="both")
As you can see, if the dimensionality of the database increases (increasing the number of LA, UF, or EK independent variables) I will have to modify manually the input for the models). So, I'm looking for a way to:
Given a certain quantity of independent variables (could be 5, 10, 30 or more) for a given category (LA, UF, and EK), the input for the model changes automatically.
Even I have found different syntax to compute the models (like X*Z = [(X+Z)^3]), I can't find a way to make this computation more dynamic.
Considerations:
The number of independent variables (LA, UF, EK) can change.
The number of dependent variables (LAY, UFY, EKY) never changes.
From the output of this models is extracted the coefficient vector (just in case this one).
I'm trying to produce several aggregate statistics, and some of them need to be produced on a subset of each group. The data.table is quite large, 10 million rows, but using by without column subsetting is blazing fast (less than a second). Adding just one additional column which needs to be calculated on a subset of each group increases the running time by factor of 12.
Is the a faster way to do this? Below is my full code.
library(data.table)
library(microbenchmark)
N = 10^7
DT = data.table(id1 = sample(1:400, size = N, replace = TRUE),
id2 = sample(1:100, size = N, replace = TRUE),
id3 = sample(1:50, size = N, replace = TRUE),
filter_var = sample(1:10, size = N, replace = TRUE),
x1 = sample(1:1000, size = N, replace = TRUE),
x2 = sample(1:1000, size = N, replace = TRUE),
x3 = sample(1:1000, size = N, replace = TRUE),
x4 = sample(1:1000, size = N, replace = TRUE),
x5 = sample(1:1000, size = N, replace = TRUE) )
setkey(DT, id1,id2,id3)
microbenchmark(
DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5)
) , by = c('id1','id2','id3')] , unit = 's', times = 10L)
min lq mean median uq max neval
0.942013 0.9566891 1.004134 0.9884895 1.031334 1.165144 10
microbenchmark( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(x1[filter_var < 5]) #this line slows everything down
) , by = c('id1','id2','id3')] , unit = 's', times = 10L)
min lq mean median uq max neval
12.24046 12.4123 12.83447 12.72026 13.49059 13.61248 10
GForce makes grouped operations run faster and will work on expressions like list(x = funx(X), y = funy(Y)), ...) where X and Y are column names and funx and funy belong to the set of optimized functions.
For a full description of what works, see ?GForce.
To test if an expression works, read the messages from DT[, expr, by=, verbose=TRUE].
In the OP's case, we have sum_x1_F1 = sum(x1[filter_var < 5]) which is not covered by GForce even though sum(v) is. In this special case, we can make a var v = x1*condition and sum that:
DT[, v := x1*(filter_var < 5)]
system.time( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(v)
) , by = c('id1','id2','id3')])
# user system elapsed
# 0.63 0.19 0.81
For comparison, timing the OP's code on my computer:
system.time( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(x1[filter_var < 5]) #this line slows everything down
) , by = c('id1','id2','id3')])
# user system elapsed
# 9.00 0.02 9.06
Using the survey package, I am having issues creating an imputationList that svydesign will accept. Here is a reproducible example:
library(tibble)
library(survey)
library(mitools)
# Data set 1
# Note that I am excluding the "income" variable from the "df"s and creating
# it separately so that it varies between the data sets. This simulates the
# variation with multiple imputation. Since I am using the same seed
# (i.e., 123), all the other variables will be the same, the only one that
# will vary will be "income."
set.seed(123)
df1 <- tibble(id = seq(1, 100, by = 1),
gender = as.factor(rbinom(n = 100, size = 1, prob = 0.50)),
working = as.factor(rbinom(n = 100, size = 1, prob = 0.40)),
pweight = sample(50:500, 100, replace = TRUE))
# Data set 2
set.seed(123)
df2 <- tibble(id = seq(1, 100, by = 1),
gender = as.factor(rbinom(n = 100, size = 1, prob = 0.50)),
working = as.factor(rbinom(n = 100, size = 1, prob = 0.40)),
pweight = sample(50:500, 100, replace = TRUE))
# Data set 3
set.seed(123)
df3 <- tibble(id = seq(1, 100, by = 1),
gender = as.factor(rbinom(n = 100, size = 1, prob = 0.50)),
working = as.factor(rbinom(n = 100, size = 1, prob = 0.40)),
pweight = sample(50:500, 100, replace = TRUE))
# Create list of imputed data sets
impList <- imputationList(df1,
df2,
df3)
# Apply NHIS weights
weights <- svydesign(id = ~id,
weight = ~pweight,
data = impList)
I get the following error:
Error in eval(predvars, data, env) :
numeric 'envir' arg not of length one
To get it to work, I needed to directly add imputationList to svydesign as follows:
weights <- svydesign(id = ~id,
weight = ~pweight,
data = imputationList(list(df1,
df2,
df3))
the step by step instructions available at http://asdfree.com/national-health-interview-survey-nhis.html walk through exactly how to create a multiply-imputed nhis design, and the analysis examples below that include svyglm calls. avoid using library(data.table) and library(dplyr) with library(survey)