Faster anova of regression models - r

I have the following toy data -
data<-data.frame(y=rnorm(1000),x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000), x4=rnorm(1000))
On this data, I am creating two models as below -
fit1 = lm(y ~ x1 + x3, data)
fit2 = lm(y ~ x2 + x3 + x4, data)
Finally, I am comparing these models using anova
anova(fit1, fit2)
Since I am running these tests on ~1 million separate datasets, I want to improve the performance and don't want to use lm and anova functions.
To speed up computation, I am using RcppArmadillo::fastLM, instead of lm but there is no available anova function to compare models. Is there a faster function (compared to lm), which also has a corresponding anova function?
Any suggestions to improve performance will be appreciated.
Here are the performance results of various lm versions used below. For linear regression fastLM is an obvious winner. Function anova2 by #Jay is much faster than anova function and works on fastLM. Function f by #Donald is also faster than anova function and works on fastLM.
microbenchmark::microbenchmark(
base_lm_1 = fit1 <- lm(y ~ x1 + x3, data),
base_lm_2 = fit2 <-lm(y ~ x2 + x3 + x4, data),
lm.fit_1 = lm.fit1 <- with(data, .lm.fit(y = y, x = cbind(1, x1, x3))),
lm.fit_2 = lm.fit2 <- with(data, .lm.fit(y = y, x = cbind(1, x2, x3, x4))),
lm.fit_3 = lm.fit3 <- lm_fun(y ~ x1 + x3, data),
lm.fit_4 = lm.fit4 <- lm_fun(y ~ x2 + x3 + x4, data),
fastLm1 = fastLM1 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x1, x3))),
fastLm2 = fastLM2 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x2, x3, x4))),
anova_base = anova(fit1, fit2),
Jay_fastLM = anova2(fastLM1, fastLM1),
Jay_lm.fit = anova2(lm.fit3, lm.fit4),
Donald = f(fastLM1, fastLM1),
times = 100L,
control=list(warmup=100L)
)
Unit: microseconds
expr min lq mean median uq max neval cld
base_lm_1 1472.480 1499.2560 1817.2659 1532.3840 1582.2615 28370.870 100 e
base_lm_2 1657.745 1706.5505 1796.3631 1744.3945 1825.7435 4761.020 100 e
lm.fit_1 94.212 106.9020 112.3093 111.2235 116.7010 147.192 100 a
lm.fit_2 124.220 129.8080 134.4455 132.9830 138.2510 156.166 100 a
lm.fit_3 853.906 873.9035 902.5856 889.9715 917.9375 1028.415 100 cd
lm.fit_4 991.238 1006.7015 1213.7061 1021.5325 1045.8980 19379.399 100 d
fastLm1 368.289 390.7805 434.1467 422.0855 476.9085 584.761 100 a c
fastLm2 416.645 441.8660 481.0027 462.8850 514.0755 617.619 100 a c
anova_base 2021.982 2099.8755 2322.2707 2190.3340 2246.7800 15345.093 100 f
Jay_fastLM 202.026 218.2580 229.6244 226.3405 238.9490 303.964 100 ab
Jay_lm.fit 200.028 216.0805 234.0143 229.7580 246.1870 292.268 100 ab
Donald 549.425 582.8105 612.6990 605.4400 625.5340 1079.989 100 bc

We could hack stats:::anova.lmlist so that it works for lists produced by .lm.fit (notice the dot) and RcppArmadillo::fastLm. Please check stats:::anova.lmlist, if I didn't delete stuff you need!
anova2 <- function(object, ...) {
objects <- list(object, ...)
ns <- vapply(objects, function(x) length(x$residuals), 0L)
stopifnot(!any(ns != ns[1L]))
resdf <- vapply(objects, df.residual, 0L)
resdev <- vapply(objects, function(x) crossprod(residuals(x)), 0)
bigmodel <- order(resdf)[1L]
dfs <- c(NA, -diff(resdf))[bigmodel]
ssq <- c(NA, -diff(resdev))[bigmodel]
df.scale <- resdf[bigmodel]
scale <- resdev[bigmodel]/resdf[bigmodel]
fstat <- (ssq/dfs)/scale
p <- pf(fstat, abs(dfs), df.scale, lower.tail=FALSE)
return(c(F=fstat, p=p))
}
These wrappers make .lm.fit and RcppArmadillo::fastLm a little more convenient:
lm_fun <- function(fo, dat) {
X <- model.matrix(fo, dat)
fit <- .lm.fit(X, dat[[all.vars(fo)[1]]])
fit$df.residual <- dim(X)[1] - dim(X)[2]
return(fit)
}
fastLm_fun <- function(fo, dat) {
fit <- RcppArmadillo::fastLm(model.matrix(fo, dat), dat[[all.vars(fo)[1]]])
return(fit)
}
Use it
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
# F p
# 0.3609728 0.5481032
## Or use `fastLm_fun` here, but it's slower.
Compare
anova(lm(y ~ x1 + x3, data), lm(y ~ x2 + x3 + x4, data))
# Analysis of Variance Table
#
# Model 1: y ~ x1 + x3
# Model 2: y ~ x2 + x3 + x4
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 997 1003.7
# 2 996 1003.4 1 0.36365 0.361 0.5481
lm_fun (which outperforms fastLm_fun) combined with anova2 appears to be around 60% faster than the conventional approach:
microbenchmark::microbenchmark(
conventional={
fit1 <- lm(y ~ x1 + x3, data)
fit2 <- lm(y ~ x2 + x3 + x4, data)
anova(fit1, fit2)
},
anova2={ ## using `.lm.fit`
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
},
anova2_Fast={ ## using `RcppArmadillo::fastLm`
fit1 <- fastLm_fun(y ~ x1 + x3, data)
fit2 <- fastLm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
},
anova_Donald={
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova_Donald(fit1, fit2)
},
times=1000L,
control=list(warmup=100L)
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# conventional 2.885718 2.967053 3.205947 3.018668 3.090954 40.15720 1000 d
# anova2 1.180683 1.218121 1.285131 1.233335 1.267833 23.81955 1000 a
# anova2_Fast 1.961897 2.012756 2.179458 2.037854 2.087893 26.65279 1000 c
# anova_Donald 1.368699 1.409198 1.561751 1.430562 1.472148 33.12247 1000 b
Data:
set.seed(42)
data <- data.frame(y=rnorm(1000), x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000),
x4=rnorm(1000))

Related

I am getting error in fitting Model based recursive partitioning

library("partykit")
library(data.table) ## optional, but what I'll use to coerce the list into a DT
library(future.apply) ## for parallel stuff
library(glmnet)
plan(multisession) ## use all available cores
#sample size
N <- 200
#coefficients
betas <- list()
betas$b0 <- 1
betas$b1_up <- 2.4
betas$b1_down <- 2
betas$b2_up <- 2.4
betas$b2_down <- 2
ols_formula <- y ~a| x1 + x2+ x3+ x4+ x5+ x6 +x7 +x8+ x9 + x10
#lmtee # function to fit mob tree
#""0 +"" ---> supress the 'double' interecept
ols <- function(y, a, start = NULL, weights = NULL, offset = NULL,..){
lm(y ~ 0 + a)
}
reg_simulation_mob <- function(...){
#data
data <- data. Frame(
a= rbinom(N,1,0.5),
x1 = rnorm(N),
x2 = rnorm(N),
x3=rnorm(N),
x4=rnorm(N),
x5=rnorm(N),
x6=rnorm(N),
x7=rnorm(N),
x8=rnorm(N),
x9=rnorm(N),
x10=rnorm(N),
e = rnorm(N))
#dependent variable
data$y <- betas$b0 + with(data, ifelse(a>0, betas$b1_up * x1 + betas$b2_up * x2 , betas$b1_down * x1 + betas$b2_down * x2 )+ e )
#Estimate mob()-OLS
ols_mob <- lm(ols_formula, data = data, fit =ols)
# return(ols$coefficients)
return(ols_mob)
}
# N repetitions
nreps <- 4
## Parallel version
results <- future_lapply(1:nreps, reg_simulation_mob,future.seed =NULL)
results
The trees are not splitting, moreover I intend to regress y over a treatment with x1...x10 as covariates. The result I obtained is given below
Call:
lm(formula = ols_formula, data = data, fit = ols)
Coefficients:
(Intercept)
0.8658
a | x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10TRUE
NA

Linear regression with ongoing data, in R

Modell
y ~ x1 + x2 + x3
about 1000 rows
What Iwant to do is to do an prediction "step-by-step"
Using Row 0:20 to predict y of 21:30 and then using 11:30 to predict y of 31:40 and so on.
You can use the predict function:
mod = lm(y ~ ., data=df[1:990,])
pred = predict(mod, newdata=df[991:1000,2:4])
Edit: to change the range of training data in a loop:
index = seq(10,990,10)
pred = matrix(nrow=10, ncol=length(index))
for(i in index){
mod = lm(y ~ ., data=df[1:i,])
pred[,i/10] = predict(mod, newdata=df[(i+1):(i+10),2:4])
MSE[i/10] = sum((df$y[(i+1):(i+10)]-pred[,i/10])^2)}
mean(MSE)
Are you looking for something like this?
# set up mock data
set.seed(1)
df <- data.frame(y = rnorm(1000),
x1 = rnorm(1000),
x2 = rnorm(1000),
x3 = rnorm(1000))
# for loop
prd <- list()
for(i in 1:970){
# training data
trn <- df[i:(i+20), ]
# test data
tst <- df[(i+21):(i+30), ]
# lm model
mdl <- lm(y ~ x1 + x2 + x3, trn)
# append a list of data.frame with both predicted and actual values
# for later confrontation
prd[[i]] <- data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
# your list
prd
You can also try something fancier with the package slider:
# define here your model and how you wanna handle the preditions
sliding_lm <- function(..., frm, n_trn, n_tst){
df <- data.frame(...)
trn <- df[1:n_trn, ]
tst <- df[n_trn+1:n_tst, ]
mdl <- lm(y ~ x1 + x2 + x3, trn)
data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
n_trn <- 20 # number of training obs
n_tst <- 10 # number of test obs
frm <- y ~ x1 + x2 + x3 # formula of your model
prd <- slider::pslide(df, sliding_lm,
frm = frm,
n_trn = n_trn,
n_tst = n_tst,
.after = n_trn + n_tst,
.complete = TRUE)
Note that the last 30 entries in the list are NULL, because you look only at complete windows [30 observations with training and test]

`nlme` with crossed random effects

I am trying to fit a crossed non-linear random effect model as the linear random effect models as mentioned in this question and in this mailing list post using the nlme package. Though, I get an error regardless of what I try. Here is an example
library(nlme)
#####
# simulate data
set.seed(18112003)
na <- 30
nb <- 30
sigma_a <- 1
sigma_b <- .5
sigma_res <- .33
n <- na*nb
a <- gl(na,1,n)
b <- gl(nb,na,n)
u <- gl(1,1,n)
x <- runif(n, -3, 3)
y_no_noise <- x + sin(2 * x)
y <-
x + sin(2 * x) +
rnorm(na, sd = sigma_a)[as.integer(a)] +
rnorm(nb, sd = sigma_b)[as.integer(b)] +
rnorm(n, sd = sigma_res)
#####
# works in the linear model where we know the true parameter
fit <- lme(
# somehow we found the right values
y ~ x + sin(2 * x),
random = list(u = pdBlocked(list(pdIdent(~ a - 1), pdIdent(~ b - 1)))))
vv <- VarCorr(fit)
vv2 <- vv[c("a1", "b1"), ]
storage.mode(vv2) <- "numeric"
print(vv2,digits=4)
#R Variance StdDev
#R a1 1.016 1.0082
#R b1 0.221 0.4701
#####
# now try to do the same with `nlme`
fit <- nlme(
y ~ c0 + sin(c1),
fixed = list(c0 ~ x, c1 ~ x - 1),
random = list(u = pdBlocked(list(pdIdent(~ a - 1), pdIdent(~ b - 1)))),
start = c(0, 0.5, 1))
#R Error in nlme.formula(y ~ a * x + sin(b * x), fixed = list(a ~ 1, b ~ :
#R 'random' must be a formula or list of formulae
The lme example is similar to the one page 163-166 of "Mixed-effects Models in S and S-PLUS" with only 2 random effects instead of 3.
I should haved used a two-sided formula as written in help("nlme")
fit <- nlme(
y ~ c0 + c1 + sin(c2),
fixed = list(c0 ~ 1, c1 ~ x - 1, c2 ~ x - 1),
random = list(u = pdBlocked(list(pdIdent(c0 ~ a - 1), pdIdent(c1 ~ b - 1)))),
start = c(0, 0.5, 1))
# fixed effects estimates
fixef(fit)
#R c0.(Intercept) c1.x c2.x
#R -0.1788218 0.9956076 2.0022338
# covariance estimates
vv <- VarCorr(fit)
vv2 <- vv[c("c0.a1", "c1.b1"), ]
storage.mode(vv2) <- "numeric"
print(vv2,digits=4)
#R Variance StdDev
#R c0.a1 0.9884 0.9942
#R c1.b1 0.2197 0.4688

Estimating difference in lsmeans

How does one use lsmeans to estimate the difference in two pairwise contrasts? For instance--imagine a continuous dv and two factor predictors
library(lsmeans)
library(tidyverse)
dat <- data.frame(
y = runif(30),
x1 = 1:2 %>%
factor %>%
sample(30, T),
x2 = letters[1:3] %>%
factor %>%
sample(30, T)
)
lm1 <- lm(y ~ x1 * x2, data = dat)
This call gets me the estimates of the effect of x1 by x2
lsmeans(lm1, ~ x1 | x2) %>%
pairs
returns
x2 = a:
contrast estimate SE df t.ratio p.value
1 - 2 -0.150437681 0.2688707 24 -0.560 0.5810
x2 = b:
contrast estimate SE df t.ratio p.value
1 - 2 -0.048950972 0.1928172 24 -0.254 0.8018
x2 = c:
contrast estimate SE df t.ratio p.value
1 - 2 -0.006819473 0.2125610 24 -0.032 0.9747
This is fine, but I now want the difference of these contrasts, to see if these 1 - 2 differences are themselves different according to x2 levels.
Use
lsm = lsmeans(lm1, ~ x1 * x2)
contrast(lsm, interaction = “pairwise”)
As I understand your problem right, you can use this solution :
lm1 <- lm(y ~ x1 * x2, data = dat)
means.int <- lsmeans(lm1, ~x1 + x2)
dd <- contrast(means.int, list(`a--b` = c(1,-1,-1,1,0,0),
`a--c`=c(1,-1,0,0,-1,1),
`b--c` = c(0,0,1,-1,-1,1)),
adjust = 'mvt')

How to export all coefficients of a penlized regression model from package `penalized`? Need them for reporting rolling regression estimate

I have been able to run regression with some coefficients constrained to positive territory, but I'm doing alot of rolling regressions where I face the problem. Here is my sample code:
library(penalized)
set.seed(1)
x1=rnorm(100)*10
x2=rnorm(100)*10
x3=rnorm(100)*10
y=sin(x1)+cos(x2)-x3+rnorm(100)
data <- data.frame(y, x1, x2, x3)
win <- 10
coefs <- matrix(NA, ncol=4, nrow=length(y))
for(i in 1:(length(y)-win)) {
d <- data[(1+i):(win+i),]
p <- win+i
# Linear Regression
coefs[p,] <- as.vector(coef(penalized(y, ~ x1 + x2 + x3, ~1,
lambda1=0, lambda2=0, positive = c(F, F, T), data=data)))}
This is how I usually populate matrix with coefs from rolling regression and now I receive error:
Error in coefs[p, ] <- as.vector(coef(penalized(y, ~x1 + x2 + x3, ~1, :
number of items to replace is not a multiple of replacement length
I assume that this error is produced because there is not always Intercept + 3 coefficients coming out of that penalized regression function. Is there away to get penalized function to show 0 coefs as well? or other way to populated matrix / data.frame?
Perhaps you are unaware of the which argument for coef for "penfit" object. Have a look at:
getMethod(coef, "penfit")
#function (object, ...)
#{
# .local <- function (object, which = c("nonzero", "all", "penalized",
# "unpenalized"), standardize = FALSE)
# {
# coefficients(object, which, standardize)
# }
# .local(object, ...)
#}
#<environment: namespace:penalized>
We can set which = "all" to report all coefficients. The default is which = "nonzero", which is causing the "replacement length differs" issue.
The following works:
library(penalized)
set.seed(1)
x1 = rnorm(100)*10
x2 = rnorm(100)*10
x3 = rnorm(100)*10
y = sin(x1) + cos(x2) - x3 + rnorm(100)
data <- data.frame(y, x1, x2, x3)
win <- 10
coefs <- matrix(NA, ncol=4, nrow=length(y))
for(i in 1:(length(y)-win)) {
d <- data[(1+i):(win+i),]
p <- win + i
pen <- penalized(y, ~ x1 + x2 + x3, ~1, lambda1 = 0, lambda2 = 0,
positive = c(F, F, T), data = data)
beta <- coef(pen, which = "all")
coefs[p,] <- unname(beta)
}

Resources