Repeated Single Regression w/ Factor and Continuous Variables - r

I have a dataset similar to the below structure that I'd like to run several single regressions with:
example <- tibble(
id = paste0('ID', runif(100,0,10)),
response = runif(100,0,10),
x1 = runif(100,0,10),
x2 = factor(rep(seq(1,5),20)))
regression_1 <- lm(response ~ x1, data = example)
regression_2 <- lm(response ~ x2, data = example)
And so on for n predictors. I've tried a variety of approaches but I keep running into an error and cannot figure out for the life of me any neat and easy way to do this.
Appreciate any help

Using reformulate.
lapply(c("x1", "x2", "xn"), function(x) lm(reformulate(x, response="response"), d))
# [[1]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) x1
# 0.03567 0.03603
#
#
# [[2]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) x2
# 0.03098 -0.14824
#
#
# [[3]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) xn
# 0.02961 0.08823
Data:
set.seed(42)
d <- data.frame(matrix(rnorm(400), 100, 4, dimnames=list(NULL, c("response", "x1", "x2", "xn"))))

We can subset the data and use ~ .
lapply(c("x1", "x2", "xn"), function(x) lm(response ~ ., df1[c('response', x)]))

Related

R: how to specify predictors in mboost model

I have the following dataset with 3 columns of covariates, and 1 outcome column:
data <- structure(list(V1 = c(0.368203440103238, 0.324519532540959, -0.267369607029419,
-0.551350850969297, 0.12599748535452), V2 = c(-0.685091020879978,
0.0302665318913346, 0.38152909685676, -0.741473194305708, 1.01476858643759
), V3 = c(-1.11459785962843, -0.012932271762972, 2.02715929057818,
0.118419126609398, -1.01804828579617), y = c(-1.95083653823476,
-0.50091658480941, 3.74423248124182, -0.0459478421882341, -1.24653151600941
)), class = "data.frame", row.names = c("X1", "X2", "X3", "X4",
"X5"))
> head(data)
V1 V2 V3 y
X1 0.3682034 -0.68509102 -1.11459786 -1.95083654
X2 0.3245195 0.03026653 -0.01293227 -0.50091658
X3 -0.2673696 0.38152910 2.02715929 3.74423248
X4 -0.5513509 -0.74147319 0.11841913 -0.04594784
X5 0.1259975 1.01476859 -1.01804829 -1.24653152
I want to fit the following model:
library(mboost)
model <- mboost(y ~ bols(V1, intercept = FALSE) +
bols(V2, intercept = FALSE) + bols(V3, intercept = FALSE),
data = data)
However, it is very tedious to type out bols(covariate, intercept = FALSE) for every single column in the model. Is there a way to automate this for an arbitrary number of covariates? For example, I currently have 3 covariates named V1, V2, V3. But what if I had 10 that are named V1-V10? I would like to avoid having to type out 10 bols() statements.
We can create a formula expression with paste
fmla <- as.formula(paste0('y ~ ', paste0('bols(', setdiff(names(data),
'y'), ', intercept = FALSE)', collapse= " + ")))
model <- mboost(fmla, data = data)
model$call[[2]] <- fmla
model
# Model-based Boosting
#Call:
#mboost(formula = y ~ bols(V1, intercept = FALSE) + bols(V2, intercept = FALSE) + bols(V3, intercept = FALSE), data = data)
# Squared Error (Regression)
#Loss function: (y - f)^2
#Number of boosting iterations: mstop = 100
#Step size: 0.1
#Offset: 1.157408e-15
#Number of baselearners: 3

Repeat a procedure with generated random data and save the results in data frames in R

I create random data in R like that:
data<-matrix(rnorm(100*5,mean=0,sd=1), 100, 5)
colnames(data) <- c("X1", "X2", "X3", "X4", "X5")
data <- as.data.frame(data)
a <- 5
b <- 0.8
c <- 100
and then i want to "play" with the correlations of those data and do something like the following
data[,2] <- a*data[,1] - b*rnorm(c)
data[,3] <- a*data[,1] + b*rnorm(c)
data[,4] <- a*data[,1] - b*rnorm(c)
after that i perform the following code
data<-matrix(rnorm(100*5,mean=0,sd=1), 100, 5)
colnames(data) <- c("X1", "X2", "X3", "X4", "X5")
data <- as.data.frame(data)
a <- 5
b <- 0.8
c <- 100
data[,2] <- a*data[,1] - b*rnorm(c)
data[,3] <- a*data[,1] + b*rnorm(c)
data[,4] <- a*data[,1] - b*rnorm(c)
library(glmnet)
library(coefplot)
A <- as.matrix(data)
set.seed(1)
results <- lapply(seq_len(ncol(A)), function(i) {
list(
cvfit = cv.glmnet(A[, -i] , A[, i] , standardize = TRUE , type.measure = "mse" , nfolds = 10 , alpha = 1)
)
})
lam <- as.data.frame(`names<-`(
lapply(results, function(x) (x$cvfit$lambda.min)),
paste0("X", seq_along(results))
))
sigma<- matrix(rnorm(1*5,mean=0,sd=1), 1, 5)
colnames(sigma) <- c("X1", "X2", "X3", "X4", "X5")
as.vector(sigma)
sub1.sigma <- subset(sigma, select = sigma <= sum(lam))
sub2.sigma <- subset(sigma, select = sigma <= 2*sum(lam))
sub3.sigma <- subset(sigma, select = sigma <= 3*sum(lam))
which results in a vector 1x5 called sigma and 3 vectors sub1.sigma, sub2.sigma, sub3.sigma like the following
> sigma
X1 X2 X3 X4 X5
38.64019 624.4896 0 0 0
> sub1.sigma
X1 X3 X4 X5
1 38.64019 0 0 0
> sub2.sigma
X1 X3 X4 X5
1 38.64019 0 0 0
> sub3.sigma
X1 X3 X4 X5
1 38.64019 0 0 0
The generated data are random and i usually use a set.seed() to produce the same results. I want, if it's possible without modify the main code, to run my code 100 times (with different data each time) and save in 4 dataframes the correspanding results sigma sub1.sigma, sub2.sigma, sub3.sigma in order to compare the them. Is there any way to achieve that in R?
Based on comments i manage to create the following but still doesn't seem to give the desired results. FIrst of all code[1:10] display 10 vectors which represent what? the sigma? are those the sigma of each run? how can i make it calculate the sub.sigma also?
set.seed(2021)
code <- replicate(10,{
data<-matrix(rnorm(100*5,mean=0,sd=1), 100, 5)
colnames(data) <- c("X1", "X2", "X3", "X4", "X5")
data <- as.data.frame(data)
a <- 5
b <- 0.8
c <- 100
data[,2] <- a*data[,1] - b*rnorm(c)
data[,3] <- a*data[,1] + b*rnorm(c)
data[,4] <- a*data[,1] - b*rnorm(c)
library(glmnet)
library(coefplot)
A <- as.matrix(data)
set.seed(1)
results <- lapply(seq_len(ncol(A)), function(i) {
list(
cvfit = cv.glmnet(A[, -i] , A[, i] , standardize = TRUE , type.measure = "mse" , nfolds = 10 , alpha = 1)
)
})
lam <- as.data.frame(`names<-`(
lapply(results, function(x) (x$cvfit$lambda.min)),
paste0("X", seq_along(results))
))
sigma<- matrix(rnorm(1*5,mean=0,sd=1), 1, 5)
colnames(sigma) <- c("X1", "X2", "X3", "X4", "X5")
as.vector(sigma)
sub1.sigma <- subset(sigma, select = sigma <= sum(lam))
sub2.sigma <- subset(sigma, select = sigma <= 2*sum(lam))
sub3.sigma <- subset(sigma, select = sigma <= 3*sum(lam))
}, simplify = FALSE)
code[1:10]
sigmas <- as.data.frame(do.call(rbind,lapply(code, sigma)))
I'm a fan of keeping models around for various reasons, so I'll start with a list of models-run. In this case, replicate(n, ..., simplify=FALSE) returns a list of whatever we need it to. (For the record, that is equivalent to lapply(seq_len(n), function(ign) ...).)
(Side note: I don't have glmnet installed, so I'll mimic that with a simple/absurd lm model.)
set.seed(2021)
models <- replicate(10, {
zany_numbers <- runif(32) # nrow(mtcars)
lm(zany_numbers ~ mpg + disp + cyl, data = mtcars)
}, simplify = FALSE)
models[1:2]
# [[1]]
# Call:
# lm(formula = zany_numbers ~ mpg + disp + cyl, data = mtcars)
# Coefficients:
# (Intercept) mpg disp cyl
# 8.85e-02 2.01e-02 -5.29e-05 2.41e-02
# [[2]]
# Call:
# lm(formula = zany_numbers ~ mpg + disp + cyl, data = mtcars)
# Coefficients:
# (Intercept) mpg disp cyl
# -0.52302 0.02485 -0.00122 0.13071
From here, we can make any sort of frame you want.
coefs <- as.data.frame(do.call(rbind, lapply(models, coef)))
coefs
# (Intercept) mpg disp cyl
# 1 0.0885 0.020114 -5.29e-05 0.0241
# 2 -0.5230 0.024847 -1.22e-03 0.1307
# 3 0.0856 0.014215 1.18e-03 -0.0225
# 4 0.4899 0.012013 1.08e-03 -0.0876
# 5 -0.6926 0.024653 -1.16e-03 0.1499
# 6 0.0862 0.010497 -5.02e-04 0.0389
# 7 0.8358 -0.008419 -6.64e-04 -0.0141
# 8 0.3679 -0.000198 1.44e-03 -0.0391
# 9 0.4360 0.011994 -9.59e-05 -0.0303
# 10 0.2276 0.003659 -1.14e-03 0.0651
(You might need to clean up names there.)
You can replace do.call(rbind, ...) with data.table::rbindlist(...) or dplyr::bind_rows(...) if you prefer.
From this models, and using the same list-of-frames do.call(rbind, ...) follow-up, you can generate companion frames, such as
otherstats <- as.data.frame(do.call(rbind, lapply(models, function(mdl) summary(mdl)[c("r.squared", "adj.r.squared")])))
otherstats
# r.squared adj.r.squared
# 1 0.104 0.00745
# 2 0.144 0.0523
# 3 0.044 -0.0584
# 4 0.202 0.117
# 5 0.149 0.0573
# 6 0.0639 -0.0364
# 7 0.0586 -0.0422
# 8 0.137 0.0446
# 9 0.241 0.16
# 10 0.0814 -0.017

Linear regression with ongoing data, in R

Modell
y ~ x1 + x2 + x3
about 1000 rows
What Iwant to do is to do an prediction "step-by-step"
Using Row 0:20 to predict y of 21:30 and then using 11:30 to predict y of 31:40 and so on.
You can use the predict function:
mod = lm(y ~ ., data=df[1:990,])
pred = predict(mod, newdata=df[991:1000,2:4])
Edit: to change the range of training data in a loop:
index = seq(10,990,10)
pred = matrix(nrow=10, ncol=length(index))
for(i in index){
mod = lm(y ~ ., data=df[1:i,])
pred[,i/10] = predict(mod, newdata=df[(i+1):(i+10),2:4])
MSE[i/10] = sum((df$y[(i+1):(i+10)]-pred[,i/10])^2)}
mean(MSE)
Are you looking for something like this?
# set up mock data
set.seed(1)
df <- data.frame(y = rnorm(1000),
x1 = rnorm(1000),
x2 = rnorm(1000),
x3 = rnorm(1000))
# for loop
prd <- list()
for(i in 1:970){
# training data
trn <- df[i:(i+20), ]
# test data
tst <- df[(i+21):(i+30), ]
# lm model
mdl <- lm(y ~ x1 + x2 + x3, trn)
# append a list of data.frame with both predicted and actual values
# for later confrontation
prd[[i]] <- data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
# your list
prd
You can also try something fancier with the package slider:
# define here your model and how you wanna handle the preditions
sliding_lm <- function(..., frm, n_trn, n_tst){
df <- data.frame(...)
trn <- df[1:n_trn, ]
tst <- df[n_trn+1:n_tst, ]
mdl <- lm(y ~ x1 + x2 + x3, trn)
data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
n_trn <- 20 # number of training obs
n_tst <- 10 # number of test obs
frm <- y ~ x1 + x2 + x3 # formula of your model
prd <- slider::pslide(df, sliding_lm,
frm = frm,
n_trn = n_trn,
n_tst = n_tst,
.after = n_trn + n_tst,
.complete = TRUE)
Note that the last 30 entries in the list are NULL, because you look only at complete windows [30 observations with training and test]

Number of rows in predicted data frame does not equal number of rows in new data frame fed to predict function

I have a data frame that I split based on a vector of factors. I'm trying to create a model for each data set and then create a set of predicted values from them.
I'm trying to span the predicted values over a large number of values (e.g. length.out = 500), but when I feed the predict function with a new data set with 500 rows, it still spits out a predicted data frame that's the same length as the original data frame fed into the model.
data(mtcars)
rownames(mtcars) <- NULL #I've ran this code with and without this line, both times it gave the same result
mtcars.split <- split(mtcars, mtcars$cyl)
mtcars.split <- lapply(mtcars.split, function(x){
rownames(x) <- NULL
x <- droplevels(x)
return(x)
})
mtcars.lm <- lapply(mtcars.split, function(x){
lm(disp ~ wt, data = x)
})
mtcars.fitted <- mapply(x = mtcars.lm, y = mtcars.split, function(x, y){
newdata = data.frame(wt = seq(min(y$wt), max(y$wt), length.out = 500))
fitted <- as.data.frame(predict(x, new.data = newdata, se = T))
return(fitted)
}, SIMPLIFY = F)
lapply(mtcars.fitted, nrow)
lapply(mtcars.split, nrow)
I tried running the linear model for the entire data set and it did the same thing.
mtcars.lm.all <- lm(disp ~ wt, data = mtcars)
newdata <- data.frame(wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 500))
nrow(as.data.frame(predict(mtcars.lm.all, new.data = newdata, se = T)))
Even attempting to subset the data set didn't make any difference.
mtcars.head <- head(mtcars, n = 16)
mtcars.head.lm <- lm(disp ~ wt, data = mtcars.head)
predict.mtcars <- as.data.frame(predict(mtcars.head.lm,
new.data = data.frame(wt = seq(min(mtcars.head$wt),
max(mtcars.head$wt),
length.out = 500)),
se = T))
nrow(predict.mtcars)
Am I missing something here? This used to work but it doesn't seem to work now. Even restarting the R session or my computer doesn't seem to make it work.
the argument in the predict function is not new.data but newdata.
Attached the desired result.
data(mtcars)
rownames(mtcars) <- NULL #I've ran this code with and without this line, both times it gave the same result
mtcars.split <- split(mtcars, mtcars$cyl)
mtcars.split <- lapply(mtcars.split, function(x){
rownames(x) <- NULL
x <- droplevels(x)
return(x)
})
mtcars.lm <- lapply(mtcars.split, function(x){
lm(disp ~ wt, data = x)
})
mtcars.fitted <- mapply(x = mtcars.lm, y = mtcars.split, function(x, y){
newdata = data.frame(wt = seq(min(y$wt), max(y$wt), length.out = 500))
fitted <- as.data.frame(predict(x, newdata = newdata, se = T))
return(fitted)
}, SIMPLIFY = F)
lapply(mtcars.fitted, nrow)
#> $`4`
#> [1] 500
#>
#> $`6`
#> [1] 500
#>
#> $`8`
#> [1] 500
lapply(mtcars.split, nrow)
#> $`4`
#> [1] 11
#>
#> $`6`
#> [1] 7
#>
#> $`8`
#> [1] 14
mtcars.lm.all <- lm(disp ~ wt, data = mtcars)
newdata <- data.frame(wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 500))
nrow(as.data.frame(predict(mtcars.lm.all, newdata = newdata, se = T)))
#> [1] 500
Created on 2020-07-22 by the reprex package (v0.3.0)

Multiple Linear Models

I currently have two data tables one of which contains independent and control variables in columns while the other contains rows of dependent variables.
Can anybody help in creating a method to do linear models from the two tables that repeats for each row in the dependent value table?
You haven't provided nor a reproducible example, nor the desired output, so I'll have to guess
If this is your column names vector
vec <- LETTERS[1:3]
And this is your data set
set.seed(1)
df <- data.frame(A = sample(10, 10),
B = sample(20, 10),
C = sample(30, 10))
Then you can try something like
lapply(vec,
function(x) lm(as.formula(paste(x, "~",
paste(setdiff(names(df), x),
collapse = "+"))),
data = df))
Which will give
# [[1]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) B C
# 4.9687 0.2410 -0.1565
#
#
# [[2]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) A C
# 2.7975 0.8182 0.2775
#
#
# [[3]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) A B
# 13.200 -1.675 0.875

Resources