I have an example dataset below.
train<-data.frame(x1 = c(4,5,6,4,3,5), x2 = c(4,2,4,0,5,4), x3 = c(1,1,1,0,0,1),
x4 = c(1,0,1,1,0,0), x5 = c(0,0,0,1,1,1))
Suppose I want to create separate models for column x3, x4, x5 based on column x1 and x2. For example
lm1 <- lm(x3 ~ x1 + x2)
lm2 <- lm(x4 ~ x1 + x2)
lm3 <- lm(x5 ~ x1 + x2)
I want to then take these models and apply them to a testing set using predict, and then create a matrix that has each model outcome as a column.
test <- data.frame(x1 = c(4,3,2,1,5,6), x2 = c(4,2,1,6,8,5))
p1 <- predict(lm1, newdata = test)
p2 <- predict(lm2, newdata = test)
p3 <- predict(lm3, newdata = test)
final <- cbind(p1, p2, p3)
This is a simplified version where you can do it step by step, the actual data is far too large. Is there a way to create a function or use a for statement to combine this into one or two steps?
I had an inclination to close your question as a duplicate to Fitting a linear model with multiple LHS, but sadly the prediction issue is not addressed over there. On the other hand, Prediction of 'mlm' linear model object from lm() talks about prediction, but is a little bit far off your situation, as you work with formula interface instead of matrix interface.
I did not manage to locate a perfect duplicate target in "mlm" tag. So I think it a good idea to contribute another answer for this tag. As I said in linked questions, predict.mlm does not support se.fit, and at the moment, this is also a missing issue in "mlm" tag. So I would take this chance to fill such gap.
Here is a function to get standard error of prediction:
f <- function (mlmObject, newdata) {
## model formula
form <- formula(mlmObject)
## drop response (LHS)
form[[2]] <- NULL
## prediction matrix
X <- model.matrix(form, newdata)
Q <- forwardsolve(t(qr.R(mlmObject$qr)), t(X))
## unscaled prediction standard error
unscaled.se <- sqrt(colSums(Q ^ 2))
## residual standard error
sigma <- sqrt(colSums(residuals(mlmObject) ^ 2) / mlmObject$df.residual)
## scaled prediction standard error
tcrossprod(unscaled.se, sigma)
}
For your given example, you can do
## fit an `mlm`
fit <- lm(cbind(x3, x4, x5) ~ x1 + x2, data = train)
## prediction (mean only)
pred <- predict(fit, newdata = test)
# x3 x4 x5
#1 0.555956679 0.38628159 0.60649819
#2 0.003610108 0.47653430 0.95848375
#3 -0.458483755 0.48014440 1.27256318
#4 -0.379061372 -0.03610108 1.35920578
#5 1.288808664 0.12274368 0.17870036
#6 1.389891697 0.46570397 0.01624549
## prediction error
pred.se <- f(fit, newdata = test)
# [,1] [,2] [,3]
#[1,] 0.1974039 0.3321300 0.2976205
#[2,] 0.3254108 0.5475000 0.4906129
#[3,] 0.5071956 0.8533510 0.7646849
#[4,] 0.6583707 1.1077014 0.9926075
#[5,] 0.5049637 0.8495959 0.7613200
#[6,] 0.3552794 0.5977537 0.5356451
We can verify that f is correct:
## `lm1`, `lm2` and `lm3` are defined in your question
predict(lm1, test, se.fit = TRUE)$se.fit
# 1 2 3 4 5 6
#0.1974039 0.3254108 0.5071956 0.6583707 0.5049637 0.3552794
predict(lm2, test, se.fit = TRUE)$se.fit
# 1 2 3 4 5 6
#0.3321300 0.5475000 0.8533510 1.1077014 0.8495959 0.5977537
predict(lm3, test, se.fit = TRUE)$se.fit
# 1 2 3 4 5 6
#0.2976205 0.4906129 0.7646849 0.9926075 0.7613200 0.5356451
Related
I need to use a linear regression. Since each predictor is added to the model respectively, I should use a for loop to fit the model.
set.seed(98274) # Creating example data
y <- rnorm(1000)
x1 <- rnorm(1000) + 0.2 * y
x2 <- rnorm(1000) + 0.2 * x1 + 0.1 * y
x3 <- rnorm(1000) - 0.1 * x1 + 0.3 * x2 - 0.3 * y
data <- data.frame(y, x1, x2, x3)
head(data) # Head of data
mod_summaries <- list() # Create empty list
for(i in 2:ncol(data)) { # Head of for-loop
predictors_i <- colnames(data)[2:i] # Create vector of predictor names
mod_summaries[[i - 1]] <- summary( # Store regression model summary in list
lm(y ~ ., data[ , c("y", predictors_i)]))
}
Then, I tried to predict the test data using those models in another for loop. My code is provided in the following.
## Test
set.seed(44) # Creating test data
y <- rnorm(1000)
x1 <- rnorm(1000) + 0.19 * y
x2 <- rnorm(1000) + 0.2 * x1 + 0.11 * y
x3 <- rnorm(1000) - 0.12 * x1 + 0.28 * x2 - 0.33 * y
test <- data.frame(y, x1, x2, x3)
predict_models <- matrix(nrow = nrow(test), ncol = 3)
for(i in 2:ncol(data)) { # Head of for-loop
predictors_i <- colnames(data)[2:i] # Create vector of predictor names
predict_models[,i-1] <- predict.lm(mod_summaries[[i-1]], test[,2:i])
}
predict_models
but it throws out the following error:
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
'data' must be a data.frame, environment, or list
In addition: Warning message:
In predict.lm(mod_summaries[[i - 1]], test[, 2:i]) :
calling predict.lm(<fake-lm-object>) ...
First, you want to store just the models, not the summaries.
mod_summaries <- vector('list', ncol(data) - 1L) ## preallocate list of known length, it's way more efficient
for (i in seq_len(ncol(data))[-1]) {
predictors_i <- colnames(data)[2:i]
mod_summaries[[i - 1]] <- lm(y ~ ., data[, c("y", predictors_i)])
}
Then, data for predict actually doesn't change, only columns in model are used, so using test is sufficient.
predict_models <- matrix(nrow=nrow(test), ncol=ncol(test) - 1L)
for (i in seq_len(ncol(data))[-1]) {
predict_models[, i - 1] <- predict.lm(mod_summaries[[i - 1]], test)
}
That's actually it.
head(predict_models)
# [,1] [,2] [,3]
# [1,] -0.115690784 -0.19149611 -0.4815419
# [2,] -0.004721430 0.03814865 0.1894562
# [3,] -0.110812904 0.02312155 0.2579051
# [4,] 0.004264032 -0.06147035 -0.2328833
# [5,] 0.320110168 -0.04145044 -0.3229186
# [6,] -0.040603638 0.01977484 -0.1090088
Alternatively, and more R-ish, you could do the same in just two lines of code, without for loops, though.
ms <- lapply(seq_along(data)[-1], \(i) lm(reformulate(names(data)[2:i], 'y'), data))
pm <- sapply(ms, predict, test)
head(pm)
# [,1] [,2] [,3]
# 1 -0.115690784 -0.19149611 -0.4815419
# 2 -0.004721430 0.03814865 0.1894562
# 3 -0.110812904 0.02312155 0.2579051
# 4 0.004264032 -0.06147035 -0.2328833
# 5 0.320110168 -0.04145044 -0.3229186
# 6 -0.040603638 0.01977484 -0.1090088
Firstly, let's say I have a data frame df with variables y, x1, x2, x1 is a continuous variable and x2 is a factor.
Let's say I have a model:
model <- glm(y ~ x1 + x2, data = df, family = binomial)
This will result in an object where I can extract the coefficients using the command model$coefficients.
However, for use in another program I would like to export the data frame df, but I'd also like to be able to display the results of the model beyond simply adding the fitted values to the data frame.
Therefore I would like to have coeff1*x1 and coeff2*x2 also in the same dataframe, so that I could use these and the original data together to display their effects. The problem arises from the fact that one of the variables is a multi-level factor and therefore it's not preferable to simply use a for-loop to extract the coefficients and multiply the variables with them.
Is there another way to add two new variables to the dataframe df such that they've been derived from combining the original variables x1, x2 and their respective coefficients?
Try:
set.seed(123)
N <- 10
df <- data.frame(x1 = rnorm(N, 10, 1),
x2 = sample(1:3, N, TRUE),
y = as.integer(50 - x2* 0.4 + x1 * 1.2 + rnorm(N, 0, 0.5) > 52))
model <- glm(y ~ x1 + x2, data = df, family = binomial)
# add column for intercept
df <- cbind(x0 = rep(1, N), df)
df$intercept <- df$x0 * model$coefficients["(Intercept)"]
df[["coeff1*x1"]] <- df$x1 * model$coefficients["x1"]
df[["coeff2*x2"]] <- df$x2 * model$coefficients["x2"]
# x0 x1 x2 y intercept coeff1*x1 coeff2*x2
# 1 1 9.439524 1 1 24.56607 -3.361333e-06 -4.281056e-07
# 2 1 9.769823 1 1 24.56607 -3.478949e-06 -4.281056e-07
# 3 1 11.558708 1 1 24.56607 -4.115956e-06 -4.281056e-07
Alternatively:
# add column for intercept
df <- cbind(x0 = rep(1, N), df)
tmp <- as.data.frame(Map(function(x, y) x * y, subset(df, select = -y), model$coefficients))
names(tmp) <- paste0("coeff*", names(model$coefficients))
cbind(df, tmp)
The function stats::model.matrix can be used to convert an input data frame into a matrix suitable for raw algebraic manipulation in solving regression equations etc. For example:
set.seed(0)
df <- data.frame(a = rnorm(5), n = rnorm(5))
model.matrix(~., data = df)
produces:
(Intercept) a n
1 1 1.2629543 -1.539950042
2 1 -0.3262334 -0.928567035
3 1 1.3297993 -0.294720447
4 1 1.2724293 -0.005767173
5 1 0.4146414 2.404653389
One of the features is that it controls how the intercept is added to the design matrix. Compare: model.matrix(~ 0 + ., data = df) vs model.matrix(~ 1 + ., data = df)
Now, the . in the formula specifies that all variables of df should be included. This leads to a problem when I wish to pass an empty data frame and thus create just the intercepts, e.g.:
df <- data.frame(matrix(, nrow=5, ncol=0))
model.matrix(~ ., data = df)
leads to:
Error in terms.formula(object, data = data) :
'.' in formula and no 'data' argument
Does anyone know how to get around this, to be specific, I want to produce the following result given by model.matrix(~ 1, data = df), except using the ..
You could do:
model.matrix(as.formula(paste0("~", colnames(df), "+1", collapse="+")), data=df)
However, I think you are better off using an if statement. I think formulas in R are more convenience than anything fundamental.
I have an lm model in R that I have trained and serialized. Inside a function, where I pass as input the model and a feature vector (one single array), I have:
CREATE OR REPLACE FUNCTION lm_predict(
feat_vec float[],
model bytea
)
RETURNS float
AS
$$
#R-code goes here.
mdl <- unserialize(model)
# class(feat_vec) outputs "array"
y_hat <- predict.lm(mdl, newdata = as.data.frame.list(feat_vec))
return (y_hat)
$$ LANGUAGE 'plr';
This returns the wrong y_hat!! I know this because this other solution works (the inputs to this function are still the model (in a bytearray) and one feat_vec (array)):
CREATE OR REPLACE FUNCTION lm_predict(
feat_vec float[],
model bytea
)
RETURNS float
AS
$$
#R-code goes here.
mdl <- unserialize(model)
coef = mdl$coefficients
y_hat = coef[1] + as.numeric(coef[-1]%*%feat_vec)
return (y_hat)
$$ LANGUAGE 'plr';
What am I doing wrong?? It is the same unserialized model, the first option should give me the right answer as well...
The problem seems to be the use of newdata = as.data.frame.list(feat_vec). As discussed in your previous question, this returns ugly column names. While when you call predict, newdata must have column names consistent with covariates names in your model formula. You should get some warning message when you call predict.
## example data
set.seed(0)
x1 <- runif(20)
x2 <- rnorm(20)
y <- 0.3 * x1 + 0.7 * x2 + rnorm(20, sd = 0.1)
## linear model
model <- lm(y ~ x1 + x2)
## new data
feat_vec <- c(0.4, 0.6)
newdat <- as.data.frame.list(feat_vec)
# X0.4 X0.6
#1 0.4 0.6
## prediction
y_hat <- predict.lm(model, newdata = newdat)
#Warning message:
#'newdata' had 1 row but variables found have 20 rows
What you need is
newdat <- as.data.frame.list(feat_vec,
col.names = attr(model$terms, "term.labels"))
# x1 x2
#1 0.4 0.6
y_hat <- predict.lm(model, newdata = newdat)
# 1
#0.5192413
This is the same as what you can compute manually:
coef = model$coefficients
unname(coef[1] + sum(coef[-1] * feat_vec))
#[1] 0.5192413
I currently have following code with two functions that calculate the model fit for two distinct models. The difference is in the lm function, where + log(v2) has been added in model 2.
R code
dat <- data.frame(clicks = runif(30, 1, 100), v1 = runif(30, 1, 100), v2 = runif(30, 1, 100))
p0 <- 1 # number of parameters in lm()
p1 <- 2 # number of parameters in lm()
n <- nrow(dat) - 1
## Model 1 Loop
model1 <- function(x) {
fit <- lm(log(clicks) ~ log(v1), data = dat, subset = 1:x, model = FALSE)
pred <- predict(fit, newdata = dat[x+1, ])
c(summary(fit)$r.squared)
}
## Model 1 Regression
result_m1 <- t(sapply(p0:n, model1))
data.frame(result_m1)
## Model 2 Loop
model2 <- function(x) {
fit <- lm(log(clicks) ~ log(v1) + log(v2), data = dat, subset = 1:x, model = FALSE)
pred <- predict(fit, newdata = dat[x+1, ])
c(summary(fit)$r.squared)
}
## Model 2 Regression
result_m2 <- t(sapply(p1:n, model2))
data.frame(result_m2)
Question: Can I somehow create a function that implements a loop for the different models only, instead of repeating the calculation for every model?
I have something like this in mind but weren't able to implement it .http://www.ats.ucla.edu/stat/r/pages/looping_strings.htm
I don't see a point in recreating a function that can be easily done with model-selection functions in available packages.
library(leaps)
library(dplyr)
b <- regsubsets(clicks ~ ., data=dat, nbest=10, nvmax=2) # carries out exhaustive model selection (10 best models; 2 maximum predictors)
coef(b, 1:3) # returns coefficient for the 3 models in this case
[[1]]
(Intercept) v1
60.8067570 -0.2665699
[[2]]
(Intercept) v2
49.96974177 -0.05227489
[[3]]
(Intercept) v1 v2
62.02323816 -0.26422966 -0.02676747
summary(b)$rsq #provide r.squared value for 3 models
[1] 0.067952759 0.002366681 0.068568059
To run prediction is a tad more complicated.
all.mods <- summary(b)$which[,-1] # gives logic output of predictors combination
all.mods
v1 v2
1 TRUE FALSE
1 FALSE TRUE
2 TRUE TRUE
RHS <- lapply(seq(nrow(all.mods)), function(m) summary(b)$which[m,-1] %>% which %>% names %>% paste(., collapse="+"))
RHS
[[1]]
[1] "v1"
[[2]]
[1] "v2"
[[3]]
[1] "v1+v2"
lm.form <- lapply(RHS, function(m)parse(text=paste("lm(clicks ~", m, ", data=dat)")))
lm.mods <- lapply(lm.form, eval) # return list of all lm.mods generated
The list of lm.mods can subsequently be used for predict with new.data.