Using MAE as the error function for a linear model - r

I'd like to perform linear regression, however instead of using RMSE as my error function, I'd like to use MAE (Mean Absolute Error).
Is there a package that would allow me to do this?

You may use caret and Metrics packages.
library(caret)
data("mtcars")
maeSummary <- function (data,
lev = NULL,
model = NULL) {
require(Metrics)
out <- mae(data$obs, data$pred)
names(out) <- "MAE"
out
}
mControl <- trainControl(summaryFunction = maeSummary)
set.seed(123)
lm_model <- train(mpg ~ wt,
data = mtcars,
method = "lm",
metric = "MAE",
maximize = FALSE,
trControl = mControl)
> lm_model$metric
[1] "MAE"

Probably late to the party, but here is a solution using CVXR package for optimisation.
library(CVXR)
# defining variables to be tuned during optimisation
coefficient <- Variable(1)
intercept <- Variable(1)
# defining the objective i.e. minimizing the sum af absolute differences (MAE)
objective <- Minimize(sum(abs(mtcars$disp - (mtcars$hp * coefficient) - intercept)))
# optimisation
problem <- Problem(objective)
result <- solve(problem)
# result
result$status
mae_coefficient <- result$getValue(coefficient)
mae_intercept <- result$getValue(intercept)
lm_coeff_intrc <- lm(formula = disp ~ hp, data = mtcars)$coefficients
library(tidyverse)
ggplot(mtcars, aes(hp, disp)) +
geom_point() +
geom_abline(
slope = lm_coeff_intrc["hp"],
intercept = lm_coeff_intrc["(Intercept)"],
color = "red"
) +
geom_abline(
slope = mae_coefficient,
intercept = mae_intercept,
color = "blue"
)
df <- mtcars %>%
select(disp, hp) %>%
rownames_to_column() %>%
mutate(
mae = disp - hp * mae_coefficient - mae_intercept,
lm = disp - hp * lm_coeff_intrc["hp"] - lm_coeff_intrc["(Intercept)"]
)
df %>%
select(mae, lm) %>%
pivot_longer(cols = 1:2) %>%
group_by(name) %>%
summarise(
mae = sum(abs(value))
)

Related

how to create a loop over a different set of variables and models in R

code below first prints out lm for mpg ~ disp then for mpg ~ disp + wt. I would like to create another loop over the models (note that the second lm is my personalize model, and for the simplicity, we can assume it is lm). how can I loop over different models?
data("mtcars")
formulas <- list(
mpg ~ disp,
mpg ~ disp + wt
)
models <- list(lm, lm)
res <- vector("list", length = length(formulas))
for(i in seq_along(formulas)){
res[[i]] <- lm(formulas[[i]], data = mtcars)
}
res
or
lapply(formulas, lm, data = mtcars)
You may use nested lapply -
lapply(models, function(x) lapply(formulas, function(y) x(y, data = mtcars)))
I like to use tidyverse's purrr for such multi-model approaches:
pacman::p_load(dplyr, purrr)
data("mtcars")
d <- crossing(formula = c(mpg ~ disp, mpg ~ disp + wt),
model = list("lm", "glm")) %>%
mutate(result = pmap(.l = list(model, formula),
.f = function(m, f) do.call(m, args = list(formula = f, data = substitute(mtcars)))))
We could use outer in base R and should be fast
out <- c(outer(models, formulas, Vectorize(function(x, y) list(x(y, data = mtcars)))))

can not use Non-standard evaluation in self-define function in r

I want to write a function that extracts some information from gam model.
I can do this without self-define function (df is what I wanted):
library(mgcv)
library(tidyverse)
model = gam(mpg ~ cyl, data = mtcars)
result = summary(model)$p.table
estimate = result[2,1]
se = result[2,2]
df = data.frame(estimate = estimate, se = se)
df
Then I wrapped it with a self-define function:
my_gam <- function(y, x, data){
model = gam(y ~ x, data = data)
result = summary(model)$p.table
estimate = result[2,1]
se = result[2,2]
df = data.frame(estimate = estimate, se = se)
df
}
But I can not use my function correctly.
my_gam(y = mpg, x = cyl, data = mtcars)
Error in eval(predvars, data, env) : object 'cyl' not found
my_gam(y = 'mpg', x = 'cyl', data = mtcars)
Error in gam(y ~ x, data = data) :
Not enough (non-NA) data to do anything meaningful
Is that a way I can get the df just as the first code block when I run my_gam(y = mpg, x = cyl, data = mtcars).
Any help will be highly appreciated!!
You can use reformulate/as.formula to construct the formula.
library(mgcv)
my_gam <- function(y, x, data){
model = gam(reformulate(x, y), data = data)
result = summary(model)$p.table
estimate = result[2,1]
se = result[2,2]
df = data.frame(estimate = estimate, se = se)
df
}
my_gam(y = 'mpg', x = 'cyl', data = mtcars)
# estimate se
#1 -2.876 0.3224
We can construct a formula with paste which would be fast
my_gam <- function(y, x, data){
model <- gam(as.formula(paste(y, "~", x)), data = data)
result <- summary(model)$p.table
estimate <- result[2,1]
se <- result[2,2]
df <- data.frame(estimate = estimate, se = se)
df
}
my_gam(y = 'mpg', x = 'cyl', data = mtcars)
# estimate se
#1 -2.87579 0.3224089
Or another option is to pass a formula as argument
my_gam <- function(fmla, data){
model <- gam(fmla, data = data)
result <- summary(model)$p.table
estimate <- result[2,1]
se <- result[2,2]
df <- data.frame(estimate = estimate, se = se)
df
}
my_gam(mpg ~ cyl, data = mtcars)
# estimate se
# 1 -2.87579 0.3224089

Fit models with robust standard errors

I am using the following R code to run several linear regression models and extract results to dataframe:
library(tidyverse)
library(broom)
data <- mtcars
outcomes <- c("wt", "mpg", "hp", "disp")
exposures <- c("gear", "vs", "am")
models <- expand.grid(outcomes, exposures) %>%
group_by(Var1) %>% rowwise() %>%
summarise(frm = paste0(Var1, "~factor(", Var2, ")")) %>%
group_by(model_id = row_number(),frm) %>%
do(tidy(lm(.$frm, data = data))) %>%
mutate(lci = estimate-(1.96*std.error),
uci = estimate+(1.96*std.error))
How can I modify my code to use robust standard errors similar to STATA?
* example of using robust standard errors in STATA
regress y x, robust
There is a comprehensive discussion about the robust standard errors in lm models at stackexchange.
You can update your code in the following way:
library(sandwich)
models <- expand.grid(outcomes, exposures) %>%
group_by(Var1) %>% rowwise() %>%
summarise(frm = paste0(Var1, "~factor(", Var2, ")")) %>%
group_by(model_id = row_number(),frm) %>%
do(cbind(
tidy(lm(.$frm, data = data)),
robSE = sqrt(diag(vcovHC(lm(.$frm, data = data), type="HC1"))) )
) %>%
mutate(
lci = estimate - (1.96 * std.error),
uci = estimate + (1.96 * std.error),
lciR = estimate - (1.96 * robSE),
uciR = estimate + (1.96 * robSE)
)
The important line is this:
sqrt(diag(vcovHC(lm(.$frm, data = data), type="HC1"))) )
Function vcovHC returns covariance matrix. You need to extract variances on the diagonal diag and take compute a square root sqrt.

Histogram of AIC for each models

Hello How can I create a histogram for the difference of the AICs of each models to the AIC of the full model.?
#AIC of the full model
Y <- modelTT$aic
#AICs for each of the n models.
X <- lapply(listOfModels,function(xx) xx$aic)
so basically I want to do the X - Y first. Then I need to create the histogram of each of the difference values from largest to smallest.
Another alternative using broom()
df = data.frame(a = sample(1:10, replace = TRUE, 24),
b = sample(25:40, replace = TRUE, 24),
c = sample(0:1, replace = TRUE, 24))
model1 = lm(a ~ b + c, df)
model2 = lm(b ~ c, df )
model3 = lm(a ~ c, df)
library(broom)
library(ggplot2)
library(dplyr)
mod1 = glance(model1) %>% mutate(model = "m1")
mod2 = glance(model2) %>% mutate(model = "m2")
mod3 = glance(model3) %>% mutate(model = "m3")
models = bind_rows(mod1, mod2, mod3)
models %>% ggplot(aes(model,AIC)) + geom_bar(stat = "identity")
Gives the following
A generic data.frame
db<-data.frame(y=c(1,2,3,4,5,6,7,8,9),x1=c(9,8,7,6,5,4,3,2,1),x2=c(9,9,7,7,5,5,3,3,1))
A list of lm models
LM_modesl<-NULL
LM_modesl[[1]]<-lm(y ~ x1+x2 , data = db)
LM_modesl[[2]] <- lm(y ~ x1 , data = db)
LM_modesl[[3]] <- lm(y ~ x2 , data = db)
AIC calculation
AIC<-lapply(LM_modesl,AIC)
Decreasing plot
plot(sort(unlist(AIC),decreasing = T),type="h")

Prediction of single bagged tree models dependent on pre-processing using caret

I'm using the caret package to predict a time series with method treebag. caret estimates bagging regression trees with 25 bootstrap replications.
What I'm struggling to understand is how the final prediction of that 'treebag model' relates to the predictions made by each of the 25 trees, depending on whether I use caret::preProcess, or not.
I am aware of this question and the linked resources therein. (But could not draw the right conclusions from it.)
Here is an example using the economics data. Let's say I want to predict unemploy_rate, which has to be created first.
# packages
library(caret)
library(tidyverse)
# data
data("economics")
economics$unemploy_rate <- economics$unemploy / economics$pop * 100
x <- economics[, -c(1, 7)]
y <- economics[["unemploy_rate"]]
I wrote a function that extracts the 25 individual trees from the train object, makes a prediction for each tree, averages these 25 predictions, and compares this average with the prediction from the train object. It returns a plot.
predict_from_treebag <- function(model) {
# extract 25 trees from train object
bagged_trees <- map(.x = model$finalModel$mtrees, .f = pluck, "btree")
# make a prediction for each tree
pred_trees <- map(bagged_trees, .f = predict, newdata = x)
names(pred_trees) <- paste0("tree_", seq_along(pred_trees))
# aggreagte predictions
pred_trees <- as.data.frame(pred_trees) %>%
add_column(date = economics$date, .before = 1) %>%
gather(tree, value, matches("^tree")) %>%
group_by(date) %>%
mutate(mean_pred_from_trees = mean(value)) %>%
ungroup()
# add prediction from train object
pred_trees$bagging_model_prediction = predict(model, x)
pred_trees <- pred_trees %>%
gather(model, pred_value, 4:5)
# plot
p <- ggplot(data = pred_trees, aes(date)) +
geom_line(aes(y = value, group = tree), alpha = .2) +
geom_line(aes(y = pred_value, col = model)) +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "bottom"
)
p
}
Now I estimate two models, the first will be unscaled, the second will be centered and scaled.
preproc_opts <- list(unscaled = NULL,
scaled = c("center", "scale"))
# estimate the models
models <- map(preproc_opts, function(preproc)
train(
x = x,
y = y,
trControl = trainControl(method = "none"), # since there are no tuning parameters for this model
metric = "RMSE",
method = "treebag",
preProcess = preproc
))
# apply predict_from_treebag to each model
imap(.x = models,
.f = ~{predict_from_treebag(.x) + labs(title = .y)})
The results are shown below. The unscaled model prediction is the average of the 25 trees but why is each prediction from the 25 trees a constant when I use preProcess?
Thank you for any advice where I might be wrong.
The problem is in this part of the code:
pred_trees <- map(bagged_trees, .f = predict, newdata = x)
in the function predict_from_treebag
this predict function is in fact predict.rpart since
class(bagged_trees[[1]])
predict.rpart does not know that you pre-processed the data in caret.
Here is a quick fix:
predict_from_treebag <- function(model) {
# extract 25 trees from train object
bagged_trees <- map(.x = model$finalModel$mtrees, .f = pluck, "btree")
x <- economics[, -c(1, 7)]
# make a prediction for each tree
newdata = if(is.null(model$preProcess)) x else predict(model$preProcess, x)
pred_trees <- map(bagged_trees, .f = predict, newdata = newdata)
names(pred_trees) <- paste0("tree_", seq_along(pred_trees))
# aggreagte predictions
pred_trees <- as.data.frame(pred_trees) %>%
add_column(date = economics$date, .before = 1) %>%
gather(tree, value, matches("^tree")) %>%
group_by(date) %>%
mutate(mean_pred_from_trees = mean(value)) %>%
ungroup()
# add prediction from train object
pred_trees$bagging_model_prediction = predict(model, x)
pred_trees <- pred_trees %>%
gather(model, pred_value, 4:5)
# plot
p <- ggplot(data = pred_trees, aes(date)) +
geom_line(aes(y = value, group = tree), alpha = .2) +
geom_line(aes(y = pred_value, col = model)) +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "bottom"
)
p
}
Now after running:
preproc_opts <- list(unscaled = NULL,
scaled = c("center", "scale"))
models <- map(preproc_opts, function(preproc)
train(
x = x,
y = y,
trControl = trainControl(method = "none"), # since there are no tuning parameters for this model
metric = "RMSE",
method = "treebag",
preProcess = preproc
))
map2(.x = models,
.y = names(models),
.f = ~{predict_from_treebag(.x) + labs(title = .y)})
the result is in line with the expected

Resources