Histogram of AIC for each models - r

Hello How can I create a histogram for the difference of the AICs of each models to the AIC of the full model.?
#AIC of the full model
Y <- modelTT$aic
#AICs for each of the n models.
X <- lapply(listOfModels,function(xx) xx$aic)
so basically I want to do the X - Y first. Then I need to create the histogram of each of the difference values from largest to smallest.

Another alternative using broom()
df = data.frame(a = sample(1:10, replace = TRUE, 24),
b = sample(25:40, replace = TRUE, 24),
c = sample(0:1, replace = TRUE, 24))
model1 = lm(a ~ b + c, df)
model2 = lm(b ~ c, df )
model3 = lm(a ~ c, df)
library(broom)
library(ggplot2)
library(dplyr)
mod1 = glance(model1) %>% mutate(model = "m1")
mod2 = glance(model2) %>% mutate(model = "m2")
mod3 = glance(model3) %>% mutate(model = "m3")
models = bind_rows(mod1, mod2, mod3)
models %>% ggplot(aes(model,AIC)) + geom_bar(stat = "identity")
Gives the following

A generic data.frame
db<-data.frame(y=c(1,2,3,4,5,6,7,8,9),x1=c(9,8,7,6,5,4,3,2,1),x2=c(9,9,7,7,5,5,3,3,1))
A list of lm models
LM_modesl<-NULL
LM_modesl[[1]]<-lm(y ~ x1+x2 , data = db)
LM_modesl[[2]] <- lm(y ~ x1 , data = db)
LM_modesl[[3]] <- lm(y ~ x2 , data = db)
AIC calculation
AIC<-lapply(LM_modesl,AIC)
Decreasing plot
plot(sort(unlist(AIC),decreasing = T),type="h")

Related

How can you force predictions of a linear model to start at the same point as new data

I would like my logistic regression model to start at the same point as a predictor variable.
Data:
df <- tibble(
x = c(0:20, 0:20),
y = c(log(10:30 + 2), log(10:30 + 10)),
init = c(rep(log(10 + 2), 21), rep(log(10 + 10), 21)),
group = c(rep('A', 21), rep('B', 21))
)
Model:
lm_fit <- lm(y ~ log(x + 1) + init, data = df)
Example of model fitted to data:
newdata <- df %>%
filter(group == 'A') %>%
mutate(pred_y = predict(lm_fit, newdata = newdata, type = 'response')) %>%
pivot_longer(c(y, pred_y), names_to = 'pred_type', values_to = 'value')
ggplot(aes(x, value, colour = pred_type)) +
geom_point() +
geom_line()
How can I change my model so the red line (model) starts at the same value as the blue line (data)? i.e. when x=0, pred_y = y.
Using your init variable, you have to treat it as an offset (its coefficient will be 1) and disable the intercept (-1 in model formula).
lm_fit <- lm(y ~ log(x + 1) + offset(init) - 1, data = df)
After changing the model formula to log(y) ~ log(x + 1) a possible approach is to transform the y variable and use its new value in x = 0 for the offset (init) variable (I would actually recommend to always derive the offset from the y variable and not compute it independently). This way only the data is modified and the rest will remain the same.
df <- df %>%
group_by(group) %>%
mutate(y = log(y),
init = y[x==0])

Fitting decays in R

I have a data like that I want to fit decay.
library(tidyverse)
library(broom)
t = 1:100
x=1:80
y1=sample(seq(from = 20, to = 50), size = 100, replace = TRUE)
y1<-y1 %>% jitter()
y2 = 24 + (60 - 24) * -0.01 * x %>% jitter(10)
df1 <- tibble(t = t, y = y1, sensor = 'sensor1') %>%
rbind(. , data.frame(t = x, y = y2, sensor = 'sensor2'))
fit <- nls(y ~ SSasymp(t, yf, y0, log_alpha), data = sensor1)
fit
# Fit the data
fitted <- df %>%
nest(-sensor) %>%
mutate(
fit = map(data, ~nls(y ~ SSasymp(t, yf, y0, log_alpha), data = .)),
tidied = map(fit, tidy),
augmented = map(fit, augment),
)
And I got :
Error in mutate():
! Problem while computing fit = map(data, ~nls(y ~ SSasymp(t, yf, y0, log_alpha), data = .)).
Caused by error in nls():
! singular gradient
Could you please let me where was the issue.

Overlay decision boundary for random forests and boostings

I generate some random data and am trying to overlay a decision boundary based upon fitting using random forests and boosting. I can recreate the problem below. I generate the data, and using regression trees I can easily overlay the decision boundary using the following code:
library(tidyverse)
# set seed and generate some random data
set.seed(123)
Dat <- tibble(
x1 = rnorm(100),
x2 = rnorm(100)
) %>% mutate(y = as_factor(ifelse(x1^2 + x2^2 > 1.39, "A", "B")))
circlepts <- tibble(theta = seq(0, 2*pi, length = 100)) %>%
mutate(x = sqrt(1.39) * sin(theta), y = sqrt(1.39) * cos(theta))
# graph the data and draw the boundary
p <- ggplot(Dat) + geom_point(aes(x1, x2, color = y)) + coord_fixed() +
geom_polygon(data = circlepts, aes(x, y), color = "blue", fill = NA)
# convert character to binary inputs making classification easier
binVec = as.vector(Dat$y)
binVec[which(binVec =="A")] = 1
binVec[which(binVec == "B")] = 0
binVec = as.numeric(binVec)
Dat$y = binVec
# split the data up
datasplit <- initial_split(Dat, prop = 0.7)
training_set <- as_tibble(training(datasplit))
testing_set <- as_tibble(testing(datasplit))
tree_fit <- tree(y~ ., training_set)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(tree_fit)
# plot the data with the decision overlay of the tree fit
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
Now if I try doing so with random forests or gradient boosting, add_predictions doesn't cooperate that well...
rf_fit <- randomForest(y ~ ., data=training_set, mtry = 2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
##ERROR: Error in if (is.na(out.type)) stop("type must be one of 'response', 'prob', 'vote'") : argument is of length zero
And for gradient boosting:
fitBoost <- gbm(y ~ ., data= Dat, distribution = "gaussian",
n.trees = 1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(fitBoost)
### ERROR: Error in paste("Using", n.trees, "trees...\n") : argument "n.trees" is missing, with no default
It seems to be a very simple problem. Could someone help me out?
The following code works with your random forest:
training_set$y <- factor(training_set$y)
rf_fit <- randomForest(y ~ ., data=training_set, mtry=2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
And here is the code for the gradient boosting machine:
fitBoost <- gbm(y ~ ., data=Dat, distribution="gaussian", n.trees=1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
add_predictions2 <- function (data, model, var = "pred", type = NULL)
{
data[[var]] <- predict2(model, data, type = type)
data
}
predict2 <- function (model, data, type = NULL)
{
if (is.null(type)) {
stats::predict(model, data, n.trees=1000)
} else {
stats::predict(model, data, type = type, n.trees=1000)
}
}
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
add_predictions2(fitBoost)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)

Using MAE as the error function for a linear model

I'd like to perform linear regression, however instead of using RMSE as my error function, I'd like to use MAE (Mean Absolute Error).
Is there a package that would allow me to do this?
You may use caret and Metrics packages.
library(caret)
data("mtcars")
maeSummary <- function (data,
lev = NULL,
model = NULL) {
require(Metrics)
out <- mae(data$obs, data$pred)
names(out) <- "MAE"
out
}
mControl <- trainControl(summaryFunction = maeSummary)
set.seed(123)
lm_model <- train(mpg ~ wt,
data = mtcars,
method = "lm",
metric = "MAE",
maximize = FALSE,
trControl = mControl)
> lm_model$metric
[1] "MAE"
Probably late to the party, but here is a solution using CVXR package for optimisation.
library(CVXR)
# defining variables to be tuned during optimisation
coefficient <- Variable(1)
intercept <- Variable(1)
# defining the objective i.e. minimizing the sum af absolute differences (MAE)
objective <- Minimize(sum(abs(mtcars$disp - (mtcars$hp * coefficient) - intercept)))
# optimisation
problem <- Problem(objective)
result <- solve(problem)
# result
result$status
mae_coefficient <- result$getValue(coefficient)
mae_intercept <- result$getValue(intercept)
lm_coeff_intrc <- lm(formula = disp ~ hp, data = mtcars)$coefficients
library(tidyverse)
ggplot(mtcars, aes(hp, disp)) +
geom_point() +
geom_abline(
slope = lm_coeff_intrc["hp"],
intercept = lm_coeff_intrc["(Intercept)"],
color = "red"
) +
geom_abline(
slope = mae_coefficient,
intercept = mae_intercept,
color = "blue"
)
df <- mtcars %>%
select(disp, hp) %>%
rownames_to_column() %>%
mutate(
mae = disp - hp * mae_coefficient - mae_intercept,
lm = disp - hp * lm_coeff_intrc["hp"] - lm_coeff_intrc["(Intercept)"]
)
df %>%
select(mae, lm) %>%
pivot_longer(cols = 1:2) %>%
group_by(name) %>%
summarise(
mae = sum(abs(value))
)

Visualize Multilevel Growth Model with nlme/ggplot2 vs lme4/ggplot2

I am trying to visualize the results of an nlme object without success. When I do so with an lmer object, the correct plot is created. My goal is to use nlme and visualize a fitted growth curve for each individual with ggplot2. The predict() function seems to work differently with nlme and lmer objects.
model:
#AR1 with REML
autoregressive <- lme(NPI ~ time,
data = data,
random = ~time|patient,
method = "REML",
na.action = "na.omit",
control = list(maxlter=5000, opt="optim"),
correlation = corAR1())
nlme visualization attempt:
data <- na.omit(data)
data$patient <- factor(data$patient,
levels = 1:23)
ggplot(data, aes(x=time, y=NPI, colour=factor(patient))) +
geom_point(size=1) +
#facet_wrap(~patient) +
geom_line(aes(y = predict(autoregressive,
level = 1)), size = 1)
when I use:
data$fit<-fitted(autoregressive, level = 1)
geom_line(aes(y = fitted(autoregressive), group = patient))
it returns the same fitted values for each individual and so ggplot produces the same growth curve for each. Running test <-data.frame(ranef(autoregressive, level=1)) returns varying intercepts and slopes by patient id. Interestingly, when I fit the model with lmer and run the below code it returns the correct plot. Why does predict() work differently with nlme and lmer objects?
timeREML <- lmer(NPI ~ time + (time | patient),
data = data,
REML=T, na.action=na.omit)
ggplot(data, aes(x = time, y = NPI, colour = factor(patient))) +
geom_point(size=3) +
#facet_wrap(~patient) +
geom_line(aes(y = predict(timeREML)))
In creating a reproducible example, I found that the error was not occurring in predict() nor in ggplot() but instead in the lme model.
Data:
###libraries
library(nlme)
library(tidyr)
library(ggplot2)
###example data
df <- data.frame(replicate(78, sample(seq(from = 0,
to = 100, by = 2), size = 25,
replace = F)))
##add id
df$id <- 1:nrow(df)
##rearrange cols
df <- df[c(79, 1:78)]
##sort columns
df[,2:79] <- lapply(df[,2:79], sort)
##long format
df <- gather(df, time, value, 2:79)
##convert time to numeric
df$time <- factor(df$time)
df$time <- as.numeric(df$time)
##order by id, time, value
df <- df[order(df$id, df$time),]
##order value
df$value <- sort(df$value)
Model 1 with no NA values fits successfully.
###model1
model1 <- lme(value ~ time,
data = df,
random = ~time|id,
method = "ML",
na.action = "na.omit",
control = list(maxlter=5000, opt="optim"),
correlation = corAR1(0, form=~time|id,
fixed=F))
Introducing NA's causes invertible coefficient matrix error in model 1.
###model 1 with one NA value
df[3,3] <- NA
model1 <- lme(value ~ time,
data = df,
random = ~time|id,
method = "ML",
na.action = "na.omit",
control = list(maxlter=2000, opt="optim"),
correlation = corAR1(0, form=~time|id,
fixed=F))
But not in model 2, which has a more simplistic within-group AR(1) correlation structure.
###but not in model2
model2 <- lme(value ~ time,
data = df,
random = ~time|id,
method = "ML",
na.action = "na.omit",
control = list(maxlter=2000, opt="optim"),
correlation = corAR1(0, form = ~1 | id))
However, changing opt="optim" to opt="nlminb" fits model 1 successfully.
###however changing the opt to "nlminb", model 1 runs
model3 <- lme(value ~ time,
data = df,
random = ~time|id,
method = "ML",
na.action = "na.omit",
control = list(maxlter=2000, opt="nlminb"),
correlation = corAR1(0, form=~time|id,
fixed=F))
The code below visualizes model 3 (formerly model 1) successfully.
df <- na.omit(df)
ggplot(df, aes(x=time, y=value)) +
geom_point(aes(colour = factor(id))) +
#facet_wrap(~id) +
geom_line(aes(y = predict(model3, level = 0)), size = 1.3, colour = "black") +
geom_line(aes(y = predict(model3, level=1, group=id), colour = factor(id)), size = 1)
Note that I am not exactly sure what changing the optimizer from "optim" to "nlminb" does and why it works.

Resources