Changing Reference Level of Categorical Variables Changes Confusion Matrix & Prediction Probabilities - r

I am trying to understand why changing the reference level of a factor changes the results of a model. Consider this example:
library(liver)
library(caret)
library(glmnet)
library(dplyr)
data(churn)
head(churn)
# set reference levels
churn$state <- relevel(churn$state, ref = "NE")
churn$area.code <- relevel(churn$area.code, ref = "area_code_408")
churn$intl.plan <- relevel(churn$intl.plan, ref = "yes")
churn$voice.plan <- relevel(churn$voice.plan, ref = "no")
# split into train and test
set.seed(1)
train.index <- createDataPartition(churn$churn, p = 0.8, list = FALSE)
train_churn <- churn[train.index,]
test_churn <- churn[-train.index,]
# add class weights
my_weights = train_churn %>%
select(churn) %>%
group_by(churn) %>%
count()
weight_for_yes = (1 / my_weights$n[1]) * ((my_weights$n[1] + my_weights$n[2]) / 2.0)
weight_for_yes
weight_for_no = (1 / my_weights$n[2]) * ((my_weights$n[1] + my_weights$n[2]) / 2.0)
weight_for_no
model_weights <- ifelse(train_churn$churn == "yes", weight_for_yes, weight_for_no)
# tuning grid
myGrid <- expand.grid(
alpha = 0,
lambda = seq(0,1,0.01)
)
set.seed(1)
mod_1 <- train(churn ~
state +
area.code +
intl.plan +
voice.plan,
data = train_churn,
method = "glmnet",
tuneGrid = myGrid,
weights = model_weights)
mod_1
Tuning parameter 'alpha' was held constant at a value of 0
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were alpha = 0 and lambda = 0.8
prediction <- predict(mod_1, newdata = test_churn)
confusionMatrix(prediction, test_churn$churn)
I also look at a single new prediction
new_data = data.frame(state = c("CA"),
area.code = c("area_code_510"),
intl.plan = c("yes"),
voice.plan = c("no"))
predict(mod_1, newdata = new_data, type = "prob")
Now I restart R, set new reference levels, and rerun all the code. This is the output
# set new reference levels
churn$state <- relevel(churn$state, ref = "OR")
churn$area.code <- relevel(churn$area.code, ref = "area_code_415")
churn$intl.plan <- relevel(churn$intl.plan, ref = "no")
churn$voice.plan <- relevel(churn$voice.plan, ref = "yes")
Tuning parameter 'alpha' was held constant at a value of 0
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were alpha = 0 and lambda = 0.62.
I expected the lambdas to change, but I did not expect the confusion matrix or classification probabilities to changes. Is this normal?

Related

Error in checkMeasures(measures, learner) : object 'fbeta' not found

I am doing an imbalanced classification task, so I want to use f-beta as performance measure. I used the library(mlr) to set measures=fbeta, which follows:
library(mlr)
#create tasks
## Create combined training data
train_data <- cbind(x_train, y_train)
valid_data <- cbind(x_valid,y_valid)
train_task_data <- rbind(train_data, valid_data)
size <- nrow(train_task_data)
train_ind <- seq_len(nrow(train_data))
validation_ind <- seq.int(max(train_ind) + 1, size)
## Create training task
train_task <- makeClassifTask(data = train_task_data, target = "DEFAULT", positive = 1)
testtask <- makeClassifTask(data = cbind(x_test,y_test),target = "DEFAULT")
#create learner
lrn <- makeLearner("classif.xgboost",predict.type = "response") ##predict.type = "prob"
lrn$par.vals <- list( objective="binary:logistic", eval_metric="logloss", nrounds=100L, eta=0.1)
#set parameter space
params <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 9L,upper = 10L),
makeNumericParam("min_child_weight",lower = 9L,upper = 10L),
makeNumericParam("subsample",lower = 0.9,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.9,upper = 1))
#search strategy
ctrl <- makeTuneControlRandom(maxit = 10L)
#set parallel backend
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
mytune <- tuneParams(learner = lrn, task = train_task,
resampling = makeFixedHoldoutInstance(train_ind, validation_ind, size),
measures = fbeta, par.set = params, control = ctrl, show.info = T)
#parameter tuning
#set hyperparameters
lrn_tune <- setHyperPars(lrn,par.vals = mytune$x)
#train model
xgmodel <- train(learner = lrn_tune,task = train_task)
#predict model
xgpred <- predict(xgmodel,testtask)
confusionMatrix(xgpred$data$response,xgpred$data$truth)
However, this error is reported:
Error in checkMeasures(measures, learner) : object 'fbeta' not found
Besides, my dataset contains 150,000 instances, but based on the computed confusion matrix, they are less than 150,000.
> confusionMatrix(xgpred$data$response,xgpred$data$truth)
[,1] [,2]
[1,] 0 0
[2,] 0 149887
Update: I function to calculate f score is as follows, but I am not sure about it.
fbeta = makeMeasure(id = "fbeta", minimize = FALSE, best = 1, worst = 0,
properties = c("classif", "req.pred", "req.truth"),
name = "Fbeta measure",
note = "Defined as: (1+beta^2) * tp/ (beta^2 * sum(truth == positive) + sum(response == positive))",
fun = function(task, model, pred, feats, extra.args) {
beta = 1
beta = beta^2
truth = pred$data$truth
response = pred$data$response
positive = pred$task.desc$positive
(1+beta) * measureTP(truth, response, positive) /
(beta * sum(truth == positive) + sum(response == positive))
}
)

Formula versus Non-Formula Interface Categorical Variables train() glmnet

I am comparing the confusion matrix between the formula interface and the non-formula interface using caret's train() for elastic net. I am trying to understand why the two interfaces produces different confusion matrices. I understand that the formula interface will decompose the categorical variables into dummies and the model will have more coefficients.
First consider the formula interface model:
library(liver)
library(caret)
library(glmnet)
library(dplyr)
data(churn)
head(churn)
set.seed(1)
train.index <- createDataPartition(churn$churn, p = 0.8, list = FALSE)
train_churn <- churn[train.index,]
test_churn <- churn[-train.index,]
# add class weights
my_weights = train_churn %>%
select(churn) %>%
group_by(churn) %>%
count()
weight_for_yes = (1 / my_weights$n[1]) * ((my_weights$n[1] + my_weights$n[2]) / 2.0)
weight_for_yes
weight_for_no = (1 / my_weights$n[2]) * ((my_weights$n[1] + my_weights$n[2]) / 2.0)
weight_for_no
model_weights <- ifelse(train_churn$churn == "yes", weight_for_yes, weight_for_no)
myGrid <- expand.grid(
alpha = 0,
lambda = 0.1
)
#----------------- formula interface
set.seed(1)
mod_1 <- train(churn ~
state +
area.code +
intl.plan,
data = train_churn,
method = "glmnet",
tuneGrid = myGrid,
weights = model_weights)
prediction <- predict(mod_1, newdata = test_churn[,-20])
confusionMatrix(prediction, test_churn$churn)
Now, consider the non-formula interface model
predictors <- train_churn %>%
select(state,
area.code,
intl.plan) %>%
data.matrix()
response <- train_churn$churn
set.seed(1)
mod_2 <- train(x = predictors,
y = response,
method = "glmnet",
tuneGrid = myGrid,
weights = model_weights)
Is the disparity due to formula versus non-formula, or is this an artifact of elastic net?

Tuning ridge regression with tidymodel using nested resampling

I want to tune a ridge regression using tidymodels. I have looked at this nested sampling tutorial, but not sure how to increase the tuning from one to two hyperparameters. Please see example below:
Example data:
library("mlbench")
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}
set.seed(9815)
train_dat <- sim_data(50)
Setting inner and outer folds:
library(tidymodels)
results_nested_resampling <- rsample::nested_cv(train_dat,
outside = vfold_cv(v=10, repeats = 1),
inside = vfold_cv(v=10, repeats = 1))
Function to fit the model and compute the RMSE works:
svm_rmse <- function(object, penalty = 1, mixture = 1) {
y_col <- ncol(object$data)
mod <-
parsnip::linear_reg(penalty = penalty, mixture = mixture) %>% # tune() uses the grid
parsnip::set_engine("glmnet") %>%
fit(y ~ ., data = analysis(object))
holdout_pred <-
predict(mod, assessment(object) %>% dplyr::select(-y)) %>%
bind_cols(assessment(object) %>% dplyr::select(y))
rmse(holdout_pred, truth = y, estimate = .pred)$.estimate
}
# In some case, we want to parameterize the function over the tuning parameter:
rmse_wrapper <- function(penalty, mixture, object) svm_rmse(object, penalty, mixture)
# testing rmse_wrapper
rmse_wrapper(penalty=0.1, mixture=0.1, object=results_nested_resampling$inner_resamples[[5]]$splits[[1]])
But function to tune over the two hyperparameters does not work:
tune_over_cost <- function(object) {
glmn_grid <- base::expand.grid(
penalty = 10^seq(-3, -1, length = 20),
mixture = (0:5) / 5)
df3_glmn_grid %>%
mutate(RMSE = map_dbl(glmn_grid$penalty, glmn_grid$mixture, rmse_wrapper, object = object))
}
tune_over_cost(object=results_nested_resampling$inner_resamples[[5]])
Thanks in advance.
Try using map2_dbl instead of map_dbl.
That is, change this line of code:
mutate(RMSE = map_dbl(glmn_grid$penalty, glmn_grid$mixture, rmse_wrapper, object = object))
to this line:
mutate(RMSE = map2_dbl(penalty, mixture, rmse_wrapper, object = object))

How can I train a glmnet model (Poisson family) with an offset term using the caret package in R?

I want to model insurance claim count using a Poisson glmnet. The data I have at hand contains the number of claims for each policy (which is the response variable), some features about the policy (gender, region, etc.) as well as the duration of the policy (in years). I want to include the log-duration as an offset term, as we usually do in actuarial science. With the cv.glmnet function of the glmnet package, it is straightforward:
library(tidyverse)
library(glmnet)
n <- 100
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
fit <- cv.glmnet(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
However, my goal is to train this model using the train function of the caret package, because of the many advantages it gives. Indeed, validation, preprocessing as well as feature selection is much better with this package. It is straightforward to train a basic glmnet (without an offset term) with caret:
library(caret)
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson"
)
fit
Naively, we could try to add the offset argument in the train function:
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
Unfortunately, this code throws the error Error : No newoffset provided for prediction, yet offset used in fit of glmnet. This error occurs because the caret::train function doesn't take care to give a value for the newoffset argument in predict.glmnet function.
In this book, they show how to add an offset term to a GLM model by modifying the source code of the caret::train function. It works perfectly. However, the predict.glm function is quite different from the predict.glmnet function, because it does not have the newoffset argument. I tried to modify the source code of the caret::train function, but I am having some trouble because I do not know well enough how this function works.
A simple way to perform this is pass the offset column as part of x and in each fit and predict call pass as x columns of x which are not the offset. While as offset/newoffset pass the x column corresponding to the offset.
In the following example the offest column of x needs to be named "offset" too. This can be changed relatively easy
To create the function we will just use lots of parts from: https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet is peculiar since it needs a loop, the rest is just rinse and reapeat from https://topepo.github.io/caret/using-your-own-model-in-train.html#illustrative-example-1-svms-with-laplacian-kernels
family = "poisson" will be specified throughout, to change this adopt code from https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet_offset <- list(type = "Regression",
library = c("glmnet", "Matrix"),
loop = function(grid) {
alph <- unique(grid$alpha)
loop <- data.frame(alpha = alph)
loop$lambda <- NA
submodels <- vector(mode = "list", length = length(alph))
for(i in seq(along = alph)) {
np <- grid[grid$alpha == alph[i],"lambda"]
loop$lambda[loop$alpha == alph[i]] <- np[which.max(np)]
submodels[[i]] <- data.frame(lambda = np[-which.max(np)])
}
list(loop = loop, submodels = submodels)
})
glmnet_offset$parameters <- data.frame(parameter = c('alpha', 'lambda'),
class = c("numeric", "numeric"),
label = c('Mixing Percentage', 'Regularization Parameter'))
glmnet_offset$grid <- function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
init <- glmnet::glmnet(Matrix::as.matrix(x[,colnames(x) != "offset"]), y,
family = "poisson",
nlambda = len+2,
alpha = .5,
offset = x[,colnames(x) == "offset"])
lambda <- unique(init$lambda)
lambda <- lambda[-c(1, length(lambda))]
lambda <- lambda[1:min(length(lambda), len)]
out <- expand.grid(alpha = seq(0.1, 1, length = len),
lambda = lambda)
} else {
out <- data.frame(alpha = runif(len, min = 0, 1),
lambda = 2^runif(len, min = -10, 3))
}
out
}
So x[,colnames(x) != "offset"] is x while offset is x[,colnames(x) == "offset"]
glmnet_offset$fit <- function(x, y, wts, param, last, ...) {
theDots <- list(...)
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
if(!(class(x)[1] %in% c("matrix", "sparseMatrix")))
x <- Matrix::as.matrix(x)
modelArgs <- c(list(x = x[,colnames(x) != "offset"],
y = y,
alpha = param$alpha,
family = "poisson",
offset = x[,colnames(x) == "offset"]),
theDots)
out <- do.call(glmnet::glmnet, modelArgs)
if(!is.na(param$lambda[1])) out$lambdaOpt <- param$lambda[1]
out
}
glmnet_offset$predict <- function(modelFit, newdata, submodels = NULL) {
if(!is.matrix(newdata)) newdata <- Matrix::as.matrix(newdata)
out <- predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = modelFit$lambdaOpt,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response") #important for measures to be appropriate
if(is.matrix(out)) out <- out[,1]
out
if(!is.null(submodels)) {
tmp <- as.list(as.data.frame(predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = submodels$lambda,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response"),
stringsAsFactors = TRUE))
out <- c(list(out), tmp)
}
out
}
For some reason which I don't understand yet it does not work without the prob slot
glmnet_offset$prob <- glmnet_offset$predict
glmnet_offset$tags = c("Generalized Linear Model", "Implicit Feature Selection",
"L1 Regularization", "L2 Regularization", "Linear Classifier",
"Linear Regression")
glmnet_offset$sort = function(x) x[order(-x$lambda, x$alpha),]
glmnet_offset$trim = function(x) {
x$call <- NULL
x$df <- NULL
x$dev.ratio <- NULL
x
}
library(tidyverse)
library(caret)
library(glmnet)
n <- 100
set.seed(123)
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
x = dat %>%
dplyr::select(-nb_claims) %>%
mutate(offset = log(duration)) %>%
dplyr::select(-duration) %>%
as.matrix
fit <- caret::train(
x = x,
y = dat %>% pull(nb_claims),
method = glmnet_offset,
)
fit
100 samples
4 predictor
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 100, 100, 100, 100, 100, 100, ...
Resampling results across tuning parameters:
alpha lambda RMSE Rsquared MAE
0.10 0.0001640335 0.7152018 0.01805762 0.5814200
0.10 0.0016403346 0.7152013 0.01805684 0.5814193
0.10 0.0164033456 0.7130390 0.01798125 0.5803747
0.55 0.0001640335 0.7151988 0.01804917 0.5814020
0.55 0.0016403346 0.7150312 0.01802689 0.5812936
0.55 0.0164033456 0.7095996 0.01764947 0.5783706
1.00 0.0001640335 0.7152033 0.01804795 0.5813997
1.00 0.0016403346 0.7146528 0.01798979 0.5810811
1.00 0.0164033456 0.7063482 0.01732168 0.5763653
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were alpha = 1 and lambda = 0.01640335.
predict(fit$finalModel, x[,1:3], newoffset = x[,4]) #works
This will not work with preprocessing in caret since we pass offset as one of the features. However it will work with recipes since you can define columns on which preprocessing functions will be performed via selections. Se article for details: https://tidymodels.github.io/recipes/articles/Selecting_Variables.html
I haven't had time to error check my code. If any problems occur or if there is a mistake somewhere please comment. Thanks.
You can also post an issue in caret github asking this feature (offset/newoffset) to be added to the model
I tried to change the model info a lot of ways, but it was failing miserably. Below I can propose one solution, may not be the best, but will get you somewhere if your data is sensible.
In the poisson / negative binom .. regression, the offset in factor gets introduced into the regression, you can read more here and here:
where tx is the offset. In glmnet, there is a penalty factor you can introduce for each term, and if you let that be 0 for a term, basically you are not penalizing it and it's always included. We can use that for the offset, and you can see this effect only if you use a dataset that makes some sense (note that in your example dataset, the offsets are numbers that make no sense).
Below I use the insurance claims dataset from MASS:
library(tidyverse)
library(glmnet)
library(MASS)
dat <- Insurance
X = model.matrix(Claims ~ District + Group + Age,data=dat)
Y = dat$Claims
OFF = log(dat$Holders)
fit_cv <- cv.glmnet(
x = X,
y = Y,
family = "poisson",
offset = OFF
)
Now using caret, I will fit it without any training, and using the same lambda obtained from the fit in cv.glmnet. One thing you should note too is that cv.glmnet often uses lambda.1se instead of lambda.min:
fit_c <- caret::train(
x = cbind(X,OFF),
y = Y,
method = "glmnet",
family = "poisson",
tuneGrid=data.frame(lambda=fit_cv$lambda.1se,alpha=1),
penalty=c(rep(1,ncol(X)),0),
trControl = trainControl(method="none")
)
We can see how different are the predictions:
p1 = predict(fit_cv,newx=X,newoffset=OFF)
p2 = predict(fit_c,newx=cbind(X,OFF))
plot(p1,p2)

How to bootstrap Mixed-Effects Model in R

I have a data set (df) in this format
index <- runif(n = 100,min = 0, max = 1)
type1 <- rep("low", 50)
type2 <- rep("high", 50)
type <- c(type1,type2)
level1 <- rep("single", 25)
level2 <- rep("multiple", 25)
level3 <- rep("single", 25)
level4 <- rep("multiple", 25)
level <- c(level1,level2,level3,level4)
block <- rep(1:5, 10)
set <- rep(1:5, 10)
df <- data.frame("index" = index,"type" = type, "level" = level, "block" = block, "set" = set)
df$block <- as.factor(df$block)
df$set <- as.factor(df$set)
I want to create a model that looks like like this
model <- lmer(index ~ type * level + (1|block) + (1|set), data = df)
However, in my original data the fit is bad because the data is bound between 0 and 1. I want to bootstrap this mixed effects model. Any idea on how to achieve boot-strapping for such a model? I want to compare this this full model with sub-models eg. without interaction, or with level or type alone. I also want with confidence intervals for the final model
The confint() function has a method for merMod objects. The following should work:
confint(model, method = "boot", nsim = 1000)
And with multiple CPUs:
confint(model, method = "boot", nsim = 1000,
parallel = "multicore", ncpus = 8)

Resources