I'm trying to create a custom GBM model that tunes the classification threshold for a binary classification problem. There is a nice example provided on the caret website here, but when I try to apply something similar to GBM I receive the following error:
Error in { : task 1 failed - "argument 1 is not a vector"
Unfortunately, I have no idea where the error is and the error isn't very helpful.
Here's an example, with the code that I've used for defining the custom GBM
library(caret)
library(gbm)
library(pROC)
#### DEFINE A CUSTOM GBM MODEL FOR PROBABILITY THRESHOLD TUNING ####
## Get the model code for the original gbm method from caret
customGBM <- getModelInfo("gbm", regex = FALSE)[[1]]
customGBM$type <- c("Classification")
## Add the threshold (i.e. class cutoff) as another tuning parameter
customGBM$parameters <- data.frame(parameter = c("n.trees", "interaction.depth", "shrinkage",
"n.minobsinnode", "threshold"),
class = rep("numeric", 5),
label = c("# Boosting Iterations", "Max Tree Depth", "Shrinkage",
"Min. Terminal Node Size", "Probability Cutoff"))
## Customise the tuning grid:
## Some paramters are fixed. Will give a tuning grid of 2,500 values if len = 100
customGBM$grid <- function(x, y, len = NULL, search = "grid") {
if (search == "grid") {
grid <- expand.grid(n.trees = seq(50, 250, 50),
interaction.depth = 2, ### fix interaction depth at 2
shrinkage = 0.0001, ### fix learning rate at 0.0001
n.minobsinnode = seq(2, 10, 2),
threshold = seq(.01, .99, length = len))
} else {
grid <- expand.grid(n.trees = floor(runif(len, min = 1, max = 5000)),
interaction.depth = sample(1:10, replace = TRUE, size = len),
shrinkage = runif(len, min = .001, max = .6),
n.minobsinnode = sample(5:25, replace = TRUE, size = len),
threshold = runif(1, 0, size = len))
grid <- grid[!duplicated(grid),] ### remove any duplicated rows in the training grid
}
grid
}
## Here we fit a single gbm model and loop over the threshold values to get predictions from the
## same gbm model.
customGBM$loop = function(grid) {
library(plyr)
loop <- ddply(grid, c("n.trees", "shrinkage", "interaction.depth", "n.minobsinnode"),
function(x) c(threshold = max(x$threshold)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$threshold)) {
index <- which(grid$n.trees == loop$n.trees[i] &
grid$interaction.depth == loop$interaction.depth[i] &
grid$shrinkage == loop$shrinkage[i] &
grid$n.minobsinnode == loop$n.minobsinnode[i])
cuts <- grid[index, "threshold"]
submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
}
list(loop = loop, submodels = submodels)
}
## Fit the model independent of the threshold parameter
customGBM$fit = function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
if (any(names(theDots) == "distribution")) {
modDist <- theDots$distribution
theDots$distribution <- NULL
} else {
if (is.numeric(y)) {
stop("This works only for 2-class classification problems")
} else modDist <- if (length(lev) == 2) "bernoulli" else
stop("This works only for 2-class classification problems")
}
# if (length(levels(y)) != 2)
# stop("This works only for 2-class problems")
## check to see if weights were passed in (and availible)
if (!is.null(wts)) theDots$w <- wts
if (is.factor(y) && length(lev) == 2) y <- ifelse(y == lev[1], 1, 0)
modArgs <- list(x = x,
y = y,
interaction.depth = param$interaction.depth,
n.trees = param$n.trees,
shrinkage = param$shrinkage,
n.minobsinnode = param$n.minobsinnode,
distribution = modDist)
do.call("gbm.fit", modArgs)
}
## Now get a probability prediction and use different thresholds to
## get the predicted class
customGBM$predict = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, n.trees = modelFit$tuneValue$n.trees,
type = "response")#[, modelFit$obsLevels[1]]
out[is.nan(out)] <- NA
class1Prob <- ifelse(out >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
## Raise the threshold for class #1 and a higher level of
## evidence is needed to call it class 1 so it should
## decrease sensitivity and increase specificity
out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
if (!is.null(submodels)) {
tmp2 <- out
out <- vector(mode = "list", length = length(submodels$threshold))
out[[1]] <- tmp2
for (i in seq(along = submodels$threshold)) {
out[[i + 1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
modelFit$obsLevels[1],
modelFit$obsLevels[2])
}
}
out
}
## The probabilities are always the same but we have to create
## mulitple versions of the probs to evaluate the data across
## thresholds
customGBM$prob = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, type = "response",
n.trees = modelFit$tuneValue$n.trees)
out[is.nan(out)] <- NA
out <- cbind(out, 1 - out)
colnames(out) <- modelFit$obsLevels
if (!is.null(submodels)) {
tmp <- predict(modelFit, newdata, type = "response", n.trees = submodels$n.trees)
tmp <- as.list(as.data.frame(tmp))
lapply(tmp, function(x, lvl) {
x <- cbind(x, 1 - x)
colnames(x) <- lvl
x}, lvl = modelFit$obsLevels)
out <- c(list(out), tmp)
}
out
}
fourStats <- function (data, lev = levels(data$obs), model = NULL) {
## This code will get use the area under the ROC curve and the
## sensitivity and specificity values using the current candidate
## value of the probability threshold.
out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
## The best possible model has sensitivity of 1 and specificity of 1.
## How far are we from that value?
coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
ncol = 2,
byrow = TRUE)
colnames(coords) <- c("Spec", "Sens")
rownames(coords) <- c("Best", "Current")
c(out, Dist = dist(coords)[1])
}
And then some code showing how to use the custom model
set.seed(949)
trainingSet <- twoClassSim(500, -9)
mod1 <- train(Class ~ ., data = trainingSet,
method = customGBM, metric = "Dist",
maximize = FALSE, tuneLength = 10,
trControl = trainControl(method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = fourStats))
The model appears to run, but finishes with the error from above. If someone could please help me with customising the GBM model to tune the GBM parameters, and the probability threshold for the classes that would be great.
Related
I am trying to implement the genetic algorithm for feature selection as done in the book Feature Engineering and Selection: A Practical Approach for Predictive Models
by Max Kuhn and Kjell Johnson. I copied the code from here https://github.com/topepo/FES/blob/master/12_Global_Search/12_03_Genetic_Algorithms.R
I keep getting this error, "cannot take a sample larger than the population when 'replace = FALSE'". For the sake of demonstration, I tried it on the churn data set. Also, I reduced the iterations from 15 to 1 to increase the speed.
library(caret)
library(liver)
data(churn)
head(churn)
set.seed(3456)
trainIndex <- createDataPartition(churn$churn, p = .8,
list = FALSE,
times = 1)
train <- churn[ trainIndex,]
test <- churn[-trainIndex,]
# ------------------------------------------------------------------------------
many_stats <-
function(data, lev = levels(data$obs), model = NULL) {
c(
twoClassSummary(data = data, lev = levels(data$obs), model),
prSummary(data = data, lev = levels(data$obs), model),
mnLogLoss(data = data, lev = levels(data$obs), model),
defaultSummary(data = data, lev = levels(data$obs), model)
)
}
# ------------------------------------------------------------------------------
ga_funcs <- caretGA
ga_funcs$fitness_extern <- many_stats
ga_funcs$initial <- function(vars, popSize, ...) {
x <- matrix(NA, nrow = popSize, ncol = vars)
probs <- seq(0.1, 0.90, length = popSize)
for (i in 1:popSize) {
x[i, ] <-
sample(0:1, replace = TRUE, size = vars, prob = c(probs[i], 1 - probs[i]))
}
var_count <- apply(x, 1, sum)
if (any(var_count == 0)) {
for (i in which(var_count == 0)) {
p <- sample(1:length(vars), size = 2)
x[i, p] <- 1
}
}
x
}
ctrl_rs <- trainControl(
method = "LGOCV",
p = 0.90,
number = 1,
summaryFunction = many_stats,
classProbs = TRUE,
allowParallel = FALSE
)
ga_ctrl <- gafsControl(
method = "cv",
metric = c(internal = "ROC", external = "ROC"),
maximize = c(internal = TRUE, external = TRUE),
functions = ga_funcs,
returnResamp = "all",
verbose = TRUE
)
options(digits = 3)
nb_grid <- data.frame(usekernel = TRUE, fL = 0, adjust = 1)
set.seed(325)
gen_algo <- gafs(
x = train[,-20],
y = train$churn,
data = train,
iters = 1,
gafsControl = ga_ctrl,
method = "nb",
tuneGrid = nb_grid,
trControl = ctrl_rs,
metric = "ROC"
)
The code specifies, "replace = TRUE", but clearly I am missing something. Any help is greatly appreciated!
Thanks!
The following function shall be used with Caret's train() function. Without any factor variables or without cross-validation it works fine.
The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:
Consider the following adapted cforest model (from the package partykit):
cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
library = c("partykit", "party"),
loop = NULL,
type = c("Classification", "Regression"),
parameters = data.frame(parameter = 'mtry',
class = 'numeric',
label = "#Randomly Selected Predictors"),
grid = function(x, y, len = NULL, search = "grid"){
if(search == "grid") {
out <- data.frame(mtry = caret::var_seq(p = ncol(x),
classification = is.factor(y),
len = len))
} else {
out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
}
out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
# make consistent factor levels
if(any(sapply(x, is.factor))){
fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])],
levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))
}
}
dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
dat$.outcome <- y
theDots <- list(...)
if(any(names(theDots) == "mtry")) # # change controls to mtry?
{
theDots$mtry <- as.integer(param$mtry) # remove gtcrl
theDots$mtry
theDots$mtry <- NULL
} else mtry <- min(param$mtry, ncol(x))
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(.outcome ~ .),
data = dat,
mtry = mtry), # change controls to mtry?
theDots)
out <- do.call(partykit::cforest, modelArgs)
out
},
predict = function(modelFit, newdata = NULL, submodels = NULL) {
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
# make consistent factor levels
if(any(sapply(newdata, is.factor))){
fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])],
levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))
}
}
## party builds the levels into the model object, so I'm
## going to assume that all the levels will be passed to
## the output
out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
if(is.matrix(out)) out <- out[,1]
if(!is.null(modelFit$'(response)')) out <- as.character(out) # if(!is.null(modelFit#responses#levels$.outcome)) out <- as.character(out)
out
},
prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
obsLevels <- levels(modelFit$'(response)')
rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
out <- data.frame(probMatrix)
colnames(out) <- obsLevels
rownames(out) <- NULL
out
},
predictors = function(x, ...) {
vi <- partykit::varimp(x, ...)
names(vi)[vi != 0]
},
varImp = function(object, ...) {
variableImp <- partykit::varimp(object, ...)
out <- data.frame(Overall = variableImp)
out
},
tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
levels = function(x) levels(x#data#get("response")[,1]),
sort = function(x) x[order(x[,1]),],
oob = function(x) {
obs <- x#data#get("response")[,1]
pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
postResample(pred, obs)
})
When applying it within train and repeatedcv using a data frame with a factor predictor variable, an error occurs:
library(caret)
library(party)
library(partykit)
dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))
# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 2)
# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2,
index = folds_train,
savePred = T)
preds <- dat[,2:5]
response <- dat[,1]
# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4))
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
y = response, # response
method = cforest_partykit,
metric = "RMSE",
tuneGrid = tunegrid,
trControl = finalcontrol,
ntree = 150)
warnings()
1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t
The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:
for (i in 1:length(folds_train)) {
preds_temp <- preds[folds_train[[i]],]
# check levels
levels(preds_temp$class)
# which are actually present
unique(preds_temp$class)
# assign present levels to each subset
preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))
}
I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.
How could I implement this in the caret train() or trainControl() or createDataPartition() function?
To make sure cforest_partykit treats categorical variables appropriately, it is best to create the design matrix explicitly through the model.matrix command.
For example
# Create a formula for the model
model_formula <- as.formula("y_column ~ . -1")
# Then create the design matrix
model_train.design.matrix <- model.matrix(model_formula, data = dat)
# Add in the y-variable
model_train.design.data <- cbind(y_column = data$y_column, model_train.design.matrix)
I am trying to get the below piece of code to work - which performs multiple regression with different stepwise methods i.e. backward, forward, exhaustive, seqrep, but it is returning an error at the method = stepwise line.
# cross-validation
predict.regsubsets <- function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefs = coef(object, id = id)
xvars = names(coefs)
mat[, xvars] %*% coefs
}
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted) ^ 2))
}
cv <- function(stepwise, folds) {
num_folds = folds
set.seed(1)
folds = caret::createFolds(mtcars$mpg, k = num_folds)
fold_error = matrix(0, nrow = num_folds, ncol = num_vars,
dimnames = list(paste(1:num_folds), paste(1:num_vars)))
for(j in 1:num_folds) {
train_fold = mtcars[-folds[[j]], ]
validate_fold = mtcars[ folds[[j]], ]
best_fit = regsubsets(mpg ~ ., data = mtcars, nvmax = 10, method = stepwise)
for (i in 1:num_vars) {
pred = predict(best_fit, validate_fold, id = i)
fold_error[j, i] = rmse(actual = validate_fold$mpg,
predicted = pred)
}
}
cv_error = apply(fold_error, 2, mean)
par(mfrow=c(1,1))
plot(cv_error, type = 'l')
cv_min = which.min(cv_error)
points(cv_min, cv_error[cv_min], col = "red", cex = 2, pch = 20)
}
cv("backward",5)
What have I done wrong?
I'm trying to perform recursive feature elimination using the rfe function but I'm having a bit of trouble trying to change the performance measure to output the ROC:
newFunc <- caretFuncs
newFunc$summary <- twoClassSummary
ctrl <- rfeControl(functions = newFunc,
method = 'cv',
returnResamp = TRUE,
number = 2,
verbose = TRUE)
profiler <- rfe(predictors, response,
sizes = c(1),
method = 'nnet',
tuneGrid = expand.grid(size=c(4), decay=c(0.1)),
maxit = 20,
metric = 'ROC',
rfeControl = ctrl)
Trying to run this code is giving me the following error:
Error in { : task 1 failed - "undefined columns selected"
If I remove the custom newFunc, set the functions parameter inside rfeControl to use caretFuncs and remove the metric parameter from rfe, the model works fine. This makes me think there's something wrong with the summary.
caretFuncs$summary:
function (data, lev = NULL, model = NULL)
{
if (is.character(data$obs))
data$obs <- factor(data$obs, levels = lev)
postResample(data[, "pred"], data[, "obs"])
}
twoClassSummary
function (data, lev = NULL, model = NULL)
{
lvls <- levels(data$obs)
if (length(lvls) > 2)
stop(paste("Your outcome has", length(lvls), "levels. The twoClassSummary() function isn't appropriate."))
requireNamespaceQuietStop("ModelMetrics")
if (!all(levels(data[, "pred"]) == lvls))
stop("levels of observed and predicted data do not match")
data$y = as.numeric(data$obs == lvls[2])
rocAUC <- ModelMetrics::auc(ifelse(data$obs == lev[2], 0,
1), data[, lvls[1]])
out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"],
lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out) <- c("ROC", "Sens", "Spec")
out
}
The output to postResample and twoClassSummary are identical in their structures so I'm a little lost as to what this problem is. Am I doing something inherently wrong here or is this a bug that I need to flag to the devs?
I'm actually interested in obtaining the logLoss so I could write my own function:
logLoss = function(data, lev = NULL, model = NULL) {
-1*mean(log(data[, 'pred'][model.matrix(~ as.numeric(data[, 'obs'], levels = lev) + 0) - data[, 'pred'] > 0]))
}
But, I'm a little unsure how to convert the factor levels into the correct [0,1] from my [yes, no] factor?
First of all here is a viable logloss function for use with caret:
LogLoss <- function (data, lev = NULL, model = NULL)
{
obs <- data[, "obs"]
cls <- levels(obs) #find class names
probs <- data[, cls[2]] #use second class name
probs <- pmax(pmin(as.numeric(probs), 1 - 1e-15), 1e-15) #bound probability
logPreds <- log(probs)
log1Preds <- log(1 - probs)
real <- (as.numeric(data$obs) - 1)
out <- c(mean(real * logPreds + (1 - real) * log1Preds)) * -1
names(out) <- c("LogLoss")
out
}
to answer the question how to convert the factor levels into the correct [0,1] from my [yes, no] factor:
real <- (as.numeric(data$obs) - 1)
to get rfe to work you can use rfFuncs instead of caretFuncs. Example:
rfFuncs$summary <- twoClassSummary
ctrl <- rfeControl(functions = rfFuncs,
method = 'cv',
returnResamp = TRUE,
number = 2,
verbose = TRUE)
profiler <- rfe(Sonar[,1:60], Sonar$Class,
sizes = c(1, 5, 20, 40, 60),
method = 'nnet',
tuneGrid = expand.grid(size=c(4), decay=c(0.1)),
maxit = 20,
metric = 'ROC',
rfeControl = ctrl)
profiler$results
Variables ROC Sens Spec ROCSD SensSD SpecSD
1 1 0.6460027 0.6387987 0.5155187 0.08735968 0.132008571 0.007516016
2 5 0.7563971 0.6847403 0.7013180 0.03751483 0.008724045 0.039383924
3 20 0.8633511 0.8462662 0.7017432 0.08460677 0.091143309 0.097708207
4 40 0.8841540 0.8642857 0.7429847 0.08096697 0.090913729 0.098309489
5 60 0.8945351 0.9004870 0.7431973 0.05707867 0.064971175 0.127471631
or with the LogLoss function I provided:
rfFuncs$summary <- LogLoss
ctrl <- rfeControl(functions = rfFuncs,
method = 'cv',
returnResamp = TRUE,
number = 2,
verbose = TRUE)
profiler <- rfe(Sonar[,1:60], Sonar$Class,
sizes = c(1, 5, 20, 40, 60),
method = 'nnet',
tuneGrid = expand.grid(size=c(4), decay=c(0.1)),
maxit = 20,
metric = 'LogLoss',
rfeControl = ctrl,
maximize = FALSE) #this was edited after the answer of Дмитрий Пасько)
profiler$results
Variables LogLoss LogLossSD
1 1 1.8237372 1.030120134
2 5 0.5548774 0.128704686
3 20 0.4226522 0.021547998
4 40 0.4167819 0.013587892
5 60 0.4328718 0.008000892
EDIT: Дмитрий Пасько raises a valid concern in his answer - LogLoss should be minimized. One way to achieve this is to provide the logical argument maximize telling caret should the metric be minimized or maximized.
but u should minimize logLoss, thus use this code (example with logistic regression https://www.kaggle.com/demetrypascal/rfe-logreg-with-pca-and-feature-importance):
LogLoss <- function (data, lev = NULL, model = NULL)
{
obs <- data[, "obs"]
cls <- levels(obs) #find class names
probs <- data[, cls[2]] #use second class name
probs <- pmax(pmin(as.numeric(probs), 1 - 1e-15), 1e-15) #bound probability
logPreds <- log(probs)
log1Preds <- log(1 - probs)
real <- (as.numeric(data$obs) - 1)
out <- c(mean(real * logPreds + (1 - real) * log1Preds)) * -1
names(out) <- c("LogLossNegative")
-out
}
lrFuncs$summary <- LogLoss
rfec = rfeControl(method = "cv",
number = 2,
functions = lrFuncs)
I want to mimic a multinomial logit model using separate logistic regressions and cross validating them via caret. In the non-CV world, I want to achieve the following:
# Create Data-Set
library(data.table)
library(dplyr)
N <- 1000
X1 <- rnorm(N, 175, 7)
X2 <- rnorm(N, 30, 8)
X3 <- rnorm(N,0,1)
length <- sample(0:5,N,T)
Ycont <- 0.5*X1 - 0.3*X2 +0.01*X3 + 10 + rnorm(N, 0, 6)
# create 3 categories
Ycateg <- ntile(Ycont,3)
df <- data.frame(id=1:N,length,X1, X2,X3, Ycateg)
df=setDT(df)[,.SD[rep(1L,length)],by = id]
df=df[ , time := 1:.N , by=id]
df=df[,-c("length")]
df$Ycateg=ifelse(df$Ycateg==1,"type1",ifelse(df$Ycateg==2,"type2","type0"))
head(df)
# aim of the model without CV - combine logit regressions
y_1=ifelse(df$Ycateg=="type1",1,0)
y_2=ifelse(df$Ycateg=="type2",1,0)
#drop the ID column
dat_model=df[,-1]
# fit the models
fit_1=glm(formula=y_1~. ,family=binomial(link='logit'),data=dat_model,control = list(maxit = 50))
fit_2=glm(formula=y_2~. ,family=binomial(link='logit'),data=dat_model,control = list(maxit = 50))
# predict
p_1=predict(fit_1,type = "response")
p_2=predict(fit_2,type = "response")
p_0=1-p_1-p_2
head(cbind(p_0,p_1,p_2))
p_0 p_1 p_2
1 1.000000e+00 2.220446e-16 2.220446e-16
2 0.000000e+00 2.220446e-16 1.000000e+00
3 4.930381e-32 1.000000e+00 2.220446e-16
4 4.930381e-32 1.000000e+00 2.220446e-16
So what I need, is to build my own model in the caret-framework to replicate the model above. What I have done so far is:
#Extend Caret
customLogit <- list(type = "Classification", library = "stats", loop = NULL)
customLogit$parameters =data.frame(parameter = c("decay"), class = c("numeric"), label = c("decay"))
customLogit$grid = function(x, y, len = NULL, search = "grid") { }
customLogit$fit <- function(x,y, ...) {
y_1=ifelse(df$Ycateg=="type1",1,0)
y_2=ifelse(df$Ycateg=="type2",1,0)
fit_1=glm(formula=y_1~. ,family=binomial(link='logit'),control = list(maxit = 50),...)
fit_2=glm(formula=y_2~. ,family=binomial(link='logit'),control = list(maxit = 50),...)
out = vector("list",2)
out[[1]]=fit_1
out[[2]]=fit_2
return(out)
}
customLogit$predict <- function(modelFit_all, newdata, preProc = NULL, submodels = NULL,...) {
p_1=predict(modelFit_all[[1]],newdata=newdata,...)
p_2=predict(modelFit_all[[2]],newdata=newdata,...)
p_0=ifelse(p_1==0 & p_2==0,1,0)
out=cbind(p_0,p_1,p_2)
return(out)
}
customLogit$prob <- function(modelFit_all, newdata, preProc = NULL, submodels = NULL) {
p_1=predict(modelFit_all[[1]],newdata=newdata,type="response",...)
p_2=predict(modelFit_all[[2]],newdata=newdata,type="response",...)
p_0=1- p_1-p_0
out=cbind(p_cur,p_def,p_pre)
return(out)
}
customLogit$sort <- NULL
customLogit$levels <- function(x) x$classes
# which type of cross validation to do
fitControl <- trainControl(method = 'cv',number=5,classProbs=TRUE,summaryFunction=defaultSummary, selectionFunction = "best", savePredictions = TRUE)
# tuning parameters
grid <- expand.grid(decay = 0 )
cv=train(as.factor(Ycateg)~.,
data = dat_model,
method = customLogit,
trControl = fitControl,
tuneGrid = grid,
)
Sadely, I could not bring the code to work and it throws me the error:
Error in train.default(x, y, weights = w, ...) :
argument is missing, with no default
I assume that the problem is the decay parameter, but as I understood, one cannot "tune" the logistic regression model using glm, such that I don't want to introduce any "tuning" parameters.
Many thanks in advance!