Multiple regression function does not work in R - r

I am trying to get the below piece of code to work - which performs multiple regression with different stepwise methods i.e. backward, forward, exhaustive, seqrep, but it is returning an error at the method = stepwise line.
# cross-validation
predict.regsubsets <- function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefs = coef(object, id = id)
xvars = names(coefs)
mat[, xvars] %*% coefs
}
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted) ^ 2))
}
cv <- function(stepwise, folds) {
num_folds = folds
set.seed(1)
folds = caret::createFolds(mtcars$mpg, k = num_folds)
fold_error = matrix(0, nrow = num_folds, ncol = num_vars,
dimnames = list(paste(1:num_folds), paste(1:num_vars)))
for(j in 1:num_folds) {
train_fold = mtcars[-folds[[j]], ]
validate_fold = mtcars[ folds[[j]], ]
best_fit = regsubsets(mpg ~ ., data = mtcars, nvmax = 10, method = stepwise)
for (i in 1:num_vars) {
pred = predict(best_fit, validate_fold, id = i)
fold_error[j, i] = rmse(actual = validate_fold$mpg,
predicted = pred)
}
}
cv_error = apply(fold_error, 2, mean)
par(mfrow=c(1,1))
plot(cv_error, type = 'l')
cv_min = which.min(cv_error)
points(cv_min, cv_error[cv_min], col = "red", cex = 2, pch = 20)
}
cv("backward",5)
What have I done wrong?

Related

R: Genetic Algorithm Feature Selection 'replace = FALSE' error

I am trying to implement the genetic algorithm for feature selection as done in the book Feature Engineering and Selection: A Practical Approach for Predictive Models
by Max Kuhn and Kjell Johnson. I copied the code from here https://github.com/topepo/FES/blob/master/12_Global_Search/12_03_Genetic_Algorithms.R
I keep getting this error, "cannot take a sample larger than the population when 'replace = FALSE'". For the sake of demonstration, I tried it on the churn data set. Also, I reduced the iterations from 15 to 1 to increase the speed.
library(caret)
library(liver)
data(churn)
head(churn)
set.seed(3456)
trainIndex <- createDataPartition(churn$churn, p = .8,
list = FALSE,
times = 1)
train <- churn[ trainIndex,]
test <- churn[-trainIndex,]
# ------------------------------------------------------------------------------
many_stats <-
function(data, lev = levels(data$obs), model = NULL) {
c(
twoClassSummary(data = data, lev = levels(data$obs), model),
prSummary(data = data, lev = levels(data$obs), model),
mnLogLoss(data = data, lev = levels(data$obs), model),
defaultSummary(data = data, lev = levels(data$obs), model)
)
}
# ------------------------------------------------------------------------------
ga_funcs <- caretGA
ga_funcs$fitness_extern <- many_stats
ga_funcs$initial <- function(vars, popSize, ...) {
x <- matrix(NA, nrow = popSize, ncol = vars)
probs <- seq(0.1, 0.90, length = popSize)
for (i in 1:popSize) {
x[i, ] <-
sample(0:1, replace = TRUE, size = vars, prob = c(probs[i], 1 - probs[i]))
}
var_count <- apply(x, 1, sum)
if (any(var_count == 0)) {
for (i in which(var_count == 0)) {
p <- sample(1:length(vars), size = 2)
x[i, p] <- 1
}
}
x
}
ctrl_rs <- trainControl(
method = "LGOCV",
p = 0.90,
number = 1,
summaryFunction = many_stats,
classProbs = TRUE,
allowParallel = FALSE
)
ga_ctrl <- gafsControl(
method = "cv",
metric = c(internal = "ROC", external = "ROC"),
maximize = c(internal = TRUE, external = TRUE),
functions = ga_funcs,
returnResamp = "all",
verbose = TRUE
)
options(digits = 3)
nb_grid <- data.frame(usekernel = TRUE, fL = 0, adjust = 1)
set.seed(325)
gen_algo <- gafs(
x = train[,-20],
y = train$churn,
data = train,
iters = 1,
gafsControl = ga_ctrl,
method = "nb",
tuneGrid = nb_grid,
trControl = ctrl_rs,
metric = "ROC"
)
The code specifies, "replace = TRUE", but clearly I am missing something. Any help is greatly appreciated!
Thanks!

R caret: train() failed for repeatedcv with factor predictors

The following function shall be used with Caret's train() function. Without any factor variables or without cross-validation it works fine.
The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:
Consider the following adapted cforest model (from the package partykit):
cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
library = c("partykit", "party"),
loop = NULL,
type = c("Classification", "Regression"),
parameters = data.frame(parameter = 'mtry',
class = 'numeric',
label = "#Randomly Selected Predictors"),
grid = function(x, y, len = NULL, search = "grid"){
if(search == "grid") {
out <- data.frame(mtry = caret::var_seq(p = ncol(x),
classification = is.factor(y),
len = len))
} else {
out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
}
out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
# make consistent factor levels
if(any(sapply(x, is.factor))){
fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])],
levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))
}
}
dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
dat$.outcome <- y
theDots <- list(...)
if(any(names(theDots) == "mtry")) # # change controls to mtry?
{
theDots$mtry <- as.integer(param$mtry) # remove gtcrl
theDots$mtry
theDots$mtry <- NULL
} else mtry <- min(param$mtry, ncol(x))
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(.outcome ~ .),
data = dat,
mtry = mtry), # change controls to mtry?
theDots)
out <- do.call(partykit::cforest, modelArgs)
out
},
predict = function(modelFit, newdata = NULL, submodels = NULL) {
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
# make consistent factor levels
if(any(sapply(newdata, is.factor))){
fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])],
levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))
}
}
## party builds the levels into the model object, so I'm
## going to assume that all the levels will be passed to
## the output
out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
if(is.matrix(out)) out <- out[,1]
if(!is.null(modelFit$'(response)')) out <- as.character(out) # if(!is.null(modelFit#responses#levels$.outcome)) out <- as.character(out)
out
},
prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
obsLevels <- levels(modelFit$'(response)')
rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
out <- data.frame(probMatrix)
colnames(out) <- obsLevels
rownames(out) <- NULL
out
},
predictors = function(x, ...) {
vi <- partykit::varimp(x, ...)
names(vi)[vi != 0]
},
varImp = function(object, ...) {
variableImp <- partykit::varimp(object, ...)
out <- data.frame(Overall = variableImp)
out
},
tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
levels = function(x) levels(x#data#get("response")[,1]),
sort = function(x) x[order(x[,1]),],
oob = function(x) {
obs <- x#data#get("response")[,1]
pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
postResample(pred, obs)
})
When applying it within train and repeatedcv using a data frame with a factor predictor variable, an error occurs:
library(caret)
library(party)
library(partykit)
dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))
# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 2)
# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2,
index = folds_train,
savePred = T)
preds <- dat[,2:5]
response <- dat[,1]
# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4))
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
y = response, # response
method = cforest_partykit,
metric = "RMSE",
tuneGrid = tunegrid,
trControl = finalcontrol,
ntree = 150)
warnings()
1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t
The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:
for (i in 1:length(folds_train)) {
preds_temp <- preds[folds_train[[i]],]
# check levels
levels(preds_temp$class)
# which are actually present
unique(preds_temp$class)
# assign present levels to each subset
preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))
}
I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.
How could I implement this in the caret train() or trainControl() or createDataPartition() function?
To make sure cforest_partykit treats categorical variables appropriately, it is best to create the design matrix explicitly through the model.matrix command.
For example
# Create a formula for the model
model_formula <- as.formula("y_column ~ . -1")
# Then create the design matrix
model_train.design.matrix <- model.matrix(model_formula, data = dat)
# Add in the y-variable
model_train.design.data <- cbind(y_column = data$y_column, model_train.design.matrix)

Intialized vector in AUC function out of bounds

I am trying to use cross validation with a decision tree using AUC. These are the functions that I am using:
.cvFolds <- function(Y, V) {
Y0 <- split(sample(which(Y == 0)), rep(1:V, length = length(which(Y == 0))))
Y1 <- split(sample(which(Y == 1)), rep(1:V, length = length(which(Y == 1))))
folds <- vector("list", length = V)
for (v in seq(V)) folds[[v]] <- c(Y0[[v]], Y1[[v]])
return(folds)
}
.doFit <- function(V, folds, train) {
set.seed(v)
ycol <- which(names(train) == y)
params <- list(x = train[-folds[[V]], -ycol],
y = as.factor(train[-folds[[V]], ycol]),
xtest = train[folds[[V]], -ycol])
fit <- do.call(randomForest, params)
pred <- fit$test$votes[, 2]
return(pred)
}
This is the function to calculate probabilities:
iid_example <- function(train, y = "V1", V = 10, seed = 1) {
set.seed(seed)
folds <- .cvFolds(Y = train[, c(y)], V = V)
# Generate CV predicted values
cl <- makeCluster(detectCores())
registerDoParallel(cl)
predictions <- foreach(v = 1:V, .combine = "c",
.packages = c("randomForest")) %dopar% .doFit(v, folds, train)
stopCluster(cl)
predictions[unlist(folds)] <- predictions
# Get CV AUC
runtime <- system.time(res <- ci.cvAUC(predictions = predictions,
labels = train[, c(y)],
folds = folds,
confidence = 0.95))
print(runtime)
return(res)
}
The actual function call:
res <- iid_example(train = datos, y = "V1", V = 10, seed = 1)
When I try to run it, I get the following error:
Y0[[v]] out of bounds
I am trying to adjust the parameterization of the function, but I do not understand why it is out of boundaries. Thanks for your help

Custom classification threshold for GBM

I'm trying to create a custom GBM model that tunes the classification threshold for a binary classification problem. There is a nice example provided on the caret website here, but when I try to apply something similar to GBM I receive the following error:
Error in { : task 1 failed - "argument 1 is not a vector"
Unfortunately, I have no idea where the error is and the error isn't very helpful.
Here's an example, with the code that I've used for defining the custom GBM
library(caret)
library(gbm)
library(pROC)
#### DEFINE A CUSTOM GBM MODEL FOR PROBABILITY THRESHOLD TUNING ####
## Get the model code for the original gbm method from caret
customGBM <- getModelInfo("gbm", regex = FALSE)[[1]]
customGBM$type <- c("Classification")
## Add the threshold (i.e. class cutoff) as another tuning parameter
customGBM$parameters <- data.frame(parameter = c("n.trees", "interaction.depth", "shrinkage",
"n.minobsinnode", "threshold"),
class = rep("numeric", 5),
label = c("# Boosting Iterations", "Max Tree Depth", "Shrinkage",
"Min. Terminal Node Size", "Probability Cutoff"))
## Customise the tuning grid:
## Some paramters are fixed. Will give a tuning grid of 2,500 values if len = 100
customGBM$grid <- function(x, y, len = NULL, search = "grid") {
if (search == "grid") {
grid <- expand.grid(n.trees = seq(50, 250, 50),
interaction.depth = 2, ### fix interaction depth at 2
shrinkage = 0.0001, ### fix learning rate at 0.0001
n.minobsinnode = seq(2, 10, 2),
threshold = seq(.01, .99, length = len))
} else {
grid <- expand.grid(n.trees = floor(runif(len, min = 1, max = 5000)),
interaction.depth = sample(1:10, replace = TRUE, size = len),
shrinkage = runif(len, min = .001, max = .6),
n.minobsinnode = sample(5:25, replace = TRUE, size = len),
threshold = runif(1, 0, size = len))
grid <- grid[!duplicated(grid),] ### remove any duplicated rows in the training grid
}
grid
}
## Here we fit a single gbm model and loop over the threshold values to get predictions from the
## same gbm model.
customGBM$loop = function(grid) {
library(plyr)
loop <- ddply(grid, c("n.trees", "shrinkage", "interaction.depth", "n.minobsinnode"),
function(x) c(threshold = max(x$threshold)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$threshold)) {
index <- which(grid$n.trees == loop$n.trees[i] &
grid$interaction.depth == loop$interaction.depth[i] &
grid$shrinkage == loop$shrinkage[i] &
grid$n.minobsinnode == loop$n.minobsinnode[i])
cuts <- grid[index, "threshold"]
submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
}
list(loop = loop, submodels = submodels)
}
## Fit the model independent of the threshold parameter
customGBM$fit = function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
if (any(names(theDots) == "distribution")) {
modDist <- theDots$distribution
theDots$distribution <- NULL
} else {
if (is.numeric(y)) {
stop("This works only for 2-class classification problems")
} else modDist <- if (length(lev) == 2) "bernoulli" else
stop("This works only for 2-class classification problems")
}
# if (length(levels(y)) != 2)
# stop("This works only for 2-class problems")
## check to see if weights were passed in (and availible)
if (!is.null(wts)) theDots$w <- wts
if (is.factor(y) && length(lev) == 2) y <- ifelse(y == lev[1], 1, 0)
modArgs <- list(x = x,
y = y,
interaction.depth = param$interaction.depth,
n.trees = param$n.trees,
shrinkage = param$shrinkage,
n.minobsinnode = param$n.minobsinnode,
distribution = modDist)
do.call("gbm.fit", modArgs)
}
## Now get a probability prediction and use different thresholds to
## get the predicted class
customGBM$predict = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, n.trees = modelFit$tuneValue$n.trees,
type = "response")#[, modelFit$obsLevels[1]]
out[is.nan(out)] <- NA
class1Prob <- ifelse(out >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
## Raise the threshold for class #1 and a higher level of
## evidence is needed to call it class 1 so it should
## decrease sensitivity and increase specificity
out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
if (!is.null(submodels)) {
tmp2 <- out
out <- vector(mode = "list", length = length(submodels$threshold))
out[[1]] <- tmp2
for (i in seq(along = submodels$threshold)) {
out[[i + 1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
modelFit$obsLevels[1],
modelFit$obsLevels[2])
}
}
out
}
## The probabilities are always the same but we have to create
## mulitple versions of the probs to evaluate the data across
## thresholds
customGBM$prob = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, type = "response",
n.trees = modelFit$tuneValue$n.trees)
out[is.nan(out)] <- NA
out <- cbind(out, 1 - out)
colnames(out) <- modelFit$obsLevels
if (!is.null(submodels)) {
tmp <- predict(modelFit, newdata, type = "response", n.trees = submodels$n.trees)
tmp <- as.list(as.data.frame(tmp))
lapply(tmp, function(x, lvl) {
x <- cbind(x, 1 - x)
colnames(x) <- lvl
x}, lvl = modelFit$obsLevels)
out <- c(list(out), tmp)
}
out
}
fourStats <- function (data, lev = levels(data$obs), model = NULL) {
## This code will get use the area under the ROC curve and the
## sensitivity and specificity values using the current candidate
## value of the probability threshold.
out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
## The best possible model has sensitivity of 1 and specificity of 1.
## How far are we from that value?
coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
ncol = 2,
byrow = TRUE)
colnames(coords) <- c("Spec", "Sens")
rownames(coords) <- c("Best", "Current")
c(out, Dist = dist(coords)[1])
}
And then some code showing how to use the custom model
set.seed(949)
trainingSet <- twoClassSim(500, -9)
mod1 <- train(Class ~ ., data = trainingSet,
method = customGBM, metric = "Dist",
maximize = FALSE, tuneLength = 10,
trControl = trainControl(method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = fourStats))
The model appears to run, but finishes with the error from above. If someone could please help me with customising the GBM model to tune the GBM parameters, and the probability threshold for the classes that would be great.

Caret package Custom metric

I'm using the caret function "train()" in one of my project and I'd like to add
a "custom metric" F1-score. I looked at this url caret package
But I cannot understand how I can build this score with the parameter available.
There is an example of custom metric which is the following:
## Example with a custom metric
madSummary <- function (data,
lev = NULL,
model = NULL) {
out <- mad(data$obs - data$pred,
na.rm = TRUE)
names(out) <- "MAD"
out
}
robustControl <- trainControl(summaryFunction = madSummary)
marsGrid <- expand.grid(degree = 1, nprune = (1:10) * 2)
earthFit <- train(medv ~ .,
data = BostonHousing,
method = "earth",
tuneGrid = marsGrid,
metric = "MAD",
maximize = FALSE,
trControl = robustControl)
Update:
I tried your code but the problem is that it doesn't work with multiple classes like with the code below (The F1 score is displayed, but it is weird) I'm not sure but I think the function F1_score works only on binary classes
library(caret)
library(MLmetrics)
set.seed(346)
dat <- iris
## See http://topepo.github.io/caret/training.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
print(data)
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs)
c(F1 = f1_val)
}
# Split the Data into .75 input
in_train <- createDataPartition(dat$Species, p = .70, list = FALSE)
trainClass <- dat[in_train,]
testClass <- dat[-in_train,]
set.seed(35)
mod <- train(Species ~ ., data = trainClass ,
method = "rpart",
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
print(mod)
I coded a manual F1 score as well, with one input the confusion matrix: (I'm not sure if we can have a confusion matrix in "summaryFunction"
F1_score <- function(mat, algoName){
##
## Compute F1-score
##
# Remark: left column = prediction // top = real values
recall <- matrix(1:nrow(mat), ncol = nrow(mat))
precision <- matrix(1:nrow(mat), ncol = nrow(mat))
F1_score <- matrix(1:nrow(mat), ncol = nrow(mat))
for(i in 1:nrow(mat)){
recall[i] <- mat[i,i]/rowSums(mat)[i]
precision[i] <- mat[i,i]/colSums(mat)[i]
}
for(i in 1:ncol(recall)){
F1_score[i] <- 2 * ( precision[i] * recall[i] ) / ( precision[i] + recall[i])
}
# We display the matrix labels
colnames(F1_score) <- colnames(mat)
rownames(F1_score) <- algoName
# Display the F1_score for each class
F1_score
# Display the average F1_score
mean(F1_score[1,])
}
You should look at The caret Package - Alternate Performance Metrics for details. A working example:
library(caret)
library(MLmetrics)
set.seed(346)
dat <- twoClassSim(200)
## See https://topepo.github.io/caret/model-training-and-tuning.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
set.seed(35)
mod <- train(Class ~ ., data = dat,
method = "rpart",
tuneLength = 5,
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
For the two-class case, you can try the following:
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = prSummary,
classProbs = TRUE))
or define a custom summary function that combines both twoClassSummary and prSummary current favorite which provides the following possible evaluation metrics - AUROC, Spec, Sens, AUPRC, Precision, Recall, F - any of which can be used as the metric argument. This also includes the special case I mentioned in my comment on the accepted answer (F is NA).
comboSummary <- function(data, lev = NULL, model = NULL) {
out <- c(twoClassSummary(data, lev, model), prSummary(data, lev, model))
# special case missing value for F
out$F <- ifelse(is.na(out$F), 0, out$F)
names(out) <- gsub("AUC", "AUPRC", names(out))
names(out) <- gsub("ROC", "AUROC", names(out))
return(out)
}
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = comboSummary,
classProbs = TRUE))

Resources