Using caret with recipes is leading to difficulties with resample - r

I've been using recipes to pipe into caret::train, which has been going well, but now I've tried some step_transforms, I'm getting the error:
Error in resamples.default(model_list) :
There are different numbers of resamples in each model
when I compare models with and without the transformations. The same code with step_centre and step_scale works fine.
library(caret)
library(tidyverse)
library(tidymodels)
formula <- price ~ carat
model_recipe <- recipe(formula, data = diamonds)
quadratic_model_recipe <- recipe(formula, data = diamonds) %>%
step_poly(all_predictors())
model_list <- list(
linear_model = NULL,
quadratic = NULL
)
model_list$linear_model <-
model_recipe %>% train(
data = diamonds,
method = "lm",
trControl = trainControl(method = "cv"))
model_list$quadratic_model <-
quadratic_model_recipe %>% train(
data = diamonds,
method = "lm",
trControl = trainControl(method = "cv"))
resamp <- resamples(model_list)

quadratic = NULL should have been quadratic_model = NULL

Related

How to use tensor function of gam model in train function of caret package in r?

I am using gam method with both spline and tensor interaction functions(s and ti) inside the train function (for test and train).
I know for spline functions in gam we can use method = "gam" in train function. for example:
fit <- gam(Y ~ s(x1) + s(x2) + s(x3) , data=df)
Prediction_gam <- as.numeric(predict(fit , data=df , type = "response"))
can be changed to follow for test and train in caret package:
fit_train <- train(Y ~ x1 + x2 + x3 , data = train_df, method = "gam", trControl = train.control)
but I don't know how to add tensor interaction function of gam in train function for example:
fit <- gam(Y ~ s(x1) + s(x2) + s(x3) + ti(x1,x2) , data=df)
any suggestion would be appreciated.
the full codes are as follow:
library(caret)
df <- data.frame(Y=rnorm(100), x1=rnorm(100),x2=rnorm(100), x3=rnorm(100))
df <- as.data.frame(do.call(cbind, df))
set.seed(1)
training.samples <- df$x1%>%createDataPartition(p = 0.8, list = FALSE)
train_df <- df[training.samples, ]
test_df <- df[-training.samples, ]
train.control <- trainControl(method = "repeatedcv", number = 10, repeats = 2)
fit_train <- train(Y ~ x1 + x2 + x3 , data = train_df, method = "gam", trControl = train.control)
Prediction_train <- as.numeric(predict(fit_train , data=train_df , type = "raw"))
Prediction_test <- as.numeric(predict(fit_train , newdata =test_df , type = "raw"))

Error building partial dependence plots for RF using FinalModel output from caret's train() function

I am using the following code to fit and test a random forest classification model:
> control <- trainControl(method='repeatedcv',
+ number=5,repeats = 3,
+ search='grid')
> tunegrid <- expand.grid(.mtry = (1:12))
> rf_gridsearch <- train(y = river$stat_bino,
+ x = river[,colnames(river) != "stat_bino"],
+ data = river,
+ method = 'rf',
+ metric = 'Accuracy',
+ ntree = 600,
+ importance = TRUE,
+ tuneGrid = tunegrid, trControl = control)
Note, I am using
train(y = river$stat_bino, x = river[,colnames(river) != "stat_bino"],...
rather than: train(stat_bino ~ .,...
so that my categorical variables will not be turned into dummy variables.
solution here: variable encoding in K-fold validation of random forest using package 'caret')
I would like to extract the FinalModel and use it to make partial dependence plots for my variables (using code below), but I get an error message and don't know how to fix it.
> model1 <- rf_gridsearch$finalModel
> library(pdp)
> partial(model1, pred.var = "MAXCL", type = "classification", which.class = "1", plot =TRUE)
Error in eval(stats::getCall(object)$data) :
..1 used in an incorrect context, no ... to look in
Thanks for any solutions here!

R | How to get accuracy from cv.glmnet

I've been using the cv.glmnet function to fit a lasso logistic regression model. I'm using R
Here's my code. I'm using the iris dataset.
df = iris %>%
mutate(Species = as.character(Species)) %>%
filter(!(Species =="setosa")) %>%
mutate(Species = as.factor(Species))
X = data.matrix(df %>% select(-Species))
y = df$Species
Model = cv.glmnet(X, y, alpha = 1, family = "binomial")
How do I get the model accuracy from the cv.glmnet object (Model).
If I had been using caret on a normal logistic regression model, accuracy is already in the output.
train_control = trainControl(method = "cv", number = 10)
M2 = train(Species ~., data = df, trControl = train_control,
method = "glm", family = "binomial")
M2$results
but a cv.glmnet object doesn't seem to contain this information.
You want to add type.measure='class' as in Model 2 below, otherwise the default for family='binomial' is 'deviance'.
df = iris %>%
mutate(Species = as.character(Species)) %>%
filter(!(Species =="setosa")) %>%
mutate(Species = as.factor(Species))
X = data.matrix(df %>% select(-Species))
y = df$Species
Model = cv.glmnet(X, y, alpha = 1, family = "binomial")
Model2 = cv.glmnet(X, y, alpha = 1, family = "binomial", type.measure = 'class')
Then cvm gives the misclassification rate.
Model2$lambda ## lambdas used in CV
Model2$cvm ## mean cross-validated error for each of those lambdas
If you want results for the best lambda, you can use lambda.min
Model2$lambda.min ## lambda with the lowest cvm
Model2$cvm[Model2$lambda==Model2$lambda.min] ## cvm for lambda.min

r caret: train ONE model once the hyper-parameters are already known

I am using caret to train a ridge regression:
library(ISLR)
Hitters = na.omit(Hitters)
x = model.matrix(Salary ~ ., Hitters)[, -1] #Dropping the intercept column.
y = Hitters$Salary
set.seed(0)
train = sample(1:nrow(x), 7*nrow(x)/10)
library(caret)
set.seed(0)
# Values of lambda over which to check:
grid = 10 ^ seq(5, -2, length = 100)
train_control = trainControl(method = 'cv', number = 10)
tune.grid = expand.grid(lambda = grid, alpha = 0)
ridge.caret = train(x[train, ], y[train],
method = 'glmnet',
trControl = train_control,
tuneGrid = tune.grid)
ridge.caret$bestTune
# alpha is 0 and best lambda is 242.0128
So, I found my optimal lambda and alpha. In fact, it's not really important for my question, what they are.
Now, how could I now run just ONE ridge regression (using caret) with alpha = 0 and lambda = 242.0128 for the whole data set?
I discovered that I can specify trainControl method as 'none'. See the code below. Did I correctly specify the tuneGrid (with just one line). Is this how it should be done?
Thank you very much!
set.seed(12345)
ridge_full <- train(x, y,
method = 'glmnet',
trControl = trainControl(method = 'none'),
tuneGrid = expand.grid(lambda = ridge.caret$bestTune$lambda, alpha = 0))
coef(ridge_full$finalModel, s = ridge_full$bestTune$lambda)

Plotting ROC curve from two different algorithms using lift in caret

I have a two models like the following:
library(mlbench)
data(Sonar)
library(caret)
set.seed(998)
my_data <- Sonar
fitControl <-
trainControl(
method = "boot632",
number = 10,
classProbs = T,
savePredictions = "final",
summaryFunction = twoClassSummary
)
modelxgb <- train(
Class ~ .,
data = my_data,
method = "xgbTree",
trControl = fitControl,
metric = "ROC"
)
library(mlbench)
data(Sonar)
library(caret)
set.seed(998)
my_data <- Sonar
fitControl <-
trainControl(
method = "boot632",
number = 10,
classProbs = T,
savePredictions = "final",
summaryFunction = twoClassSummary
)
modelsvm <- train(
Class ~ .,
data = my_data,
method = "svmLinear2",
trControl = fitControl,
metric = "ROC"
)
I want to plot the ROC curves for both models on one ggplot.
I am doing the following to generate the points for the curve:
for_lift_xgb = data.frame(Class = modelxgb$pred$obs, xgbTree = modelxgb$pred$R)
for_lift_svm = data.frame(Class = modelsvm$pred$obs, svmLinear2 = modelsvm$pred$R)
lift_obj_xgb = lift(Class ~ xgbTree, data = for_lift_xgb, class = "R")
lift_obj_svm = lift(Class ~ svmLinear2, data = for_lift_svm, class = "R")
What would be the easiest way to plot both of these curves on a single plot, and have them in different colors. I would also like to annotate the individual AUC values on the plot.
After building the models you can combine the predictions in a single data frame:
for_lift = data.frame(Class = modelxgb$pred$obs,
xgbTree = modelxgb$pred$R,
svmLinear2 = modelsvm$pred$R)
use it to build the lift object using the following:
lift = lift(Class ~ xgbTree + svmLinear2, data = for_lift, class = "R")
and plot with ggplot:
library(ggplot)
ggplot(lift$data)+
geom_line(aes(1-Sp , Sn, color = liftModelVar))+
scale_color_discrete(guide = guide_legend(title = "method"))
You can combine and compare many models this way.
To add auc to the plot you can create a data frame with the models names, the corresponding auc and the coordinates for plotting:
auc_ano <- data.frame(model = c("xgbTree","svmLinear2"),
auc = c(pROC::roc(response = for_lift$Class,
predictor = for_lift$xgbTree,
levels=c("M", "R"))$auc,
pROC::roc(response = for_lift$Class,
predictor = for_lift$svmLinear2,
levels=c("M", "R"))$auc),
y = c(0.95, 0.9))
auc_ano
#output
model auc y
1 xgbTree 0.9000756 0.95
2 svmLinear2 0.5041086 0.90
and pass it to geom_text:
ggplot(lift$data)+
geom_line(aes(1-Sp , Sn, color = liftModelVar))+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_text(data = auc_ano, aes(label = round(auc, 4), color = model, y = y), x = 0.1)

Resources