Training different ML models using Caret - r

I am trying to train a couple of ML models using the trainControl and train functions in Caret but I am always getting the same error which just says:
Error: Stopping
without giving any more details.
The problem is the same for gbm and ranger, so I am wondering if it has something to do with a conflict of the packages I am also using in my code.
library(ggplot2)
library(lattice)
library(caret)
library(rlang)
library(tidyverse)
library(Matrix)
library(glmnet)
library(iterators)
library(parallel)
library(doParallel) # parallel processing.
registerDoParallel(cores=16)
library(randomForest)
library(gbm)
library(ranger)
library(data.table)
library(smooth)
data<- data.frame(A=seq(as.Date("2019-01-01"), by=1, len=100),B=as.numeric(runif(100, 50, 150)),C=as.numeric(runif(100, 50, 150)))
# define data sets
data_training<-data[1:60,]
data_test<-data[(60+1):nrow(data),]
# creating sampling seeds
set.seed(123)
n=nrow(data_training)
tuneLength.num <- 5
seeds <- vector(mode = "list", length = n) # creates an empty vector containing lists
for(i in 1:(n-1)){ # choose tuneLength.num random samples from 1 to 1000
seeds[[i]] <- sample.int(1000, tuneLength.num)
}
# For the last model:
seeds[[n]] <- sample.int(1000, 10)
# Define TimeControl for training and fitting:
trainingTimeControl <- trainControl(method = "timeslice",
initialWindow = 25,
horizon = 1,
fixedWindow = TRUE,
returnResamp="all",
allowParallel = TRUE,
seeds = seeds,
savePredictions = TRUE)
gbm.mod<- caret::train(B ~.- A,
data = data_training,
method = "gbm",
distribution = "gaussian",
trControl = trainingTimeControl,
tuneLength=tuneLength.num,
metric="RMSE")
EDIT: The following code works just fine:
gbm<-gbm(formula = B ~ . - A,
distribution = "gaussian",
data = data_training,
keep.data = TRUE)
It would be great if anyone has an idea what is going on here. The code is working fine with svmRadial.

Related

cross_Validation in oneR usuing R language

I'm trying to cross-value with the OneR algorithm and I don't quite know how to do it.With the example code I get the error "Error in x[0, , drop = FALSE] : incorrect number of dimensions"
glass <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data",
col.names=c("","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Type")
str(glass)
head(glass)
standard.features <- scale(glass[,2:10])
data <- cbind(standard.features,glass[11])
data$Type<-factor(data$Type)
anyNA(data)
inTraining <- createDataPartition(data$Type, p = .7, list = FALSE, times =1 )
training <- data[ inTraining,]
testing <- data[-inTraining,]
set.seed(12345)
fitControl <- trainControl(## 5-fold CV
method = "cv",
number = 5
)
model <- OneR(Type~.,data= training)
oneRFit1 <- train(model,
trControl = fitControl)
It is fairly easy to write your own loop to carry out cross-validation. However, it looks like you want to use the caret package to manage it. If so, just use the method argument inside caret's train function to specify that you want to use OneR:
oneRFit1 <- train(Type~.,
data=training,
method="OneR" ,
trControl = fitControl)
str(iris)
head(iris)
set.seed(123)
inTraining <- createDataPartition(iris$Species, p = .7, list = FALSE, times =1 )
training <- iris[ inTraining,]
testing <- iris[-inTraining,]
set.seed(123)
train.control <- trainControl(method = "cv", number = 2)
# Train the model
oneRFit <- train(Species ~., data = training, method = "OneR",
trControl = train.control)

R: Multiclass Matrices

I am working with the R programming language. I am trying to learn how to make a "confusion matrix" for multiclass variables (e.g. How to construct the confusion matrix for a multi class variable).
Suppose I generate some data and fit a decision tree model :
#load libraries
library(rpart)
library(caret)
#generate data
a <- rnorm(1000, 10, 10)
b <- rnorm(1000, 10, 5)
d <- rnorm(1000, 5, 10)
group_1 <- sample( LETTERS[1:3], 1000, replace=TRUE, prob=c(0.33,0.33,0.34) )
e = data.frame(a,b,d, group_1)
e$group_1 = as.factor(d$group_1)
#split data into train and test set
trainIndex <- createDataPartition(e$group_1, p = .8,
list = FALSE,
times = 1)
training <- e[trainIndex,]
test <- e[-trainIndex,]
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 1)
#fit decision tree model
TreeFit <- train(group_1 ~ ., data = training,
method = "rpart2",
trControl = fitControl)
From here, I am able to store the results into a "confusion matrix":
pred <- predict(TreeFit,test)
table_example <- table(pred,test$group_1)
This satisfies my requirements - but this "table" requires me to manually calculate the different accuracy metrics of "A", "B" and "C" (as well as the total accuracy).
My question: Is it possible to use the caret::confusionMatrix() command for this problem?
e.g.
pred <- predict(TreeFit, test, type = "prob")
labels_example <- as.factor(ifelse(pred[,2]>0.5, "1", "0"))
con <- confusionMatrix(labels_example, test$group_1)
This way, I would be able to directly access the accuracy measurements from the confusion matrix. E.g. metric = con$overall[1]
Thanks
Is this what you're looking for?
pred <- predict(
TreeFit,
test)
con <- confusionMatrix(
test$group_1,
pred)
con
con$overall[1]
Same output as in:
table(test$group_1, pred)
Plus accuracy metrics.

Metric Accuracy not applicable for regression models

I am trying to investigate my model with R with machine learning. Training model in general works not well.
# # Logistic regression multiclass
for (i in 1:30) {
# split data into training/test
trainPhyIndex <- createDataPartition(subs_phy$Methane, p=10/17,list = FALSE)
trainingPhy <- subs_phy[trainPhyIndex,]
testingPhy <- subs_phy[-trainPhyIndex,]
# Pre-process predictor values
trainXphy <- trainingPhy[,names(trainingPhy)!= "Methane"]
preProcValuesPhy <- preProcess(x= trainXphy,method = c("center","scale"))
# using boot to avoid over-fitting
fitControlPhyGLMNET <- trainControl(method = "repeatedcv",
number = 10,
repeats = 4,
savePredictions="final",
classProbs = TRUE
)
fit_glmnet_phy <- train (Methane~.,
trainingPhy,
method = "glmnet",
tuneGrid = expand.grid(
.alpha =0.1,
.lambda = 0.00023),
metric = "Accuracy",
trControl = fitControlPhyGLMNET)
pred_glmnet_phy <- predict(fit_glmnet_phy, testingPhy)
# Get the confusion matrix to see accuracy value
u <- union(pred_glmnet_phy,testingPhy$Methane)
t <- table(factor(pred_glmnet_phy, u), factor(testingPhy$Methane, u))
accu_glmnet_phy <- confusionMatrix(t)
# accu_glmnet_phy<-confusionMatrix(pred_glmnet_phy,testingPhy$Methane)
glmnetstatsPhy[(nrow(glmnetstatsPhy)+1),] = accu_glmnet_phy$overall
}
glmnetstatsPhy
The program always stopped on fit_glmnet_phy <- train (Methane~., ..
this command and shows
Metric Accuracy not applicable for regression models
I have no idea about this error
I also attached the type of mathane
enter image description here
Try normalizing the input columns and mapping the output column as factors. This helped me resolve an issue similar to it.

Meaning: improvement in RMSE during crossvalidation, although not on testset?

In the code below I train a NN with crossvalidation on the first 20000 records in the dataset. The dataset contains 8 predictors.
First I have split my data in 2 parts:
the first 20.000 rows (trainset)
and the last 4003 rows (out of sample test set)
I have done 2 runs:
run 1) a run with 3 predictors
run 2) a run with all 8 predictors (see code below).
Based on crossvalidation within the 20.000 rows from the trainset, the RMSE (for the optimal parametersetting) improves from 2.30 (run 1) to 2.11 (run 2).
Although when I test both models on the 4003 rows from the out of sample test set, the RMSE improves only neglible from 2.64 (run 1) to 2.63 (run 2).
What can be concluded from these contradiction in the results?
Thanks!
### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
### Chapter 7: Non-Linear Regression Models
### Required packages: AppliedPredictiveModeling, caret, doMC (optional),
### earth, kernlab, lattice, nnet
################################################################################
library(caret)
### Load the data
mydata <- read.csv(file="data.csv", header=TRUE, sep=",")
validatiex <- mydata[20001:24003,c(1:8)]
validatiey <- mydata[20001:24003,9]
mydata <- mydata[1:20000,]
x <- mydata[,c(1:8)]
y <- mydata[,9]
parti <- createDataPartition(y, times = 1, p=0.8, list = FALSE)
x_train <- x[parti,]
x_test <- x[-parti,]
y_train <- y[parti]
y_test <- y[-parti]
set.seed(100)
indx <- createFolds(y_train, returnTrain = TRUE)
ctrl <- trainControl(method = "cv", index = indx)
## train neural net:
nnetGrid <- expand.grid(decay = c(.1),
size = c(5, 15, 30),
bag = FALSE)
set.seed(100)
nnetTune <- train(x = x_train, y = y_train,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = ctrl,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 30 * (ncol(x_train) + 1) + 30 + 1,
maxit = 1000,
repeats = 25,
allowParallel = FALSE)
nnetTune
plot(nnetTune)
predictions <- predict(nnetTune, validatiex, type="raw")
mse <- mean((validatiey - predictions)^2)
mse <- sqrt(mse)
print (mse)

How to create Random Forest from scratch in R (without the randomforest package)

This is the way I want to use Random Forest by using the RandomForest Package:
library (randomForest)
rf1 <- randomForest(CLA ~ ., dat, ntree=100, norm.votes=FALSE)
p1 <- predict(rf1, testing, type='response')
confMat_rf1 <- table(p1,testing_CLA$CLA)
accuracy_rf1 <- sum(diag(confMat_rf1))/sum(confMat_rf1)
I don't want to use the RandomForest Package at all. Given a dataset (dat) and using rpart and default values of randomforest package, how can I get the same results? For instance, for the 100 decision trees, I need to run the following:
for(i in 1:100){
cart.models[[i]]<-rpart(CLA~ ., data = random_dataset[[i]],cp=-1)
}
Where each random_dataset[[i]] would be randomly chosen default number of attributes and rows. In addition, does rpart used for randomforest?
It is possible to simulate training a random forest by training multiple trees using rpart and bootstrap samples on the training set and the features of the training set.
The following code snippet trains 10 trees to classify the iris species and returns a list of trees with the out of bag accuracy on each tree.
library(rpart)
library(Metrics)
library(doParallel)
library(foreach)
library(ggplot2)
random_forest <- function(train_data, train_formula, method="class", feature_per=0.7, cp=0.01, min_split=20, min_bucket=round(min_split/3), max_depth=30, ntrees = 10) {
target_variable <- as.character(train_formula)[[2]]
features <- setdiff(colnames(train_data), target_variable)
n_features <- length(features)
ncores <- detectCores(logical=FALSE)
cl <- makeCluster(ncores)
registerDoParallel(cl)
rf_model <- foreach(
icount(ntrees),
.packages = c("rpart", "Metrics")
) %dopar% {
bagged_features <- sample(features, n_features * feature_per, replace = FALSE)
index_bag <- sample(nrow(train_data), replace=TRUE)
in_train_bag <- train_data[index_bag,]
out_train_bag <- train_data[-index_bag,]
trControl <- rpart.control(minsplit = min_split, minbucket = min_bucket, cp = cp, maxdepth = max_depth)
tree <- rpart(formula = train_formula,
data = in_train_bag,
control = trControl)
oob_pred <- predict(tree, newdata = out_train_bag, type = "class")
oob_acc <- accuracy(actual = out_train_bag[, target_variable], predicted = oob_pred)
list(tree=tree, oob_perf=oob_acc)
}
stopCluster(cl)
rf_model
}
train_formula <- as.formula("Species ~ .")
forest <- random_forest(train_data = iris, train_formula = train_formula)

Resources