I try to apply ML on the iris dataset, using "knn" and "rpart" algorithms. This is my code:
library(tidyverse)
library(caret)
dataset <- iris
tt_index <- createDataPartition(dataset$Sepal.Length, times = 1, p = 0.9, list = FALSE)
train_set <- dataset[tt_index, ]
test_set <- dataset[-tt_index, ]
models <- c("knn","rpart")
fits <- lapply(models, function(model){
print(model)
train(Species ~ .,
data = train_set,
tuneGrid = case_when(model == "knn" ~ data.frame(k = seq(3,50,1)),
model == "rpart" ~ data.frame(cp = seq(0,0.1,len = 50))),
method = model)
})
I want to set tuneGrid parameter depending on the model inside lapply. But I receive this error:
Error in `[.data.frame`(value[[1]], rep(NA_integer_, m)) :
undefined columns selected
Any help will be greatly appreciated.
We could use if/else
library(caret)
out <- lapply(models, function(model)
train(Species ~ ., data = train_set,
tuneGrid = if(model == "knn") data.frame(k = seq(3,50,1)) else
data.frame(cp = seq(0,0.1,len = 50)), method = model))
According to ?case_when
A vector of length 1 or n, matching the length of the logical input or output vectors, with the type (and attributes) of the first RHS. Inconsistent lengths or types will generate an error.
Related
I would like to implement the weighted knn algorithm but I don't know how to do it. Everything and that I can use kknn, I suppose that it can also be done with knn. In the function train(caret) there is an option "weights" but I can't find the solution, any suggestion?
I use the following code in R :
library(caret)
library(corrplot)
glass <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data",
col.names=c("","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Type"))
str(glass)
head(glass)
glass_1<- glass[,-7]
glass_2<- glass_1[,-7]
head(glass_2)
glass<- glass_2
standard.features <- scale(glass[,2:8])
data <- cbind(standard.features,glass[9])
anyNA(data)
head(data)
corrplot(cor(data))
data$Type<-factor(data$Type)
inTraining <- createDataPartition(data$Type, p = .7, list = FALSE, times =1 )
training <- data[ inTraining,]
testing <- data[-inTraining,]
prop.table(table(training$Type))
prop.table(table(testing$Type))
dim(training); dim(testing);
summary(data)
fitControl <- trainControl(## 5-fold CV
method = "cv",
number = 5,
## repeated ten times
#repeats = 5)
)
#k_value <- expand.grid(kmax = 3, distance = 2, kernel = "optimal")
k_value <- expand.grid(k = 3)
set.seed(825)
knn_Fit <- train(Type ~ ., data = training, weights = ????,
method = "knn", tuneGrid = k_value,
trControl = fitControl)
## This last option is actually one
## for gbm() that passes through
#verbose = FALSE)
knn_Fit
knn_Fit$finalModel
I am using caret train() function to find an optimal cp value for a CART decision tree adopting as metric the F1 through a custom function. The train() function returns an error I can not understand. Perhaps the problem lies in the way I define the reproducible example?
> library(data.table)
> library(ROSE)
> data(hacide)
> train <- hacide.train
> test <- hacide.test
> numFolds = trainControl(method = "cv" , number = 10)
> cpGrid = expand.grid(.cp = seq(0.01, 0.5, 0.01))
> f1 <- function(data, lev = NULL, model = NULL) {
+ f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
+ c(F1 = f1_val)
+ }
> set.seed(12)
> train(cls ~ ., data = train,
+ method = "rpart",
+ tuneLength = 5,
+ metric = "F1",
+ trControl = trainControl(summaryFunction = f1,
+ classProbs = TRUE))
Error in train.default(x, y, weights = w, ...) :
At least one of the class levels is not a valid R variable name; This will cause errors when class probabilities are generated because the variables names will be converted to X0, X1 . Please use factor levels that can be used as valid R variable names (see ?make.names for help).
> levels(train$cls)
[1] "0" "1"
> class(train$cls)
[1] "factor"
You can try this :
levels(train$cls) <- make.names(levels(train$cls))
Then run your model this should fix your problem, Unfortunately your example is not reproducible as you missed out F1_Score function definition in your question. See if this works.
The below is working for me:
levels(train$cls) <- make.names(levels(train$cls))
set.seed(12)
train(cls ~ ., data = train,method = "rpart",tuneLength = 5,
metric = "ROC", trControl = trainControl(summaryFunction = twoClassSummary, classProbs = TRUE))
I have been trying to get the below code to run in caret but get the error. Can anyone tell me how to trouble shoot it.
Error in [.data.frame(data, , lvls[1]) : undefined columns selected
library(tidyverse)
library(caret)
mydf <- iris
mydf <- mydf %>%
mutate(tgt = as.factor(ifelse(Species == 'setosa','Y','N'))) %>%
select(everything(), -Species)
trainIndex <- createDataPartition(mydf$tgt, p = 0.75, times = 1, list = FALSE)
train <- mydf[trainIndex,]
test <- mydf[-trainIndex,]
fitControl <- trainControl(method = 'repeatedcv',
number = 10,
repeats = 10,
allowParallel = TRUE,
summaryFunction = twoClassSummary)
fit_log <- train(tgt~.,
data = train,
method = "glm",
trControl = fitControl,
family = "binomial")
You need to used classProbs = TRUE in your control function. The ROC curve is based on the class probabilities and the error is the summary function not finding those columns.
Use data = data.frame(xxxxx). As in the example below
fit.cart <- train(Condition~., data = data.frame(trainset), method="rpart", metric=metric, trControl=control)
I'm using the caret function "train()" in one of my project and I'd like to add
a "custom metric" F1-score. I looked at this url caret package
But I cannot understand how I can build this score with the parameter available.
There is an example of custom metric which is the following:
## Example with a custom metric
madSummary <- function (data,
lev = NULL,
model = NULL) {
out <- mad(data$obs - data$pred,
na.rm = TRUE)
names(out) <- "MAD"
out
}
robustControl <- trainControl(summaryFunction = madSummary)
marsGrid <- expand.grid(degree = 1, nprune = (1:10) * 2)
earthFit <- train(medv ~ .,
data = BostonHousing,
method = "earth",
tuneGrid = marsGrid,
metric = "MAD",
maximize = FALSE,
trControl = robustControl)
Update:
I tried your code but the problem is that it doesn't work with multiple classes like with the code below (The F1 score is displayed, but it is weird) I'm not sure but I think the function F1_score works only on binary classes
library(caret)
library(MLmetrics)
set.seed(346)
dat <- iris
## See http://topepo.github.io/caret/training.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
print(data)
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs)
c(F1 = f1_val)
}
# Split the Data into .75 input
in_train <- createDataPartition(dat$Species, p = .70, list = FALSE)
trainClass <- dat[in_train,]
testClass <- dat[-in_train,]
set.seed(35)
mod <- train(Species ~ ., data = trainClass ,
method = "rpart",
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
print(mod)
I coded a manual F1 score as well, with one input the confusion matrix: (I'm not sure if we can have a confusion matrix in "summaryFunction"
F1_score <- function(mat, algoName){
##
## Compute F1-score
##
# Remark: left column = prediction // top = real values
recall <- matrix(1:nrow(mat), ncol = nrow(mat))
precision <- matrix(1:nrow(mat), ncol = nrow(mat))
F1_score <- matrix(1:nrow(mat), ncol = nrow(mat))
for(i in 1:nrow(mat)){
recall[i] <- mat[i,i]/rowSums(mat)[i]
precision[i] <- mat[i,i]/colSums(mat)[i]
}
for(i in 1:ncol(recall)){
F1_score[i] <- 2 * ( precision[i] * recall[i] ) / ( precision[i] + recall[i])
}
# We display the matrix labels
colnames(F1_score) <- colnames(mat)
rownames(F1_score) <- algoName
# Display the F1_score for each class
F1_score
# Display the average F1_score
mean(F1_score[1,])
}
You should look at The caret Package - Alternate Performance Metrics for details. A working example:
library(caret)
library(MLmetrics)
set.seed(346)
dat <- twoClassSim(200)
## See https://topepo.github.io/caret/model-training-and-tuning.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
set.seed(35)
mod <- train(Class ~ ., data = dat,
method = "rpart",
tuneLength = 5,
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
For the two-class case, you can try the following:
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = prSummary,
classProbs = TRUE))
or define a custom summary function that combines both twoClassSummary and prSummary current favorite which provides the following possible evaluation metrics - AUROC, Spec, Sens, AUPRC, Precision, Recall, F - any of which can be used as the metric argument. This also includes the special case I mentioned in my comment on the accepted answer (F is NA).
comboSummary <- function(data, lev = NULL, model = NULL) {
out <- c(twoClassSummary(data, lev, model), prSummary(data, lev, model))
# special case missing value for F
out$F <- ifelse(is.na(out$F), 0, out$F)
names(out) <- gsub("AUC", "AUPRC", names(out))
names(out) <- gsub("ROC", "AUROC", names(out))
return(out)
}
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = comboSummary,
classProbs = TRUE))
I would like to run an unbiased cforest using the caret package. Is this possible?
tc <- trainControl(method="cv",
number=f,
index=indexList,
savePredictions=T,
classProbs = TRUE,
summaryFunction = twoClassSummary)
createCfGrid <- function(len, data) {
g = createGrid("cforest", len, data)
g = expand.grid(.controls = cforest_unbiased(mtry = 5, ntree = 1000))
return(g)
}
set.seed(1)
(cfMatFit <- train(as.factor(f1win) ~ .,
data=df,
method="cforest",
metric="ROC",
trControl=tc,
tuneGrid = createCfGrid))
The error is Error in as.character.default(<S4 object of class "ForestControl">) :
no method for coercing this S4 class to a vector
This is because cforest_control() can not be coerced into a data frame. The function does work if I use:
...
g = expand.grid(.mtry = 5)
...
However if I want to change ntree, this has no effect:
...
g = expand.grid(.mtry = 5, .ntree = 1000)
...
This does not error like randomForest does.
The grid should be a simple data frame with a column called .mtry. The code
g = createGrid("cforest", len, data)
will generate that for you. If you want to specify ntree you just pass a controls object in as another argument to train but leave out mtry:
mod <- train(Species ~ ., data = iris,
method = "cforest",
controls = cforest_unbiased(ntree = 10))
caret takes care of changing mtry for you.
Max