Caret: undefined columns selected - r

I have been trying to get the below code to run in caret but get the error. Can anyone tell me how to trouble shoot it.
Error in [.data.frame(data, , lvls[1]) : undefined columns selected
library(tidyverse)
library(caret)
mydf <- iris
mydf <- mydf %>%
mutate(tgt = as.factor(ifelse(Species == 'setosa','Y','N'))) %>%
select(everything(), -Species)
trainIndex <- createDataPartition(mydf$tgt, p = 0.75, times = 1, list = FALSE)
train <- mydf[trainIndex,]
test <- mydf[-trainIndex,]
fitControl <- trainControl(method = 'repeatedcv',
number = 10,
repeats = 10,
allowParallel = TRUE,
summaryFunction = twoClassSummary)
fit_log <- train(tgt~.,
data = train,
method = "glm",
trControl = fitControl,
family = "binomial")

You need to used classProbs = TRUE in your control function. The ROC curve is based on the class probabilities and the error is the summary function not finding those columns.

Use data = data.frame(xxxxx). As in the example below
fit.cart <- train(Condition~., data = data.frame(trainset), method="rpart", metric=metric, trControl=control)

Related

How to implement knn based on weights

I would like to implement the weighted knn algorithm but I don't know how to do it. Everything and that I can use kknn, I suppose that it can also be done with knn. In the function train(caret) there is an option "weights" but I can't find the solution, any suggestion?
I use the following code in R :
library(caret)
library(corrplot)
glass <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data",
col.names=c("","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Type"))
str(glass)
head(glass)
glass_1<- glass[,-7]
glass_2<- glass_1[,-7]
head(glass_2)
glass<- glass_2
standard.features <- scale(glass[,2:8])
data <- cbind(standard.features,glass[9])
anyNA(data)
head(data)
corrplot(cor(data))
data$Type<-factor(data$Type)
inTraining <- createDataPartition(data$Type, p = .7, list = FALSE, times =1 )
training <- data[ inTraining,]
testing <- data[-inTraining,]
prop.table(table(training$Type))
prop.table(table(testing$Type))
dim(training); dim(testing);
summary(data)
fitControl <- trainControl(## 5-fold CV
method = "cv",
number = 5,
## repeated ten times
#repeats = 5)
)
#k_value <- expand.grid(kmax = 3, distance = 2, kernel = "optimal")
k_value <- expand.grid(k = 3)
set.seed(825)
knn_Fit <- train(Type ~ ., data = training, weights = ????,
method = "knn", tuneGrid = k_value,
trControl = fitControl)
## This last option is actually one
## for gbm() that passes through
#verbose = FALSE)
knn_Fit
knn_Fit$finalModel

How to get test data ROC plot from MLeval

I am trying to return the ROC curves for a test dataset using the MLevals package.
# Load data
train <- readRDS(paste0("Data/train.rds"))
test <- readRDS(paste0("Data/test.rds"))
# Create factor class
train$class <- ifelse(train$class == 1, 'yes', 'no')
# Set up control function for training
ctrl <- trainControl(method = "cv",
number = 5,
returnResamp = 'none',
summaryFunction = twoClassSummary(),
classProbs = T,
savePredictions = T,
verboseIter = F)
gbmGrid <- expand.grid(interaction.depth = 10,
n.trees = 18000,
shrinkage = 0.01,
n.minobsinnode = 4)
# Build using a gradient boosted machine
set.seed(5627)
gbm <- train(class ~ .,
data = train,
method = "gbm",
metric = "ROC",
tuneGrid = gbmGrid,
verbose = FALSE,
trControl = ctrl)
# Predict results -
pred <- predict(gbm, newdata = test, type = "prob")[,"yes"]
roc <- evalm(data.frame(pred, test$class))
I have used the following post, ROC curve for the testing set using Caret package,
to try and plot the ROC from test data using MLeval and yet I get the following error message:
MLeval: Machine Learning Model Evaluation
Input: data frame of probabilities of observed labels
Error in names(x) <- value :
'names' attribute [3] must be the same length as the vector [2]
Can anyone please help? Thanks.
Please provide a reproducible example with sample data so we can replicate the error and test for solutions (i.e., we cannot access train.rds or test.rds).
Nevertheless, the below may fix your issue.
pred <- predict(gbm, newdata = test, type = "prob")
roc <- evalm(data.frame(pred, test$class))

error using train command in caret package in r

I am trying to make a glm model using caret in r using healthcare data from the CDC. However, whenever i try to train the model using the train() command in caret, i keep on getting the following error:
Error in `[.default`(y, , "time") : incorrect number of dimensions
Below is my code:
#download data
download.file(url = "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/dataset_documentation/nhamcs/stata/ed2014-stata.zip",destfile = "ed2014-stata.zip")
unzip("ed2014-stata.zip")
library(haven)
nhamcs2014 <- read_dta("ed2014-stata.dta")
dim(nhamcs2014)
#isolate variables of interest
keep2014<- c("SEX","IMMEDR","SEEN72","CANCER","ETOHAB","ALZHD","ASTHMA","CEBVD","CKD","COPD","CHF","CAD","DEPRN",
"DIABTYP1","DIABTYP2","DIABTYP0","ESRD","HPE","EDHIV","HYPLIPID","HTN","OBESITY","OSA","OSTPRSIS",
"SUBSTAB")
new.nhamcs2014 <- nhamcs2014[keep2014]
#remove missing data
e=new.nhamcs2014$IMMEDR==-9
e.clean.nhamcs2014<- new.nhamcs2014[!e,]
f=e.clean.nhamcs2014$IMMEDR==-8
f.clean.nhamcs2014<- e.clean.nhamcs2014[!f,]
g=f.clean.nhamcs2014$SEEN72==-9
g.clean.nhamcs2014 <- f.clean.nhamcs2014[!g,]
h=g.clean.nhamcs2014$SEEN72==-8
h.clean.nhamcs2014 <- g.clean.nhamcs2014[!h,]
i <- h.clean.nhamcs2014$IMMEDR==7
i.clean.nhamcs2014 <- h.clean.nhamcs2014[!i,]
#Convert response variable (IMMEDR) to binomial variable
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==3] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==2] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==1] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==5] <- 1
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==4] <- 1
#clean data
i.clean.nhamcs2014$SEX[i.clean.nhamcs2014$SEX==1] <- 0
i.clean.nhamcs2014$SEX[i.clean.nhamcs2014$SEX==2] <- 1
i.clean.nhamcs2014$SEEN72[i.clean.nhamcs2014$SEEN72==1] <- 0
i.clean.nhamcs2014$SEEN72[i.clean.nhamcs2014$SEEN72==2] <- 1
View(i.clean.nhamcs2014)
sum(is.na(i.clean.nhamcs2014))
#create glm model using caret
library(caret)
set.seed(1)
inTrain<-createDataPartition(i.clean.nhamcs2014$IMMEDR, p=.75, list = FALSE)
train.nhamcs2014 <- i.clean.nhamcs2014[inTrain,]
test.nhamcs2014 <- i.clean.nhamcs2014[-inTrain,]
control <- trainControl(method = "cv", number = 5, summaryFunction = twoClassSummary,
classProbs = TRUE, verboseIter = TRUE, returnResamp = "final")
model.glm <- train(IMMEDR~.,method = "glm", family = binomial(), metric = "ROC",
maximize = TRUE, data = train.nhamcs2014, trControl = control)
Error in `[.default`(y, , "time") : incorrect number of dimensions
Any input would be greatly appreciated!
The problem is in the input label, it is in an awkward format Labelled double. When you convert it to factor just before training it runs without issue:
Run after sum(is.na(i.clean.nhamcs2014)):
i.clean.nhamcs2014$IMMEDR <- as.character(i.clean.nhamcs2014$IMMEDR)
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR == "0"] <- "zero"
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR == "1"] <- "one"
i.clean.nhamcs2014$IMMEDR <- factor(i.clean.nhamcs2014$IMMEDR, levels = c("zero", "one"))
and then
set.seed(1)
inTrain<-createDataPartition(i.clean.nhamcs2014$IMMEDR, p=.75, list = FALSE)
train.nhamcs2014 <- i.clean.nhamcs2014[inTrain,]
test.nhamcs2014 <- i.clean.nhamcs2014[-inTrain,]
control <- trainControl(method = "cv", number = 5, summaryFunction = twoClassSummary,
classProbs = TRUE, verboseIter = TRUE, returnResamp = "final")
model.glm <- train(IMMEDR~.,method = "glm", family = binomial(), metric = "ROC",
maximize = TRUE, data = train.nhamcs2014, trControl = control)
> model.glm
Generalized Linear Model
12194 samples
24 predictor
2 classes: 'zero', 'one'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 9756, 9755, 9755, 9755, 9755
Resampling results:
ROC Sens Spec
0.632222 0.8814675 0.1774027

error in linear regression while using the train function in caret package

I have a data set called value that have four variables (ER is the dependent variable) and 400 observations (after removing N/A). I tried to divide the dataset into training and test sets and train the model using linear regression in the caret package. But I always get the errors:
In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ... :
extra argument ‘trcontrol’ is disregarded.
Below is my code:
ctrl_lm <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
value_rm = na.omit(value)
set.seed(1)
datasplit <- createDataPartition(y = value_rm[[1]], p = 0.8, list = FALSE)
train.value <- value_rm[datasplit,]
test.value <- value_rm[-datasplit,]
lmCVFit <- train(ER~., data = train.value, method = "lm",
trcontrol = ctrl_lm, metric = "Rsquared")
predictedVal <- predict(lmCVFit, test.value)
modelvalues <- data.frame(obs = test.value$ER, pred = predictedVal)
lmcv.out = defaultSummary(modelvalues)
The right sintax is trControl, not trcontrol. Try this:
library(caret)
set.seed(1)
n <- 100
value <- data.frame(ER=rnorm(n), X=matrix(rnorm(3*n),ncol=3))
ctrl_lm <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
value_rm = na.omit(value)
set.seed(1)
datasplit <- createDataPartition(y = value_rm[[1]], p = 0.8, list = FALSE)
train.value <- value_rm[datasplit,]
test.value <- value_rm[-datasplit,]
lmCVFit <- train(ER~., data = train.value, method = "lm",
trControl = ctrl_lm, metric = "Rsquared")
predictedVal <- predict(lmCVFit, test.value)
modelvalues <- data.frame(obs = test.value$ER, pred = predictedVal)
( lmcv.out <- defaultSummary(modelvalues) )
# RMSE Rsquared MAE
# 1.2351006 0.1190862 1.0371477

How to apply lasso logistic regression with caret and glmnet?

I am trying to repeat the following lines of code:
x.mat <- as.matrix(train.df[,predictors])
y.class <- train.df$Response
cv.lasso.fit <- cv.glmnet(x = x.mat, y = y.class,
family = "binomial", alpha = 1, nfolds = 10)
... with the caret package, but it doesn't work:
trainControl <- trainControl(method = "cv",
number = 10,
# Compute Recall, Precision, F-Measure
summaryFunction = prSummary,
# prSummary needs calculated class probs
classProbs = T)
modelFit <- train(Response ~ . -Id, data = train.df,
method = "glmnet",
trControl = trainControl,
metric = "F", # Optimize by F-measure
alpha=1,
family="binomial")
The parameter "alpha" is not recognized, and "the model fit fails in every fold".
What am I doing wrong? Help would be much appreciated. Thanks.
Try to use tuneGrid. For example as follows:
tuneGrid=expand.grid(
.alpha=1,
.lambda=seq(0, 100, by = 0.1))

Resources