R caretEnsemble building models on different feature subsets - r

I wonder if there is some way to combine prediction of two different models are built on two different input feature set . For example , first on features 1:10 and second on 11:20 and combine with caretEnssemble of caretStack function.
I am trying :
data("mtcars")
head(mtcars)
library(caret)
library(caretEnsemble)
library(glmnet)
library(gbm)
ma_control <- trainControl(method = "cv",
number = 2,
summaryFunction = RMSE,
verboseIter = TRUE,
savePredictions = TRUE)
subset1 <- mtcars[,c(2:3,1)]
subset2 <- mtcars[,c(4:5,1)]
classification_formula1 <- as.formula(paste("mpg" ,"~",
paste(names(subset1)[!names(subset1)=='mpg'],collapse="+")))
classification_formula2 <- as.formula(paste("mpg" ,"~",
paste(names(subset2)[!names(subset2)=='mpg'],collapse="+")))
emf_tuneGrid_list <- NULL;
emf_tuneGrid_list$glmnet1_tuneGrid <- expand.grid(alpha = 1.0 ,lambda = 1)
emf_tuneGrid_list$gbm2_tuneGrid <- expand.grid(interaction.depth = 1, n.trees = 101 ,
shrinkage = 0.5 , n.minobsinnode = 5)
emf_model_list <- caretList (
trControl=ma_control, metric = "RMSE",
tuneList=list(
glmnet1= caretModelSpec(method='glmnet', classification_formula = classification_formula1 , data = subset1 , tuneGrid=emf_tuneGrid_list$glmnet1_tuneGrid),
gbm2 = caretModelSpec(method='gbm', classification_formula = classification_formula2, data = subset2 , tuneGrid=emf_tuneGrid_list$gbm2_tuneGrid , verbose = FALSE)
)
)
But get Error in extractCaretTarget.default(...) :
argument "y" is missing, with no default

Related

How to implement knn based on weights

I would like to implement the weighted knn algorithm but I don't know how to do it. Everything and that I can use kknn, I suppose that it can also be done with knn. In the function train(caret) there is an option "weights" but I can't find the solution, any suggestion?
I use the following code in R :
library(caret)
library(corrplot)
glass <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data",
col.names=c("","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Type"))
str(glass)
head(glass)
glass_1<- glass[,-7]
glass_2<- glass_1[,-7]
head(glass_2)
glass<- glass_2
standard.features <- scale(glass[,2:8])
data <- cbind(standard.features,glass[9])
anyNA(data)
head(data)
corrplot(cor(data))
data$Type<-factor(data$Type)
inTraining <- createDataPartition(data$Type, p = .7, list = FALSE, times =1 )
training <- data[ inTraining,]
testing <- data[-inTraining,]
prop.table(table(training$Type))
prop.table(table(testing$Type))
dim(training); dim(testing);
summary(data)
fitControl <- trainControl(## 5-fold CV
method = "cv",
number = 5,
## repeated ten times
#repeats = 5)
)
#k_value <- expand.grid(kmax = 3, distance = 2, kernel = "optimal")
k_value <- expand.grid(k = 3)
set.seed(825)
knn_Fit <- train(Type ~ ., data = training, weights = ????,
method = "knn", tuneGrid = k_value,
trControl = fitControl)
## This last option is actually one
## for gbm() that passes through
#verbose = FALSE)
knn_Fit
knn_Fit$finalModel

error using train command in caret package in r

I am trying to make a glm model using caret in r using healthcare data from the CDC. However, whenever i try to train the model using the train() command in caret, i keep on getting the following error:
Error in `[.default`(y, , "time") : incorrect number of dimensions
Below is my code:
#download data
download.file(url = "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/dataset_documentation/nhamcs/stata/ed2014-stata.zip",destfile = "ed2014-stata.zip")
unzip("ed2014-stata.zip")
library(haven)
nhamcs2014 <- read_dta("ed2014-stata.dta")
dim(nhamcs2014)
#isolate variables of interest
keep2014<- c("SEX","IMMEDR","SEEN72","CANCER","ETOHAB","ALZHD","ASTHMA","CEBVD","CKD","COPD","CHF","CAD","DEPRN",
"DIABTYP1","DIABTYP2","DIABTYP0","ESRD","HPE","EDHIV","HYPLIPID","HTN","OBESITY","OSA","OSTPRSIS",
"SUBSTAB")
new.nhamcs2014 <- nhamcs2014[keep2014]
#remove missing data
e=new.nhamcs2014$IMMEDR==-9
e.clean.nhamcs2014<- new.nhamcs2014[!e,]
f=e.clean.nhamcs2014$IMMEDR==-8
f.clean.nhamcs2014<- e.clean.nhamcs2014[!f,]
g=f.clean.nhamcs2014$SEEN72==-9
g.clean.nhamcs2014 <- f.clean.nhamcs2014[!g,]
h=g.clean.nhamcs2014$SEEN72==-8
h.clean.nhamcs2014 <- g.clean.nhamcs2014[!h,]
i <- h.clean.nhamcs2014$IMMEDR==7
i.clean.nhamcs2014 <- h.clean.nhamcs2014[!i,]
#Convert response variable (IMMEDR) to binomial variable
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==3] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==2] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==1] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==5] <- 1
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==4] <- 1
#clean data
i.clean.nhamcs2014$SEX[i.clean.nhamcs2014$SEX==1] <- 0
i.clean.nhamcs2014$SEX[i.clean.nhamcs2014$SEX==2] <- 1
i.clean.nhamcs2014$SEEN72[i.clean.nhamcs2014$SEEN72==1] <- 0
i.clean.nhamcs2014$SEEN72[i.clean.nhamcs2014$SEEN72==2] <- 1
View(i.clean.nhamcs2014)
sum(is.na(i.clean.nhamcs2014))
#create glm model using caret
library(caret)
set.seed(1)
inTrain<-createDataPartition(i.clean.nhamcs2014$IMMEDR, p=.75, list = FALSE)
train.nhamcs2014 <- i.clean.nhamcs2014[inTrain,]
test.nhamcs2014 <- i.clean.nhamcs2014[-inTrain,]
control <- trainControl(method = "cv", number = 5, summaryFunction = twoClassSummary,
classProbs = TRUE, verboseIter = TRUE, returnResamp = "final")
model.glm <- train(IMMEDR~.,method = "glm", family = binomial(), metric = "ROC",
maximize = TRUE, data = train.nhamcs2014, trControl = control)
Error in `[.default`(y, , "time") : incorrect number of dimensions
Any input would be greatly appreciated!
The problem is in the input label, it is in an awkward format Labelled double. When you convert it to factor just before training it runs without issue:
Run after sum(is.na(i.clean.nhamcs2014)):
i.clean.nhamcs2014$IMMEDR <- as.character(i.clean.nhamcs2014$IMMEDR)
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR == "0"] <- "zero"
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR == "1"] <- "one"
i.clean.nhamcs2014$IMMEDR <- factor(i.clean.nhamcs2014$IMMEDR, levels = c("zero", "one"))
and then
set.seed(1)
inTrain<-createDataPartition(i.clean.nhamcs2014$IMMEDR, p=.75, list = FALSE)
train.nhamcs2014 <- i.clean.nhamcs2014[inTrain,]
test.nhamcs2014 <- i.clean.nhamcs2014[-inTrain,]
control <- trainControl(method = "cv", number = 5, summaryFunction = twoClassSummary,
classProbs = TRUE, verboseIter = TRUE, returnResamp = "final")
model.glm <- train(IMMEDR~.,method = "glm", family = binomial(), metric = "ROC",
maximize = TRUE, data = train.nhamcs2014, trControl = control)
> model.glm
Generalized Linear Model
12194 samples
24 predictor
2 classes: 'zero', 'one'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 9756, 9755, 9755, 9755, 9755
Resampling results:
ROC Sens Spec
0.632222 0.8814675 0.1774027

error in linear regression while using the train function in caret package

I have a data set called value that have four variables (ER is the dependent variable) and 400 observations (after removing N/A). I tried to divide the dataset into training and test sets and train the model using linear regression in the caret package. But I always get the errors:
In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ... :
extra argument ‘trcontrol’ is disregarded.
Below is my code:
ctrl_lm <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
value_rm = na.omit(value)
set.seed(1)
datasplit <- createDataPartition(y = value_rm[[1]], p = 0.8, list = FALSE)
train.value <- value_rm[datasplit,]
test.value <- value_rm[-datasplit,]
lmCVFit <- train(ER~., data = train.value, method = "lm",
trcontrol = ctrl_lm, metric = "Rsquared")
predictedVal <- predict(lmCVFit, test.value)
modelvalues <- data.frame(obs = test.value$ER, pred = predictedVal)
lmcv.out = defaultSummary(modelvalues)
The right sintax is trControl, not trcontrol. Try this:
library(caret)
set.seed(1)
n <- 100
value <- data.frame(ER=rnorm(n), X=matrix(rnorm(3*n),ncol=3))
ctrl_lm <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
value_rm = na.omit(value)
set.seed(1)
datasplit <- createDataPartition(y = value_rm[[1]], p = 0.8, list = FALSE)
train.value <- value_rm[datasplit,]
test.value <- value_rm[-datasplit,]
lmCVFit <- train(ER~., data = train.value, method = "lm",
trControl = ctrl_lm, metric = "Rsquared")
predictedVal <- predict(lmCVFit, test.value)
modelvalues <- data.frame(obs = test.value$ER, pred = predictedVal)
( lmcv.out <- defaultSummary(modelvalues) )
# RMSE Rsquared MAE
# 1.2351006 0.1190862 1.0371477

Caret: undefined columns selected

I have been trying to get the below code to run in caret but get the error. Can anyone tell me how to trouble shoot it.
Error in [.data.frame(data, , lvls[1]) : undefined columns selected
library(tidyverse)
library(caret)
mydf <- iris
mydf <- mydf %>%
mutate(tgt = as.factor(ifelse(Species == 'setosa','Y','N'))) %>%
select(everything(), -Species)
trainIndex <- createDataPartition(mydf$tgt, p = 0.75, times = 1, list = FALSE)
train <- mydf[trainIndex,]
test <- mydf[-trainIndex,]
fitControl <- trainControl(method = 'repeatedcv',
number = 10,
repeats = 10,
allowParallel = TRUE,
summaryFunction = twoClassSummary)
fit_log <- train(tgt~.,
data = train,
method = "glm",
trControl = fitControl,
family = "binomial")
You need to used classProbs = TRUE in your control function. The ROC curve is based on the class probabilities and the error is the summary function not finding those columns.
Use data = data.frame(xxxxx). As in the example below
fit.cart <- train(Condition~., data = data.frame(trainset), method="rpart", metric=metric, trControl=control)

R not valid variable name for caret function

I want to use train caret function to investigate xgboost results
#open file with train data
trainy <- read.csv('')
# open file with test data
test <- read.csv('')
# we dont need ID column
##### Removing IDs
trainy$ID <- NULL
test.id <- test$ID
test$ID <- NULL
##### Extracting TARGET
trainy.y <- trainy$TARGET
trainy$TARGET <- NULL
# set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
nrounds = 1000,
eta = c(0.01, 0.001, 0.0001),
max_depth = c(2, 4, 6, 8, 10),
gamma = 1
)
# pack the training control parameters
xgb_trcontrol_1 = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
returnData = FALSE,
returnResamp = "all", # save losses across all models
classProbs = TRUE, # set to TRUE for AUC to be computed
summaryFunction = twoClassSummary,
allowParallel = TRUE
)
# train the model for each parameter combination in the grid,
# using CV to evaluate
xgb_train_1 = train(
x = as.matrix(trainy),
y = as.factor(trainy.y),
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree"
)
I see this error
Error in train.default(x = as.matrix(trainy), y = as.factor(trainy.y), trControl = xgb_trcontrol_1, :
At least one of the class levels is not a valid R variable name;
I have looked at other cases but still cant understand what I should change? R is quite different from Python for me for now
As I can see I should do something with y classes variable, but what and how exactly ? Why didnt as.factor function work?
I solved this issue, hope it will help to all novices
I needed to transofm all data to factor type in the way like
trainy[] <- lapply(trainy, factor)

Resources