The data below is just an example, it is operations on this , or any, data which I am confused about:
library(caret)
set.seed(3433)
data(AlzheimerDisease)
complete <- data.frame(diagnosis, predictors)
in_train <- createDataPartition(complete$diagnosis, p = 0.75)[[1]]
training <- complete[in_train,]
testing <- complete[-in_train,]
predIL <- grep("^IL", names(training))
smalltrain <- training[, c(1, predIL)]
fit_noPCA <- train(diagnosis ~ ., method = "glm", data = smalltrain)
pre_proc_obj <- preProcess(smalltrain[,-1], method = "pca", thresh = 0.8)
smalltrainsPCs <- predict(pre_proc_obj, smalltrain[,-1])
fit_PCA <- train(x = smalltrainsPCs, y = smalltrain$diagnosis, method = "glm")
fit_noPCA$results$Accuracy
fit_PCA$results$Accuracy
When running this code, I get a 0.689539 accuracy for fit_noPCA and 0.682951 accuracy for fit_PCA. But when I rerun the last portion of the code:
fit_noPCA <- train(diagnosis ~ ., method = "glm", data = smalltrain)
pre_proc_obj <- preProcess(smalltrain[,-1], method = "pca", thresh = 0.8)
smalltrainsPCs <- predict(pre_proc_obj, smalltrain[,-1])
fit_PCA <- train(x = smalltrainsPCs, y = smalltrain$diagnosis, method = "glm")
fit_noPCA$results$Accuracy
fit_PCA$results$Accuracy
Then each time I rerun these 6 lines I get different accuracy values. Why is this so? Is it because I am not resetting the seed? Even if, where is the inherent randomness of this process?
By default, the model is trained using bootstrap, you can see it here:
library(caret)
library(AppliedPredictiveModeling)
> fit_noPCA
Generalized Linear Model
251 samples
12 predictor
2 classes: 'Impaired', 'Control'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 251, 251, 251, 251, 251, 251, ...
Resampling results:
Accuracy Kappa
0.6870006 0.04107016
So with every train , the bootstrapped samples will be different, to get back the same result, you can set the seed before running train:
set.seed(111)
fit_PCA <- train(x = smalltrainsPCs, y = smalltrain$diagnosis, method = "glm",trControl=trainControl(method="boot",number=100))
fit_PCA$results$Accuracy
[1] 0.6983512
set.seed(112)
fit_PCA <- train(x = smalltrainsPCs, y = smalltrain$diagnosis, method = "glm",trControl=trainControl(method="boot",number=100))
fit_PCA$results$Accuracy
[1] 0.6991537
set.seed(111)
fit_PCA <- train(x = smalltrainsPCs, y = smalltrain$diagnosis, method = "glm",trControl=trainControl(method="boot",number=100))
fit_PCA$results$Accuracy
[1] 0.6983512
Or use for example cv where you can define the folds using index= in trainControl
Related
I am comparing the confusion matrix between the formula interface and the non-formula interface using caret's train() for elastic net. I am trying to understand why the two interfaces produces different confusion matrices. I understand that the formula interface will decompose the categorical variables into dummies and the model will have more coefficients.
First consider the formula interface model:
library(liver)
library(caret)
library(glmnet)
library(dplyr)
data(churn)
head(churn)
set.seed(1)
train.index <- createDataPartition(churn$churn, p = 0.8, list = FALSE)
train_churn <- churn[train.index,]
test_churn <- churn[-train.index,]
# add class weights
my_weights = train_churn %>%
select(churn) %>%
group_by(churn) %>%
count()
weight_for_yes = (1 / my_weights$n[1]) * ((my_weights$n[1] + my_weights$n[2]) / 2.0)
weight_for_yes
weight_for_no = (1 / my_weights$n[2]) * ((my_weights$n[1] + my_weights$n[2]) / 2.0)
weight_for_no
model_weights <- ifelse(train_churn$churn == "yes", weight_for_yes, weight_for_no)
myGrid <- expand.grid(
alpha = 0,
lambda = 0.1
)
#----------------- formula interface
set.seed(1)
mod_1 <- train(churn ~
state +
area.code +
intl.plan,
data = train_churn,
method = "glmnet",
tuneGrid = myGrid,
weights = model_weights)
prediction <- predict(mod_1, newdata = test_churn[,-20])
confusionMatrix(prediction, test_churn$churn)
Now, consider the non-formula interface model
predictors <- train_churn %>%
select(state,
area.code,
intl.plan) %>%
data.matrix()
response <- train_churn$churn
set.seed(1)
mod_2 <- train(x = predictors,
y = response,
method = "glmnet",
tuneGrid = myGrid,
weights = model_weights)
Is the disparity due to formula versus non-formula, or is this an artifact of elastic net?
I would like to implement the weighted knn algorithm but I don't know how to do it. Everything and that I can use kknn, I suppose that it can also be done with knn. In the function train(caret) there is an option "weights" but I can't find the solution, any suggestion?
I use the following code in R :
library(caret)
library(corrplot)
glass <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data",
col.names=c("","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Type"))
str(glass)
head(glass)
glass_1<- glass[,-7]
glass_2<- glass_1[,-7]
head(glass_2)
glass<- glass_2
standard.features <- scale(glass[,2:8])
data <- cbind(standard.features,glass[9])
anyNA(data)
head(data)
corrplot(cor(data))
data$Type<-factor(data$Type)
inTraining <- createDataPartition(data$Type, p = .7, list = FALSE, times =1 )
training <- data[ inTraining,]
testing <- data[-inTraining,]
prop.table(table(training$Type))
prop.table(table(testing$Type))
dim(training); dim(testing);
summary(data)
fitControl <- trainControl(## 5-fold CV
method = "cv",
number = 5,
## repeated ten times
#repeats = 5)
)
#k_value <- expand.grid(kmax = 3, distance = 2, kernel = "optimal")
k_value <- expand.grid(k = 3)
set.seed(825)
knn_Fit <- train(Type ~ ., data = training, weights = ????,
method = "knn", tuneGrid = k_value,
trControl = fitControl)
## This last option is actually one
## for gbm() that passes through
#verbose = FALSE)
knn_Fit
knn_Fit$finalModel
I am using the Hitters dataset in R. Currently I fit a linear regression predicting Salary from all other covariates with varying sample sizes from 20 to 75 and I calculated the average test/training errors :
data("Hitters", package = 'ISLR')
Hitters = na.omit(Hitters)
set.seed(1)
train.idx = sample(1:nrow(Hitters), 75,replace=FALSE)
train = Hitters[train.idx,-20]
test = Hitters[-train.idx,-20]
errs <- rep(NA,56)
for (ii in 20:75){
train.idx = sample(1:nrow(Hitters), ii,replace=FALSE)
train = Hitters[train.idx,-20]
test = Hitters[-train.idx,-20]
train.lm <- lm(Salary ~., - Salary, data = train)
train.pred <- predict(train.lm, train)
test.pred <- predict(train.lm, data = test)
errs[ii-19] <- mean((test.pred - train$Salary)^2)
}
errs
Now I am trying to do the same with Ridge regression using those samples I created from before with a regularization parameter of 20. I tried:
x_train = model.matrix(Salary~., train)[,-1]
x_test = model.matrix(Salary~., test)[,-1]
y_train = train$Salary
y_test = test$Salary
#cv.out = cv.glmnet(x_train,y_train, alpha = 0)
#lam = cv.out$lambda.min
errs.train <- rep(NA, 56)
for (ii in 20:75){
ridge_mod = glmnet(x_train, y_train, alpha=0, lambda = 20)
ridge_pred = predict(ridge_mod, newx = x_test)
#errs.test[ii] <- mean((ridge_pred - y_test)^2)
errs.train[ii-19] <- mean((ridge_pred - y_train)^2)
}
errs.train
But all the errors are coming out the same. How can I fix this?
There's a few bugs in the first part of the code for lm. It should be predict(train.lm, newdata = test) instead of predict(train.lm, data = test) . Do ?predict.lm if you are not sure of the input. Second, if you are interested in the error in test set, you should be subtracting the prediction of test with test$Salary and with the values from train . Something like below should work:
data("Hitters", package = 'ISLR')
Hitters = na.omit(Hitters)
set.seed(1)
sample_size = 20:75
errs = vector("numeric",length(sample_size))
for (ii in seq_along(sample_size)){
train.idx = sample(1:nrow(Hitters), sample_size[ii],replace=FALSE)
train = Hitters[train.idx,-20]
test = Hitters[-train.idx,-20]
train.lm <- lm(Salary ~., data = train)
test.pred <- predict(train.lm, newdata = test)
errs[ii] <- mean((test.pred - test$Salary)^2)
}
Now for ridge, only difference is that you create the model matrix and subset with each iteration :
errs.test = vector("numeric",length(sample_size))
x_data = model.matrix(Salary~., Hitters)[,-1]
y_data = Hitters$Salary
for (ii in seq_along(sample_size)){
train.idx = sample(1:nrow(x_data), sample_size[ii],replace=FALSE)
x_train = x_data[train.idx,]
x_test = x_data[-train.idx,]
y_train = y_data[train.idx]
y_test = y_data[-train.idx]
ridge_mod = glmnet(x_train, y_train, alpha=0, lambda = 20)
ridge_pred = predict(ridge_mod, newx = x_test)
errs.test[ii] <- mean((ridge_pred - y_test)^2)
}
Error: nrow(x) == n is not TRUE
I am not sure what "n" is referring to in this case. Here is the code throwing the error:
# BUILD MODEL
set.seed(9353)
elastic_net_model <- train(x = predictors, y = y,
method = "glmnet",
family = "binomial",
preProcess = c("scale"),
tuneLength = 10,
metric = "ROC",
# metric = "Spec",
trControl = train_control)
The main problem that others were running into with this error is that their y variable was not a factor or numeric. They were often passing it as a matrix or dataframe. I explicitly make my y a factor, shown here:
# Make sure that the outcome variable is a two-level factor
dfBlocksAll$trophout1 = as.factor(dfBlocksAll$trophout1)
# Set levels for dfBlocksAll$trophout1
levels(dfBlocksAll$trophout1) <- c("NoTrophy", "Trophy")
# Split the data into training and test set, 70/30 split
set.seed(1934)
index <- createDataPartition(y = dfBlocksAll$trophout1, p = 0.70, list = FALSE)
training <- dfBlocksAll[index, ]
testing <- dfBlocksAll[-index, ]
# This step is the heart of the process
y <- dfBlocksAll$trophout1 # outcome variable - did they get a trophy or not?
predictors <- training[,which(colnames(training) != "trophout1")]
The only other potentially relevant code that comes before the block throwing the error is this:
train_control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
# sampling = "down",
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel = TRUE,
savePredictions = "final",
verboseIter = FALSE)
Since my y is already a factor, I assume that my error has something to do with the x, not the y. As you can see from the code that my x is a dataframe called "predictors." This dataframe contains 768 obs. of 67 vars, and is filled with chars and numerics.
Your response variable has to come from the training, here I use an example dataset:
dfBlocksAll = data.frame(matrix(runif(1000),ncol=10))
dfBlocksAll$trophout1 = factor(sample(c("NoTrophy", "Trophy"),100,replace=TRUE))
index <- createDataPartition(y = dfBlocksAll$trophout1, p = 0.70, list = FALSE)
training <- dfBlocksAll[index, ]
testing <- dfBlocksAll[-index, ]
And this part should be changed:
y <- training$trophout1
predictors <- training[,which(colnames(training) != "trophout1")]
And the rest runs pretty ok:
elastic_net_model <- train(x = predictors, y = y,
method = "glmnet",
family = "binomial",
preProcess = c("scale"),
tuneLength = 10,
metric = "ROC",
trControl = train_control)
elastic_net_model
glmnet
71 samples
10 predictors
2 classes: 'NoTrophy', 'Trophy'
Pre-processing: scaled (10)
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 65, 64, 64, 63, 64, 64, ...
Resampling results across tuning parameters:
alpha lambda ROC Sens Spec
0.1 0.0003090198 0.5620833 0.5908333 0.51666667
0.1 0.0007138758 0.5620833 0.5908333 0.51666667
0.1 0.0016491457 0.5614583 0.5908333 0.51083333
0.1 0.0038097407 0.5594444 0.5933333 0.51083333
I would like to plot knn regression, are there any functions or best ways to plot machine learning regressions?
Once I chose the best model, what should I plot?
Many thanks for your help!
df <- mtcars
library(caret)
set.seed(123)
trainRowNumbers <- createDataPartition(df$mpg, p=0.8, list=FALSE)
trainData <- df[trainRowNumbers,]
testData <- df[-trainRowNumbers,]
y = trainData$mpg
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
trainData$mpg <- y
set.seed(123)
options(warn=-1)
subsets <- c(2:5, 8, 9, 12)
ctrl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
lmProfile <- rfe(x=trainData[, 2:11], y=trainData$mpg,
sizes = subsets,
rfeControl = ctrl)
lmProfile
control <- trainControl(method = "cv", number = 15)
set.seed(123)
model_lm = train(mpg ~ wt+hp+disp+cyl, data=trainData, method='lm', trControl = control)
model_lm
linear.predict <- predict(model_lm, testData)
linear.predict
postResample(linear.predict, testData$mpg)
model_knn = train(mpg ~ wt+hp+disp+cyl, data=trainData, method='knn', trControl = control)
model_knn
knn.predict <- predict(model_knn, testData)
knn.predict
postResample(knn.predict, testData$mpg)
You can plot two things as follows
#To show the changes in RMSE with changing tuning parameters
plot(model_knn)
#The observed vs. predicted plot
library("lattice")
library(mosaic)
df1 <- data.frame(Observed=testData$mpg, Predicted=linear.predict)
xyplot(Predicted ~ Observed, data = df1, pch = 19, panel=panel.lmbands,
band.lty = c(conf =2, pred = 1))