I'm trying to do some experiment and I want to run several GLMs model in R using the same variables but different training samples.
Here is some simulated data:
resp <- sample(0:1,100,TRUE)
x1 <- c(rep(5,20),rep(0,15), rep(2.5,40),rep(17,25))
x2 <- c(rep(23,10),rep(5,10), rep(15,40),rep(1,25), rep(2, 15))
dat <- data.frame(resp,x1, x2)
This is the loop I'm trying to use:
n <- 5
for (i in 1:n)
{
### Create training and testing data
## 80% of the sample size
# Note that I didn't use seed so that random split is performed every iteration.
smp_sizelogis <- floor(0.8 * nrow(dat))
train_indlogis <- sample(seq_len(nrow(dat)), size = smp_sizelogis)
trainlogis <- dat[train_indlogis, ]
testlogis <- dat[-train_indlogis, ]
InitLOogModel[i] <- glm(resp ~ ., data =trainlogis, family=binomial)
}
But unfortunately, I'm getting this error:
Error in InitLOogModel[i] <- glm(resp ~ ., data = trainlogis, family = binomial) :
object 'InitLOogModel' not found
Any thoughts.
I'd suggest using caret for what you're trying to do. It takes some time to learn, but incorporates many 'best practices'. Once you've learned the basics you'll be able to quickly try models other than a glm, and easily compare the models to each other. Here's modified code from your example to get you started.
## caret
library(caret)
# your data
resp <- sample(0:1,100,TRUE)
x1 <- c(rep(5,20),rep(0,15), rep(2.5,40),rep(17,25))
x2 <- c(rep(23,10),rep(5,10), rep(15,40),rep(1,25), rep(2, 15))
dat <- data.frame(resp,x1, x2)
# so caret knows you're trying to do classification, otherwise will give you an error at the train step
dat$resp <- as.factor(dat$resp)
# create a hold-out set to use after your model fitting
# not really necessary for your example, but showing for completeness
train_index <- createDataPartition(dat$resp, p = 0.8,
list = FALSE,
times = 1)
# create your train and test data
train_dat <- dat[train_index, ]
test_dat <- dat[-train_index, ]
# repeated cross validation, repeated 5 times
# this is like your 5 loops, taking 80% of the data each time
fitControl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 5)
# fit the glm!
glm_fit <- train(resp ~ ., data = train_dat,
method = "glm",
family = "binomial",
trControl = fitControl)
# summary
glm_fit
# best model
glm_fit$finalModel
Related
I am trying to calibrate probabilities that I get with the predict function in the R package.
I have in my case two classes and mutiple predictors. I used the iris dataset as an example for you to try and help me out.
my_data <- iris %>% #reducing the data to have two classes only
dplyr::filter((Species =="virginica" | Species == "versicolor") ) %>% dplyr::select(Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species)
my_data <- droplevels(my_data)
index <- createDataPartition(y=my_data$Species,p=0.6,list=FALSE)
#creating train and test set for machine learning
Train <- my_data[index,]
Test <- my_data[-index,]
#machine learning based on Train data partition with glmnet method
classCtrl <- trainControl(method = "repeatedcv", number=10,repeats=5,classProbs = TRUE,savePredictions = "final")
set.seed(355)
glmnet_ML <- train(Species~., Train, method= "glmnet", trControl=classCtrl)
glmnet_ML
#probabilities to assign each row of data to one class or the other on Test
predTestprob <- predict(glmnet_ML,Test,type="prob")
pred
#trying out calibration following "Applied predictive modeling" book from Max Kuhn p266-273
predTrainprob <- predict(glmnet_ML,Train,type="prob")
predTest <- predict(glmnet_ML,Test)
predTestprob <- predict(glmnet_ML,Test,type="prob")
Test$PredProb <- predTestprob[,"versicolor"]
Test$Pred <- predTest
Train$PredProb <- predTrainprob[,"versicolor"]
#logistic regression to calibrate
sigmoidalCal <- glm(relevel(Species, ref= "virginica") ~ PredProb,data = Train,family = binomial)
coef(summary(sigmoidalCal))
#predicting calibrated scores
sigmoidProbs <- predict(sigmoidalCal,newdata = Test[,"PredProb", drop = FALSE],type = "response")
Test$CalProb <- sigmoidProbs
#plotting to see if it works
calCurve2 <- calibration(Species ~ PredProb + CalProb, data = Test)
xyplot(calCurve2,auto.key = list(columns = 2))
According to me, the result given by the plot is not good which indicates a mistake in the calibration, the Calprob curve should follow the diagonal but it doe not.
Has anyone done anything similar ?
I have multiple classification machine learning models with all different accuracy. When I run my xgBOOST (using library(caret)) in the console, I get an accuracy of 0.7586. But when I knit my Rmarkdown, the accuracy of the same model is 0.8621. I have no idea why this is different.
I followed the suggestions of this link, but nothing worked: https://community.rstudio.com/t/console-and-rmd-output-differ-same-program-used-but-the-calculation-gives-a-different-result/67873/3
I also followed the suggestions of problem, but nothing worked: Statistics Result in R Markdown is different from the Knit Output (All Format: Word, HTML, PDF)
At last I tried this, but also nothing worked: sample function gives different result in console and in knitted document when seed is set
Here is my code which I run the same in console and Rmarkdown but with different accuracy:
# Data
data <- data[!is.na(data$var1),]
# Change levels of var1
levels(data$var1)=c("No","Yes")
#Data Preparation and Preprocessing
# Create the training and test datasets
set.seed(100)
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(data$var1, p=0.8, list=FALSE)
# Step 2: Create the training dataset
trainset <- data[trainRowNumbers,]
# Step 3: Create the test dataset
testset <- data[-trainRowNumbers,]
# Store Y for later use.
y = trainset$var1
# Create the knn imputation model on the training data
preProcess_missingdata_model <- preProcess(as.data.frame(trainset), method= c("knnImpute"))
preProcess_missingdata_model
# Create the knn imputation model on the testset data
preProcess_missingdata_model_test <- preProcess(as.data.frame(testset), method = c("knnImpute"))
preProcess_missingdata_model_test
# Use the imputation model to predict the values of missing data points
library(RANN) # required for knnInpute
trainset <- predict(preProcess_missingdata_model, newdata = trainset)
anyNA(trainset)
# Use the imputation model to predict the values of missing data points
library(RANN) # required for knnInpute
testset <- predict(preProcess_missingdata_model_test, newdata = testset)
anyNA(testset)
# Append the Y variable
trainset$var1 <- y
# Run algorithms using 5-fold cross validation
control <- trainControl(method="cv",
number=5,
repeats = 5,
savePredictions = "final",
search = "grid",
classProbs = TRUE)
metric <- "Accuracy"
# Make Valid Column Names
colnames(trainset) <- make.names(colnames(trainset))
colnames(testset) <- make.names(colnames(testset))
# xgBOOST
set.seed(7)
fit.xgbDART <- train(var1~., data = trainset, method = "xgbTree", metric = metric, trControl = control, verbose = FALSE, tuneLength = 7, nthread = 1)
# estimate skill of xgBOOST on the testset dataset
predictions <- predict(fit.xgbDART, testset)
cm <- caret::confusionMatrix(predictions, testset$var1, mode='everything')
cm
My RNGKind is:
RNGkind()
[1] "L'Ecuyer-CMRG" "Inversion" "Rejection"
always add the function :
set.seed(544)
This function sets the starting number used to generate a sequence of random numbers – it ensures that you get the same result if you start with that same seed each time you run the same process. For example, if I use the sample() function immediately after setting a seed, I will always get the same sample.
This is my suggestion on where to use set.seed()
# Data
data <- data[!is.na(data$var1),]
# Change levels of var1
levels(data$var1)=c("No","Yes")
#Data Preparation and Preprocessing
# Create the training and test datasets
# Step 1: Get row numbers for the training data
set.seed(100)
trainRowNumbers <- createDataPartition(data$var1, p=0.8, list=FALSE)
# Step 2: Create the training dataset
trainset <- data[trainRowNumbers,]
# Step 3: Create the test dataset
testset <- data[-trainRowNumbers,]
# Store Y for later use.
y = trainset$var1
# Create the knn imputation model on the training data
set.seed(100)
preProcess_missingdata_model <- preProcess(as.data.frame(trainset), method= c("knnImpute"))
preProcess_missingdata_model
# Create the knn imputation model on the testset data
set.seed(100)
preProcess_missingdata_model_test <- preProcess(as.data.frame(testset), method = c("knnImpute"))
preProcess_missingdata_model_test
# Use the imputation model to predict the values of missing data points
library(RANN) # required for knnInpute
trainset <- predict(preProcess_missingdata_model, newdata = trainset)
anyNA(trainset)
# Use the imputation model to predict the values of missing data points
library(RANN) # required for knnInpute
testset <- predict(preProcess_missingdata_model_test, newdata = testset)
anyNA(testset)
# Append the Y variable
trainset$var1 <- y
# Run algorithms using 5-fold cross validation
set.seed(100)
control <- trainControl(method="cv",
number=5,
repeats = 5,
savePredictions = "final",
search = "grid",
classProbs = TRUE)
metric <- "Accuracy"
# Make Valid Column Names
colnames(trainset) <- make.names(colnames(trainset))
colnames(testset) <- make.names(colnames(testset))
# xgBOOST
set.seed(7)
fit.xgbDART <-
train(
var1 ~ .,
data = trainset,
method = "xgbTree",
metric = metric,
trControl = control,
verbose = FALSE,
tuneLength = 7,
nthread = 1
)
# estimate skill of xgBOOST on the testset dataset
predictions <- predict(fit.xgbDART, testset)
cm <- caret::confusionMatrix(predictions, testset$var1, mode='everything')
I have a dataset with both continuous and categorical variables. I am running regression to predict one of the variables based on the other variables in the dataset. After comparing the results of ridge, lasso and elastic-net regression, the lasso regression is the best model to proceed with.
I used the 'coef' function to extract the model's coefficients, however, the result is a very long list with over 800 variables (as some of my categorical variables have many levels). Is there a way I can quickly rank the coefficients from largest to smallest? This is a glmnet model output
Reproducible problem with example code:
# Libraries Needed
library(caret)
library(glmnet)
library(mlbench)
library(psych)
# Data
data("BostonHousing")
data <- BostonHousing
str(data)
# Data Partition
set.seed(222)
ind <- sample(2, nrow(data), replace = T, prob = c(0.7, 0.3))
train <- data[ind==1,]
test <- data[ind==2,]
# Custom Control Parameters
custom <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
verboseIter = T)
# Linear Model
set.seed(1234)
lm <- train(medv ~.,
train,
method='lm',
trControl = custom)
# Results
lm$results
lm
summary(lm)
plot(lm$finalModel)
# Ridge Regression
set.seed(1234)
ridge <- train(medv ~.,
train,
method = 'glmnet',
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0.0001, 1, length=5)),#try 5 values for lambda between 0.0001 and 1
trControl=custom)
#increasing lambda = increasing penalty and vice versa
#increase lambda therefore will cause coefs to shrink
# Plot Results
plot(ridge)
plot(ridge$finalModel, xvar = "lambda", label = T)
plot(ridge$finalModel, xvar = 'dev', label=T)
plot(varImp(ridge, scale=T))
# Lasso Regression
set.seed(1234)
lasso <- train(medv ~.,
train,
method = 'glmnet',
tuneGrid = expand.grid(alpha=1,
lambda = seq(0.0001,1, length=5)),
trControl = custom)
# Plot Results
plot(lasso)
lasso
plot(lasso$finalModel, xvar = 'lambda', label=T)
plot(lasso$finalModel, xvar = 'dev', label=T)
plot(varImp(lasso, scale=T))
# Elastic Net Regression
set.seed(1234)
en <- train(medv ~.,
train,
method = 'glmnet',
tuneGrid = expand.grid(alpha = seq(0,1,length=10),
lambda = seq(0.0001,1,length=5)),
trControl = custom)
# Plot Results
plot(en)
plot(en$finalModel, xvar = 'lambda', label=T)
plot(en$finalModel, xvar = 'dev', label=T)
plot(varImp(en))
# Compare Models
model_list <- list(LinearModel = lm, Ridge = ridge, Lasso = lasso, ElasticNet=en)
res <- resamples(model_list)
summary(res)
bwplot(res)
xyplot(res, metric = 'RMSE')
# Best Model
en$bestTune
best <- en$finalModel
coef(best, s = en$bestTune$lambda)
For most models all you'd have to do would be:
sort(coef(model), decreasing=TRUE)
Since you're using glmnet it's a little bit more complicated. I'm going to replicate a minimal version of your example here (the other models, plots, etc. are not necessary in order for us to be able to reproduce your problem ...)
## Packages
library(caret)
library(glmnet)
library(mlbench) ## for BostonHousing data
# Data
data("BostonHousing")
data <- BostonHousing
# Data Partition
set.seed(222)
ind <- sample(2, nrow(data), replace = TRUE, prob = c(0.7, 0.3))
train <- data[ind==1,]
test <- data[ind==2,]
# Custom Control Parameters
custom <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
verboseIter = TRUE)
# Elastic Net Regression
set.seed(1234)
en <- train(medv ~.,
train,
method = 'glmnet',
tuneGrid = expand.grid(alpha = seq(0,1,length=10),
lambda = seq(0.0001,1,length=5)),
trControl = custom)
# Best Model
best <- en$finalModel
coefs <- coef(best, s = en$bestTune$lambda)
(This could probably be made simpler: for example, do you really need the custom control parameters to show us the example? This would be even simpler without using caret - just using `glmnet - but I was afraid I might leave something out.)
Once you've got the coefficients, sorting does appear to work, albeit with a message about possible inefficiency:
sort(coefs, decreasing=TRUE)
## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
## [1] 25.191049410 5.078589706 1.389548822 0.244605193 0.045600250
## [6] 0.008840485 0.004372752 -0.012701593 -0.028337745 -0.162794401
## [11] -0.335062819 -0.901475516 -1.395091095 -12.632336419
sort(as.numeric(coefs)) also appears to work fine.
If you want to sort the entire matrix (i.e. keeping the values for all penalization levels), you can take advantage of the fact that the penalization doesn't change the rank-order of the parameters:
coeftab <-coef(best)
lastvals <- coeftab[,ncol(coeftab)]
coeftab_s <- coeftab[order(lastvals,decreasing=TRUE),]
## plot, leaving out the intercept
matplot(t(coeftab_s)[,-1],type="l")
I prefer to use caret when fitting models because of its relative speed and preprocessing capabilities. However, I'm slightly confused on how it makes predictions. When comparing predictions made directly from the train object and predictions made from the extracted final model, I'm seeing very different numbers. The predictions from the train object appear to be more accurate.
library(caret)
library(ranger)
x1 <- rnorm(100)
x2 <- rbeta(100, 1, 1)
y <- 2*x1 + x2 + 5*x1*x2
data <- data.frame(x1, x2, y)
fitRanger <- train(y ~ x1 + x2, data = data,
method = 'ranger',
tuneLength = 1,
preProcess = c('knnImpute', 'center', 'scale'))
predict.data <- data.frame(x1 = rnorm(10), x2 = rbeta(10, 1, 1))
prediction1 <- predict(fitRanger, newdata = predict.data)
prediction2 <- predict(fitRanger$finalModel, data = predict.data)$prediction
results <- data.frame(prediction1, prediction2)
results
I'm positive it has something to do with how I preprocess the data in the train object, but even when I preprocess the test data and use the Ranger model to make predictions, the values are different
predict.data.processed <- predict.data %>%
preProcess(method = c('knnImpute',
'center',
'scale')) %>% .$data
results3 <- predict(fitRanger$finalModel, data = predict.data.processed)$prediction
results <- cbind(results, results3)
results
I want to extract the predictions from each individual tree in the ranger model, which I can't do in caret. Any thoughts?
In order to get the same predictions from the final model as with caret train you should pre-process the data in the same way. Using your example with set.seed(1):
caret predict:
prediction1 <- predict(fitRanger,
newdata = predict.data)
ranger predict on the final model. caret pre process was used on predict.data
prediction2 <- predict(fitRanger$finalModel,
data = predict(fitRanger$preProcess,
predict.data))$prediction
all.equal(prediction1,
prediction2)
#output
TRUE
I’m working in R and exploring the use of caret for variable selection and weighting using several methods. Here I’m exploring using forward stepwise and least angular regression (LARS), using tuning parameters for each. In the code below, I’ve arbitrarily chosen a dependent variable (y) and a subset of predictors (x’s) and have ran them via training algorithms using a subset of 70% of the data. To do so, I’m applying repeated 10-fold cross validations.
What I’m struggling with is locating a command to identify the final model parameters (e.g., intercept, beta weights) derived from the train function. I’m not readily seeing it when I call object$finalModel. Is there a way to recover these in R using the methods listed (forward stepwise regression and LARS)? I feel like this would have to exist....
Thanks!
library (caret)
library(AppliedPredictiveModeling)
data(abalone)
str(abalone)
set.seed(18)
inTrain <- sample(1:(round(nrow(abalone)*.7)),replace=FALSE)
train_df <- abalone [inTrain,]
test_df <- abalone [-inTrain,]
#predicting Diameter using several of the dataset's variables#
train_df_x <- train_df [,4:8]
test_df_x <- test_df [,4:8]
y_train <- train_df [,3]
y_test <- test_df [,3]
set.seed(18)
fold.ids <- createMultiFolds(y_train,k=10,times=3)
fitControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
returnResamp = "final",
index = fold.ids,
summaryFunction = defaultSummary,
selectionFunction = "oneSE")
### Forward regression ###
library(leaps)
forwardLmGrid <- expand.grid (.nvmax=seq(2,5))
set.seed(18)
F_OLS_fit <- train(train_df_x, y_train,"leapForward",trControl = fitControl,metric="RMSE", tuneGrid=forwardLmGrid)
### LARS ###
larGrid <- expand.grid(.fraction=seq(.01,.99,length=50))
library(lars)
Lar_fit <- train(train_df_x, y_train,"lars",trControl = fitControl,metric="RMSE", tuneGrid=larGrid)
I'll show you how I do it with an example:
library(data.table)
n <- 1000
x1 <- runif(n,min=-10,max=10)
x2 <- runif(n,min=-10,max=10)
x3 <- runif(n,min=-10,max=10)
x4 <- runif(n,min=-10,max=10)
x5 <- runif(n,min=-10,max=10)
y1 <- 30 + x1 + 4*x2 + x3
synthetic <- data.table(x1=x1,x2=x2,x3=x3,x4=x4,x5=x5,y=y1)
library(caret)
library(lars)
ctrl <- trainControl(method = "cv", savePred=T, number=3)
fractionGrid <- expand.grid (fraction=seq(0,1,(1/(ncol(widedt)-1))))
cvresult <- train(y~.,
data=synthetic,
method = "lars",
trControl = ctrl,
metric="RMSE",
tuneGrid=fractionGrid,
use.Gram=FALSE)
coeffs <- predict.lars(cvresult$finalModel,type="coefficients")
models <- as.data.table(coeffs$coefficients)
winnermodelscoeffs <- models[which(coeffs$fraction==cvresult$bestTune$fraction)]