xgboost problem with parameter 'eta' = 0.01 - r

I used dataset "churn" with xgboost algorithm. Y has two levels, Yes and No.
I have a problem with parameter 'eta' in xgboost. When I run eta = 0.1, I have no error in confusion matrix. But, when I run eta=0.01, I have this error = "Error in confusionMatrix.default(Y_test, pred_y) :
the data cannot have more levels than the reference".
Why?
Here the code
my_data <- read.csv("churn.csv", sep=",")
data<-data[,-1] # drop customerid
y<-data$Churn
x <- data[,1:ncol(data)-1]
data <-cbind(y,x)
head(data, 3)
set.seed(3)
train_index <- createDataPartition(data$y, p = .7, # ampiezza del train
list = FALSE,
times = 1) # no replacement
train <- data[ train_index,]
test <- data[ -train_index,]
X_train <- data.matrix(train[,-1])
Y_train <- train[,1]
X_test <- data.matrix(test[,-1])
Y_test <- test[,1]
xgboost_train = xgb.DMatrix(data=X_train, label=Y_train)
xgboost_test = xgb.DMatrix(data=X_test, label=Y_test)
model <- xgboost(data = xgboost_train, # the data
max.depth=5, # max depth
eta= 0.01,
nrounds=50)
summary(model)
pred_test = predict(model, xgboost_test)
pred_test
pred_test[(pred_test>3)] = 2
pred_y = as.factor((levels(Y_test))[round(pred_test)])
print(pred_y)
conf_mat = confusionMatrix(Y_test, pred_y)
print(conf_mat)
```[enter image description here][1]
[1]: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

Related

Trying to find test and training errors for ridge regression as a function of sample size

I am using the Hitters dataset in R. Currently I fit a linear regression predicting Salary from all other covariates with varying sample sizes from 20 to 75 and I calculated the average test/training errors :
data("Hitters", package = 'ISLR')
Hitters = na.omit(Hitters)
set.seed(1)
train.idx = sample(1:nrow(Hitters), 75,replace=FALSE)
train = Hitters[train.idx,-20]
test = Hitters[-train.idx,-20]
errs <- rep(NA,56)
for (ii in 20:75){
train.idx = sample(1:nrow(Hitters), ii,replace=FALSE)
train = Hitters[train.idx,-20]
test = Hitters[-train.idx,-20]
train.lm <- lm(Salary ~., - Salary, data = train)
train.pred <- predict(train.lm, train)
test.pred <- predict(train.lm, data = test)
errs[ii-19] <- mean((test.pred - train$Salary)^2)
}
errs
Now I am trying to do the same with Ridge regression using those samples I created from before with a regularization parameter of 20. I tried:
x_train = model.matrix(Salary~., train)[,-1]
x_test = model.matrix(Salary~., test)[,-1]
y_train = train$Salary
y_test = test$Salary
#cv.out = cv.glmnet(x_train,y_train, alpha = 0)
#lam = cv.out$lambda.min
errs.train <- rep(NA, 56)
for (ii in 20:75){
ridge_mod = glmnet(x_train, y_train, alpha=0, lambda = 20)
ridge_pred = predict(ridge_mod, newx = x_test)
#errs.test[ii] <- mean((ridge_pred - y_test)^2)
errs.train[ii-19] <- mean((ridge_pred - y_train)^2)
}
errs.train
But all the errors are coming out the same. How can I fix this?
There's a few bugs in the first part of the code for lm. It should be predict(train.lm, newdata = test) instead of predict(train.lm, data = test) . Do ?predict.lm if you are not sure of the input. Second, if you are interested in the error in test set, you should be subtracting the prediction of test with test$Salary and with the values from train . Something like below should work:
data("Hitters", package = 'ISLR')
Hitters = na.omit(Hitters)
set.seed(1)
sample_size = 20:75
errs = vector("numeric",length(sample_size))
for (ii in seq_along(sample_size)){
train.idx = sample(1:nrow(Hitters), sample_size[ii],replace=FALSE)
train = Hitters[train.idx,-20]
test = Hitters[-train.idx,-20]
train.lm <- lm(Salary ~., data = train)
test.pred <- predict(train.lm, newdata = test)
errs[ii] <- mean((test.pred - test$Salary)^2)
}
Now for ridge, only difference is that you create the model matrix and subset with each iteration :
errs.test = vector("numeric",length(sample_size))
x_data = model.matrix(Salary~., Hitters)[,-1]
y_data = Hitters$Salary
for (ii in seq_along(sample_size)){
train.idx = sample(1:nrow(x_data), sample_size[ii],replace=FALSE)
x_train = x_data[train.idx,]
x_test = x_data[-train.idx,]
y_train = y_data[train.idx]
y_test = y_data[-train.idx]
ridge_mod = glmnet(x_train, y_train, alpha=0, lambda = 20)
ridge_pred = predict(ridge_mod, newx = x_test)
errs.test[ii] <- mean((ridge_pred - y_test)^2)
}

Plotting training and test error rates of knn cross-validation in R

I have performed the following cross-validation knn (using the caret package) on the iris dataset. I am now trying to plot the training and test error rates for the result. Here is my attempt but I cannot get the error rates. Can anyone help me please?
library(caret)
data(iris)
sample <- sample(2, nrow(iris), replace=TRUE, prob=c(0.80, 0.20))
iris.training <- iris[sample == 1, 1:4]
iris.test <- iris[sample == 2, 1:4]
iris.trainLabels <- iris[sample == 1, 5]
iris.testLabels <- iris[sample == 2, 5]
# Combine training data and combine test data.
iris_train <- cbind(iris.trainLabels, iris.training)
iris_test <- cbind(iris.testLabels, iris.test)
trControl <- trainControl(method = "cv", number = 5)
# K values 1 3 5 7 9
k_values <- seq(from=1, to=10, by=2)
fit <- train(iris.trainLabels ~ ., method = "knn", tuneGrid = expand.grid(k = k_values), trControl = trControl, data = iris_train)
# Plot
bestK <- function(iris_train, iris.trainLabels,
iris.testLabels) {
ctr <- c(); cts <- c()
for (k in length(k_values)) {
fit <- train(iris.trainLabels ~ ., method = "knn", tuneGrid = expand.grid(k = k_values), trControl = trControl, data = iris_train)
trTable <- prop.table(table(fit, iris.trainLabels))
tsTable <- prop.table(table(fit, iris.testLabels))
erTr <- trTable[1,2] + trTable[2,1]
erTs <- tsTable[1,2] + tsTable[2,1]
ctr <- c(ctr,erTr)
cts <- c(cts,erTs)
}
err <- data.frame(k=k_values, trER=ctr, tsER=cts)
return(err)
}
err <- bestK(iris_train, iris.trainLabels, iris.testLabels)
plot(err$k,err$trER,type='o',ylim=c(0,.5),xlab="k",ylab="Error rate",col="blue")
lines(err$k,err$tsER,type='o',col="red")
Update:
Would like to obtain a visual plot something similar to this...

how to get accuration xgboost in r

how i get accuracy xgboost in r?
i have same problem, i will get a accuracy with method xgboost
library(xgboost)
library(RStoolbox)
library("caret", lib.loc="~/R/win-library/3.5")
setwd("D:/NEW DATA/kurt/tugas")
shp <- shapefile("jajal/samplepoint(2).shp")
ras <- stack("cigudegc21.tif")
vals <- extract(ras,shp)
train<-data.matrix(vals)
classes <- as.numeric(as.factor(shp#data$id)) - 1
xgb <- xgboost(data = train,
label = classes,
eta = 0.1,
max_depth = 4,
nround=100,
objective = "multi:softmax",
num_class = length(unique(classes)),
nthread = 3)
result <- predict(xgb, ras[1:(nrow(ras)*ncol(ras))],reshape=TRUE)
res <- raster(ras)
res <- setValues(res,result+1)```

Error running LightGBM in R "does not support constructing from ‘data.frame’"

I finally installed LightGBM in R, but I can't quite get it to work.
Below is a simple reproducible example using the iris dataset.
I'm using R version 3.5.2 on windows 10 and lightgbm_2.2.4.
library(lightgbm)
library(caTools)
set.seed(42)
# Prepare the dataset for binary classification
iris$label <- 0
iris$label[iris$Species=="setosa"] <- 1
iris <- iris[,!(names(iris) %in% "Species")]
# Split into train & validation set
sample <- sample.split(iris$label, SplitRatio = .75)
train <- subset(iris, sample == TRUE)
valid <- subset(iris, sample == FALSE)
X_train <- train[,!(names(train) %in% "label")]
X_valid <- valid[,!(names(valid) %in% "label")]
y_train <- train$label
y_valid <- valid$label
# Train LightGBM
dtrain <- lgb.Dataset(data = X_train, label = y_train)
dvalid <- lgb.Dataset(data = X_valid, label = y_valid)
params <- list(objective = "binary",
verbose = 1,
seed = 42)
lgb_model <- lgb.train(params, dtrain, 200, list(eval = dvalid),
verbose_eval=200, early_stopping_round=10)
When runnning lgb.train, I get the following error message:
"Error in data$construct() : lgb.Dataset.construct: does not support constructing from ‘data.frame’".
I can't figure out what is causing this error and how to fix it. Any help is greatly appreciated.
lgb.Dataset accept matrices only so
dtrain <- lgb.Dataset(data = as.matrix(X_train), label = y_train)
dvalid <- lgb.Dataset(data = as.matrix(X_valid), label = y_valid)
should work

Error with prediction - ROCR package (using probabilities)

I have used "rfe" function with svm to create a model with reduced features. Then I use "predict" on test data which outputs class labels (binary), 0 class probabilities, 1 class probabilities. I then tried using prediction function, in ROCR package, on predicted probabilities and true class labels but get the following error and am not sure why as the lengths of the 2 arrays are equal:
> pred_svm <- prediction(pred_svm_2class[,2], as.numeric(as.character(y)))
Error in prediction(pred_svm_2class[, 2], as.numeric(as.character(y))) :
Number of predictions in each run must be equal to the number of labels for each run.
I have the code below and the input is here click me.It is a small dataset with binary classification, so code runs fast.
library("caret")
library("ROCR")
sensor6data_2class <- read.csv("/home/sensei/clustering/svm_2labels.csv")
sensor6data_2class <- within(sensor6data_2class, Class <- as.factor(Class))
set.seed("1298356")
inTrain_svm_2class <- createDataPartition(y = sensor6data_2class$Class, p = .75, list = FALSE)
training_svm_2class <- sensor6data_2class[inTrain_svm_2class,]
testing_svm_2class <- sensor6data_2class[-inTrain_svm_2class,]
trainX <- training_svm_2class[,1:20]
y <- training_svm_2class[,21]
ctrl_svm_2class <- rfeControl(functions = rfFuncs , method = "repeatedcv", number = 5, repeats = 2, allowParallel = TRUE)
model_train_svm_2class <- rfe(x = trainX, y = y, data = training_svm_2class, sizes = c(1:20), metric = "Accuracy", rfeControl = ctrl_svm_2class, method="svmRadial")
pred_svm_2class = predict(model_train_svm_2class, newdata=testing_svm_2class)
pred_svm <- prediction(pred_svm_2class[,2], y)
Thanks and appreciate your help.
This is because in the line
pred_svm <- prediction(pred_svm_2class[,2], y)
pred_svm_2class[,2] is the predictions on test data and y is the labels for training data. Just generate the labels for test in a separate variable like this
y_test <- testing_svm_2class[,21]
And now if you do
pred_svm <- prediction(pred_svm_2class[,2], y_test)
There will be no error. Full code below -
# install.packages("caret")
# install.packages("ROCR")
# install.packages("e1071")
# install.packages("randomForest")
library("caret")
library("ROCR")
sensor6data_2class <- read.csv("svm_2labels.csv")
sensor6data_2class <- within(sensor6data_2class, Class <- as.factor(Class))
set.seed("1298356")
inTrain_svm_2class <- createDataPartition(y = sensor6data_2class$Class, p = .75, list = FALSE)
training_svm_2class <- sensor6data_2class[inTrain_svm_2class,]
testing_svm_2class <- sensor6data_2class[-inTrain_svm_2class,]
trainX <- training_svm_2class[,1:20]
y <- training_svm_2class[,21]
y_test <- testing_svm_2class[,21]
ctrl_svm_2class <- rfeControl(functions = rfFuncs , method = "repeatedcv", number = 5, repeats = 2, allowParallel = TRUE)
model_train_svm_2class <- rfe(x = trainX, y = y, data = training_svm_2class, sizes = c(1:20), metric = "Accuracy", rfeControl = ctrl_svm_2class, method="svmRadial")
pred_svm_2class = predict(model_train_svm_2class, newdata=testing_svm_2class)
pred_svm <- prediction(pred_svm_2class[,2], y_test)

Resources