how to get accuration xgboost in r - r

how i get accuracy xgboost in r?
i have same problem, i will get a accuracy with method xgboost
library(xgboost)
library(RStoolbox)
library("caret", lib.loc="~/R/win-library/3.5")
setwd("D:/NEW DATA/kurt/tugas")
shp <- shapefile("jajal/samplepoint(2).shp")
ras <- stack("cigudegc21.tif")
vals <- extract(ras,shp)
train<-data.matrix(vals)
classes <- as.numeric(as.factor(shp#data$id)) - 1
xgb <- xgboost(data = train,
label = classes,
eta = 0.1,
max_depth = 4,
nround=100,
objective = "multi:softmax",
num_class = length(unique(classes)),
nthread = 3)
result <- predict(xgb, ras[1:(nrow(ras)*ncol(ras))],reshape=TRUE)
res <- raster(ras)
res <- setValues(res,result+1)```

Related

xgboost problem with parameter 'eta' = 0.01

I used dataset "churn" with xgboost algorithm. Y has two levels, Yes and No.
I have a problem with parameter 'eta' in xgboost. When I run eta = 0.1, I have no error in confusion matrix. But, when I run eta=0.01, I have this error = "Error in confusionMatrix.default(Y_test, pred_y) :
the data cannot have more levels than the reference".
Why?
Here the code
my_data <- read.csv("churn.csv", sep=",")
data<-data[,-1] # drop customerid
y<-data$Churn
x <- data[,1:ncol(data)-1]
data <-cbind(y,x)
head(data, 3)
set.seed(3)
train_index <- createDataPartition(data$y, p = .7, # ampiezza del train
list = FALSE,
times = 1) # no replacement
train <- data[ train_index,]
test <- data[ -train_index,]
X_train <- data.matrix(train[,-1])
Y_train <- train[,1]
X_test <- data.matrix(test[,-1])
Y_test <- test[,1]
xgboost_train = xgb.DMatrix(data=X_train, label=Y_train)
xgboost_test = xgb.DMatrix(data=X_test, label=Y_test)
model <- xgboost(data = xgboost_train, # the data
max.depth=5, # max depth
eta= 0.01,
nrounds=50)
summary(model)
pred_test = predict(model, xgboost_test)
pred_test
pred_test[(pred_test>3)] = 2
pred_y = as.factor((levels(Y_test))[round(pred_test)])
print(pred_y)
conf_mat = confusionMatrix(Y_test, pred_y)
print(conf_mat)
```[enter image description here][1]
[1]: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

confusionMatrix cannot provide accuracy

I'm trying to print out the accuracy but dont know why my confusionMatrix cannot print them out
x1 <- complete(imputed_Data,1)
x.matrix <- model.matrix(people~.-1,x1)
trainIndex <- createDataPartition(x1$people, p = .75, list = FALSE, times = 1)
train <- list(x = x.matrix[trainIndex,], y=x1$people[trainIndex])
test <- list(x = x.matrix[-trainIndex,], y=x1$people[-trainIndex])
dtrain <- xgb.DMatrix(data = train$x, label=train$y)
dtest <- xgb.DMatrix(data = test$x, label=test$y)
watchlist <- list(train=dtrain, test=dtest)
bst <- xgb.train(data=dtrain, max.depth=3, nround=100,
eta = 0.01, nthread = 3,
watchlist = watchlist,
objective = "binary:logistic",
eval_metric = "auc")
pred <- predict(bst,dtest)
pred1 <- predict(bst,dtrain)
xgbpred1 <- ifelse (pred1 > 0.5,1,0)
xgbpred <- ifelse (pred > 0.5,1,0)
confusionMatrix(as.factor(xgbpred), as.factor(test$y))
confusionMatrix(as.factor(xgbpred1), as.factor(train$y))
my output is like this, don't know why

How to implement knn based on weights

I would like to implement the weighted knn algorithm but I don't know how to do it. Everything and that I can use kknn, I suppose that it can also be done with knn. In the function train(caret) there is an option "weights" but I can't find the solution, any suggestion?
I use the following code in R :
library(caret)
library(corrplot)
glass <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data",
col.names=c("","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Type"))
str(glass)
head(glass)
glass_1<- glass[,-7]
glass_2<- glass_1[,-7]
head(glass_2)
glass<- glass_2
standard.features <- scale(glass[,2:8])
data <- cbind(standard.features,glass[9])
anyNA(data)
head(data)
corrplot(cor(data))
data$Type<-factor(data$Type)
inTraining <- createDataPartition(data$Type, p = .7, list = FALSE, times =1 )
training <- data[ inTraining,]
testing <- data[-inTraining,]
prop.table(table(training$Type))
prop.table(table(testing$Type))
dim(training); dim(testing);
summary(data)
fitControl <- trainControl(## 5-fold CV
method = "cv",
number = 5,
## repeated ten times
#repeats = 5)
)
#k_value <- expand.grid(kmax = 3, distance = 2, kernel = "optimal")
k_value <- expand.grid(k = 3)
set.seed(825)
knn_Fit <- train(Type ~ ., data = training, weights = ????,
method = "knn", tuneGrid = k_value,
trControl = fitControl)
## This last option is actually one
## for gbm() that passes through
#verbose = FALSE)
knn_Fit
knn_Fit$finalModel

Using ParBayesianOptimization for regression problem in R (minimizing rmse)

I am trying to use the ParBayesianOptimization package for tunning parameters in my model. The original GitHub repository demonstrates how to implement the package for parameter tuning in the classification problem (maximizing AUC). However, in my case, I want to implement the function in the regression problem and minimize rmse.
The main problem I am having is to understand why the final parameters getBestPars(optObj) are chosen according to the highest value in a column Score here: optObj$scoreSummary. As I understand the score column represents the value of rmse for a given iteration thus the function should return parameters for the lowest score.
My results:
Example of code to reproduce:
# install.packages("mlbench")
library('mlbench')
library('ParBayesianOptimization')
library("xgboost")
library("data.table")
library('doParallel')
#------------------------------------------------------------------------------#
#### Get data
#------------------------------------------------------------------------------#
set.seed(123)
data(BostonHousing)
BostonHousing <- data.frame(apply(BostonHousing, 2, as.numeric))
setDT(BostonHousing)
train_x <- BostonHousing[ , .SD,.SDcols = setdiff(names(BostonHousing), "medv")]
train_y <- BostonHousing[ ,.SD,.SDcols = "medv"]
#------------------------------------------------------------------------------#
#### Create Folds
#------------------------------------------------------------------------------#
Folds <- list(
Fold1 = as.integer(seq(1,nrow(BostonHousing),by = 3))
, Fold2 = as.integer(seq(2,nrow(BostonHousing),by = 3))
, Fold3 = as.integer(seq(3,nrow(BostonHousing),by = 3))
)
#------------------------------------------------------------------------------#
#### define the scoring function
#------------------------------------------------------------------------------#
scoringFunction <- function(max_depth, min_child_weight, subsample, eta, gamma,
colsample_bytree) {
dtrain <- xgboost::xgb.DMatrix(as.matrix(train_x), label = as.matrix(train_y))
Pars <- list(
booster = "gbtree"
, gamma = gamma
, colsample_bytree = colsample_bytree
, eta = eta
, max_depth = max_depth
, min_child_weight = min_child_weight
, subsample = subsample
, objective = 'reg:linear'
, eval_metric = "rmse"
)
xgbcv <- xgb.cv(
params = Pars
, data = dtrain
, nround = 100
, folds = Folds
, early_stopping_rounds = 100
, maximize = TRUE
, verbose = 1
)
return(
list(Score = min(xgbcv$evaluation_log$test_rmse_mean)
, nrounds = xgbcv$best_iteration
)
)
}
#------------------------------------------------------------------------------#
#### Bounds
#------------------------------------------------------------------------------#
bounds <- list(
gamma = c(0.1,50L)
, colsample_bytree = c(0.5,1L)
, eta = c(0.01,0.1)
, max_depth = c(1L, 5L)
, min_child_weight = c(0, 25)
, subsample = c(0.1, 1)
)
#------------------------------------------------------------------------------#
#### To run in parallel
#------------------------------------------------------------------------------#
cl <- makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl)
clusterExport(cl,c('Folds','train_x', "train_y"))
clusterEvalQ(cl,expr= {
library(xgboost)
})
tWithPar <- system.time(
optObj <- bayesOpt(
FUN = scoringFunction
, bounds = bounds
, initPoints = 7
, iters.n = (parallel::detectCores() - 1)*2
, iters.k = (parallel::detectCores() - 1)*2
, parallel = TRUE
, verbose = 1
)
)
stopCluster(cl)
registerDoSEQ()
#------------------------------------------------------------------------------#
#### Printing results
#------------------------------------------------------------------------------#
optObj$scoreSummary
getBestPars(optObj)
I would appreciate any help in better understanding the function and how to correctly implement it in a regression problem.
Minimizing the RMSE is equivalent to maximizing -1*RMSE so try redefining your Score.
Score = -1*min(xgbcv$evaluation_log$test_rmse_mean)

Error with prediction - ROCR package (using probabilities)

I have used "rfe" function with svm to create a model with reduced features. Then I use "predict" on test data which outputs class labels (binary), 0 class probabilities, 1 class probabilities. I then tried using prediction function, in ROCR package, on predicted probabilities and true class labels but get the following error and am not sure why as the lengths of the 2 arrays are equal:
> pred_svm <- prediction(pred_svm_2class[,2], as.numeric(as.character(y)))
Error in prediction(pred_svm_2class[, 2], as.numeric(as.character(y))) :
Number of predictions in each run must be equal to the number of labels for each run.
I have the code below and the input is here click me.It is a small dataset with binary classification, so code runs fast.
library("caret")
library("ROCR")
sensor6data_2class <- read.csv("/home/sensei/clustering/svm_2labels.csv")
sensor6data_2class <- within(sensor6data_2class, Class <- as.factor(Class))
set.seed("1298356")
inTrain_svm_2class <- createDataPartition(y = sensor6data_2class$Class, p = .75, list = FALSE)
training_svm_2class <- sensor6data_2class[inTrain_svm_2class,]
testing_svm_2class <- sensor6data_2class[-inTrain_svm_2class,]
trainX <- training_svm_2class[,1:20]
y <- training_svm_2class[,21]
ctrl_svm_2class <- rfeControl(functions = rfFuncs , method = "repeatedcv", number = 5, repeats = 2, allowParallel = TRUE)
model_train_svm_2class <- rfe(x = trainX, y = y, data = training_svm_2class, sizes = c(1:20), metric = "Accuracy", rfeControl = ctrl_svm_2class, method="svmRadial")
pred_svm_2class = predict(model_train_svm_2class, newdata=testing_svm_2class)
pred_svm <- prediction(pred_svm_2class[,2], y)
Thanks and appreciate your help.
This is because in the line
pred_svm <- prediction(pred_svm_2class[,2], y)
pred_svm_2class[,2] is the predictions on test data and y is the labels for training data. Just generate the labels for test in a separate variable like this
y_test <- testing_svm_2class[,21]
And now if you do
pred_svm <- prediction(pred_svm_2class[,2], y_test)
There will be no error. Full code below -
# install.packages("caret")
# install.packages("ROCR")
# install.packages("e1071")
# install.packages("randomForest")
library("caret")
library("ROCR")
sensor6data_2class <- read.csv("svm_2labels.csv")
sensor6data_2class <- within(sensor6data_2class, Class <- as.factor(Class))
set.seed("1298356")
inTrain_svm_2class <- createDataPartition(y = sensor6data_2class$Class, p = .75, list = FALSE)
training_svm_2class <- sensor6data_2class[inTrain_svm_2class,]
testing_svm_2class <- sensor6data_2class[-inTrain_svm_2class,]
trainX <- training_svm_2class[,1:20]
y <- training_svm_2class[,21]
y_test <- testing_svm_2class[,21]
ctrl_svm_2class <- rfeControl(functions = rfFuncs , method = "repeatedcv", number = 5, repeats = 2, allowParallel = TRUE)
model_train_svm_2class <- rfe(x = trainX, y = y, data = training_svm_2class, sizes = c(1:20), metric = "Accuracy", rfeControl = ctrl_svm_2class, method="svmRadial")
pred_svm_2class = predict(model_train_svm_2class, newdata=testing_svm_2class)
pred_svm <- prediction(pred_svm_2class[,2], y_test)

Resources