confusionMatrix cannot provide accuracy - r

I'm trying to print out the accuracy but dont know why my confusionMatrix cannot print them out
x1 <- complete(imputed_Data,1)
x.matrix <- model.matrix(people~.-1,x1)
trainIndex <- createDataPartition(x1$people, p = .75, list = FALSE, times = 1)
train <- list(x = x.matrix[trainIndex,], y=x1$people[trainIndex])
test <- list(x = x.matrix[-trainIndex,], y=x1$people[-trainIndex])
dtrain <- xgb.DMatrix(data = train$x, label=train$y)
dtest <- xgb.DMatrix(data = test$x, label=test$y)
watchlist <- list(train=dtrain, test=dtest)
bst <- xgb.train(data=dtrain, max.depth=3, nround=100,
eta = 0.01, nthread = 3,
watchlist = watchlist,
objective = "binary:logistic",
eval_metric = "auc")
pred <- predict(bst,dtest)
pred1 <- predict(bst,dtrain)
xgbpred1 <- ifelse (pred1 > 0.5,1,0)
xgbpred <- ifelse (pred > 0.5,1,0)
confusionMatrix(as.factor(xgbpred), as.factor(test$y))
confusionMatrix(as.factor(xgbpred1), as.factor(train$y))
my output is like this, don't know why

Related

Intialized vector in AUC function out of bounds

I am trying to use cross validation with a decision tree using AUC. These are the functions that I am using:
.cvFolds <- function(Y, V) {
Y0 <- split(sample(which(Y == 0)), rep(1:V, length = length(which(Y == 0))))
Y1 <- split(sample(which(Y == 1)), rep(1:V, length = length(which(Y == 1))))
folds <- vector("list", length = V)
for (v in seq(V)) folds[[v]] <- c(Y0[[v]], Y1[[v]])
return(folds)
}
.doFit <- function(V, folds, train) {
set.seed(v)
ycol <- which(names(train) == y)
params <- list(x = train[-folds[[V]], -ycol],
y = as.factor(train[-folds[[V]], ycol]),
xtest = train[folds[[V]], -ycol])
fit <- do.call(randomForest, params)
pred <- fit$test$votes[, 2]
return(pred)
}
This is the function to calculate probabilities:
iid_example <- function(train, y = "V1", V = 10, seed = 1) {
set.seed(seed)
folds <- .cvFolds(Y = train[, c(y)], V = V)
# Generate CV predicted values
cl <- makeCluster(detectCores())
registerDoParallel(cl)
predictions <- foreach(v = 1:V, .combine = "c",
.packages = c("randomForest")) %dopar% .doFit(v, folds, train)
stopCluster(cl)
predictions[unlist(folds)] <- predictions
# Get CV AUC
runtime <- system.time(res <- ci.cvAUC(predictions = predictions,
labels = train[, c(y)],
folds = folds,
confidence = 0.95))
print(runtime)
return(res)
}
The actual function call:
res <- iid_example(train = datos, y = "V1", V = 10, seed = 1)
When I try to run it, I get the following error:
Y0[[v]] out of bounds
I am trying to adjust the parameterization of the function, but I do not understand why it is out of boundaries. Thanks for your help

R: Caret rfe with a customized variable importance permimp

I want to run a recursive feature elimination with caret rfe() with the alternative variable importance algorithm permimp. The permimp permutation importance uses cforest with cforest_unbiased(). Which other caret functions do I need to customize in order run rfe with permimp() and cforest?
This is my code so far:
library(caret)
permimpRFE <- list(summary = defaultSummary,
fit = function(x, y, first, last, ...){
library(party)
tmp <- as.data.frame(x, stringsAsFactors = TRUE)
tmp$y <- y
party::cforest(y ~ ., data = tmp,
control = party::cforest_unbiased(ntree = 50))
},
pred = function(object, x) predict(object, x),
rank = function(object, x, y) {
library(permimp)
vimp <- permimp::permimp(object, conditional = TRUE, threshold = .95, do_check = FALSE)
vimp <- as.data.frame(vimp$values)
colnames(vimp) <- "Overall"
vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
if (ncol(x) == 1) {
vimp$var <- colnames(x)
} else vimp$var <- rownames(vimp)
vimp
},
selectSize = pickSizeBest,
selectVar = pickVars)
# specify rfeControl
contr <- caret::rfeControl(functions=permimpRFE, method="repeatedcv", number=3, repeats=2,
saveDetails = TRUE)
dat <- as.data.frame(ChickWeight)[1:50,]
preds <- dat[,2:4]
response <- dat[,1]
# recursive feature elimination caret (Algorithm 2)
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
results <- caret::rfe(x = preds,
y = response,
sizes=c(1:3),
metric= "RMSE",
rfeControl=contr)
I get the error Error in { : task 1 failed - "invalid 'x' type in 'x && y'"
How can I get the rfe running with permimp and cforest?
When trying to customize the variable importance function for rfe(), it is important to implement the exact syntax of the desired function as well as its dependent functions.
In my case, I had to change predict(object, x) to predict(object, newdata = x).
Now the code snippet is working:
library(caret)
permimpRFE <- list(summary = defaultSummary,
fit = function(x, y, first, last, ...){
library(party)
tmp <- as.data.frame(x, stringsAsFactors = TRUE)
tmp$y <- y
party::cforest(y ~ ., data = tmp,
control = party::cforest_unbiased(ntree = 50))
},
pred = function(object, x) {
x <- as.data.frame(x, stringsAsFactors = TRUE)
predict(object, newdata = x)
},
rank = function(object, x, y) {
library(permimp)
vimp <- permimp::permimp(object, conditional = TRUE, threshold = .95, do_check = FALSE)
vimp <- as.data.frame(vimp$values)
colnames(vimp) <- "Overall"
vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
if (ncol(x) == 1) {
vimp$var <- colnames(x)
} else vimp$var <- rownames(vimp)
vimp
},
selectSize = pickSizeBest,
selectVar = pickVars)
# specify rfeControl
contr <- caret::rfeControl(functions=permimpRFE, method="repeatedcv", number=3, repeats=2,
saveDetails = TRUE)
dat <- as.data.frame(ChickWeight)[1:50,]
preds <- dat[,2:4]
response <- dat[,1]
# recursive feature elimination caret (Algorithm 2)
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
results <- caret::rfe(x = preds,
y = response,
sizes=c(1:3),
metric= "RMSE",
rfeControl=contr)

how to get accuration xgboost in r

how i get accuracy xgboost in r?
i have same problem, i will get a accuracy with method xgboost
library(xgboost)
library(RStoolbox)
library("caret", lib.loc="~/R/win-library/3.5")
setwd("D:/NEW DATA/kurt/tugas")
shp <- shapefile("jajal/samplepoint(2).shp")
ras <- stack("cigudegc21.tif")
vals <- extract(ras,shp)
train<-data.matrix(vals)
classes <- as.numeric(as.factor(shp#data$id)) - 1
xgb <- xgboost(data = train,
label = classes,
eta = 0.1,
max_depth = 4,
nround=100,
objective = "multi:softmax",
num_class = length(unique(classes)),
nthread = 3)
result <- predict(xgb, ras[1:(nrow(ras)*ncol(ras))],reshape=TRUE)
res <- raster(ras)
res <- setValues(res,result+1)```

R doParallel: couldn't find function

I have set up the following function:
cv_model <- function(dat, targets, predictors_name){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
MAE <- mean(abs(data$obs - data$pred))
BIAS <- mean(data$obs - data$pred)
Rsquared <- R2(pred = data$pred, obs = data$obs, formula = "corr", na.rm = FALSE)
c(MAPE = mape, RMSE = RMSE, MAE = MAE, BIAS = BIAS, Rsquared = Rsquared)
}
for (k in 1:length(dat)) {
a <- dat[[k]][dat[[k]]$vari == "a", -c(which(names(dat[[k]]) == "vari"))]
b <- dat[[k]][dat[[k]]$vari == "b", -c(which(names(dat[[k]]) == "vari"))]
ab <- list(a, b)
for (i in 1:length(targets)) {
for (j in 1:length(ab)) {
# specifiy trainControl
control <- trainControl(method="repeatedcv", number=10, repeats=10, search="grid", savePred =T,
summaryFunction = sumfct)
tunegrid <- expand.grid(mtry=c(1:length(predictors_name)))
set.seed(42)
model <- train(formula(paste0(targets[i],
" ~ ",
paste(predictors_name, sep = '', collapse = ' + '))),
data = ab[[j]],
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=tunegrid,
trControl=control)
}
}
}
}
According to this tutorial (https://topepo.github.io/caret/parallel-processing.html) I can parallelize my code just by calling library(doParallel); cl <- makePSOCKcluster(2); registerDoParallel(cl).
When I then use the function with doParallel
predictors_name <- c("Time", "Chick")
targets <- "weight"
dat <- as.data.frame(ChickWeight)
dat$vari <- rep(NA, nrow(dat))
dat$vari[c(1:10,320:350)] <- "a"
dat$vari[-c(1:10,320:350)] <- "b"
d <- list(dat[1:300,], dat[301:500,])
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
cv_model(dat = d, targets = targets, predictors_name = predictors_name)
# end parallel computing
stopCluster(cl)
the error message couldn't find function "MAPE" occurs.
How can I fix this without using the foreach syntax?
If I specify the package while calling the function like package::function, then it is working. Maybe there is a more elegant solution, but this is how I made the code running without an error:
cv_model <- function(dat, targets, predictors_name){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MLmetrics::MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
MAE <- mean(abs(data$obs - data$pred))
BIAS <- mean(data$obs - data$pred)
Rsquared <- R2(pred = data$pred, obs = data$obs, formula = "corr", na.rm = FALSE)
c(MAPE = mape, RMSE = RMSE, MAE = MAE, BIAS = BIAS, Rsquared = Rsquared)
}
for (k in 1:length(dat)) {
a <- dat[[k]][dat[[k]]$vari == "a", -c(which(names(dat[[k]]) == "vari"))]
b <- dat[[k]][dat[[k]]$vari == "b", -c(which(names(dat[[k]]) == "vari"))]
ab <- list(a, b)
for (i in 1:length(targets)) {
for (j in 1:length(ab)) {
# specifiy trainControl
control <- caret::trainControl(method="repeatedcv", number=10, repeats=10, search="grid", savePred =T,
summaryFunction = sumfct)
tunegrid <- expand.grid(mtry=c(1:length(predictors_name)))
set.seed(42)
model <- caret::train(formula(paste0(targets[i],
" ~ ",
paste(predictors_name, sep = '',
collapse = ' + '))),
data = ab[[j]],
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=tunegrid,
trControl=control)
}
}
}
}
predictors_name <- c("Time", "Chick", "Diet")
targets <- "weight"
dat <- as.data.frame(ChickWeight)
dat$vari <- rep(NA, nrow(dat))
dat$vari[c(1:10,320:350)] <- "a"
dat$vari[-c(1:10,320:350)] <- "b"
d <- list(dat[1:300,], dat[301:578,])
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
cv_model(dat = d, targets = targets, predictors_name = predictors_name)
# end parallel computing
stopCluster(cl)

Error with prediction - ROCR package (using probabilities)

I have used "rfe" function with svm to create a model with reduced features. Then I use "predict" on test data which outputs class labels (binary), 0 class probabilities, 1 class probabilities. I then tried using prediction function, in ROCR package, on predicted probabilities and true class labels but get the following error and am not sure why as the lengths of the 2 arrays are equal:
> pred_svm <- prediction(pred_svm_2class[,2], as.numeric(as.character(y)))
Error in prediction(pred_svm_2class[, 2], as.numeric(as.character(y))) :
Number of predictions in each run must be equal to the number of labels for each run.
I have the code below and the input is here click me.It is a small dataset with binary classification, so code runs fast.
library("caret")
library("ROCR")
sensor6data_2class <- read.csv("/home/sensei/clustering/svm_2labels.csv")
sensor6data_2class <- within(sensor6data_2class, Class <- as.factor(Class))
set.seed("1298356")
inTrain_svm_2class <- createDataPartition(y = sensor6data_2class$Class, p = .75, list = FALSE)
training_svm_2class <- sensor6data_2class[inTrain_svm_2class,]
testing_svm_2class <- sensor6data_2class[-inTrain_svm_2class,]
trainX <- training_svm_2class[,1:20]
y <- training_svm_2class[,21]
ctrl_svm_2class <- rfeControl(functions = rfFuncs , method = "repeatedcv", number = 5, repeats = 2, allowParallel = TRUE)
model_train_svm_2class <- rfe(x = trainX, y = y, data = training_svm_2class, sizes = c(1:20), metric = "Accuracy", rfeControl = ctrl_svm_2class, method="svmRadial")
pred_svm_2class = predict(model_train_svm_2class, newdata=testing_svm_2class)
pred_svm <- prediction(pred_svm_2class[,2], y)
Thanks and appreciate your help.
This is because in the line
pred_svm <- prediction(pred_svm_2class[,2], y)
pred_svm_2class[,2] is the predictions on test data and y is the labels for training data. Just generate the labels for test in a separate variable like this
y_test <- testing_svm_2class[,21]
And now if you do
pred_svm <- prediction(pred_svm_2class[,2], y_test)
There will be no error. Full code below -
# install.packages("caret")
# install.packages("ROCR")
# install.packages("e1071")
# install.packages("randomForest")
library("caret")
library("ROCR")
sensor6data_2class <- read.csv("svm_2labels.csv")
sensor6data_2class <- within(sensor6data_2class, Class <- as.factor(Class))
set.seed("1298356")
inTrain_svm_2class <- createDataPartition(y = sensor6data_2class$Class, p = .75, list = FALSE)
training_svm_2class <- sensor6data_2class[inTrain_svm_2class,]
testing_svm_2class <- sensor6data_2class[-inTrain_svm_2class,]
trainX <- training_svm_2class[,1:20]
y <- training_svm_2class[,21]
y_test <- testing_svm_2class[,21]
ctrl_svm_2class <- rfeControl(functions = rfFuncs , method = "repeatedcv", number = 5, repeats = 2, allowParallel = TRUE)
model_train_svm_2class <- rfe(x = trainX, y = y, data = training_svm_2class, sizes = c(1:20), metric = "Accuracy", rfeControl = ctrl_svm_2class, method="svmRadial")
pred_svm_2class = predict(model_train_svm_2class, newdata=testing_svm_2class)
pred_svm <- prediction(pred_svm_2class[,2], y_test)

Resources