consider the following example
rm(list = ls(all=T))
library(ISLR)
library(glmnet)
Hitters=na.omit(Hitters)
# Binary proble - Logistic regression
Hitters$Salary <- ifelse(Hitters$Salary > 1000, 1, 0)
Hitters$Salary <- as.factor(Hitters$Salary)
# the class is unbalanced
# > table(Hitters$Salary)
# 0 1
# 233 30
# cls <- sapply(Hitters, class)
# for(j in names(cls[cls == 'integer'])) Hitters[,j] <- as.double(Hitters[,j])
x = model.matrix(~ . -1, Hitters[,names(Hitters)[!names(Hitters) %in% c('Salary')]] )
inx_train <- 1:200
inx_test <- 201:dim(Hitters)[1]
x_train <- x[inx_train, ]
x_test <- x[inx_test, ]
y_train <- Hitters[inx_train, c('Salary')]
y_test <- Hitters[inx_test, 'Salary']
fit = cv.glmnet(x=x_train, y=y_train, alpha=1, type.measure='auc', family = "binomial")
plot(fit)
pred = predict(fit, s='lambda.min', newx=x_test)
quantile(pred)
# 0% 25% 50% 75% 100%
# -5.200853 -3.704760 -2.883836 -1.937052 1.386215
Given the above probabilities, which function or parameter in predict should I use/modify to transform them between 0 and 1?
In your predict call you need the type="response" argument set. As per the documentation it returns the fitted probabilities.
pred = predict(fit, s='lambda.min', newx=x_test, type="response")
Also, if you are just wanted the classification labels you can use type="class"
Related
I am trying to plot my SVM classification with usinf e1071 library.However, the classification plot shows only single value for my parameters.Even though I change the selected parameters to create 2d classification plot it is wrong.
require(caTools)
library(caret)
dataset <-read.csv("income_evaluation.csv")
# fnlwgt row remowed since it is not necessary
df_income <- subset(dataset,select=-c(fnlwgt))
# turn binary attribute into 0 and 1
df_income$income <-ifelse(df_income$income==" >50K",1,0)
df_income$native.country
apply(X=df_income,2,FUN=function(x) length(which(x==' ?')))
# handling missing values
#define function to calculate mode
find_mode <- function(x) {
u <- unique(x)
tab <- tabulate(match(x, u))
u[tab == max(tab)]
}
mod_workclass_df = find_mode(df_income$workclass)
mod_occupation_df = find_mode(df_income$occupation)
mod_country_df = find_mode(df_income$native.country)
# replacing the missing values with the mod values
df_income$workclass[df_income$workclass == ' ?'] <- mod_workclass_df
df_income$occupation[df_income$occupation == ' ?'] <- mod_occupation_df
df_income$native.country[df_income$native.country == ' ?'] <- mod_country_df
# one hot encoding for train set
dmy <- dummyVars(" ~ .", data = df_income, fullRank = T)
df_income <- data.frame(predict(dmy, newdata = df_income))
# sampling
set.seed(101)
sample = sample.split(df_income$income, SplitRatio = .75)
trainingSet = subset(df_income, sample == TRUE)
testSet = subset(df_income, sample == TRUE)
# isolaate y cariable
Y_train <- trainingSet$income
Y_test <- testSet$income
#isolate x cariable
X_test <- subset(testSet,select=-c(income))
# evalution of svm
library(e1071)
svm_classifier = svm(formula=income ~ .,data=trainingSet,type="C-classification",kernel="radial",scale=TRUE,cost=10)
Y_pred = predict(svm_classifier,newdata= X_test)
confusionMatrix(table(Y_test,Y_pred))
# cross validation
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(trainingSet$income, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = trainingSet[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = trainingSet[x, ] # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm_classifier
Y_pred = predict(svm_classifier,newdata= test_fold[-97])
cm = table(test_fold[, 97], Y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy
trainingSet$income <-as.factor(trainingSet$income)
# Visualising the Training set results
plot(svm_classifier,trainingSet,education.num~age)
library(ggplot2)
svm_classifier
table(predicted=svm_classifier$fitted,actual=trainingSet$income)
Here is my code above and the plot below.I could not find the problem why there is only one color background and why there is any red color in the background.
I'm using a R package called logistf to make a Logistc Regression and I saw that there's no predict function for new data in this package and predict package does not work with this, so I found a code that show how making this with new data:
fit<-logistf(Tax ~ L20+L24+L28+L29+L31+L32+L33+L36+S10+S15+S16+S17+S20, data=trainData)
betas <- coef(fit)
X <- model.matrix(fit, data=testData)
probs <- 1 / (1 + exp(-X %*% betas))
I want to make a cross validation version with this using fit$predict and the probabilities that probs generate for me. Has anyone ever done something like this before?
Other thing that I want to know is about fit$predict I'm making a binary logistic regression, and this function returns many values, are these values from class 0 or 1, how can I know this? Thanks
While the code that you wrote works perfectly, there is a concise way of getting the same results seemingly:
brglm_model <- brglm(formula = response ~ predictor , family = "binomial", data = train )
brglm_pred <- predict(object = brglm_model, newdata = test , type = "response")
About the CV, you have to write a few lines of code I guess:
#Setting the number of folds, and number of instances in each fold
n_folds <- 5
fold_size <- nrow(dataset) %/% 5
residual <- nrow(dataset) %% 5
#label the instances based on the number of folds
cv_labels <- c(rep(1,fold_size),rep(2,fold_size), rep(3,fold_size), rep(4,fold_size), rep(5,fold_size), rep(5,residual))
# the error term would differ based on each threshold value
t_seq <- seq(0.1,0.9,by = 0.1)
index_mat <- matrix(ncol = (n_folds+1) , nrow = length(t_seq))
index_mat[,1] <- t_seq
# the main loop for calculation of the CV error on each fold
for (i in 1:5){
train <- dataset %>% filter(cv_labels != i)
test <- dataset %>% filter(cv_labels == i )
brglm_cv_model <- brglm(formula = response_var ~ . , family = "binomial", data = train )
brglm_cv_pred <- predict(object = brglm_model, newdata = test , type = "response")
# error formula that you want, e.g. misclassification
counter <- 0
for (treshold in t_seq ) {
counter <- counter + 1
conf_mat <- table( factor(test$response_var) , factor(brglm_cv_pred>treshold, levels = c("FALSE","TRUE") ))
sen <- conf_mat[2,2]/sum(conf_mat[2,])
# other indices can be computed as follows
#spec <- conf_mat[1,1]/sum(conf_mat[1,])
#prec <- conf_mat[2,2]/sum(conf_mat[,2])
#F1 <- (2*prec * sen)/(prec+sen)
#accuracy <- (conf_mat[1,1]+conf_mat[2,2])/sum(conf_mat)
#here I am only interested in sensitivity
index_mat[counter,(i+1)] <- sen
}
}
# final data.frame would be the mean of sensitivity over each threshold value
final_mat <- matrix(nrow = length(t_seq), ncol = 2 )
final_mat[,1] <- t_seq
final_mat[,2] <- apply(X = index_mat[,-1] , MARGIN = 1 , FUN = mean)
final_mat <- data.frame(final_mat)
colnames(final_mat) <- c("treshold","sensitivity")
#why not having a look at the CV-sensitivity of the model over threshold values?
ggplot(data = final_mat) +
geom_line(aes(x = treshold, y = sensitivity ), color = "blue")
The following code are used to produce the probability output of binary classification with Random Forest.
library(randomForest)
rf <- randomForest(train, train_label,importance=TRUE,proximity=TRUE)
prediction<-predict(rf, test, type="prob")
Then the result about prediction is as follows:
The true label about test data are known (named test_label). Now I want to compute logarithmic loss for probability output of binary classification. The function about LogLoss is as follows.
LogLoss=function(actual, predicted)
{
result=-1/length(actual)*(sum((actual*log(predicted)+(1-actual)*log(1-predicted))))
return(result)
}
How to compute logarithmic loss with probability output of binary classification. Thank you.
library(randomForest)
rf <- randomForest(Species~., data = iris, importance=TRUE, proximity=TRUE)
prediction <- predict(rf, iris, type="prob")
#bound the results, otherwise you might get infinity results
prediction <- apply(prediction, c(1,2), function(x) min(max(x, 1E-15), 1-1E-15))
#model.matrix generates a true probabilities matrix, where an element is either 1 or 0
#we subtract the prediction, and, if the result is bigger than 0 that's the correct class
logLoss = function(pred, actual){
-1*mean(log(pred[model.matrix(~ actual + 0) - pred > 0]))
}
logLoss(prediction, iris$Species)
I think the logLoss formula is a little bit wrong.
model <- glm(vs ~ mpg, data = mtcars, family = "binomial")
### OP's formula (Wrong)
logLoss1 <- function(pred, actual){
-1*mean(log(pred[model.matrix(~ actual + 0) - pred > 0]))
}
logLoss1(actual = model$y, pred = model$fitted.values)
# [1] 0.4466049
### Correct formula in native R
logLoss2 <- function(pred, actual){
-mean(actual * log(pred) + (1 - actual) * log(1 - pred))
}
logLoss2(actual = model$y, pred = model$fitted.values)
# [1] 0.3989584
## Results from various packages to verify the correct answer
### From ModelMetrics package
ModelMetrics::logLoss(actual = model$y, pred = model$fitted.values)
# [1] 0.3989584
### From MLmetrics package
MLmetrics::LogLoss(y_pred = model$fitted.values, y_true = model$y)
# [1] 0.3989584
### From reticulate package
sklearn.metrics <- import("sklearn.metrics")
sklearn.metrics$log_loss(y_true = model$y, y_pred = model$fitted.values)
# [1] 0.3989584
I used the R version 4.1.0 (2021-05-18).
I have constructed a logistic regression and now I want to calculate the predictive accuracy for various cutoff values ranging from 0 to 1. This is the for loop I have been using. But I am getting
subscript out of bounds
Here predtrain contains 300 predicted output probabilities each ranging from 0 to 1. Each value is compared to cutoff eff. Finally a table/confusion matrix has to be generated comparing the original values (train$CAN) with f1. Some thing like this:
tab
# pred2
# 0 1
# 0 1 93
# 1 0 206
code I have written is this:
predtrain <- predict(logreg1, newdata = train, type = 'response')
eff<-seq(0,1,by = 0.05)
for (i in 1:length(eff) {
f1 <- ifelse(predtrain > eff[i], 1, 0)
t1 <- table(train$CAN, f1)
effy <- (t1[1,1]+t1[2,2])/(t1[1,1]+t1[1,2]+t1[2,2]+t1[2,1])
eff[[i]] <-effy
}
The reason you're getting subscript out of bounds errors is that you're trying to create confusion matrices with cutoffs like 0 and 1 -- this will create a confusion matrix with a single column (all predictions are either positive or negative), causing code like t1[2,2] to cause your error.
In reality all you're trying to do is to compute the predictive accuracy at different cutoffs, which can be accomplished without creating tables at all with something like:
cutoffs <- seq(0, 1, by=0.05)
eff <- sapply(cutoffs, function(cutoff) {
sum((predtrain > cutoff) == train$CAN) / length(predtrain)
})
To see this in action, let's consider a small example model:
set.seed(144)
x <- runif(100)
train <- data.frame(x, CAN=as.numeric(runif(100)+x >= 1))
logreg1 <- glm(CAN~x, data=train, family="binomial")
predtrain <- predict(logreg1, newdata = train, type = 'response')
Now we can get the predictive accuracy at each cutoff:
eff <- sapply(cutoffs, function(cutoff) {
sum((predtrain > cutoff) == train$CAN) / length(predtrain)
})
plot(cutoffs, eff)
You could alternately use a package like the ROCR package to grab metrics. For instance, here is how you could grab the sensitivity at each cutoff:
library(ROCR)
pred <- prediction(predtrain, train$CAN)
perf <- performance(pred, "sens")
eff <- sapply(cutoffs, function(cutoff) max(perf#y.values[[1]][perf#x.values[[1]] >= cutoff]))
plot(cutoffs, eff)
But to calculate something like specificity and sensitivity doesn't it become more difficult? I have written using two for loops, I know it is not very effective but I do get the table from which I can calculate performance variables. Can this method be improved?
enter code here
z <- seq(0,1,by = 0.05)
t1 <- vector(mode = "list", length = length(z))
for(i in 1:length(z)) {
predtrain <- predict(logreg1, newdata = train, type = 'response')
for(j in 1:length(predtrain)){
predtrain[j] <- ifelse(predtrain[j]>z[i], 1, 0)
}
t1[[i]] <- table(train$CAN, predtrain)
} t1
I can't for the life of me figure out how to compute a confusion matrix on rpart.
Here is what I have done:
set.seed(12345)
UBANK_rand <- UBank[order(runif(1000)), ]
UBank_train <- UBank_rand[1:900, ]
UBank_test <- UBank_rand[901:1000, ]
dim(UBank_train)
dim(UBank_test)
#Build the formula for the Decision Tree
UB_tree <- Personal.Loan ~ Experience + Age+ Income +ZIP.Code + Family + CCAvg + Education
#Building the Decision Tree from Test Data
UB_rpart <- rpart(UB_tree, data=UBank_train)
Now, I would think I would do something like
table(predict(UB_rpart, UBank_test, UBank_Test$Default))
But that is not giving me a confusion matrix.
You didn't provide a reproducible example, so I'll create a synthetic dataset:
set.seed(144)
df = data.frame(outcome = as.factor(sample(c(0, 1), 100, replace=T)),
x = rnorm(100))
The predict function for an rpart model with type="class" will return the predicted class for each observation.
library(rpart)
mod = rpart(outcome ~ x, data=df)
pred = predict(mod, type="class")
table(pred)
# pred
# 0 1
# 51 49
Lastly, you can build the confusion matrix by running table between the prediction and true outcome:
table(pred, df$outcome)
# pred 0 1
# 0 36 15
# 1 14 35
You can try
pred <- predict(UB_rpart, UB_test)
confusionMatrix(pred, UB_test$Personal.Loan)