I'm trying to use the naiveBayes() function from the e1071 package. When I add a non-zero laplace argument, my resulting probability estimates are not changing and I don't understand why.
Example:
library(e1071)
# Generate data
train.x <- data.frame(x1=c(1,1,0,0), x2=c(1,0,1,0))
train.y <- factor(c("cat", "cat", "dog", "dog"))
test.x <- data.frame(x1=c(1), x2=c(1))
# without laplace smoothing
classifier <- naiveBayes(x=train.x, y=train.y, laplace=0)
predict(classifier, test.x, type="raw") # returns (1, 0.00002507)
# with laplace smoothing
classifier <- naiveBayes(x=train.x, y=train.y, laplace=1)
predict(classifier, test.x, type="raw") # returns (1, 0.00002507)
I would expect the probabilities to change in this case since all the training instances for the "dog" class have 0 for x1. To check this, here's the same thing using Python
Python example:
import numpy as np
from sklearn.naive_bayes import BernoulliNB
train_x = pd.DataFrame({'x1':[1,1,0,0], 'x2':[1,0,1,0]})
train_y = np.array(["cat", "cat", "dog", "dog"])
test_x = pd.DataFrame({'x1':[1,], 'x2':[1,]})
# alpha (i.e. laplace = 0)
classifier = BernoulliNB(alpha=.00000001)
classifier.fit(X=train_x, y=train_y)
classifier.predict_proba(X=test_x) # returns (1, 0)
# alpha (i.e. laplace = 1)
classifier = BernoulliNB(alpha=1)
classifier.fit(X=train_x, y=train_y)
classifier.predict_proba(X=test_x) # returns (.75, .25)
Why am I getting this unexpected result using e1071?
Laplace estimates are only valid for categorical features, not numerical ones. You can find in the source code:
## estimation-function
est <- function(var)
if (is.numeric(var)) {
cbind(tapply(var, y, mean, na.rm = TRUE),
tapply(var, y, sd, na.rm = TRUE))
} else {
tab <- table(y, var)
(tab + laplace) / (rowSums(tab) + laplace * nlevels(var))
}
that for numerical values Gaussian estimates are used. Thus convert your data to factors and you are good to go.
train.x <- data.frame(x1=c("1","1","0","0"), x2=c("1","0","1","0"))
train.y <- factor(c("cat", "cat", "dog", "dog"))
test.x <- data.frame(x1=c("1"), x2=c("1"))
# without laplace smoothing
classifier <- naiveBayes(x=train.x, y=train.y, laplace=0)
predict(classifier, test.x, type="raw") # returns (100% for dog)
# with laplace smoothing
classifier <- naiveBayes(x=train.x, y=train.y, laplace=1)
predict(classifier, test.x, type="raw") # returns (75% for dog)
Major facepalm on this one. The naiveBayes() method was interpreting x1 and x2 as numeric variables and thus trying to use Gaussian conditional probability distributions internally (I think). Encoding my variables as factors solved my problem.
train.x <- data.frame(x1=factor(c(1,1,0,0)), x2=factor(c(1,0,1,0)))
train.y <- factor(c("cat", "cat", "dog", "dog"))
test.x <- data.frame(x1=factor(c(1)), x2=factor(c(1)))
Related
I have this code for a multiclass classification problem:
data$Class = as.factor(data$Class)
levels(data$Class) <- make.names(levels(factor(data$Class)))
trainIndex <- createDataPartition(data$Class, p = 0.6, list = FALSE, times=1)
trainingSet <- data[ trainIndex,]
testingSet <- data[-trainIndex,]
train_x <- trainingSet[, -ncol(trainingSet)]
train_y <- trainingSet$Class
testing_x <- testingSet[, -ncol(testingSet)]
testing_y <- testingSet$Class
oneRM <- OneR(trainingSet, verbose = TRUE)
oneRM
summary(oneRM)
plot(oneRM)
oneRM_pred <- predict(oneRM, testing_x)
oneRM_pred
eval_model(oneRM_pred, testing_y)
AUC_oneRM_pred <- auc(roc(oneRM_pred,testing_y))
cat ("AUC=", oneRM_pred)
# Recall-Precision curve
oneRM_prediction <- prediction(oneRM_pred, testing_y)
RP.perf <- performance(oneRM_prediction, "tpr", "fpr")
plot (RP.perf)
plot(roc(oneRM_pred,testing_y))
But code does not work, after this line:
oneRM_prediction <- prediction(oneRM_pred, testing_y)
I get this error:
Error in prediction(oneRM_pred, testing_y) : Format of predictions is
invalid.
In addition, I donĀ“t know how I can get easily the F1-measure.
Finally, a question, does it make sense to calculate AUC in a multi-class classification problem?
Let's start from F1.
Assuming that you are using the iris dataset, first, we need to load everything, train the model and perform the predictions as you did.
library(datasets)
library(caret)
library(OneR)
library(pROC)
trainIndex <- createDataPartition(iris$Species, p = 0.6, list = FALSE, times=1)
trainingSet <- iris[ trainIndex,]
testingSet <- iris[-trainIndex,]
train_x <- trainingSet[, -ncol(trainingSet)]
train_y <- trainingSet$Species
testing_x <- testingSet[, -ncol(testingSet)]
testing_y <- testingSet$Species
oneRM <- OneR(trainingSet, verbose = TRUE)
oneRM_pred <- predict(oneRM, testing_x)
Then, you should calculate the precision, recall, and F1 for each class.
cm <- as.matrix(confusionMatrix(oneRM_pred, testing_y))
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
diag = diag(cm) # number of correctly classified instances per class
precision = diag / colsums
recall = diag / rowsums
f1 = 2 * precision * recall / (precision + recall)
print(" ************ Confusion Matrix ************")
print(cm)
print(" ************ Diag ************")
print(diag)
print(" ************ Precision/Recall/F1 ************")
print(data.frame(precision, recall, f1))
After that, you are able to find the macro F1.
macroPrecision = mean(precision)
macroRecall = mean(recall)
macroF1 = mean(f1)
print(" ************ Macro Precision/Recall/F1 ************")
print(data.frame(macroPrecision, macroRecall, macroF1))
To find the ROC (precisely the AUC), it best to use pROC library.
print(" ************ AUC ************")
roc.multi <- multiclass.roc(testing_y, as.numeric(oneRM_pred))
print(auc(roc.multi))
Hope that it helps you.
Find details on this link for F1 and this for AUC.
If I use levels(oneRM_pred) <- levels(testing_y) in this way:
...
oneRM <- OneR(trainingSet, verbose = TRUE)
oneRM
summary(oneRM)
plot(oneRM)
oneRM_pred <- predict(oneRM, testing_x)
levels(oneRM_pred) <- levels(testing_y)
...
The accuracy is very much lower than before. So, I am not sure if to enforce the same levels is a good solution.
I have an array of outputs from hundreds of segmented linear models (made using the segmented package in R). I want to be able to use these outputs on new data, using the predict function. To be clear, I do not have the segmented linear model objects in my workspace; I just saved and reimported the relevant outputs (e.g. the coefficients and breakpoints). For this reason I can't simply use the predict.segmented function from the segmented package.
Below is a toy example based on this link that seems promising, but does not match the output of the predict.segmented function.
library(segmented)
set.seed(12)
xx <- 1:100
zz <- runif(100)
yy <- 2 + 1.5*pmax(xx-35,0) - 1.5*pmax(xx-70,0) +
15*pmax(zz-0.5,0) + rnorm(100,0,2)
dati <- data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
o<-## S3 method for class 'lm':
segmented(out.lm,seg.Z=~x,psi=list(x=c(30,60)),
control=seg.control(display=FALSE))
# Note that coefficients with U in the name are differences in slopes, not slopes.
# Compare:
slope(o)
coef(o)[2] + coef(o)[3]
coef(o)[2] + coef(o)[3] + coef(o)[4]
# prediction
pred <- data.frame(x = 1:100)
pred$dummy1 <- pmax(pred$x - o$psi[1,2], 0)
pred$dummy2 <- pmax(pred$x - o$psi[2,2], 0)
pred$dummy3 <- I(pred$x > o$psi[1,2]) * (coef(o)[2] + coef(o)[3])
pred$dummy4 <- I(pred$x > o$psi[2,2]) * (coef(o)[2] + coef(o)[3] + coef(o)[4])
names(pred)[-1]<- names(model.frame(o))[-c(1,2)]
# compute the prediction, using standard predict function
# computing confidence intervals further
# suppose that the breakpoints are fixed
pred <- data.frame(pred, predict(o, newdata= pred,
interval="confidence"))
# Try prediction using the predict.segment version to compare
test <- predict.segmented(o)
plot(pred$fit, test, ylim = c(0, 100))
abline(0,1, col = "red")
# At least one segment not being predicted correctly?
Can I use the base r predict() function (not the segmented.predict() function) with the coefficients and break points saved from segmented linear models?
UPDATE
I figured out that the code above has issues (don't use it). Through some reverse engineering of the segmented.predict() function, I produced the design matrix and use that to predict values instead of directly using the predict() function. I do not consider this a full answer of the original question yet because predict() can also produce confidence intervals for the prediction, and I have not yet implemented that--question still open for someone to add confidence intervals.
library(segmented)
## Define function for making matrix of dummy variables (this is based on code from predict.segmented())
dummy.matrix <- function(x.values, x_names, psi.est = TRUE, nameU, nameV, diffSlope, est.psi) {
# This function creates a model matrix with dummy variables for a segmented lm with two breakpoints.
# Inputs:
# x.values: the x values of the segmented lm
# x_names: the name of the column of x values
# psi.est: this is legacy from the predict.segmented function, leave it set to 'TRUE'
# obj: the segmented lm object
# nameU: names (class character) of 3rd and 4th coef, which are "U1.x" "U2.x" for lm with two breaks. Example: names(c(obj$coef[3], obj$coef[4]))
# nameV: names (class character) of 5th and 6th coef, which are "psi1.x" "psi2.x" for lm with two breaks. Example: names(c(obj$coef[5], obj$coef[6]))
# diffSlope: the coefficients (class numeric) with the slope differences; called U1.x and U2.x for lm with two breaks. Example: c(o$coef[3], o$coef[4])
# est.psi: the estimated break points (class numeric); these are the estimated breakpoints from segmented.lm. Example: c(obj$psi[1,2], obj$psi[2,2])
#
n <- length(x.values)
k <- length(est.psi)
PSI <- matrix(rep(est.psi, rep(n, k)), ncol = k)
newZ <- matrix(x.values, nrow = n, ncol = k, byrow = FALSE)
dummy1 <- pmax(newZ - PSI, 0)
if (psi.est) {
V <- ifelse(newZ > PSI, -1, 0)
dummy2 <- if (k == 1)
V * diffSlope
else V %*% diag(diffSlope)
newd <- cbind(x.values, dummy1, dummy2)
colnames(newd) <- c(x_names, nameU, nameV)
} else {
newd <- cbind(x.values, dummy1)
colnames(newd) <- c(x_names, nameU)
}
# if (!x_names %in% names(coef(obj.seg)))
# newd <- newd[, -1, drop = FALSE]
return(newd)
}
## Test dummy matrix function----------------------------------------------
set.seed(12)
xx<-1:100
zz<-runif(100)
yy<-2+1.5*pmax(xx-35,0)-1.5*pmax(xx-70,0)+15*pmax(zz-.5,0)+rnorm(100,0,2)
dati<-data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
#1 segmented variable, 2 breakpoints: you have to specify starting values (vector) for psi:
o<-segmented(out.lm,seg.Z=~x,psi=c(30,60),
control=seg.control(display=FALSE))
slope(o)
plot.segmented(o)
summary(o)
# Test dummy matrix fn with the same dataset
newdata <- dati
nameU1 <- c("U1.x", "U2.x")
nameV1 <- c("psi1.x", "psi2.x")
diffSlope1 <- c(o$coef[3], o$coef[4])
est.psi1 <- c(o$psi[1,2], o$psi[2,2])
test <- dummy.matrix(x.values = newdata$x, x_names = "x", psi.est = TRUE,
nameU = nameU1, nameV = nameV1, diffSlope = diffSlope1, est.psi = est.psi1)
# Predict response variable using matrix multiplication
col1 <- matrix(1, nrow = dim(test)[1])
test <- cbind(col1, test) # Now test is the same as model.matrix(o)
predY <- coef(o) %*% t(test)
plot(predY[1,])
lines(predict.segmented(o), col = "blue") # good, predict.segmented gives same answer
The following code are used to produce the probability output of binary classification with Random Forest.
library(randomForest)
rf <- randomForest(train, train_label,importance=TRUE,proximity=TRUE)
prediction<-predict(rf, test, type="prob")
Then the result about prediction is as follows:
The true label about test data are known (named test_label). Now I want to compute logarithmic loss for probability output of binary classification. The function about LogLoss is as follows.
LogLoss=function(actual, predicted)
{
result=-1/length(actual)*(sum((actual*log(predicted)+(1-actual)*log(1-predicted))))
return(result)
}
How to compute logarithmic loss with probability output of binary classification. Thank you.
library(randomForest)
rf <- randomForest(Species~., data = iris, importance=TRUE, proximity=TRUE)
prediction <- predict(rf, iris, type="prob")
#bound the results, otherwise you might get infinity results
prediction <- apply(prediction, c(1,2), function(x) min(max(x, 1E-15), 1-1E-15))
#model.matrix generates a true probabilities matrix, where an element is either 1 or 0
#we subtract the prediction, and, if the result is bigger than 0 that's the correct class
logLoss = function(pred, actual){
-1*mean(log(pred[model.matrix(~ actual + 0) - pred > 0]))
}
logLoss(prediction, iris$Species)
I think the logLoss formula is a little bit wrong.
model <- glm(vs ~ mpg, data = mtcars, family = "binomial")
### OP's formula (Wrong)
logLoss1 <- function(pred, actual){
-1*mean(log(pred[model.matrix(~ actual + 0) - pred > 0]))
}
logLoss1(actual = model$y, pred = model$fitted.values)
# [1] 0.4466049
### Correct formula in native R
logLoss2 <- function(pred, actual){
-mean(actual * log(pred) + (1 - actual) * log(1 - pred))
}
logLoss2(actual = model$y, pred = model$fitted.values)
# [1] 0.3989584
## Results from various packages to verify the correct answer
### From ModelMetrics package
ModelMetrics::logLoss(actual = model$y, pred = model$fitted.values)
# [1] 0.3989584
### From MLmetrics package
MLmetrics::LogLoss(y_pred = model$fitted.values, y_true = model$y)
# [1] 0.3989584
### From reticulate package
sklearn.metrics <- import("sklearn.metrics")
sklearn.metrics$log_loss(y_true = model$y, y_pred = model$fitted.values)
# [1] 0.3989584
I used the R version 4.1.0 (2021-05-18).
I'm using the fGARCH package in R in order to fit an ARMA_GARCH(1,1) model to a time series. I want to extract standardized residuals, meaning the residuals divided by the corresponding daily volatility estimate. I tried to things
res <- residuals(m1, standardize=FALSE)
vol <- volatility(m1)
stand.res <- res/vol
and
stand.res <- residuals(m1, standardize=TRUE)
if I plot both results, they differ from each other. Why is that?
Thank you very much.
I'm having a similar problem; please consider:
rm(list=ls(all=TRUE))
library(fGarch)
set.seed(4)
x <- runif(6587, -0.10, 0.10)
gfit <- garchFit(formula = ~ garch(2,2), cond.dist = "std", data = x, include.shape=TRUE, trace=FALSE)
and
condVar = gfit#h.t
resid <- (x / sqrt(condVar));
tail(resid) # Standardized Residuals
#[1] -0.4201041 -0.8342208 1.5639541 1.0237848 -0.1779349 -0.7820030
#or
tail(x/ gfit#sigma.t)
#[1] -0.4201041 -0.8342208 1.5639541 1.0237848 -0.1779349 -0.7820030
vs
tail(residuals(gfit, standardize = TRUE))
#[1] -0.4156200 -0.8297368 1.5684382 1.0282689 -0.1734509 -0.7775190
I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))