I have constructed a logistic regression and now I want to calculate the predictive accuracy for various cutoff values ranging from 0 to 1. This is the for loop I have been using. But I am getting
subscript out of bounds
Here predtrain contains 300 predicted output probabilities each ranging from 0 to 1. Each value is compared to cutoff eff. Finally a table/confusion matrix has to be generated comparing the original values (train$CAN) with f1. Some thing like this:
tab
# pred2
# 0 1
# 0 1 93
# 1 0 206
code I have written is this:
predtrain <- predict(logreg1, newdata = train, type = 'response')
eff<-seq(0,1,by = 0.05)
for (i in 1:length(eff) {
f1 <- ifelse(predtrain > eff[i], 1, 0)
t1 <- table(train$CAN, f1)
effy <- (t1[1,1]+t1[2,2])/(t1[1,1]+t1[1,2]+t1[2,2]+t1[2,1])
eff[[i]] <-effy
}
The reason you're getting subscript out of bounds errors is that you're trying to create confusion matrices with cutoffs like 0 and 1 -- this will create a confusion matrix with a single column (all predictions are either positive or negative), causing code like t1[2,2] to cause your error.
In reality all you're trying to do is to compute the predictive accuracy at different cutoffs, which can be accomplished without creating tables at all with something like:
cutoffs <- seq(0, 1, by=0.05)
eff <- sapply(cutoffs, function(cutoff) {
sum((predtrain > cutoff) == train$CAN) / length(predtrain)
})
To see this in action, let's consider a small example model:
set.seed(144)
x <- runif(100)
train <- data.frame(x, CAN=as.numeric(runif(100)+x >= 1))
logreg1 <- glm(CAN~x, data=train, family="binomial")
predtrain <- predict(logreg1, newdata = train, type = 'response')
Now we can get the predictive accuracy at each cutoff:
eff <- sapply(cutoffs, function(cutoff) {
sum((predtrain > cutoff) == train$CAN) / length(predtrain)
})
plot(cutoffs, eff)
You could alternately use a package like the ROCR package to grab metrics. For instance, here is how you could grab the sensitivity at each cutoff:
library(ROCR)
pred <- prediction(predtrain, train$CAN)
perf <- performance(pred, "sens")
eff <- sapply(cutoffs, function(cutoff) max(perf#y.values[[1]][perf#x.values[[1]] >= cutoff]))
plot(cutoffs, eff)
But to calculate something like specificity and sensitivity doesn't it become more difficult? I have written using two for loops, I know it is not very effective but I do get the table from which I can calculate performance variables. Can this method be improved?
enter code here
z <- seq(0,1,by = 0.05)
t1 <- vector(mode = "list", length = length(z))
for(i in 1:length(z)) {
predtrain <- predict(logreg1, newdata = train, type = 'response')
for(j in 1:length(predtrain)){
predtrain[j] <- ifelse(predtrain[j]>z[i], 1, 0)
}
t1[[i]] <- table(train$CAN, predtrain)
} t1
Related
I'm performing a time series analysis with multiple breakpoints in R.
I managed to identify three breakpoints using the procedure suggested in strucchange package but I'm struggling to get the significance (p-value) for these break points.
Here there is a dummy dataset and the code I was working with.
the dataset:
x=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,
29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,
54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,
79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94)
z=c(128,103,29,117,53,49,84,67,76,111,81,38,36,-35,-12,21,121,38,84,173,153,99,
91,110,69,50,15,-50,15,-97,-2,13,107,47,137,25,-19,54,4,87,72,58,32,-4,75,50,
80,65,124,56,58,-3,30,42,55,212,245,18,106,128,88,216,205,234,120,171,195,230,
237,143,225,253,202,218,283,227,291,192,179,197,337,259,261,215,290,293,255,
316,355,312,337,341,388,338)
df=data.frame(z,x)
plot(x,z)
the code:
# https://www.marinedatascience.co/blog/2019/09/28/comparison-of-change-point-detection-methods/
library(strucchange)
library(sandwich)
library(fxregime)
# get best model
opt_bpts <- function(x) {
#x = bpts_sum$RSS["BIC",]
n <- length(x)
lowest <- vector("logical", length = n-1)
lowest[1] <- FALSE
for (i in 2:n) {
lowest[i] <- x[i] < x[i-1] & x[i] < x[i+1]
}
out <- as.integer(names(x)[lowest])
return(out)
}
#################################################################
#marinedatascience.co/blog/2019/09/28/comparison-of-change-point-detection-methods/
#Zeileis, A., Leisch, F., Hornik, K. & Kleiber, C. (2002), strucchange: An R Package for Testing for Structural Change in Linear Regression Models. J Stat Softw 7(2), 38p., doi: 10.18637/jss.v007.i02↩
z_ts <- as.ts(df$z) #crate time series
bpts <- breakpoints(z ~ x, data = df)
plot(bpts)
bpts_sum <- summary(bpts)
opt_brks <- opt_bpts(bpts_sum$RSS["BIC",])
opt_brks
# Nested syntax with 3 breaks:
ci=confint(bpts,breaks = 3)#, level = 0.99)
bpts <- breakpoints(breakpoints(z ~ x, data = df), breaks = 3)
Fst=Fstats(z_ts~1)
#here I get a p-value for the analysis with three breakpoints
plot(Fst)
sctest(Fst)
I get as Fst output:
supF test
data: Fst
sup.F = 203.23, p-value < 2.2e-16
I would like to obtain (if it's possible) the p-value of each breakpoint.
Something like this:
F test
data: Fst
breakpoint1:p-value < brk1.pvalue
breakpoint2:p-value < brk2.pvalue
breakpoint3:p-value < brk3.pvalue
I want to replicate the results of multinom() function with optim() function in R, but it does not yield the same results. What was wrong?
First, I imported a public data as "ml".
require(foreign)
ml <- read.dta("https://stats.idre.ucla.edu/stat/data/hsbdemo.dta")
The codes to get the summary statistics of "ml" data and the results are below:
with(ml, table(ses,prog))
with(ml, do.call(rbind,tapply(write, prog, function(x) c(M = mean(x), SD = sd(x)))))
prog
ses general academic vocation
low 16 19 12
middle 20 44 31
high 9 42 7
M SD
general 51.33333 9.397775
academic 56.25714 7.943343
vocation 46.76000 9.318754
The codes to get the results from multinom() function that conducts multinomial logistic regression, and following results are below:
library(nnet)
ml$prog2 <- relevel(ml$prog, ref = "academic")
ml_pckg <- multinom(prog2 ~ write + ses, data = ml)
summary(ml_pckg)
Call:
multinom(formula = prog2 ~ write + ses, data = ml)
Coefficients:
(Intercept) write sesmiddle seshigh
general 2.852198 -0.0579287 -0.5332810 -1.1628226
vocation 5.218260 -0.1136037 0.2913859 -0.9826649
Std. Errors:
(Intercept) write sesmiddle seshigh
general 1.166441 0.02141097 0.4437323 0.5142196
vocation 1.163552 0.02221996 0.4763739 0.5955665
Residual Deviance: 359.9635
AIC: 375.9635
The code to get the z statistics and the results are below:
z <- summary(ml_pckg)$coefficients/summary(ml_pckg)$standard.errors
z
(Intercept) write sesmiddle seshigh
general 2.445214 -2.705562 -1.2018081 -2.261334
vocation 4.484769 -5.112689 0.6116747 -1.649967
Next, I wrote the code to replicate the results above.
I generated dummy variables for the categorical dependant/independant variables as below:
ml$prog_academic <- ifelse(ml$prog == "academic", 1, 0)
ml$prog_general <- ifelse(ml$prog == "general", 1, 0)
ml$prog_vocational <- ifelse(ml$prog == "vocational", 1, 0)
ml$ses_low <- ifelse(ml$ses == "low", 1, 0)
ml$ses_middle <- ifelse(ml$ses == "middle", 1, 0)
ml$ses_high <- ifelse(ml$ses == "high", 1, 0)
I generated one vector to multiply with the intercept and subsetted write, ses_middle, and ses_high for the explanatory variable. ses_low is baseline here. I assigned these covariates into a new data frame named "X".
one <-as.data.frame(rep(1,200))
covar <- ml[,c(7,19,20)]
X <- data.frame(one,covar) #200*4
Next, I created another data frame for dependant variables named "Y" that consists of prog_general and prog_vocational. Here, prog_academic is the baseline.
Y <- ml[,16:17] #200*2
I set the initial value of the parameters similar to the results of mlogit() function so that the optimization function converges.
B_0 <- c(3, -0.1, -0.5, -1, 5, -0.1, 0.2, -1) #8*1 #initial value as vector
Here, I refer to a document to find the likelihood of the multinomial logistic regression. The likelihood is in equation 31 on page 12. I found out that the second part of the equation should be summed with respect to i as well.
I generated a blank matrix "xb" to include part
xb <- matrix(0, nrow=200, ncol=2) #200*2
I run the code below at once to get the results of the optimization.
mlogit <- function(B){
B <- matrix(B, nrow=2, ncol=4, byrow=T)
for (i in 1:nrow(xb)){ #i is the dimension of individual: 200
for (j in 1:ncol(xb)){ #j is the dimension of dependant variables -1 (categorical): 2
xb[i,j] <- sum(X[i,]*B[j,]) #200*2
}
}
exp <- exp(xb) #200*2
sumexp <- rowSums(exp) #200*1
sumexp <- as.numeric(sumexp)
yxb <- Y*xb #200*2
sumyxb <- sum(yxb)
ll <- sumyxb-sum(log(1+sumexp))
-ll
}
mlogit_result <- optim(par = B_0, fn = mlogit)
mlogit_result
The results are below:
$par
[1] 0.05325004 -0.01417267 -0.64375499 -0.96137147 6.33471560 -0.86154161 0.92387035 -0.65728823
$value
[1] 103.7692
$counts
function gradient
353 NA
$convergence
[1] 0
$message
NULL
If the results correspond with that of mlogit() function, $par should be as below:
2.852198 -0.0579287 -0.5332810 -1.1628226 5.218260 -0.1136037 0.2913859 -0.9826649
I reviewed my code and the likelihood function again and again, but could not find anything wrong here. I think maybe the initial parameter is wrongly set or the function I created has some problem.
Could anyone please give me any suggestions to deal with this problem?
Thank you!
I cross-post this question here, but it seems to me that I'm unlikely to receive any answer. So I post it here.
I'm running the classification method Bagging Tree (Bootstrap Aggregation) and compare the misclassification error rate with one from one single tree. We expect that the result from bagging tree is better then that from one single tree, i.e. error rate from bagging is lower than that of single tree.
I repeat the whole procedure M = 100 times (each time splitting randomly the original data set into a training set and a test set) to obtain 100 test errors and bagging test errors (use a for loop). Then I use boxplots to compare the distributions of these two types of errors.
# Loading package and data
library(rpart)
library(boot)
library(mlbench)
data(PimaIndiansDiabetes)
# Initialization
n <- 768
ntrain <- 468
ntest <- 300
B <- 100
M <- 100
single.tree.error <- vector(length = M)
bagging.error <- vector(length = M)
# Define statistic
estim.pred <- function(a.sample, vector.of.indices)
{
current.train <- a.sample[vector.of.indices, ]
current.fitted.model <- rpart(diabetes ~ ., data = current.train, method = "class")
predict(current.fitted.model, test.set, type = "class")
}
for (j in 1:M)
{
# Split the data into test/train sets
train.idx <- sample(1:n, ntrain, replace = FALSE)
train.set <- PimaIndiansDiabetes[train.idx, ]
test.set <- PimaIndiansDiabetes[-train.idx, ]
# Train a direct tree model
fitted.tree <- rpart(diabetes ~ ., data = train.set, method = "class")
pred.test <- predict(fitted.tree, test.set, type = "class")
single.tree.error[j] <- mean(pred.test != test.set$diabetes)
# Bootstrap estimates
res.boot = boot(train.set, estim.pred, B)
pred.boot <- vector(length = ntest)
for (i in 1:ntest)
{
pred.boot[i] <- ifelse (mean(res.boot$t[, i] == "pos") >= 0.5, "pos", "neg")
}
bagging.error[j] <- mean(pred.boot != test.set$diabetes)
}
boxplot(single.tree.error, bagging.error, ylab = "Misclassification errors", names = c("single.tree", "bagging"))
The result is
Could you please explain why the error rate for bagging trees is much higher than that of a single tree? I feel that this does not make sense. I've checked my code but could not found anything unusual.
I've received an answer from https://stats.stackexchange.com/questions/452882/why-is-the-error-rate-from-bagging-trees-much-higher-than-that-from-a-single-tre. I posted it here to close this question and for future visitors.
I have wrote the knn cross validation method below using the iris dataset in R. How would I get the best value of k from this and create a confusion matrix based on this? Any help would be great.
library(class)
data("iris")
kfolds = 5
iris$folds = cut(seq(1,nrow(iris)),breaks=kfolds,labels=FALSE)
iris$folds
# Sets the columns to use as predicators
pred = c("Petal.Width", "Petal.Length")
accuracies = c()
ks = c(1,3,5,7,9,11,13,15)
for (k in ks) {
k.accuracies = c()
for(i in 1:kfolds) {
# Builds the training set and test set for this fold.
train.items.this.fold = iris[iris$folds != i,]
validation.items.this.fold = iris[iris$folds == i,]
# Fit knn model on this fold.
predictions = knn(train.items.this.fold[,pred],
validation.items.this.fold[,pred],
train.items.this.fold$Species, k=k)
predictions.table <- table(predictions, validation.items.this.fold$Species)
# Work out the amount of correct and incorrect predictions.
correct.list <- predictions == validation.items.this.fold$Species
nr.correct = nrow(validation.items.this.fold[correct.list,])
# Get accuracy rate of cv.
accuracy.rate = nr.correct/nrow(validation.items.this.fold)
# Adds the accuracy list.
k.accuracies <- cbind(k.accuracies, accuracy.rate)
}
# Adds the mean accuracy to the total accuracy list.
accuracies <- cbind(accuracies, mean(k.accuracies))
}
# Accuracy for each value of k: visualisation.
accuracies
Update:
predictions.table <- table(predictions == ks[which.max(accuracies)], validation.items.this.fold$Species)
Your code have some problems, this one runs:
library(class)
data("iris")
kfolds = 5
iris$folds = cut(seq(1,nrow(iris)),breaks=kfolds,labels=FALSE)
iris$folds
# Sets the columns to use as predicators
pred = c("Petal.Width", "Petal.Length")
accuracies = c()
ks = c(1,3,5,7,9,11,13,15)
k.accuracies = c()
predictions.list = list()
for (k in ks) {
k.accuracies = c()
for(i in 1:kfolds) {
# Builds the training set and test set for this fold.
train.items.this.fold = iris[iris$folds != i,]
validation.items.this.fold = iris[iris$folds == i,]
# Fit knn model on this fold.
predictions = knn(train.items.this.fold[,pred],
validation.items.this.fold[,pred],
train.items.this.fold$Species, k=k)
predictions.list[[i]] = predictions
predictions.table <- table(predictions, validation.items.this.fold$Species)
# Work out the amount of correct and incorrect predictions.
correct.list <- predictions == validation.items.this.fold$Species
nr.correct = nrow(validation.items.this.fold[correct.list,])
# Get accuracy rate of cv.
accuracy.rate = nr.correct/nrow(validation.items.this.fold)
# Adds the accuracy list.
k.accuracies <- cbind(k.accuracies, accuracy.rate)
}
# Adds the mean accuracy to the total accuracy list.
accuracies <- cbind(accuracies, mean(k.accuracies))
}
accuracies
predictions.table <- table(predictions.list[[which.max(accuracies)]], validation.items.this.fold$Species)
When you calling predictions.table <- table(predictions, validation.items.this.fold$Species), this is the confusion matrix, and you are using the accuracy as the evaluation metric, so the best K is the best accuracy. You can get the best K value like this:
ks[which.max(accuracies)]
UPDATE
Create a list to store each prediction and then created the confusion matrix using the best accuracy.
I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))