I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))
Related
I am trying to plot my SVM classification with usinf e1071 library.However, the classification plot shows only single value for my parameters.Even though I change the selected parameters to create 2d classification plot it is wrong.
require(caTools)
library(caret)
dataset <-read.csv("income_evaluation.csv")
# fnlwgt row remowed since it is not necessary
df_income <- subset(dataset,select=-c(fnlwgt))
# turn binary attribute into 0 and 1
df_income$income <-ifelse(df_income$income==" >50K",1,0)
df_income$native.country
apply(X=df_income,2,FUN=function(x) length(which(x==' ?')))
# handling missing values
#define function to calculate mode
find_mode <- function(x) {
u <- unique(x)
tab <- tabulate(match(x, u))
u[tab == max(tab)]
}
mod_workclass_df = find_mode(df_income$workclass)
mod_occupation_df = find_mode(df_income$occupation)
mod_country_df = find_mode(df_income$native.country)
# replacing the missing values with the mod values
df_income$workclass[df_income$workclass == ' ?'] <- mod_workclass_df
df_income$occupation[df_income$occupation == ' ?'] <- mod_occupation_df
df_income$native.country[df_income$native.country == ' ?'] <- mod_country_df
# one hot encoding for train set
dmy <- dummyVars(" ~ .", data = df_income, fullRank = T)
df_income <- data.frame(predict(dmy, newdata = df_income))
# sampling
set.seed(101)
sample = sample.split(df_income$income, SplitRatio = .75)
trainingSet = subset(df_income, sample == TRUE)
testSet = subset(df_income, sample == TRUE)
# isolaate y cariable
Y_train <- trainingSet$income
Y_test <- testSet$income
#isolate x cariable
X_test <- subset(testSet,select=-c(income))
# evalution of svm
library(e1071)
svm_classifier = svm(formula=income ~ .,data=trainingSet,type="C-classification",kernel="radial",scale=TRUE,cost=10)
Y_pred = predict(svm_classifier,newdata= X_test)
confusionMatrix(table(Y_test,Y_pred))
# cross validation
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(trainingSet$income, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = trainingSet[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = trainingSet[x, ] # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm_classifier
Y_pred = predict(svm_classifier,newdata= test_fold[-97])
cm = table(test_fold[, 97], Y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy
trainingSet$income <-as.factor(trainingSet$income)
# Visualising the Training set results
plot(svm_classifier,trainingSet,education.num~age)
library(ggplot2)
svm_classifier
table(predicted=svm_classifier$fitted,actual=trainingSet$income)
Here is my code above and the plot below.I could not find the problem why there is only one color background and why there is any red color in the background.
Having a binary Classification problem:
how would be possible to get the Shap Contribution for variables for a Ranger model?
Sample data:
library(ranger)
library(tidyverse)
# Binary Dataset
df <- iris
df$Target <- if_else(df$Species == "setosa",1,0)
df$Species <- NULL
# Train Ranger Model
model <- ranger(
x = df %>% select(-Target),
y = df %>% pull(Target))
I have tried with several libraries(DALEX, shapr, fastshap, shapper) but I didnt get any solution.
I wish getting some result like SHAPforxgboost for xgboost like:
the output of shap.values which is the shap contribution of variables
the shap.plot.summary
Good Morning!,
According to what I have found, you can use ranger() with fastshap() as following:
library(fastshap)
library(ranger)
library(tidyverse)
data(iris)
# Binary Dataset
df <- iris
df$Target <- if_else(df$Species == "setosa",1,0)
df$Species <- NULL
x <- df %>% select(-Target)
# Train Ranger Model
model <- ranger(
x = df %>% select(-Target),
y = df %>% pull(Target))
# Prediction wrapper
pfun <- function(object, newdata) {
predict(object, data = newdata)$predictions
}
# Compute fast (approximate) Shapley values using 10 Monte Carlo repetitions
system.time({ # estimate run time
set.seed(5038)
shap <- fastshap::explain(model, X = x, pred_wrapper = pfun, nsim = 10)
})
# Load required packages
library(ggplot2)
theme_set(theme_bw())
# Aggregate Shapley values
shap_imp <- data.frame(
Variable = names(shap),
Importance = apply(shap, MARGIN = 2, FUN = function(x) sum(abs(x)))
)
Then for example, for variable importance, you can do:
# Plot Shap-based variable importance
ggplot(shap_imp, aes(reorder(Variable, Importance), Importance)) +
geom_col() +
coord_flip() +
xlab("") +
ylab("mean(|Shapley value|)")
Also, if you want individual predictions, the following is possible:
# Plot individual explanations
expl <- fastshap::explain(model, X = x ,pred_wrapper = pfun, nsim = 10, newdata = x[1L, ])
autoplot(expl, type = "contribution")
All this information has been found in here, and there is more to it: https://bgreenwell.github.io/fastshap/articles/fastshap.html
Check the link and solve your doubts ! :)
I launched two R packages to perform such tasks: One is "kernelshap" (crunching), the other one is "shapviz" (plotting).
library(randomForest)
library(kernelshap)
Ilibrary(shapviz)
set.seed(1)
fit <- randomForest(Sepal.Length ~ ., data = iris,)
# bg_X is usually a small (50-200 rows) subset of the data
# Step 1: Calculate Kernel SHAP values
s <- kernelshap(fit, iris[-1], bg_X = iris)
# Step 2: Turn them into a shapviz object
sv <- shapviz(s)
# Step 3: Gain insights...
sv_importance(sv, show_numbers = TRUE)
sv_dependence(sv, v = "Petal.Length", color_var = "auto")
My dataset contains 5851 observations, and is split into a train (3511 observations) and test (2340 observations) set. I now want to train a model using KNN, with two variables. I want to do 10-fold CV, repeated 5 times, using ROC metric and the one-standard error rule and the variables are preprocessed. The code is shown below.
set.seed(44780)
ctrl_repcvSE <- trainControl(method = "repeatedcv", number = 10, repeats = 5,
summaryFunction = twoClassSummary, classProbs = TRUE,
selectionFunction = "oneSE")
tune_grid <- expand.grid(k = 45:75)
mod4 <- train(purchased ~ total_policies + total_contrib,
data = mhomes_train, method = "knn",
trControl= ctrl_repcvSE, metric = "ROC",
tuneGrid = tune_grid, preProcess = c("center", "scale"))
The problem I have is that I already have tried so many different values of K (e.g., K = 10:20, 30:40, 50:60, 150:160 + different tuning lengths. However, every time the output says that the chosen value for K is the one which is last, so for example for values of K = 70:80, the chosen value for K = 80, every time I do this. This means I should look further, because if the chosen value is K in that case then there are better values of K available which are above 80. How should I eventually find this one?
The assignment only specifies: For k-nearest neighbours, explore reasonable values of k using the total_policies and total_contrib variables only.
Welcome to Stack Overflow. Your question isn't easy to answer.
For k-nearest neighbours I use another function knn3 part of the caret library.
I'll give an example using the iris dataset. We try to get the accuracy of our model for different values for k and plot those accuracies.
library(data.table)
library(tidyverse)
library(scales)
library(caret)
dt <- as.data.table(iris)
# converting and scaling data ----
dt$Species <- dt$Species %>% as.factor()
dt$Sepal.Length <- dt$Sepal.Length %>% scale()
dt$Sepal.Width <- dt$Sepal.Width %>% scale()
dt$Petal.Length <- dt$Petal.Length %>% scale()
dt$Petal.Width <- dt$Petal.Width %>% scale()
# remove in the real run ----
set.seed(1234567)
# split data into train and test - 3:1 ----
train_index <- createDataPartition(dt$Species, p = 0.75, list = FALSE)
train <- dt[train_index, ]
test <- dt[-train_index, ]
# values to check for k ----
K_VALUES <- 20:1
test_acc <- numeric(0)
train_acc <- numeric(0)
# calculate different models for each value of k ----
for (x in K_VALUES){
model <- knn3(Species ~ ., data = train, k = x)
pred_test <- predict(model, test, type = "class")
pred_test_acc <- confusionMatrix(table(pred_test,
test$Species))$overall["Accuracy"]
test_acc <- c(test_acc, pred_test_acc)
pred_train <- predict(model, train, type = "class")
pred_train_acc <- confusionMatrix(table(pred_train,
train$Species))$overall["Accuracy"]
train_acc <- c(train_acc, pred_train_acc)
}
data <- data.table(x = K_VALUES, train = train_acc, test = test_acc)
# plot a validation curve ----
plot_data <- gather(data, "type", "value", -x)
g <- qplot(x = x,
y = value,
data = plot_data,
color = type,
geom = "path",
xlim = c(max(K_VALUES),min(K_VALUES)-1))
print(g)
Now find a k with a good accuracy for your test data. That's the value you're looking for.
Disclosure: That's simplified but this approach should help you solving your problem.
I am currently using genomic expression levels, age, and smoking intensity levels to predict the number of days Lung Cancer Patients have to live. I have a small amount of data; 173 patients and 20,438 variables, including gene expression levels (which make up for 20,436). I have split up my data into test and training, utilizing an 80:20 ratio. There are no missing values in the data.
I am using knn() to train the model. Here is what the code looks like:
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
Nothing seems out of the ordinary until you notice that k=1. "Why is k=1?" you may ask. The reason k=1 is because when k=1, the model is the most accurate. This makes no sense to me. There are quite a few concerns:
I am using knn() to predict a continuous variable. I should be using something along the lines of, cox maybe.
The model is waaaaaaay too accurate. Here are a few examples of the test answer and the model's predictions. For the first patient, the number of days to death is 274. The model predicts 268. For the second patient, test: 1147, prediction: 1135. 3rd, test: 354, prediction: 370. 4th, test: 995, prediction 995. How is this possible? Out of the entire test data, the model was only off by and average of 9.0625 days! The median difference was 7 days, and the mode was 6 days. Here is a graph of the results:
Bar Graph.
So I guess my main question is what does knn() do, what does k represent, and how is the model so accurate when k=1? Here is my entire code (I am unable to attach the actual data):
# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
library(caret)
# Gather the data and store it in variables
LUAD <- read.csv('/Users/username/Documents/ClinicalData.csv')
geneData <- read.csv('/Users/username/Documents/GenomicExpressionLevelData.csv')
geneData <- data.frame(geneData)
row.names(geneData) = geneData$X
geneData <- geneData[2:514]
colNamesGeneData <- gsub(".","-",colnames(geneData),fixed = TRUE)
colnames(geneData) = colNamesGeneData
# Organize the data
# Important columns are 148 (smoking), 123 (OS Month, basically how many days old), and the gene data. And column 2 (barcode).
LUAD = data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)[complete.cases(data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)), ]
rownames(LUAD)=LUAD$LUAD.patient
LUAD <- LUAD[2:4]
# intersect(rownames(LUAD),colnames(geneData))
# ind=which(colnames(geneData)=="TCGA-778-7167-01A-11R-2066-07")
gene_expression=geneData[, rownames(LUAD)]
# Merge the two datasets to use the geneomic expression levels in your model
LUAD <- data.frame(LUAD,t(gene_expression))
LUAD.days_to_death <- LUAD[,3]
LUAD <- LUAD[,c(1:2,4:20438)]
LUAD <- data.frame(LUAD.days_to_death,LUAD)
set.seed(401)
# Number of Rows in the training data (createDataPartition(dataSet, percentForTraining, boolReturnAsList))
trainRowNum <- createDataPartition(LUAD$LUAD.days_to_death, p=0.8, list=FALSE)
# Training/Test Dataset
trainData <- LUAD[trainRowNum, ]
testData <- LUAD[-trainRowNum, ]
x = trainData[, c(2:20438)]
y = trainData$LUAD.days_to_death
v = testData[, c(2:20438)]
w = testData$LUAD.days_to_death
# Imputing missing values into the data
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
library(RANN)
if (anyNA(trainData)) {
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
}
anyNA(trainData)
# Normalizing the data
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
trainData$LUAD.days_to_death <- y
apply(trainData[,1:20438], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
preProcess_range_model_Test <- preProcess(testData, method='range')
testData <- predict(preProcess_range_model_Test, newdata = testData)
testData$LUAD.days_to_death <- w
apply(testData[,1:20438], 2, FUN=function(v){c('min'=min(v), 'max'=max(v))})
# To uncomment, select the text and press 'command' + 'shift' + 'c'
# set.seed(401)
# options(warn=-1)
# subsets <- c(1:10)
# ctrl <- rfeControl(functions = rfFuncs,
# method = "repeatedcv",
# repeats = 5,
# verbose = TRUE)
# lmProfile <- rfe(x=trainData[1:20437], y=trainAnswers,
# sizes = subsets,
# rfeControl = ctrl)
# lmProfile
trainAnswers <- trainData[,1]
testAnswers <- testData[,1]
library(class)
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
#install.packages("plotly")
library(plotly)
Test_Question_Number <- c(1:32)
prediction2 <- data.frame(prediction[1:32])
prediction2 <- as.numeric(as.vector(prediction2[c(1:32),]))
data <- data.frame(Test_Question_Number, prediction2, testAnswers)
names(data) <- c("Test Question Number","Prediction","Answer")
p <- plot_ly(data, x = ~Test_Question_Number, y = ~prediction2, type = 'bar', name = 'Prediction') %>%
add_trace(y = ~testAnswers, name = 'Answer') %>%
layout(yaxis = list(title = 'Days to Death'), barmode = 'group')
p
merge <- data.frame(prediction2,testAnswers)
difference <- abs((merge[,1])-(merge[,2]))
difference <- sort(difference)
meanDifference <- mean(difference)
medianDifference <- median(difference)
modeDifference <- names(table(difference))[table(difference)==max(table(difference))]
cat("Mean difference:", meanDifference, "\n")
cat("Median difference:", medianDifference, "\n")
cat("Mode difference:", modeDifference,"\n")
Lastly, for clarification purposes, ClinicalData.csv is the age, days to death, and smoking intensity data. The other .csv is the genomic expression data. The data above line 29 doesn't really matter, so you can just skip to the part of the code where it says "set.seed(401)".
Edit: Some samples of the data:
days_to_death OS_MONTHS
121 3.98
NACC1 2001.5708 2363.8063 1419.879
NACC2 58.2948 61.8157 43.4386
NADK 706.868 1053.4424 732.1562
NADSYN1 1628.7634 912.1034 638.6471
NAE1 832.8825 793.3014 689.7123
NAF1 140.3264 165.4858 186.355
NAGA 1523.3441 1524.4619 1858.9074
NAGK 983.6809 899.869 1168.2003
NAGLU 621.3457 510.9453 1172.511
NAGPA 346.9762 257.5654 275.5533
NAGS 460.7732 107.2116 321.9763
NAIF1 217.1219 202.5108 132.3054
NAIP 101.2305 87.8942 77.261
NALCN 13.9628 36.7031 48.0809
NAMPT 3245.6584 1257.8849 5465.6387
Because K = 1 is the most complex knn model. It has the most flexible decision boundary. It creates an overfit. It will perform well within the training data by poorly on a holdout set (but not always).
I'm using a R package called logistf to make a Logistc Regression and I saw that there's no predict function for new data in this package and predict package does not work with this, so I found a code that show how making this with new data:
fit<-logistf(Tax ~ L20+L24+L28+L29+L31+L32+L33+L36+S10+S15+S16+S17+S20, data=trainData)
betas <- coef(fit)
X <- model.matrix(fit, data=testData)
probs <- 1 / (1 + exp(-X %*% betas))
I want to make a cross validation version with this using fit$predict and the probabilities that probs generate for me. Has anyone ever done something like this before?
Other thing that I want to know is about fit$predict I'm making a binary logistic regression, and this function returns many values, are these values from class 0 or 1, how can I know this? Thanks
While the code that you wrote works perfectly, there is a concise way of getting the same results seemingly:
brglm_model <- brglm(formula = response ~ predictor , family = "binomial", data = train )
brglm_pred <- predict(object = brglm_model, newdata = test , type = "response")
About the CV, you have to write a few lines of code I guess:
#Setting the number of folds, and number of instances in each fold
n_folds <- 5
fold_size <- nrow(dataset) %/% 5
residual <- nrow(dataset) %% 5
#label the instances based on the number of folds
cv_labels <- c(rep(1,fold_size),rep(2,fold_size), rep(3,fold_size), rep(4,fold_size), rep(5,fold_size), rep(5,residual))
# the error term would differ based on each threshold value
t_seq <- seq(0.1,0.9,by = 0.1)
index_mat <- matrix(ncol = (n_folds+1) , nrow = length(t_seq))
index_mat[,1] <- t_seq
# the main loop for calculation of the CV error on each fold
for (i in 1:5){
train <- dataset %>% filter(cv_labels != i)
test <- dataset %>% filter(cv_labels == i )
brglm_cv_model <- brglm(formula = response_var ~ . , family = "binomial", data = train )
brglm_cv_pred <- predict(object = brglm_model, newdata = test , type = "response")
# error formula that you want, e.g. misclassification
counter <- 0
for (treshold in t_seq ) {
counter <- counter + 1
conf_mat <- table( factor(test$response_var) , factor(brglm_cv_pred>treshold, levels = c("FALSE","TRUE") ))
sen <- conf_mat[2,2]/sum(conf_mat[2,])
# other indices can be computed as follows
#spec <- conf_mat[1,1]/sum(conf_mat[1,])
#prec <- conf_mat[2,2]/sum(conf_mat[,2])
#F1 <- (2*prec * sen)/(prec+sen)
#accuracy <- (conf_mat[1,1]+conf_mat[2,2])/sum(conf_mat)
#here I am only interested in sensitivity
index_mat[counter,(i+1)] <- sen
}
}
# final data.frame would be the mean of sensitivity over each threshold value
final_mat <- matrix(nrow = length(t_seq), ncol = 2 )
final_mat[,1] <- t_seq
final_mat[,2] <- apply(X = index_mat[,-1] , MARGIN = 1 , FUN = mean)
final_mat <- data.frame(final_mat)
colnames(final_mat) <- c("treshold","sensitivity")
#why not having a look at the CV-sensitivity of the model over threshold values?
ggplot(data = final_mat) +
geom_line(aes(x = treshold, y = sensitivity ), color = "blue")