Why is my model so accurate when using knn(), where k=1? - r

I am currently using genomic expression levels, age, and smoking intensity levels to predict the number of days Lung Cancer Patients have to live. I have a small amount of data; 173 patients and 20,438 variables, including gene expression levels (which make up for 20,436). I have split up my data into test and training, utilizing an 80:20 ratio. There are no missing values in the data.
I am using knn() to train the model. Here is what the code looks like:
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
Nothing seems out of the ordinary until you notice that k=1. "Why is k=1?" you may ask. The reason k=1 is because when k=1, the model is the most accurate. This makes no sense to me. There are quite a few concerns:
I am using knn() to predict a continuous variable. I should be using something along the lines of, cox maybe.
The model is waaaaaaay too accurate. Here are a few examples of the test answer and the model's predictions. For the first patient, the number of days to death is 274. The model predicts 268. For the second patient, test: 1147, prediction: 1135. 3rd, test: 354, prediction: 370. 4th, test: 995, prediction 995. How is this possible? Out of the entire test data, the model was only off by and average of 9.0625 days! The median difference was 7 days, and the mode was 6 days. Here is a graph of the results:
Bar Graph.
So I guess my main question is what does knn() do, what does k represent, and how is the model so accurate when k=1? Here is my entire code (I am unable to attach the actual data):
# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
library(caret)
# Gather the data and store it in variables
LUAD <- read.csv('/Users/username/Documents/ClinicalData.csv')
geneData <- read.csv('/Users/username/Documents/GenomicExpressionLevelData.csv')
geneData <- data.frame(geneData)
row.names(geneData) = geneData$X
geneData <- geneData[2:514]
colNamesGeneData <- gsub(".","-",colnames(geneData),fixed = TRUE)
colnames(geneData) = colNamesGeneData
# Organize the data
# Important columns are 148 (smoking), 123 (OS Month, basically how many days old), and the gene data. And column 2 (barcode).
LUAD = data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)[complete.cases(data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)), ]
rownames(LUAD)=LUAD$LUAD.patient
LUAD <- LUAD[2:4]
# intersect(rownames(LUAD),colnames(geneData))
# ind=which(colnames(geneData)=="TCGA-778-7167-01A-11R-2066-07")
gene_expression=geneData[, rownames(LUAD)]
# Merge the two datasets to use the geneomic expression levels in your model
LUAD <- data.frame(LUAD,t(gene_expression))
LUAD.days_to_death <- LUAD[,3]
LUAD <- LUAD[,c(1:2,4:20438)]
LUAD <- data.frame(LUAD.days_to_death,LUAD)
set.seed(401)
# Number of Rows in the training data (createDataPartition(dataSet, percentForTraining, boolReturnAsList))
trainRowNum <- createDataPartition(LUAD$LUAD.days_to_death, p=0.8, list=FALSE)
# Training/Test Dataset
trainData <- LUAD[trainRowNum, ]
testData <- LUAD[-trainRowNum, ]
x = trainData[, c(2:20438)]
y = trainData$LUAD.days_to_death
v = testData[, c(2:20438)]
w = testData$LUAD.days_to_death
# Imputing missing values into the data
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
library(RANN)
if (anyNA(trainData)) {
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
}
anyNA(trainData)
# Normalizing the data
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
trainData$LUAD.days_to_death <- y
apply(trainData[,1:20438], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
preProcess_range_model_Test <- preProcess(testData, method='range')
testData <- predict(preProcess_range_model_Test, newdata = testData)
testData$LUAD.days_to_death <- w
apply(testData[,1:20438], 2, FUN=function(v){c('min'=min(v), 'max'=max(v))})
# To uncomment, select the text and press 'command' + 'shift' + 'c'
# set.seed(401)
# options(warn=-1)
# subsets <- c(1:10)
# ctrl <- rfeControl(functions = rfFuncs,
# method = "repeatedcv",
# repeats = 5,
# verbose = TRUE)
# lmProfile <- rfe(x=trainData[1:20437], y=trainAnswers,
# sizes = subsets,
# rfeControl = ctrl)
# lmProfile
trainAnswers <- trainData[,1]
testAnswers <- testData[,1]
library(class)
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
#install.packages("plotly")
library(plotly)
Test_Question_Number <- c(1:32)
prediction2 <- data.frame(prediction[1:32])
prediction2 <- as.numeric(as.vector(prediction2[c(1:32),]))
data <- data.frame(Test_Question_Number, prediction2, testAnswers)
names(data) <- c("Test Question Number","Prediction","Answer")
p <- plot_ly(data, x = ~Test_Question_Number, y = ~prediction2, type = 'bar', name = 'Prediction') %>%
add_trace(y = ~testAnswers, name = 'Answer') %>%
layout(yaxis = list(title = 'Days to Death'), barmode = 'group')
p
merge <- data.frame(prediction2,testAnswers)
difference <- abs((merge[,1])-(merge[,2]))
difference <- sort(difference)
meanDifference <- mean(difference)
medianDifference <- median(difference)
modeDifference <- names(table(difference))[table(difference)==max(table(difference))]
cat("Mean difference:", meanDifference, "\n")
cat("Median difference:", medianDifference, "\n")
cat("Mode difference:", modeDifference,"\n")
Lastly, for clarification purposes, ClinicalData.csv is the age, days to death, and smoking intensity data. The other .csv is the genomic expression data. The data above line 29 doesn't really matter, so you can just skip to the part of the code where it says "set.seed(401)".
Edit: Some samples of the data:
days_to_death OS_MONTHS
121 3.98
NACC1 2001.5708 2363.8063 1419.879
NACC2 58.2948 61.8157 43.4386
NADK 706.868 1053.4424 732.1562
NADSYN1 1628.7634 912.1034 638.6471
NAE1 832.8825 793.3014 689.7123
NAF1 140.3264 165.4858 186.355
NAGA 1523.3441 1524.4619 1858.9074
NAGK 983.6809 899.869 1168.2003
NAGLU 621.3457 510.9453 1172.511
NAGPA 346.9762 257.5654 275.5533
NAGS 460.7732 107.2116 321.9763
NAIF1 217.1219 202.5108 132.3054
NAIP 101.2305 87.8942 77.261
NALCN 13.9628 36.7031 48.0809
NAMPT 3245.6584 1257.8849 5465.6387

Because K = 1 is the most complex knn model. It has the most flexible decision boundary. It creates an overfit. It will perform well within the training data by poorly on a holdout set (but not always).

Related

How can I resolve the issue of invalid format when trying to plot a ROCR::performance object?

I want to plot ROC curve for my prediction and have this error
Error: Format of predictions is invalid. It couldn't be coerced to a list.
I've already tried to find the answer on google, asked chatgpt but nothing helped.
Variables svm_p and test$stroke both have the same data type and the same length.
If you can think of better way to plot ROC curve feel free to share it.
The code is provided below:
library(caret)
library(dplyr)
library(tidyr)
library(gridExtra)
library(Amelia)
library(naniar)
library(tidyverse)
library(ROCR)
start_time <- Sys.time()
#
#
prepared_dataset <- read_csv('prepared_data.csv')
prepared_dataset <- prepared_dataset[1:500, ]
#
set.seed(42)
# # total number of observations
n_obs <- nrow(prepared_dataset)
#
# # shuffle the dataset randomly
permuted_rows <- sample(n_obs)
#
# # Randomly order data
stroke_shuffled <- prepared_dataset[permuted_rows,]
#
# # Identify row to split on
split <- round(n_obs * 0.8)
#
# # Create train
train <- stroke_shuffled[1:split,]
#
# # Create test
test <- stroke_shuffled[(split + 1):nrow(stroke_shuffled),]
test[, 13]
#check if train is really 80% of the original
nrow(train) / nrow(prepared_dataset)
train$stroke = factor(train$stroke, levels = c(0, 1))
test$stroke = factor(test$stroke, levels = c(0, 1))
myGrid <- expand.grid(
C = c(1, 10, 100),
sigma = c(0.1, 1, 10)
)
svm <- train(
stroke ~ .,
train,
method = "svmRadial",
tuneGrid = myGrid
)
print(svm)
# # accuracy
svm_p <- predict(svm, newdata=test)
confusionMatrix(table(svm_p, test$stroke))
end_time <- Sys.time()
end_time - start_time
pred <- prediction(svm_p, test$stroke)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

SHAP Importance for Ranger in R

Having a binary Classification problem:
how would be possible to get the Shap Contribution for variables for a Ranger model?
Sample data:
library(ranger)
library(tidyverse)
# Binary Dataset
df <- iris
df$Target <- if_else(df$Species == "setosa",1,0)
df$Species <- NULL
# Train Ranger Model
model <- ranger(
x = df %>% select(-Target),
y = df %>% pull(Target))
I have tried with several libraries(DALEX, shapr, fastshap, shapper) but I didnt get any solution.
I wish getting some result like SHAPforxgboost for xgboost like:
the output of shap.values which is the shap contribution of variables
the shap.plot.summary
Good Morning!,
According to what I have found, you can use ranger() with fastshap() as following:
library(fastshap)
library(ranger)
library(tidyverse)
data(iris)
# Binary Dataset
df <- iris
df$Target <- if_else(df$Species == "setosa",1,0)
df$Species <- NULL
x <- df %>% select(-Target)
# Train Ranger Model
model <- ranger(
x = df %>% select(-Target),
y = df %>% pull(Target))
# Prediction wrapper
pfun <- function(object, newdata) {
predict(object, data = newdata)$predictions
}
# Compute fast (approximate) Shapley values using 10 Monte Carlo repetitions
system.time({ # estimate run time
set.seed(5038)
shap <- fastshap::explain(model, X = x, pred_wrapper = pfun, nsim = 10)
})
# Load required packages
library(ggplot2)
theme_set(theme_bw())
# Aggregate Shapley values
shap_imp <- data.frame(
Variable = names(shap),
Importance = apply(shap, MARGIN = 2, FUN = function(x) sum(abs(x)))
)
Then for example, for variable importance, you can do:
# Plot Shap-based variable importance
ggplot(shap_imp, aes(reorder(Variable, Importance), Importance)) +
geom_col() +
coord_flip() +
xlab("") +
ylab("mean(|Shapley value|)")
Also, if you want individual predictions, the following is possible:
# Plot individual explanations
expl <- fastshap::explain(model, X = x ,pred_wrapper = pfun, nsim = 10, newdata = x[1L, ])
autoplot(expl, type = "contribution")
All this information has been found in here, and there is more to it: https://bgreenwell.github.io/fastshap/articles/fastshap.html
Check the link and solve your doubts ! :)
I launched two R packages to perform such tasks: One is "kernelshap" (crunching), the other one is "shapviz" (plotting).
library(randomForest)
library(kernelshap)
Ilibrary(shapviz)
set.seed(1)
fit <- randomForest(Sepal.Length ~ ., data = iris,)
# bg_X is usually a small (50-200 rows) subset of the data
# Step 1: Calculate Kernel SHAP values
s <- kernelshap(fit, iris[-1], bg_X = iris)
# Step 2: Turn them into a shapviz object
sv <- shapviz(s)
# Step 3: Gain insights...
sv_importance(sv, show_numbers = TRUE)
sv_dependence(sv, v = "Petal.Length", color_var = "auto")

How to find the optimal value for K in K-nearest neighbors using R?

My dataset contains 5851 observations, and is split into a train (3511 observations) and test (2340 observations) set. I now want to train a model using KNN, with two variables. I want to do 10-fold CV, repeated 5 times, using ROC metric and the one-standard error rule and the variables are preprocessed. The code is shown below.
set.seed(44780)
ctrl_repcvSE <- trainControl(method = "repeatedcv", number = 10, repeats = 5,
summaryFunction = twoClassSummary, classProbs = TRUE,
selectionFunction = "oneSE")
tune_grid <- expand.grid(k = 45:75)
mod4 <- train(purchased ~ total_policies + total_contrib,
data = mhomes_train, method = "knn",
trControl= ctrl_repcvSE, metric = "ROC",
tuneGrid = tune_grid, preProcess = c("center", "scale"))
The problem I have is that I already have tried so many different values of K (e.g., K = 10:20, 30:40, 50:60, 150:160 + different tuning lengths. However, every time the output says that the chosen value for K is the one which is last, so for example for values of K = 70:80, the chosen value for K = 80, every time I do this. This means I should look further, because if the chosen value is K in that case then there are better values of K available which are above 80. How should I eventually find this one?
The assignment only specifies: For k-nearest neighbours, explore reasonable values of k using the total_policies and total_contrib variables only.
Welcome to Stack Overflow. Your question isn't easy to answer.
For k-nearest neighbours I use another function knn3 part of the caret library.
I'll give an example using the iris dataset. We try to get the accuracy of our model for different values for k and plot those accuracies.
library(data.table)
library(tidyverse)
library(scales)
library(caret)
dt <- as.data.table(iris)
# converting and scaling data ----
dt$Species <- dt$Species %>% as.factor()
dt$Sepal.Length <- dt$Sepal.Length %>% scale()
dt$Sepal.Width <- dt$Sepal.Width %>% scale()
dt$Petal.Length <- dt$Petal.Length %>% scale()
dt$Petal.Width <- dt$Petal.Width %>% scale()
# remove in the real run ----
set.seed(1234567)
# split data into train and test - 3:1 ----
train_index <- createDataPartition(dt$Species, p = 0.75, list = FALSE)
train <- dt[train_index, ]
test <- dt[-train_index, ]
# values to check for k ----
K_VALUES <- 20:1
test_acc <- numeric(0)
train_acc <- numeric(0)
# calculate different models for each value of k ----
for (x in K_VALUES){
model <- knn3(Species ~ ., data = train, k = x)
pred_test <- predict(model, test, type = "class")
pred_test_acc <- confusionMatrix(table(pred_test,
test$Species))$overall["Accuracy"]
test_acc <- c(test_acc, pred_test_acc)
pred_train <- predict(model, train, type = "class")
pred_train_acc <- confusionMatrix(table(pred_train,
train$Species))$overall["Accuracy"]
train_acc <- c(train_acc, pred_train_acc)
}
data <- data.table(x = K_VALUES, train = train_acc, test = test_acc)
# plot a validation curve ----
plot_data <- gather(data, "type", "value", -x)
g <- qplot(x = x,
y = value,
data = plot_data,
color = type,
geom = "path",
xlim = c(max(K_VALUES),min(K_VALUES)-1))
print(g)
Now find a k with a good accuracy for your test data. That's the value you're looking for.
Disclosure: That's simplified but this approach should help you solving your problem.

How to predict test data using a GAM with MRF smooth and neighborhood structure?

I am having a problem using the predict() function for a mgcv::gam (training) model on a new (testing) dataset. The problem arises due to a mrf smooth I have integrated to account for the spatial nature of my data.
I use the following call to create my GAM model
## Run GAM with MRF
m <- gam(crime ~ s(district,k=nrow(traindata),
bs ='mrf',xt=list(nb=nbtrain)), #define MRF smooth
data = traindata,
method = 'REML',
family = scat(), #fit scaled t distribution
gamma = 1.4
)
where I predict the dependent variable crime using the neighbourhood structure, parsed into the model in the smooth term argument xt. The neighbourhood structure comes as a nb object that I created using the poly2nb() function.
Now, if I want to use predict() on a new testing dataset, I don't know how to pass the according neighbourhood structure into the call. Providing just the new data
pred <- predict.gam(m,newdata=testdata)
throws the following error:
Error in predict.gam(m, newdata = testdata) :
7, 16, 20, 28, 35, 36, 37, 43 not in original fit
Here's a full reproduction of the error using the Columbus dataset called from within R directly:
#ERROR REPRODUCTION
## Load packages
require(mgcv)
require(spdep)
require(dplyr)
## Load Columbus Ohio crime data (see ?columbus for details and credits)
data(columb.polys) #Columbus district shapes list
columb.polys <- lapply(columb.polys,na.omit) #omit NAs (unfortunate problem with the Columbus sample data)
data(columb) #Columbus data frame
df <- data.frame(district=numeric(0),x=numeric(0),y= numeric(0)) #Create empty df to store x, y and IDs for each polygon
## Extract x and y coordinates from each polygon and assign district ID
for (i in 1:length(columb.polys)) {
district <- i-1
x <- columb.polys[[i]][,1]
y <- columb.polys[[i]][,2]
df <- rbind(df,cbind(district,x,y)) #Save in df data.frame
}
## Convert df into SpatialPolygons
sp <- df %>%
group_by(district) %>%
do(poly=select(., x, y) %>%Polygon()) %>%
rowwise() %>%
do(polys=Polygons(list(.$poly),.$district)) %>%
{SpatialPolygons(.$polys)}
## Merge SpatialPolygons with data
spdf <- SpatialPolygonsDataFrame(sp,columb)
## Split into training and test sample (80/20 ratio)
splt <- sample(1:2,size=nrow(spdf),replace=TRUE,prob=c(0.8,0.2))
train <- spdf[splt==1,]
test <- spdf[splt==2,]
## Prepapre both samples and create NB objects
traindata <- train#data #Extract data from SpatialPolygonsDataFrame
testdata <- test#data
traindata <- droplevels(as(train, 'data.frame')) #Drop levels
testdata <- droplevels(as(test, 'data.frame'))
traindata$district <- as.factor(traindata$district) #Factorize
testdata$district <- as.factor(testdata$district)
nbtrain <- poly2nb(train, row.names=train$Precinct, queen=FALSE) #Create NB objects for training and test sample
nbtest <- poly2nb(test, row.names=test$Precinct, queen=FALSE)
names(nbtrain) <- attr(nbtrain, "region.id") #Set region.id
names(nbtest) <- attr(nbtest, "region.id")
## Run GAM with MRF
m <- gam(crime ~ s(district, k=nrow(traindata), bs = 'mrf',xt = list(nb = nbtrain)), # define MRF smooth
data = traindata,
method = 'REML', # fast version of REML smoothness selection; alternatively 'GCV.Cp'
family = scat(), #fit scaled t distribution
gamma = 1.4
)
## Run prediction using new testing data
pred <- predict.gam(m,newdata=testdata)
SOLUTION:
I finally found the time to update this post with the solution. Thanks to everyone for helping me out. Here is the code for implementing k-fold CV with a random training-testing split:
#Apply k-fold cross validation
mses <- data.frame() #Create empty df to store CV squared error values
scores <- data.frame() #Create empty df to store CV R2 values
set.seed(42) #Set seed for reproducibility
k <- 10 #Define number of folds
for (i in 1:k) {
# Create weighting column
data$weight <- sample(c(0,1),size=nrow(data),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample
#Run GAM with MRF
ctrl <- gam.control(nthreads = 6) #Set controls
m <- gam(crime ~ s(disctrict, k=nrow(data), bs = 'mrf',xt = list(nb = nb)), #define MRF smooth
data = data,
weights = data$weight, #Use only weight==1 observations (training)
method = 'REML',
control = ctrl,
family = scat(),
gamma = 1.4
)
#Generate test dataset
testdata <- data[data$weight==0,] #Select test data by weight
#Predict test data
pred <- predict(m,newdata=testdata)
#Extract MSES
mses[i,1] <- mean((data$R_MeanDiff[data$weight==0] - pred)^2)
scores[i,1] <- summary(m)$r.sq
}
av.mse.GMRF <- mean(mses$V1)
av.r2.GMRF <- mean(scores$V1)
I have one question criticism with the current solution, being that the full dataset was used to "train" the model meaning that the predictions are going to be biased since the testdata was used to train it.
This only requires a couple minor tweaks to fix:
#Apply k-fold cross validation
mses <- data.frame() #Create empty df to store CV squared error values
scores <- data.frame() #Create empty df to store CV R2 values
set.seed(42) #Set seed for reproducibility
k <- 10 #Define number of folds
#For loop for each fold
for (i in 1:k) {
# Create weighting column
data$weight <- sample(c(0,1),size=nrow(data),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample
#Generate training dataset
trainingdata <- data[data$weight == 1, ] #Select test data by weight
#Generate test dataset
testdata <- data[data$weight == 0, ] #Select test data by weight
#Run GAM with MRF
ctrl <- gam.control(nthreads = 6) #Set controls
m <- gam(crime ~ s(disctrict, k=nrow(data), bs = 'mrf',xt = list(nb = nb)), #define MRF smooth
data = trainingdata,
weights = data$weight, #Use only weight==1 observations (training)
method = 'REML',
control = ctrl,
family = scat(),
gamma = 1.4
)
#Predict test data
pred <- predict(m,newdata = testdata)
#Extract MSES
mses[i,1] <- mean((data$R_MeanDiff[data$weight==0] - pred)^2)
scores[i,1] <- summary(m)$r.sq
}
#Get average scores from each k-fold test
av.mse.GMRF <- mean(mses$V1)
av.r2.GMRF <- mean(scores$V1)

binning continuous variables by IV value in R

I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))

Resources