SVM with rolling window - r

I have a code which predict the change in the sign of future returns.
library(quantmod)
library(PerformanceAnalytics)
library(forecast)
library(e1071)
library(caret)
library(kernlab)
library(dplyr)
library(roll)
# get data yahoo finance
getSymbols("^GSPC", from = "1990-01-01", to = "2017-12-01")
# take logreturns
rnull <- CalculateReturns(prices = GSPC$GSPC.Adjusted ,method ="log")
# lags 1, 2, 3, 4, 5 as features
feat <- merge(na.trim(lag(rnull,1)),na.trim(lag(rnull,2)),na.trim(lag(rnull,3)),na.trim(lag(rnull,4)),na.trim(lag(rnull,5)),all=FALSE)
# create dataset. 6th column is actural. Previous is lagged
dataset <- merge(feat,rnull,all=FALSE)
# set columns' names
colnames(dataset) = c("lag.1", "lag.2", "lag.3","lag.4","lag.5","TARGET")
# get signs and make a data.frame
x <- sign(dataset)%>%as.data.frame
# exclude 0 sign and assume that these values are positive
x[x==0] <- 1
# for svm purposes we need to set dependent variable as factor and make levels to interpretation
x$TARGET <- as.factor(as.character(x$TARGET))
levels(x$TARGET) <- list(positive = "1", negative = "-1")
# divide sample to training and test subsamples
trainindex <- x[1:5792,]
testindex <- x[5792:7030,]
# run svm
svmFit <- ksvm(TARGET~.,data=trainindex,type="C-svc",kernel= "rbfdot")
# prediction
predsvm <- predict(svmFit, newdata=testindex)
# results
confusionMatrix(predsvm, testindex$TARGET)
The next thing I am going to do is add a rolling window (1 step forecast) to my model.
However the basic methods as rollapply does not work with dataframe. Commom methods of one step forecast for time-series are also not valid for data.frame used in e1071 package.
I wrote the following function:
svm_next_day_prediction <- function(x){
svmFit <- svm(TARGET~., data=x)
prediction <- predict(object = svmFit, newdata = tail(x,1) )
return(prediction)
}
apl = rollapplyr(data = x, width = 180, FUN = svm_next_day_prediction, by.column = TRUE)
but recieved a error because rollapply does not understand data.frames:
Error in terms.formula(formula, data = data) : '.' in formula and
no 'data' argument
Can you please explain how to apply rolling window for svm classification model with dataframe?

A few points
rollapply works with data frames that can be coerced to a matrix so be sure that your input is entirely numeric -- not a mix of numeric and factor. For example, this works using the built-in data frame BOD which has two numeric columns. Note that x passed to pred is a matrix here.
pred <- function(x) predict(svm(demand ~ Time, x))
rollapplyr(BOD, 3, FUN = pred, by.column = FALSE)
giving
## 1 2 3
## [1,] 8.868888 10.86889 17.25474
## [2,] 11.661666 17.24870 16.00000
## [3,] 18.328435 16.18583 15.78583
## [4,] 16.230474 15.83247 19.56886
I can't reproduce the error you get. I get a different error.
the code in the question has by.column = TRUE (which is the default anyways)
but that has the result of passing only a single vector to the function which
is not what you want. You want by.column = FALSE.
Try this:
x0 <- data.matrix(x)
rollapplyr(data = x0, width = 180, FUN = svm_next_day_prediction, by.column = FALSE)

you can create a list with the individual data frames and then apply your function. I rename x to df to avoid confusion:
df=x
rowwindow=179
dfList=lapply(1:(nrow(df)-rowwindow),function(x) df[x:(rowwindow+x),])
result=sapply(dfList,svm_next_day_prediction)

Related

How to use the replicate function in R to repeat the function

I have a problem when using replicate to repeat the function.
I tried to use the bootstrap to fit
a quadratic model using concentration as the predictor and Total_lignin as the response and going to report an estimate of the maximum with a corresponding standard error.
My idea is to create a function called bootFun that essentially did everything within one iteration of a for loop. bootFun took in only the data set the predictor, and the response to use (both variable names in quotes).
However, the SD is 0, not correct. I do not know where is the wrong place. Could you please help me with it?
# Load the libraries
library(dplyr)
library(tidyverse)
# Read the .csv and only use M.giganteus and S.ravennae.
dat <- read_csv('concentration.csv') %>%
filter(variety == 'M.giganteus' | variety == 'S.ravennae') %>%
arrange(variety)
# Check the data
head(dat)
# sample size
n <- nrow(dat)
# A function to do one iteration
bootFun <- function(dat, pred, resp){
# Draw the sample size from the dataset
sample <- sample_n(dat, n, replace = TRUE)
# A quadratic model fit
formula <- paste0('resp', '~', 'pred', '+', 'I(pred^2)')
fit <- lm(formula, data = sample)
# Derive the max of the value of concentration
max <- -fit$coefficients[2]/(2*fit$coefficients[3])
return(max)
}
max <- bootFun(dat = dat, pred = 'concentration', resp = 'Total_lignin' )
# Iterated times
N <- 5000
# Use 'replicate' function to do a loop
maxs <- replicate(N, max)
# An estimate of the max of predictor and corresponding SE
mean(maxs)
sd(maxs)
Base package boot, function boot, can ease the job of calling the bootstrap function repeatedly. The first argument must be the data set, the second argument is an indices argument, that the user does not set and other arguments can also be passed toit. In this case those other arguments are the predictor and the response names.
library(boot)
bootFun <- function(dat, indices, pred, resp){
# Draw the sample size from the dataset
dat.sample <- dat[indices, ]
# A quadratic model fit
formula <- paste0(resp, '~', pred, '+', 'I(', pred, '^2)')
formula <- as.formula(formula)
fit <- lm(formula, data = dat.sample)
# Derive the max of the value of concentration
max <- -fit$coefficients[2]/(2*fit$coefficients[3])
return(max)
}
N <- 5000
set.seed(1234) # Make the bootstrap results reproducible
results <- boot(dat, bootFun, R = N, pred = 'concentration', resp = 'Total_lignin')
results
#
#ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
#Call:
#boot(data = dat, statistic = bootFun, R = N, pred = "concentration",
# resp = "Total_lignin")
#
#
#Bootstrap Statistics :
# original bias std. error
#t1* -0.4629808 -0.0004433889 0.03014259
#
results$t0 # this is the statistic, not bootstrapped
#concentration
# -0.4629808
mean(results$t) # bootstrap value
#[1] -0.4633233
Note that to fit a polynomial, function poly is much simpler than to explicitly write down the polynomial terms one by one.
formula <- paste0(resp, '~ poly(', pred, ',2, raw = TRUE)')
Check the distribution of the bootstrapped statistic.
op <- par(mfrow = c(1, 2))
hist(results$t)
qqnorm(results$t)
qqline(results$t)
par(op)
Test data
set.seed(2020) # Make the results reproducible
x <- cumsum(rnorm(100))
y <- x + x^2 + rnorm(100)
dat <- data.frame(concentration = x, Total_lignin = y)

Predict segmented lm outside of package

I have an array of outputs from hundreds of segmented linear models (made using the segmented package in R). I want to be able to use these outputs on new data, using the predict function. To be clear, I do not have the segmented linear model objects in my workspace; I just saved and reimported the relevant outputs (e.g. the coefficients and breakpoints). For this reason I can't simply use the predict.segmented function from the segmented package.
Below is a toy example based on this link that seems promising, but does not match the output of the predict.segmented function.
library(segmented)
set.seed(12)
xx <- 1:100
zz <- runif(100)
yy <- 2 + 1.5*pmax(xx-35,0) - 1.5*pmax(xx-70,0) +
15*pmax(zz-0.5,0) + rnorm(100,0,2)
dati <- data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
o<-## S3 method for class 'lm':
segmented(out.lm,seg.Z=~x,psi=list(x=c(30,60)),
control=seg.control(display=FALSE))
# Note that coefficients with U in the name are differences in slopes, not slopes.
# Compare:
slope(o)
coef(o)[2] + coef(o)[3]
coef(o)[2] + coef(o)[3] + coef(o)[4]
# prediction
pred <- data.frame(x = 1:100)
pred$dummy1 <- pmax(pred$x - o$psi[1,2], 0)
pred$dummy2 <- pmax(pred$x - o$psi[2,2], 0)
pred$dummy3 <- I(pred$x > o$psi[1,2]) * (coef(o)[2] + coef(o)[3])
pred$dummy4 <- I(pred$x > o$psi[2,2]) * (coef(o)[2] + coef(o)[3] + coef(o)[4])
names(pred)[-1]<- names(model.frame(o))[-c(1,2)]
# compute the prediction, using standard predict function
# computing confidence intervals further
# suppose that the breakpoints are fixed
pred <- data.frame(pred, predict(o, newdata= pred,
interval="confidence"))
# Try prediction using the predict.segment version to compare
test <- predict.segmented(o)
plot(pred$fit, test, ylim = c(0, 100))
abline(0,1, col = "red")
# At least one segment not being predicted correctly?
Can I use the base r predict() function (not the segmented.predict() function) with the coefficients and break points saved from segmented linear models?
UPDATE
I figured out that the code above has issues (don't use it). Through some reverse engineering of the segmented.predict() function, I produced the design matrix and use that to predict values instead of directly using the predict() function. I do not consider this a full answer of the original question yet because predict() can also produce confidence intervals for the prediction, and I have not yet implemented that--question still open for someone to add confidence intervals.
library(segmented)
## Define function for making matrix of dummy variables (this is based on code from predict.segmented())
dummy.matrix <- function(x.values, x_names, psi.est = TRUE, nameU, nameV, diffSlope, est.psi) {
# This function creates a model matrix with dummy variables for a segmented lm with two breakpoints.
# Inputs:
# x.values: the x values of the segmented lm
# x_names: the name of the column of x values
# psi.est: this is legacy from the predict.segmented function, leave it set to 'TRUE'
# obj: the segmented lm object
# nameU: names (class character) of 3rd and 4th coef, which are "U1.x" "U2.x" for lm with two breaks. Example: names(c(obj$coef[3], obj$coef[4]))
# nameV: names (class character) of 5th and 6th coef, which are "psi1.x" "psi2.x" for lm with two breaks. Example: names(c(obj$coef[5], obj$coef[6]))
# diffSlope: the coefficients (class numeric) with the slope differences; called U1.x and U2.x for lm with two breaks. Example: c(o$coef[3], o$coef[4])
# est.psi: the estimated break points (class numeric); these are the estimated breakpoints from segmented.lm. Example: c(obj$psi[1,2], obj$psi[2,2])
#
n <- length(x.values)
k <- length(est.psi)
PSI <- matrix(rep(est.psi, rep(n, k)), ncol = k)
newZ <- matrix(x.values, nrow = n, ncol = k, byrow = FALSE)
dummy1 <- pmax(newZ - PSI, 0)
if (psi.est) {
V <- ifelse(newZ > PSI, -1, 0)
dummy2 <- if (k == 1)
V * diffSlope
else V %*% diag(diffSlope)
newd <- cbind(x.values, dummy1, dummy2)
colnames(newd) <- c(x_names, nameU, nameV)
} else {
newd <- cbind(x.values, dummy1)
colnames(newd) <- c(x_names, nameU)
}
# if (!x_names %in% names(coef(obj.seg)))
# newd <- newd[, -1, drop = FALSE]
return(newd)
}
## Test dummy matrix function----------------------------------------------
set.seed(12)
xx<-1:100
zz<-runif(100)
yy<-2+1.5*pmax(xx-35,0)-1.5*pmax(xx-70,0)+15*pmax(zz-.5,0)+rnorm(100,0,2)
dati<-data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
#1 segmented variable, 2 breakpoints: you have to specify starting values (vector) for psi:
o<-segmented(out.lm,seg.Z=~x,psi=c(30,60),
control=seg.control(display=FALSE))
slope(o)
plot.segmented(o)
summary(o)
# Test dummy matrix fn with the same dataset
newdata <- dati
nameU1 <- c("U1.x", "U2.x")
nameV1 <- c("psi1.x", "psi2.x")
diffSlope1 <- c(o$coef[3], o$coef[4])
est.psi1 <- c(o$psi[1,2], o$psi[2,2])
test <- dummy.matrix(x.values = newdata$x, x_names = "x", psi.est = TRUE,
nameU = nameU1, nameV = nameV1, diffSlope = diffSlope1, est.psi = est.psi1)
# Predict response variable using matrix multiplication
col1 <- matrix(1, nrow = dim(test)[1])
test <- cbind(col1, test) # Now test is the same as model.matrix(o)
predY <- coef(o) %*% t(test)
plot(predY[1,])
lines(predict.segmented(o), col = "blue") # good, predict.segmented gives same answer

Rolling regression and prediction with lm() and predict()

I need to apply lm() to an enlarging subset of my dataframe dat, while making prediction for the next observation. For example, I am doing:
fit model predict
---------- -------
dat[1:3, ] dat[4, ]
dat[1:4, ] dat[5, ]
. .
. .
dat[-1, ] dat[nrow(dat), ]
I know what I should do for a particular subset (related to this question: predict() and newdata - How does this work?). For example to predict the last row, I do
dat1 = dat[1:(nrow(dat)-1), ]
dat2 = dat[nrow(dat), ]
fit = lm(log(clicks) ~ log(v1) + log(v12), data=dat1)
predict.fit = predict(fit, newdata=dat2, se.fit=TRUE)
How can I do this automatically for all subsets, and potentially extract what I want into a table?
From fit, I'd need the summary(fit)$adj.r.squared;
From predict.fit I'd need predict.fit$fit value.
Thanks.
(Efficient) solution
This is what you can do:
p <- 3 ## number of parameters in lm()
n <- nrow(dat) - 1
## a function to return what you desire for subset dat[1:x, ]
bundle <- function(x) {
fit <- lm(log(clicks) ~ log(v1) + log(v12), data = dat, subset = 1:x, model = FALSE)
pred <- predict(fit, newdata = dat[x+1, ], se.fit = TRUE)
c(summary(fit)$adj.r.squared, pred$fit, pred$se.fit)
}
## rolling regression / prediction
result <- t(sapply(p:n, bundle))
colnames(result) <- c("adj.r2", "prediction", "se")
Note I have done several things inside the bundle function:
I have used subset argument for selecting a subset to fit
I have used model = FALSE to not save model frame hence we save workspace
Overall, there is no obvious loop, but sapply is used.
Fitting starts from p, the minimum number of data required to fit a model with p coefficients;
Fitting terminates at nrow(dat) - 1, as we at least need the final column for prediction.
Test
Example data (with 30 "observations")
dat <- data.frame(clicks = runif(30, 1, 100), v1 = runif(30, 1, 100),
v12 = runif(30, 1, 100))
Applying code above gives results (27 rows in total, truncated output for 5 rows)
adj.r2 prediction se
[1,] NaN 3.881068 NaN
[2,] 0.106592619 3.676821 0.7517040
[3,] 0.545993989 3.892931 0.2758347
[4,] 0.622612495 3.766101 0.1508270
[5,] 0.180462206 3.996344 0.2059014
The first column is the adjusted-R.squared value for fitted model, while the second column is the prediction. The first value for adj.r2 is NaN, because the first model we fit has 3 coefficients for 3 data points, hence no sensible statistics is available. The same happens to se as well, as the fitted line has no 0 residuals, so prediction is done without uncertainty.
I just made up some random data to use for this example. I'm calling the object data because that was what it was called in the question at the time that I wrote this solution (call it anything you like).
(Efficient) Solution
data <- data.frame(v1=rnorm(100),v2=rnorm(100),clicks=rnorm(100))
data1 = data[1:(nrow(data)-1), ]
data2 = data[nrow(data), ]
for(i in 3:nrow(data)){
nam <- paste("predict", i, sep = "")
nam1 <- paste("fit", i, sep = "")
nam2 <- paste("summary_fit", i, sep = "")
fit = lm(clicks ~ v1 + v2, data=data[1:i,])
tmp <- predict(fit, newdata=data2, se.fit=TRUE)
tmp1 <- fit
tmp2 <- summary(fit)
assign(nam, tmp)
assign(nam1, tmp1)
assign(nam2, tmp2)
}
All of the results you want will be stored in the data objects this creates.
For example:
> summary_fit10$r.squared
[1] 0.3087432
You mentioned in the comments that you'd like a table of results. You can programmatically create tables of results from the 3 types of output files like this:
rm(data,data1,data2,i,nam,nam1,nam2,fit,tmp,tmp1,tmp2)
frames <- ls()
frames.fit <- frames[1:98] #change index or use pattern matching as needed
frames.predict <- frames[99:196]
frames.sum <- frames[197:294]
fit.table <- data.frame(intercept=NA,v1=NA,v2=NA,sourcedf=NA)
for(i in 1:length(frames.fit)){
tmp <- get(frames.fit[i])
fit.table <- rbind(fit.table,c(tmp$coefficients[[1]],tmp$coefficients[[2]],tmp$coefficients[[3]],frames.fit[i]))
}
fit.table
> fit.table
intercept v1 v2 sourcedf
2 -0.0647017971121678 1.34929652763687 -0.300502017324518 fit10
3 -0.0401617893034109 -0.034750571912636 -0.0843076273486442 fit100
4 0.0132968863522573 1.31283604433593 -0.388846211083564 fit11
5 0.0315113918953643 1.31099122173898 -0.371130010135382 fit12
6 0.149582794027583 0.958692838785998 -0.299479715938493 fit13
7 0.00759688947362175 0.703525856001948 -0.297223988673322 fit14
8 0.219756240025917 0.631961979610744 -0.347851129205841 fit15
9 0.13389223748979 0.560583832333355 -0.276076134872669 fit16
10 0.147258022154645 0.581865844000838 -0.278212722024832 fit17
11 0.0592160359650468 0.469842498721747 -0.163187274356457 fit18
12 0.120640756525163 0.430051839741539 -0.201725012088506 fit19
13 0.101443924785995 0.34966728554219 -0.231560038360121 fit20
14 0.0416637001406594 0.472156988919337 -0.247684504074867 fit21
15 -0.0158319749710781 0.451944113682333 -0.171367482879835 fit22
16 -0.0337969739950376 0.423851304105399 -0.157905431162024 fit23
17 -0.109460218252207 0.32206642419212 -0.055331391802687 fit24
18 -0.100560410735971 0.335862465403716 -0.0609509815266072 fit25
19 -0.138175283219818 0.390418411384468 -0.0873106257144312 fit26
20 -0.106984355317733 0.391270279253722 -0.0560299858019556 fit27
21 -0.0740684978271464 0.385267011513678 -0.0548056844433894 fit28

R code: Rolling regression After stepwise

I have spent whole day today for resolving this.. please help me.
Although I just write very simple example here, my original data has a huge number of variables- about 2,000. Therefore, to run regression I need to pick certain variables.
I do need to develop many models, so I should do this procedure automatically.
I run stepwie.
I don't know how many variables are selected by stepwise.
after selecting variables, I run rolling regression for prediction.
library(car)
library(zoo)
# run regression
m <- lm(mpg~., data=mtcars)
# run stepwise
s<-step(m, direction="both")
# select variables
variable<- attr(s$terms,"term.labels")
b<-paste(dep,paste(s, collapse="+"),sep = "~")
rollapply(mtcars, width = 2,
FUN = function(z) coef(lm(b, data = as.data.frame(z))),
by.column = FALSE, align = "right")
# Here is the automatic model I developed..
models2 <- lapply(1:11, function(x) {
dep<-names(mtcars)[x]
ind<-mtcars[-x]
w<-names(ind)
indep<-paste(dep,paste(w, collapse="+"),sep = "~")
m<-lm(indep,data=mtcars)
s<-step(m, direction="both")
b<-paste(dep,paste(s, collapse="+"),sep = "~")
rollapply(mtcars, width = 2,
FUN = function(z) coef(lm(b, data = as.data.frame(z))),
by.column = FALSE, align = "right")})
I want to calculate prediction from rolling regression..
However, it is very hard to set up
data.frame without pre-knowldege about independent variables..
There is a similar one here, but in this model independent variables are known already.
You do not need to know the independent variables! If you provide a data.frame that contains all variables, the predict function will select the necessary ones. Similar to the post you have linked, you could to this:
mtcars[,"int"] <- seq(nrow(mtcars)) # add variable used to choose newdata
models2 <- lapply(1:11, function(x) {
dep <- names(mtcars)[x]
ind <- mtcars[-x]
w <- names(ind)
form <- paste(dep,paste(w, collapse="+"),sep = "~")
m <- lm(form, data=mtcars)
s <- step(m, direction="both", trace=0) # model selection (don't print trace)
b <- formula(s) # This is clearer than your version
rpl <- rollapply(mtcars, width = 20, # if you use width=2, your model will always be overdetermined
FUN = function(z) {
nextD <- max(z[,'int'])+1 # index of row for new data
fit <- lm(b, data = as.data.frame(z)) # fit the model
c(coef=coef(fit), # coefficients
predicted=predict(fit, newdata=mtcars[nextD,])) # predict using the next row
},
by.column = FALSE, align = "right")
rpl
})

binning continuous variables by IV value in R

I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))

Resources