I'm confused while calculating train data prediction and test data prediction since the values of each feature is being calculated as same and test accuracy is not so accurate
library(dbplyr)
library(tidyverse)
library(caret)
Placeholder for test & train accuracy
train_Data_prediction=rep(1,100)
test_Data_prediction=rep(1,100)
Execute 100 times and later average the accuracy
for(count in c(1:100))
{
data_train <- read.csv("parktraining.csv",FALSE,",")
data_train = as.matrix(data_train)
x_index=ncol(data_train)
data_Without_lable <- data_train[,-x_index]
lable <- data_train[,x_index]
Train_mean = apply(data_Without_lable,2,mean)
Train_sd = apply(data_Without_lable,2,sd)
Train_offsets <- t(t(data_Without_lable) - Train_mean)
Train_scaled_data <- t(t(Train_offsets) / Train_sd)
positive_ids = which(data_train[,x_index] == 1)
negative_ids = which(data_train[,x_index] == 0)
positive_data = Train_scaled_data[positive_ids,]
negative_data = Train_scaled_data[negative_ids,]
pos_Mean = apply(positive_data,2,mean)
positive_sd=apply(positive_data,2,sd)
neg_Mean = apply(negative_data,2,mean)
negative_sd=apply(negative_data,2,sd)
tested_data <- read.csv("parktesting.csv",FALSE,",")
tested_data = as.matrix(tested_data)
testing_data = tested_data[,-x_index]
predict=function(testing_data_row){
target=0;
Used dnorm() function for normal distribution and calculate probability
p_pos=sum(log(dnorm(testing_data_row,pos_Mean,positive_sd)))
+log(length(positive_ids)/length(lable))
p_neg=sum(log(dnorm(testing_data_row,neg_Mean,negative_sd)))+log( 1 -
(length(negative_ids)/length(lable)))
if(p_pos>p_neg){
target=1
}else{
target=0
}
}
test_mean = apply(testing_data,2,mean)
test_sd = apply(testing_data,2,sd)
test_offset <- t(t(testing_data) - test_mean)
test_scaled_data <- t(t(test_offset) / test_sd)
test_prediction <- apply(test_scaled_data,1,predict)
target=tested_data[,x_index]
target
test_Data_prediction[count]=length(which((test_prediction == target)==TRUE))/length(target)
test_Data_prediction
#Predict for train data ( optional, output not printed )
train_prediction =apply(Train_scaled_data,1,predict)
train_Data_prediction[count]=length(which((train_prediction == lable)==TRUE))/length(lable)
}
test_Data_prediction
train_Data_prediction
print(paste("Average Train Data Accuracy:",mean(train_Data_prediction)*100.0,sep = " "))
print(paste("Average Test Data Accuracy:",mean(test_Data_prediction)*100.0,sep = " "))
Related
I am trying to plot my SVM classification with usinf e1071 library.However, the classification plot shows only single value for my parameters.Even though I change the selected parameters to create 2d classification plot it is wrong.
require(caTools)
library(caret)
dataset <-read.csv("income_evaluation.csv")
# fnlwgt row remowed since it is not necessary
df_income <- subset(dataset,select=-c(fnlwgt))
# turn binary attribute into 0 and 1
df_income$income <-ifelse(df_income$income==" >50K",1,0)
df_income$native.country
apply(X=df_income,2,FUN=function(x) length(which(x==' ?')))
# handling missing values
#define function to calculate mode
find_mode <- function(x) {
u <- unique(x)
tab <- tabulate(match(x, u))
u[tab == max(tab)]
}
mod_workclass_df = find_mode(df_income$workclass)
mod_occupation_df = find_mode(df_income$occupation)
mod_country_df = find_mode(df_income$native.country)
# replacing the missing values with the mod values
df_income$workclass[df_income$workclass == ' ?'] <- mod_workclass_df
df_income$occupation[df_income$occupation == ' ?'] <- mod_occupation_df
df_income$native.country[df_income$native.country == ' ?'] <- mod_country_df
# one hot encoding for train set
dmy <- dummyVars(" ~ .", data = df_income, fullRank = T)
df_income <- data.frame(predict(dmy, newdata = df_income))
# sampling
set.seed(101)
sample = sample.split(df_income$income, SplitRatio = .75)
trainingSet = subset(df_income, sample == TRUE)
testSet = subset(df_income, sample == TRUE)
# isolaate y cariable
Y_train <- trainingSet$income
Y_test <- testSet$income
#isolate x cariable
X_test <- subset(testSet,select=-c(income))
# evalution of svm
library(e1071)
svm_classifier = svm(formula=income ~ .,data=trainingSet,type="C-classification",kernel="radial",scale=TRUE,cost=10)
Y_pred = predict(svm_classifier,newdata= X_test)
confusionMatrix(table(Y_test,Y_pred))
# cross validation
# in creating the folds we specify the target feature (dependent variable) and # of folds
folds = createFolds(trainingSet$income, k = 10)
# in cv we are going to applying a created function to our 'folds'
cv = lapply(folds, function(x) { # start of function
# in the next two lines we will separate the Training set into it's 10 pieces
training_fold = trainingSet[-x, ] # training fold = training set minus (-) it's sub test fold
test_fold = trainingSet[x, ] # here we describe the test fold individually
# now apply (train) the classifer on the training_fold
classifier = svm_classifier
Y_pred = predict(svm_classifier,newdata= test_fold[-97])
cm = table(test_fold[, 97], Y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy
trainingSet$income <-as.factor(trainingSet$income)
# Visualising the Training set results
plot(svm_classifier,trainingSet,education.num~age)
library(ggplot2)
svm_classifier
table(predicted=svm_classifier$fitted,actual=trainingSet$income)
Here is my code above and the plot below.I could not find the problem why there is only one color background and why there is any red color in the background.
I’m trying to replicate the deep learning example below with the same Boston housing dataset from another source.
https://jjallaire.github.io/deep--with-r-notebooks/notebooks/3.6-predicting-house-prices.nb.html
Originally the data source is:
library(keras) dataset <- dataset_boston_housing()
Alternatively I try to use:
library(mlbench)
data(BostonHousing)
The difference between the datasets are:
the dataset from mlbench contains column names.
the dataset from keras is already split between test and train.
the set from keras is organised with lists containing matrices while the dataset from mlbench is a dataframe
the fourth column contains a categorical variable "chas" which could not be preprocessed from the mlbench dataset while it can be preprocessed from the keras dataset. To compare apples with apples I have deleted this column from both datasets.
In order to compare both datasets I have merged the train and testset from keras into 1 dataset. After this I have compared the merged dataset from keras with mlbench with summary() and these are identical for every feature (min, max, median, mean).
Since the dataset from keras is already split between test and train (80-20), I can only use one training set for the deep learning proces. This training set gives a validation_mae of around 2.5. See this graph:
If I partition the data from mlbench at 0.8 to construct a training set of similar size, run the deep learing code and do this several times, I never reach a validation_mae of around 2.5. The range is between 4 and 6. An example of the output is this graph:
Does someone know what can be the cause for this difference?
Code with dataset from keras:
library(keras)
dataset <- dataset_boston_housing()
c(c(train_data, train_targets), c(test_data, test_targets)) %<-% dataset
train_data <- train_data[,-4]
test_data <- test_data[,-4]
mean <- apply(train_data, 2, mean)
std <- apply(train_data, 2, sd)
train_data <- scale(train_data, center = mean, scale = std)
test_data <- scale(test_data, center = mean, scale = std)
# After this line the code is the same for both code examples.
# =========================================
# Because we will need to instantiate the same model multiple times,
# we use a function to construct it.
build_model <- function() {
model <- keras_model_sequential() %>%
layer_dense(units = 64, activation = "relu",
input_shape = dim(train_data)[[2]]) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1)
model %>% compile(
optimizer = "rmsprop",
loss = "mse",
metrics = c("mae")
)
}
k <- 4
indices <- sample(1:nrow(train_data))
folds <- cut(1:length(indices), breaks = k, labels = FALSE)
num_epochs <- 100
all_scores <- c()
for (i in 1:k) {
cat("processing fold #", i, "\n")
# Prepare the validation data: data from partition # k
val_indices <- which(folds == i, arr.ind = TRUE)
val_data <- train_data[val_indices,]
val_targets <- train_targets[val_indices]
# Prepare the training data: data from all other partitions
partial_train_data <- train_data[-val_indices,]
partial_train_targets <- train_targets[-val_indices]
# Build the Keras model (already compiled)
model <- build_model()
# Train the model (in silent mode, verbose=0)
model %>% fit(partial_train_data, partial_train_targets,
epochs = num_epochs, batch_size = 1, verbose = 0)
# Evaluate the model on the validation data
results <- model %>% evaluate(val_data, val_targets, verbose = 0)
all_scores <- c(all_scores, results$mean_absolute_error)
}
all_scores
mean(all_scores)
# Some memory clean-up
k_clear_session()
num_epochs <- 500
all_mae_histories <- NULL
for (i in 1:k) {
cat("processing fold #", i, "\n")
# Prepare the validation data: data from partition # k
val_indices <- which(folds == i, arr.ind = TRUE)
val_data <- train_data[val_indices,]
val_targets <- train_targets[val_indices]
# Prepare the training data: data from all other partitions
partial_train_data <- train_data[-val_indices,]
partial_train_targets <- train_targets[-val_indices]
# Build the Keras model (already compiled)
model <- build_model()
# Train the model (in silent mode, verbose=0)
history <- model %>% fit(
partial_train_data, partial_train_targets,
validation_data = list(val_data, val_targets),
epochs = num_epochs, batch_size = 1, verbose = 1
)
mae_history <- history$metrics$val_mean_absolute_error
all_mae_histories <- rbind(all_mae_histories, mae_history)
}
average_mae_history <- data.frame(
epoch = seq(1:ncol(all_mae_histories)),
validation_mae = apply(all_mae_histories, 2, mean)
)
library(ggplot2)
ggplot(average_mae_history, aes(x = epoch, y = validation_mae)) + geom_line()
Code with dataset from mlbench (after the line with "=====", the code is the same as in the code above:
library(dplyr)
library(mlbench)
library(groupdata2)
data(BostonHousing)
parts <- partition(BostonHousing, p = 0.2)
test_data <- parts[[1]]
train_data <- parts[[2]]
train_targets <- train_data$medv
test_targets <- test_data$medv
train_data$medv <- NULL
test_data$medv <- NULL
train_data$chas <- NULL
test_data$chas <- NULL
mean <- apply(train_data, 2, mean)
std <- apply(train_data, 2, sd)
train_data <- scale(train_data, center = mean, scale = std)
test_data <- scale(test_data, center = mean, scale = std)
library(keras)
# After this line the code is the same for both code examples.
# =========================================
build_model <- function() {
model <- keras_model_sequential() %>%
layer_dense(units = 64, activation = "relu",
input_shape = dim(train_data)[[2]]) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1)
model %>% compile(
optimizer = "rmsprop",
loss = "mse",
metrics = c("mae")
)
}
k <- 4
indices <- sample(1:nrow(train_data))
folds <- cut(1:length(indices), breaks = k, labels = FALSE)
num_epochs <- 100
all_scores <- c()
for (i in 1:k) {
cat("processing fold #", i, "\n")
# Prepare the validation data: data from partition # k
val_indices <- which(folds == i, arr.ind = TRUE)
val_data <- train_data[val_indices,]
val_targets <- train_targets[val_indices]
# Prepare the training data: data from all other partitions
partial_train_data <- train_data[-val_indices,]
partial_train_targets <- train_targets[-val_indices]
# Build the Keras model (already compiled)
model <- build_model()
# Train the model (in silent mode, verbose=0)
model %>% fit(partial_train_data, partial_train_targets,
epochs = num_epochs, batch_size = 1, verbose = 0)
# Evaluate the model on the validation data
results <- model %>% evaluate(val_data, val_targets, verbose = 0)
all_scores <- c(all_scores, results$mean_absolute_error)
}
all_scores
mean(all_scores)
# Some memory clean-up
k_clear_session()
num_epochs <- 500
all_mae_histories <- NULL
for (i in 1:k) {
cat("processing fold #", i, "\n")
# Prepare the validation data: data from partition # k
val_indices <- which(folds == i, arr.ind = TRUE)
val_data <- train_data[val_indices,]
val_targets <- train_targets[val_indices]
# Prepare the training data: data from all other partitions
partial_train_data <- train_data[-val_indices,]
partial_train_targets <- train_targets[-val_indices]
# Build the Keras model (already compiled)
model <- build_model()
# Train the model (in silent mode, verbose=0)
history <- model %>% fit(
partial_train_data, partial_train_targets,
validation_data = list(val_data, val_targets),
epochs = num_epochs, batch_size = 1, verbose = 1
)
mae_history <- history$metrics$val_mean_absolute_error
all_mae_histories <- rbind(all_mae_histories, mae_history)
}
average_mae_history <- data.frame(
epoch = seq(1:ncol(all_mae_histories)),
validation_mae = apply(all_mae_histories, 2, mean)
)
library(ggplot2)
ggplot(average_mae_history, aes(x = epoch, y = validation_mae)) + geom_line()
Thank you!
writing here because I can't comment...
I checked the mlbench dataset here and it said, that it contains the 14 columns of the original boston data set and 5 additional columns. Not sure if you might have a faulty dataset because you state that there are no differences in the column counts of the datasets.
Another guess might be, that the second example graph is from a model which is stuck in a local minima. To get more comparable models, you might want to work with the same seeds to make sure that the inizialisations of the weights etc. are the same to get the same results.
Hope this helps and feel free to ask.
I'm using a R package called logistf to make a Logistc Regression and I saw that there's no predict function for new data in this package and predict package does not work with this, so I found a code that show how making this with new data:
fit<-logistf(Tax ~ L20+L24+L28+L29+L31+L32+L33+L36+S10+S15+S16+S17+S20, data=trainData)
betas <- coef(fit)
X <- model.matrix(fit, data=testData)
probs <- 1 / (1 + exp(-X %*% betas))
I want to make a cross validation version with this using fit$predict and the probabilities that probs generate for me. Has anyone ever done something like this before?
Other thing that I want to know is about fit$predict I'm making a binary logistic regression, and this function returns many values, are these values from class 0 or 1, how can I know this? Thanks
While the code that you wrote works perfectly, there is a concise way of getting the same results seemingly:
brglm_model <- brglm(formula = response ~ predictor , family = "binomial", data = train )
brglm_pred <- predict(object = brglm_model, newdata = test , type = "response")
About the CV, you have to write a few lines of code I guess:
#Setting the number of folds, and number of instances in each fold
n_folds <- 5
fold_size <- nrow(dataset) %/% 5
residual <- nrow(dataset) %% 5
#label the instances based on the number of folds
cv_labels <- c(rep(1,fold_size),rep(2,fold_size), rep(3,fold_size), rep(4,fold_size), rep(5,fold_size), rep(5,residual))
# the error term would differ based on each threshold value
t_seq <- seq(0.1,0.9,by = 0.1)
index_mat <- matrix(ncol = (n_folds+1) , nrow = length(t_seq))
index_mat[,1] <- t_seq
# the main loop for calculation of the CV error on each fold
for (i in 1:5){
train <- dataset %>% filter(cv_labels != i)
test <- dataset %>% filter(cv_labels == i )
brglm_cv_model <- brglm(formula = response_var ~ . , family = "binomial", data = train )
brglm_cv_pred <- predict(object = brglm_model, newdata = test , type = "response")
# error formula that you want, e.g. misclassification
counter <- 0
for (treshold in t_seq ) {
counter <- counter + 1
conf_mat <- table( factor(test$response_var) , factor(brglm_cv_pred>treshold, levels = c("FALSE","TRUE") ))
sen <- conf_mat[2,2]/sum(conf_mat[2,])
# other indices can be computed as follows
#spec <- conf_mat[1,1]/sum(conf_mat[1,])
#prec <- conf_mat[2,2]/sum(conf_mat[,2])
#F1 <- (2*prec * sen)/(prec+sen)
#accuracy <- (conf_mat[1,1]+conf_mat[2,2])/sum(conf_mat)
#here I am only interested in sensitivity
index_mat[counter,(i+1)] <- sen
}
}
# final data.frame would be the mean of sensitivity over each threshold value
final_mat <- matrix(nrow = length(t_seq), ncol = 2 )
final_mat[,1] <- t_seq
final_mat[,2] <- apply(X = index_mat[,-1] , MARGIN = 1 , FUN = mean)
final_mat <- data.frame(final_mat)
colnames(final_mat) <- c("treshold","sensitivity")
#why not having a look at the CV-sensitivity of the model over threshold values?
ggplot(data = final_mat) +
geom_line(aes(x = treshold, y = sensitivity ), color = "blue")
I am using car evaluation dataset from UCI. I am trying to use SVM classification for it. After Model creation, when I calculate accuracy using confusion matrix, even if i change the parameters of SVM, getting same accuracy every time. Posting my code below.
require("e1071");
#Code to read data from csv and convert to numeric
car_data <- read.csv("car.data.csv",header = TRUE,sep = ",",quote = "\"");
#backup original data to other data frame
car_data_bkp <- car_data;
car_data$buying<-as.numeric(car_data$buying);
car_data$maint<-as.numeric(car_data$maint);
car_data$doors<-as.numeric(car_data$doors);
car_data$persons<-as.numeric(car_data$persons);
car_data$lug_boot<-as.numeric(car_data$lug_boot);
car_data$safety<-as.numeric(car_data$safety);
car_data$class<-as.numeric(car_data$class);
#scaling of data
maxs = apply(car_data, MARGIN = 2, max);
mins = apply(car_data, MARGIN = 2, min);
scaled = as.data.frame(scale(car_data, center = mins, scale = maxs - mins));
#sampling of data for train and testing
trainIndex <- sample(1:nrow(scaled), 0.8 * nrow(scaled));
train <- scaled[trainIndex, ];
test <- scaled[-trainIndex, ];
n <- names(train);
f <- as.formula(paste("class ~", paste(n[!n %in% "class"], collapse = " + ")));
svm_model <- svm(formula=f,train,cross = 2,tolerance= 0.00001, cost = 1000,gamma=1);
summary(svm_model);
svm.pred <- predict(svm_model, test[,-7],type = "class");
table(pred = svm.pred, true = test[,7]);
#calculate accuracy
sum(diag(svm.pred))/sum(svm.pred);
I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))