Different RF predictions when pasting code from different sources - r

I'm playing around with the German Credit dataset from the "caret" package.
First, I build a very simple model:
library(caret)
library(randomForest)
library(pmml)
data(GermanCredit)
GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
GermanCredit$CheckingAccountStatus.lt.0 <- NULL
GermanCredit$SavingsAccountBonds.lt.100 <- NULL
GermanCredit$EmploymentDuration.lt.1 <- NULL
GermanCredit$EmploymentDuration.Unemployed <- NULL
GermanCredit$Personal.Male.Married.Widowed <- NULL
GermanCredit$Property.Unknown <- NULL
GermanCredit$Housing.ForFree <- NULL
set.seed(100)
inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
GermanCreditTrain <- GermanCredit[ inTrain, ]
GermanCreditTest <- GermanCredit[-inTrain, ]
set.seed(1056)
credit.rf <- randomForest(Class~., data = GermanCreditTrain, ntree = 500)
Now, if I predict the outcome Class on the test set, and do this several times, and then compare the results:
credit.pred1 <- predict(credit.rf, GermanCreditTest)
credit.pred2 <- predict(credit.rf, GermanCreditTest)
credit.pred3 <- predict(credit.rf, GermanCreditTest)
all.equal(credit.pred1, credit.pred2)
all.equal(credit.pred2, credit.pred3)
all.equal(credit.pred1, credit.pred3)
I get the same predictions for all 3 passes. Now, I'm doing this by manually typing the code in the RStudio interpreter. But, If I copy-paste the code from my text editor (which I've posted here: https://gist.github.com/anonymous/32b3c8194362d2e10527), I get an error message saying that there are 3 string differences in the second and third comparison!
How is this possible?

Try using caret's train function:
credit.rf <- train(Class~., data = GermanCreditTrain, method="rf")
instead of
credit.rf <- randomForest(Class~., data = GermanCreditTrain, ntree = 500)
I was able to reproduce the issue and not sure what is causing it. However, the above seems to work when pasted:
credit.rf <- train(Class~., data = GermanCreditTrain, method="rf")
>
> credit.pred1 <- predict(credit.rf, GermanCreditTest)
> credit.pred2 <- predict(credit.rf, GermanCreditTest)
> credit.pred3 <- predict(credit.rf, GermanCreditTest)
>
> all.equal(credit.pred1, credit.pred2)
[1] TRUE
> all.equal(credit.pred2, credit.pred3)
[1] TRUE
> all.equal(credit.pred1, credit.pred3)
[1] TRUE

Related

Finding the precision, recall and the f1 in R

I want to run models on a loop via and then store the performance metrics into a table. I do not want to use the confusionMatrix function in caret, but I want to compute the precision, recall and f1 and then store those in a table. Please assist, edits to the code are welcome.
My attempt is below.
library(MASS) #will load our biopsy data
library(caret)
data("biopsy")
biopsy$ID<-NULL
names(biopsy)<-c('clump thickness','uniformity cell size','uniformity cell shape',
'marginal adhesion','single epithelial cell size','bare nuclei',
'bland chromatin','normal nuclei','mitosis','class')
sum(is.na(biopsy))
biopsy<-na.omit(biopsy)
sum(is.na(biopsy))
head(biopsy,5)
set.seed(123)
inTraining <- createDataPartition(biopsy$class, p = .75, list = FALSE)
training <- biopsy[ inTraining,]
testing <- biopsy[-inTraining,]
# Run algorithms using 10-fold cross validation
control <- trainControl(method="repeatedcv", number=10,repeats = 5, verboseIter = F, classProbs = T)
#CHANGING THE CHARACTERS INTO FACTORS VARAIBLES
training<- as.data.frame(unclass(training),
stringsAsFactors = TRUE)
#CHANGING THE CHARACTERS INTO FACTORS VARAIBLES
testing <- as.data.frame(unclass(testing),
stringsAsFactors = TRUE)
models<-c("svmRadial","rf")
results_table <- data.frame(models = models, stringsAsFactors = F)
for (i in models){
model_train<-train(class~., data=training, method=i,
trControl=control,metric="Accuracy")
predictions<-predict(model_train, newdata=testing)
precision_<-posPredValue(predictions,testing)
recall_<-sensitivity(predictions,testing)
f1<-(2*precision_*recall_)/(precision_+recall_)
# put that in the results table
results_table[i, "Precision"] <- precision_
results_table[i, "Recall"] <- recall_
results_table[i, "F1score"] <- f1
}
However I get an error which says Error in posPredValue.default(predictions, testing) : inputs must be factors. i do not know where I went wrong and any edits to my code are welcome.
I know that I could get precision,recall, f1 by just using the code below (B), however this is a tutorial question where I am required not to use the code example below (B):
(B)
for (i in models){
model_train<-train(class~., data=training, method=i,
trControl=control,metric="Accuracy")
predictions<-predict(model_train, newdata=testing)
print(confusionMatrix(predictions, testing$class,mode="prec_recall"))
}
A few things need to happen.
You have to change the function calls for posPredValue and sensitivity. For both, change testing to testing$class.
for the results_table, i is a word, not a value, so you're assigning results_table["rf", "Precision"] <- precision_ (This makes a new row, where the row name is "rf".)
Here is your for statement, with changes to those functions mentioned in 1) and a modification to address the issue in 2).
for (i in models){
model_train <- train(class~., data = training, method = i,
trControl= control, metric = "Accuracy")
assign("fit", model_train)
predictions <- predict(model_train, newdata = testing)
precision_ <-posPredValue(predictions, testing$class)
recall_ <- sensitivity(predictions, testing$class)
f1 <- (2*precision_ * recall_) / (precision_ + recall_)
# put that in the results table
results_table[results_table$models %in% i, "Precision"] <- precision_
results_table[results_table$models %in% i, "Recall"] <- recall_
results_table[results_table$models %in% i, "F1score"] <- f1
}
This is what it looks like for me.
results_table
# models Precision Recall F1score
# 1 svmRadial 0.9722222 0.9459459 0.9589041
# 2 rf 0.9732143 0.9819820 0.9775785

preProcess function R

I'm very new to R.
I'm trying to follow the procedure below:
imputedData <- preProcess( select(train, -SalePrice),
method = c("center", "scale", "knnImpute", "nzv", 'YeoJohnson')
)
#install.packages('RANN')
library(RANN)
trainTrans <- predict(imputedData, train)
And I have this error
Must subset rows with a valid subscript vector. x Subscript nn$nn.idx must be a simple vector, not a matrix.
I have already install Caret package
The train dataset is a table that I have imported from a CSV file
This is the program
Loading of packages
knitr::opts_chunk$set(echo = TRUE, cache = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(caret)
library(GGally)
library(lattice)
library(corrplot)
library(factoextra)
library(FactoMineR)
library(magrittr)
theme_set(theme_bw())
set.seed(181019)
Data loading
train <- readr::read_csv("train.csv")
test <- readr::read_csv("test.csv")
Missing value
missing_threshold <- .4
is_too_scarce <- lapply(select(train, -SalePrice), function(x) mean(is.na(x)) > missing_threshold)
is_too_scarce <- map_lgl(select(train, -SalePrice), ~mean(is.na(.x)) > missing_threshold)
not_too_scarce <- names(is_too_scarce)[!is_too_scarce]
train <- select(train, SalePrice, not_too_scarce)
train %<>% select(SalePrice, not_too_scarce)
test %<>% select(not_too_scarce)
Preprocessing
imputedData <- preProcess( select(train, -SalePrice),
method = c("center", "scale", "knnImpute", "nzv", 'YeoJohnson')
)
#install.packages('RANN')
library(RANN)
testTrans <- predict(imputedData, test)
trainTrans <- predict(imputedData, train)

R-Studio SVM classAgreement how-to?

I am an absolute newbie to R-Studio and want to use svm() of the e1071 package.
I went through David Meyer's paper.
I can't get classAgreement() to run.
What do I need to do before I can use classAgreement()?
Thanks a lot!
library(e1071)
data(Glass, package="mlbench")
index <- 1:nrow(Glass)
testindex <- sample(index, trunc(length(index)/3))
testset <- Glass[testindex,]
trainset <- Glass[-testindex,]
svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1)
svm.pred <- predict(svm.model, testset[,-10])
table(pred = svm.pred, true = testset[,10])
classAgreement(table)
Running your code the classAgreement(table) throws the following error:
Error in sum(tab) : invalid 'type' (closure) of argument
This is due to the fact that table here is a function as you didn't write an object called table which I think you intended to do in the previous line.
So you can either do one of the following:
svm.tab <- table(pred = svm.pred, true = testset[,10])
classAgreement(svm.tab)
Or just in one go
classAgreement(table(pred = svm.pred, true = testset[,10]))

Dredge error in is.dataframe(data) object not found

I am trying to run pdredge using the following example code, where the data is located at
https://github.com/aditibhaskar/help/blob/master/gages_urbanizing_and_ref_with_trends_cut_to_20years_2018-12-02.Rdata
library(MuMIn)
require(snow)
require(parallel)
variable.list <- c("log(hden_change_divided_by_hdenStart)", "hden_peak_change", "DRAIN_SQKM", "PPTAVG_BASIN", "SNOW_PCT_PRECIP", "log(HIRES_LENTIC_PCT)", "FRAGUN_BASIN", "BFI_AVE", "CLAYAVE", "WTDEPAVE", "AWCAVE", "RD_STR_INTERS", "RIP800_FOREST", "BAS_COMPACTNESS", "STREAMS_KM_SQ_KM", "RRMEAN", "SLOPE_PCT", "PCT_1ST_ORDER", "FRESHW_WITHDRAWAL")
y.365 <- lm(paste0("(SlopePct.365 - Ref1.SlopePct.365) ~", paste(variable.list, collapse="+")), data=gages)
options(na.action = "na.fail")
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
clusterType <- if(length(find.package("snow", quiet = TRUE))) "SOCK" else "PSOCK"
clust <- try(makeCluster(getOption("cl.cores", no_cores), type = clusterType))
dredge.365 <- pdredge(y.365, rank="AICc", trace=2, cluster=clust)
From this I get these errors:
"2097152: In is.data.frame(data):
object 'gages' not found (model 2097151 skipped)
Error in pdredge(y.365, rank="AICc", trace=2, cluster=clust) :
the result is empty
What am I doing wrong? Thanks.
You forgot to export model's data to the cluster nodes:
clusterExport(clust, "gages")

How to solve "The data cannot have more levels than the reference" error when using confusioMatrix?

I'm using R programming.
I divided the data as train & test for predicting accuracy.
This is my code:
library("tree")
credit<-read.csv("C:/Users/Administrator/Desktop/german_credit (2).csv")
library("caret")
set.seed(1000)
intrain<-createDataPartition(y=credit$Creditability,p=0.7,list=FALSE)
train<-credit[intrain, ]
test<-credit[-intrain, ]
treemod<-tree(Creditability~. , data=train)
plot(treemod)
text(treemod)
cv.trees<-cv.tree(treemod,FUN=prune.tree)
plot(cv.trees)
prune.trees<-prune.tree(treemod,best=3)
plot(prune.trees)
text(prune.trees,pretty=0)
install.packages("e1071")
library("e1071")
treepred<-predict(prune.trees, newdata=test)
confusionMatrix(treepred, test$Creditability)
The following error message happens in confusionMatrix:
Error in confusionMatrix.default(rpartpred, test$Creditability) : the data cannot have more levels than the reference
The credit data can download at this site.
http://freakonometrics.free.fr/german_credit.csv
If you look carefully at your plots, you will see that you are training a regression tree and not a classication tree.
If you run credit$Creditability <- as.factor(credit$Creditability) after reading in the data and use type = "class" in the predict function, your code should work.
code:
credit <- read.csv("http://freakonometrics.free.fr/german_credit.csv" )
credit$Creditability <- as.factor(credit$Creditability)
library(caret)
library(tree)
library(e1071)
set.seed(1000)
intrain <- createDataPartition(y = credit$Creditability, p = 0.7, list = FALSE)
train <- credit[intrain, ]
test <- credit[-intrain, ]
treemod <- tree(Creditability ~ ., data = train, )
cv.trees <- cv.tree(treemod, FUN = prune.tree)
plot(cv.trees)
prune.trees <- prune.tree(treemod, best = 3)
plot(prune.trees)
text(prune.trees, pretty = 0)
treepred <- predict(prune.trees, newdata = test, type = "class")
confusionMatrix(treepred, test$Creditability)
I had the same issue in classification. It turns out that there is ZERO observation in a specific group therefore I got the error "the data cannot have more levels than the referenceā€.
Make sure there all groups in your test set appears in your training set.

Resources