I've looked into the h2o.predict_contributions function that exposes the Shap values from xgb and gbm models. Does this function also provide these metrics from cross validation predictions? I can't seem to find them.
library(h2o)
library(mlbench)
data(Sonar)
Sonar.h2o = as.h2o(Sonar)
mdl = h2o.xgboost(x=names(Sonar), y='Class', training_frame = Sonar, nfolds=5, keep_cross_validation_predictions = TRUE)
yes you can apply the function to a single fold of interest, here is some example code:
library(h2o)
h2o.init()
prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
prostate <- h2o.uploadFile(path = prostate_path)
prostate_gbm <- h2o.gbm(3:9, "AGE", prostate, nfolds = 3)
h2o.predict(prostate_gbm, prostate)
h2o.predict_contributions(prostate_gbm, prostate)
# take a look at the output to see which key you want to use
# there are also other options to key names
prostate_gbm#model$cross_validation_models
# update this with the key of interest
key = 'GBM_model_R_1557326910287_7702_cv_2'
cv2 = h2o.getModel(key)
h2o.predict_contributions(cv2, prostate)
# RACE DPROS DCAPS PSA VOL GLEASON __internal_cv_weights__ BiasTerm
# 1 -0.006481315 -0.19211742 -0.0836791 -0.06186131 -0.9217098 -0.20128664 0 66.37209
# 2 -0.005238285 -1.09128833 0.9614767 -0.95340544 -0.7698430 0.06820074 0 66.37209
# 3 -0.006481315 0.98101193 0.1770813 1.21195042 -1.0359415 -0.23213011 0 66.37209
# 4 0.069538474 -0.01738315 -0.2000238 4.11799049 0.1177490 -0.01457024 0 66.37209
# 5 0.012923095 0.40362182 -0.1132747 1.21669090 0.9920316 -0.37245926 0 66.37209
# 6 -0.002282504 -0.91798097 0.9024866 -0.17398398 -0.6048008 0.42300656 0 66.37209
Note: you can ignore the __internal_cv_weights__ column. I've created a ticket to clean up the output that can be tracked here.
Related
Good Day
I have run a Random Forest with tuning and have added the prediction to the Train data which ran perfectly well and had no issues. However when I tried running the random forest model on the Test dataset I get the above error. Any idea as to what could be causing this below is my code. Appreciate any help with this. The Train dataset does have 3500 rows and the Test would have 1500 rows as the dataset is made of 5000 rows.
R CODE:
####Clearing the global environmnent
rm(list = ls())
##Setting the working directory
setwd("D:/Great Learning/Module 3 -Machine Learning/Project")
##Packages required to be loaded
install.packages("DataExplorer")
install.packages("xlsx")
##install.packages("magrittr")
install.packages("dplyr")
install.packages("tidyverse")
install.packages("mice")
install.packages("NbClust")
##Reading in the dataset
library(xlsx)
LoanModelRaw = read.xlsx("Thera Bank_Personal_Loan_Modelling-dataset- 1.xlsx",sheetName = "Bank_Personal_Loan_Modelling",header = T)
##LoanModelRaw = read.csv("Thera Bank_Personal_Loan_Modelling-dataset-1.csv", sep = ";",header = T)
##Viewing the dataset in R
View(LoanModelRaw)
dim(LoanModelRaw)
colnames(LoanModelRaw)
str(LoanModelRaw)
summary(LoanModelRaw)
nrow(LoanModelRaw)
attach(LoanModelRaw)
#Correcting column names
names(LoanModelRaw)[2] = "AgeInYears"
names(LoanModelRaw)[3] = "ExperienceInYears"
names(LoanModelRaw)[4] = "IncomeInKMonth"
names(LoanModelRaw)[5] = "ZIPCode"
names(LoanModelRaw)[6] = "FamilyMembers"
names(LoanModelRaw)[10] = "PersonalLoan"
names(LoanModelRaw)[11] = "SecuritiesAccount"
names(LoanModelRaw)[12] = "CDAccount"
colnames(LoanModelRaw)
#############################################################1 EDA of the data#######################################################
library(DataExplorer)
##introduce(LoanModelRaw)
plot_intro(LoanModelRaw)
plot_missing(LoanModelRaw)
##plot_bar(LoanModelRaw)
plot_histogram(LoanModelRaw)
create_report(LoanModelRaw)
?plot_boxplot
#Missing Value Treatment
library(mice)
sum(is.na(LoanModelRaw))
md.pattern(LoanModelRaw)
LoanModelRawImpute = mice(LoanModelRaw, m =5, method = 'pmm', seed = 1000)
LoanModelRawNoNa = complete(LoanModelRawImpute, 3)
md.pattern(LoanModelRawNoNa)
#Correcting negative experience
LoanModel = abs(LoanModelRawNoNa[2:14])
attach(LoanModel)
#View(LoanModel)
#summary(LoanModel)
#nrow(LoanModel)
#
LoanModel$Split = sample.split(LoanModel$PersonalLoan, SplitRatio = 0.7)
View(LoanModel)
LoanModelTrainRaw = subset(LoanModel,LoanModel$Split == TRUE)
LoanModelTestRaw = subset(LoanModel,LoanModel$Split == FALSE)
#Installing the packages for the running random forest
install.packages("randomForest")
install.packages("dplyr")
library(randomForest)
library(dplyr)
attach(LoanModelTrain)
str(LoanModelTrain)
#Need to exclude the split and move columns
LoanModelTrain = LoanModelTrainRaw[1:13]
LoanModelTest = LoanModelTestRaw[1:13]
LoanModelTrain = LoanModelTrain %>% select(IncomeInKMonth,Mortgage,ZIPCode,CCAvg,everything())
LoanModelTest = LoanModelTest %>% select(IncomeInKMonth,Mortgage,ZIPCode,CCAvg, everything())
head(LoanModelTrain)
head(LoanModelTest)
###Converting the data set to a factor variable in order to be read
#Train
fcol = c(5:13)
LoanModelTrain[,fcol] = lapply(LoanModelTrain[,fcol], factor)
str(LoanModelTrain)
nrow(LoanModelTrain)
#Test
fcol = c(5:13)
LoanModelTest[,fcol] = lapply(LoanModelTest[,fcol], factor)
str(LoanModelTest)
##Running the random forest
seed = 1000
set.seed(seed)
LoanModelTrainRF = randomForest(PersonalLoan ~ ., data = LoanModelTrain, ntree = 501, mtry = 10, nodesize = 10, importance = TRUE, do.trace = TRUE)
print(LoanModelTrainRF)
plot(LoanModelTrainRF)
importance(LoanModelTrainRF)
?randomForest
###Tuning the random Forest
set.seed(seed)
LoanModelTrain = LoanModelTrain %>% select(PersonalLoan,everything())
str(LoanModelTrain)
LoanModelTrainRFTuned = tuneRF(x = LoanModelTrain[,-c(1)],
y = PersonalLoan,
mtryStart = 10,
stepFactor = 1.5,
improve = 0.001,
trace = TRUE,
plot = TRUE,
doBest = TRUE,
importance = TRUE)
###Running refined random forest
LoanModelTrainRefinedRF = randomForest(PersonalLoan ~ ., data = LoanModelTrain, ntree = 95, mtry = 10, nodesize = 10, importance = TRUE, do.trace = TRUE)
print(LoanModelTrainRefinedRF)
plot(LoanModelTrainRefinedRF)
###Adding the prediction columns and probability columns
LoanModelTrain$Predict = predict(LoanModelTrainRefinedRF,data= LoanModelTrain, type = "class")
LoanModelTrain$Score = predict(LoanModelTrainRefinedRF,data= LoanModelTrain, type = "prob")
head(LoanModelTrain)
###Check the accuracy of the model
install.packages("caret")
library(caret)
caret::confusionMatrix(LoanModelTrain$PersonalLoan, LoanModelTrain$Predict)
###Run the model against the Test Data
str(LoanModelTest)
** LoanModelTest$Predict = predict(LoanModelTrainRefinedRF,data= LoanModelTest, type = "class")
** LoanModelTest$Score = predict(LoanModelTrainRefinedRF,data= LoanModelTest, type = "prob")
AgeInYears ExperienceInYears IncomeInKMonth ZIPCode FamilyMembers CCAvg Education
25 1 49 91107 4 1.6 1
45 19 34 90089 3 1.5 1
39 15 11 94720 1 1.0 1
35 9 100 94112 1 2.7 2
35 8 45 91330 4 1.0 2
37 13 29 92121 4 0.4 2
Mortgage PersonalLoan SecuritiesAccount CDAccount Online CreditCard Split
0 0 1 0 0 0 FALSE
0 0 1 0 0 0 FALSE
0 0 0 0 0 0 TRUE
0 0 0 0 0 0 TRUE
0 0 0 0 0 1 TRUE
155 0 0 0 1 0 TRUE
I got the same error trying to predict a single outcome from a simple glm model. In the model I specified the outcome and predictors using the format "dataset$outcome", etc. In the "test" set (really just one row of observations, I named the columns "outcome" etc. If I remove the $s from the model and instead specify "data=dataset", then the error disapears. So perhaps it's an issue with how objects are being called.
This error means that you try to append a column vector of length 3500 to a matrix that has 1500 rows. Of course, this does not work because R does not automatically create ǸA for the empty rows (and that is a good thing).
Try to check the dimensions (number of rows and number of columns) of LoanModelTest and LoanModelTrain. Also, check the return dimensions of the predict functions.
I hope this helps!
In my working project, I use rfe function from caret package to do recursive feature elimination. I use a toy example to illustrate my point.
library(mlbench)
library(caret)
data(PimaIndiansDiabetes)
rfFuncs$summary <- twoClassSummary
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(PimaIndiansDiabetes[,1:8], PimaIndiansDiabetes[,9], sizes=c(1:8), rfeControl=control, metric="ROC")
The optimal variable selected is based on those variables that give highest auroc in the process and can be retrieved by results$optVariables.
However, what I want to do is use '1 standard error rule' to select less features (code below). The number of variables identified is 4.
# auc that is 1-se from the highest auc
df.results = results$results %>% dplyr::mutate(ROCSE = ROCSD/sqrt(10-1))
idx = which.max(df.results$ROC)
ROC.1se = df.results$ROC[idx] - df.results$ROCSE[idx]
# plot ROC vs feature size
g = ggplot(df.results, aes(x=Variables, y=ROC)) +
geom_errorbar(aes(ymin=ROC-ROCSE, ymax=ROC+ROCSE),
width=.2, alpha=0.4, linetype=1) +
geom_line() +
geom_point()+
scale_color_brewer(palette="Paired")+
geom_hline(yintercept = ROC.1se)+
labs(x ="Number of Variables", y = "AUROC")
print(g)
The number of variables I identified is 4. Now I need to know which four variables. I did below:
results$variables %>% filter(Variables==4) %>% distinct(var)
It shows me 5 variables!
Does anyone know how I can retrieve those variables? Basically it applies to get those variables for any number of variables selected.
Thanks a lot in advance!
One-line Answer
If you know you want only the best 4 variables from the rfe resampling, this will give you what you are looking for.
results$optVariables[1:4]
# [1] "glucose" "mass" "age" "pregnant"
dplyr Answer
# results$variables %>%
# group_by(var) %>%
# summarize(Overall = mean(Overall)) %>%
# arrange(-Overall)
#
# A tibble: 8 x 2
# var Overall
# <chr> <dbl>
# 1 glucose 34.2
# 2 mass 15.8
# 3 age 12.7
# 4 pregnant 7.92
# 5 pedigree 5.09
# 6 insulin 4.87
# 7 triceps 3.25
# 8 pressure 1.95
Why your attempt gives more than 4 variables
You are filtering 40 observations. 10 folds of the best 4 variables. The best 4 variables is not always the same within each fold. Hence, to get the best top 4 variables across the resamples you need to average their performance across the folds as the code above does. Even simpler, the variables within optVariables are sorted in this order, so you can just grab the first 4 (as in my one-line answer). The proof that this is the case takes a bit of digging into the source code (shown below).
Details: Digging into the source code
A good first thing to do with objects returned from functions like rfe is to try functions like print, summary, or plot. Often custom methods will exist that will give you very helpful information. For example...
# Run rfe with a random seed
# library(dplyr)
# library(mlbench)
# library(caret)
# data(PimaIndiansDiabetes)
# rfFuncs$summary <- twoClassSummary
# control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# set.seed(1)
# results <- rfe(PimaIndiansDiabetes[,1:8], PimaIndiansDiabetes[,9], sizes=c(1:8),
# rfeControl=control, metric="ROC")
#
# The next two lines identical...
results
print(results)
# Recursive feature selection
#
# Outer resampling method: Cross-Validated (10 fold)
#
# Resampling performance over subset size:
#
# Variables ROC Sens Spec ROCSD SensSD SpecSD Selected
# 1 0.7250 0.870 0.4071 0.07300 0.07134 0.10322
# 2 0.7842 0.840 0.5677 0.04690 0.04989 0.05177
# 3 0.8004 0.824 0.5789 0.02823 0.04695 0.10456
# 4 0.8139 0.842 0.6269 0.03210 0.03458 0.05727
# 5 0.8164 0.844 0.5969 0.02850 0.02951 0.07288
# 6 0.8263 0.836 0.6078 0.03310 0.03978 0.07959
# 7 0.8314 0.844 0.5966 0.03075 0.04502 0.07232
# 8 0.8316 0.860 0.6081 0.02359 0.04522 0.07316 *
#
# The top 5 variables (out of 8):
# glucose, mass, age, pregnant, pedigree
Hmm, that gives 5 variables, but you said you wanted 4. We can pretty quickly dig into the source code to explore how it is calculating and returning those 5 variables as the top 5 variables.
print(caret:::print.rfe)
#
# Only a snippet code shown below...
# cat("The top ", min(top, x$bestSubset), " variables (out of ",
# x$bestSubset, "):\n ", paste(x$optVariables[1:min(top,
# x$bestSubset)], collapse = ", "), "\n\n", sep = "")
So, basically it is pulling the top 5 variables directly from results$optVariables. How is that getting populated?
# print(caret:::rfe.default)
#
# Snippet 1 of code...
# bestVar <- rfeControl$functions$selectVar(selectedVars,
bestSubset)
#
# Snippet 2 of code...
# bestSubset = bestSubset, fit = fit, optVariables = bestVar,
Ok, optVariables is getting populated by rfeControl$functions$selectVar.
print(rfeControl)
#
# Snippet of code...
# list(functions = if (is.null(functions)) caretFuncs else functions,
From above, we see that caretFuncs$selectVar is being used...
Details: Source code that is populating optVariables
print(caretFuncs$selectVar)
# function (y, size)
# {
# finalImp <- ddply(y[, c("Overall", "var")], .(var), function(x) mean(x$Overall,
# na.rm = TRUE))
# names(finalImp)[2] <- "Overall"
# finalImp <- finalImp[order(finalImp$Overall, decreasing = TRUE),
# ]
# as.character(finalImp$var[1:size])
# }
I want to compare a multinomial logit model and a random forest using a grouped brier score within cross validation. The theoretical foundation of this approach is: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3702649/pdf/nihms461154.pdf
My dependent variable has three outcomes and my data-set compremises life-time data, where the lifetime lies between 0-5.
To make things reproducable, my dataset looks like:
library(data.table)
N <- 1000
X1 <- rnorm(N, 175, 7)
X2 <- rnorm(N, 30, 8)
length <- sample(0:5,N,T)
Ycont <- 0.5*X1 - 0.3*X2 + 10 + rnorm(N, 0, 6)
Ycateg <- ntile(Ycont,3)
df <- data.frame(id=1:N,length,X1, X2, Ycateg)
df$Ycateg=ifelse(df$Ycateg==1,"current",ifelse(df$Ycateg==2,"default","prepaid"))
df=setDT(df)[,.SD[rep(1L,length)],by = id]
df=df[ , time := 1:.N , by=id]
df=df[,-c("length")]
head(df)
id X1 X2 Ycateg time
1: 1 178.0645 10.84313 1 1
2: 2 169.4208 34.39831 1 1
3: 2 169.4208 34.39831 1 2
4: 2 169.4208 34.39831 1 3
5: 2 169.4208 34.39831 1 4
6: 2 169.4208 34.39831 1 5
What I did so far is:
library(caret)
fitControl <- trainControl(method = 'cv',number=5)
cv=train(as.factor(Ycateg)~.,
data = df,
method = "multinom",
maxit=150,
trControl = fitControl)
cv
Since the models are used to predict probabilities at each time point, I want to compute the following for each fold:
Brier Score for each category of the dependent variable: BS_i=(Y_it,k - p_it,k)² - where i denotes observation i of the test-fold,t the time and k the class k of the dependent variable
Summarise for this one fold 1. by computing 1/n_t (BS_i) where n_t are the number of observations which do have an observed time t - so a grouped computation
So in the end, what I want to report - for example for a 3 fold CV & knowing that time ranges from 0-5 - is an output like this:
fold time Brier_0 Brier_1 Brier_2
1 1 0 0.39758714 0.11703814 0.8711775
2 1 1 0.99461281 0.95051037 0.1503217
3 1 2 0.01791559 0.83653814 0.1553521
4 1 3 0.92067849 0.55275340 0.6466206
5 1 4 0.73112563 0.07603891 0.5769286
6 1 5 0.29500600 0.66219814 0.7590742
7 2 0 0.24691469 0.06736522 0.8612998
8 2 1 0.13629191 0.55973431 0.5617303
9 2 2 0.48006915 0.01357407 0.4515544
10 2 3 0.01257112 0.40250469 0.1814620
. . . . . .
I know that I have to set up a customized version of the summaryFunction, but I'm really lost on how to do this. So my main aim is not to tune a model but to validate it.
There is one thing that should be remarked: the summaryFunction can only return a single numeric vector - correct me if I'm wrong. Futher, the data-parameter of the summaryFunction contains a column rowIndex which can be used to extract additional variables form the original data set.
customSummary <- function (data, lev = NULL, model = NULL) { # for training on a next-period return
#browser() #essential for debugging
dat=dim(data)
# get observed dummy
Y_obs = model.matrix( ~ data[, "obs"] - 1) # create dummy - for each level of the outcome
# get predicted probabilities
Y_pre=as.data.frame(data[ , c("current","default","prepaid")])
# get rownumbers
rows=data[,"rowIndex"]
# get time of each obs
time=df[rows,]$time
# put it all together
df_temp=data.frame(Y_obs,Y_pre,time)
names(df_temp)=c("Y_cur","Y_def","Y_pre","p_cur","p_def","p_pre","time")
# group by time and compute crier score
out=df_temp %>% group_by(time) %>% summarise(BS_cur=1/n()*sum((Y_cur-p_cur)^2),BS_def=1/n()*sum((Y_def-p_def)^2),BS_pre=1/n()*sum((Y_pre-p_pre)^2))
# name
names(out)=c("time","BS_cur","BS_def","BS_pre")
# now create one line of return - caret seems to be able to hande only one
out=as.data.frame(out)
out_stack=stack(out)
out_stack=out_stack[(max(out$time)):length(out_stack[,1]),]
out_stack=out_stack[-1,]
out_stack$ind=paste(out_stack$ind,out$time,sep = "_")
# recall, the return type must be simply numeric
out_final=(t(out_stack[,1]))
names(out_final)=(out_stack[,2])
return(out_final)
}
# which type of cross validation to do
fitControl <- trainControl(method = 'cv',number=5,classProbs=TRUE,summaryFunction=customSummary, selectionFunction = "best", savePredictions = TRUE)
grid <- expand.grid(decay = 0 )
cv=train(as.factor(Ycateg)~.,
data = df,
method = "multinom",
maxit=150,
trControl = fitControl,
tuneGrid = grid
)
cv$resample
BS_cur_1 BS_cur_2 BS_cur_3 BS_cur_4 BS_cur_5 BS_def_1 BS_def_2 BS_def_3 BS_def_4 BS_def_5 BS_pre_1 BS_pre_2 BS_pre_3 BS_pre_4 BS_pre_5
1 0.1657623 0.1542842 0.1366912 0.1398001 0.2056348 0.1915512 0.2256758 0.2291467 0.2448737 0.2698545 0.1586134 0.2101389 0.1432483 0.2076886 0.1663780
2 0.1776843 0.1919503 0.1615440 0.1654297 0.1200515 0.2108787 0.2185783 0.2209958 0.2467931 0.2199898 0.1580643 0.1595971 0.2015860 0.1826029 0.1947144
3 0.1675981 0.1818885 0.1893253 0.1402550 0.1400997 0.2358501 0.2342476 0.2079819 0.1870549 0.2065355 0.2055711 0.1586077 0.1453172 0.1638555 0.2106146
4 0.1796041 0.1573086 0.1500860 0.1738538 0.1171626 0.2247850 0.2168341 0.2031590 0.1807209 0.2616180 0.1677508 0.1965577 0.1873078 0.1859176 0.1344115
5 0.1909324 0.1640292 0.1556209 0.1371598 0.1566207 0.2314311 0.1991000 0.2255612 0.2195158 0.2071910 0.1976272 0.1777507 0.1843828 0.1453439 0.1736540
Resample
1 Fold1
2 Fold2
3 Fold3
4 Fold4
5 Fold5
I have the following dataset (obtained here):
----------item survivalpoints weight
1 pocketknife 10 1
2 beans 20 5
3 potatoes 15 10
4 unions 2 1
5 sleeping bag 30 7
6 rope 10 5
7 compass 30 1
I can cluster this dataset into three clusters with kmeans() using a binary string as my initial choice of centers. For eg:
## 1 represents the initial centers
chromosome = c(1,1,1,0,0,0,0)
## exclude first column (kmeans only support continous data)
cl <- kmeans(dataset[, -1], dataset[chromosome == 1, -1])
## check the memberships
cl$clusters
# [1] 1 3 3 1 2 1 2
Using this fundamental concept, I tried it out with GA package to conduct the search where I am trying to optimize(minimize) Davies-Bouldin (DB) Index.
library(GA) ## for ga() function
library(clusterSim) ## for index.DB() function
## defining my fitness function (Davies-Bouldin)
DBI <- function(x) {
## converting matrix to vector to access each row
binary_rep <- split(x, row(x))
## evaluate the fitness of each chromsome
for(each in 1:nrow(x){
cl <- kmeans(dataset, dataset[binary_rep[[each]] == 1, -1])
dbi <- index.DB(dataset, cl$cluster, centrotypes = "centroids")
## minimizing db
return(-dbi)
}
}
g<- ga(type = "binary", fitness = DBI, popSize = 100, nBits = nrow(dataset))
Of course (I have no idea what's happening), I received error message of
Warning messages:
Error in row(x) : a matrix-like object is required as argument to 'row'
Here are my questions:
How can correctly use the GA package to solve my problem?
How can I make sure the randomly generated chromosomes contains the same number of 1s which corresponds to k number of clusters (eg. if k=3 then the chromosome must contain exactly three 1s)?
I can't comment on the sense of combining k-means with ga, but I can point out that you had issue in your fitness function. Also, errors are produced when all genes are on or off, so fitness is only calculated when that is not the case:
DBI <- function(x) {
if(sum(x)==nrow(dataset) | sum(x)==0){
score <- 0
} else {
cl <- kmeans(dataset[, -1], dataset[x==1, -1])
dbi <- index.DB(dataset[,-1], cl=cl$cluster, centrotypes = "centroids")
score <- dbi$DB
}
return(score)
}
g <- ga(type = "binary", fitness = DBI, popSize = 100, nBits = nrow(dataset))
plot(g)
g#solution
g#fitnessValue
Looks like several gene combinations produced the same "best" fitness value
original data:
head(df.num_kmeans)
datausage mou revenue calldrop handset2g handset3g smartphone
1 896804.7 2854801 40830.404 27515 7930 19040 20810
2 155932.1 419109 5512.498 5247 2325 2856 3257
3 674983.3 2021183 25252.265 21068 6497 13056 14273
4 522787.2 1303221 14547.380 8865 4693 9439 10746
5 523465.7 1714641 24177.095 25441 8668 12605 14766
6 527062.3 1651303 20153.482 18219 6822 11067 12994
rechargecount rechargesum arpu subscribers
1 4461 235430 197704.10 105822
2 843 39820 34799.21 18210
3 2944 157099 133842.38 71351
4 2278 121697 104681.58 44975
5 2802 144262 133190.55 75860
6 2875 143333 119389.91 63740
I have the following PCA data on which i am doing Kmeans clustering:
head(pcdffinal)
PC1 PC2 PC3 PC4 PC5 PC6
1 -9.204228 -2.73517110 2.7975063 0.6794614 -0.84627095 0.4455297
2 2.927245 0.05666389 0.5085896 0.1472800 0.18193152 0.1041490
3 -4.667932 -1.98176361 2.2751862 0.5347725 -0.43314927 0.3222719
4 -1.366505 -0.40858595 0.5005192 0.4507366 -0.54996933 0.5533013
5 -4.689454 -2.77185636 2.4323856 0.7387788 0.49237229 -0.4817083
6 -3.477046 -1.84904214 1.5539558 0.5463861 -0.03231143 0.2814843
opt.cluster<-3
set.seed(115)
pccomp.km <- kmeans(pcdffinal,opt.cluster,nstart=25)
head(pccomp.km$cluster)
[1] 2 1 2 2 2 2
barplot(table(pccomp.km$cluster), col="steelblue")
pccomp.km$tot.withinss #For total within cluster sum of squares.
[1] 13172.59
We can also use a plot to illustrate the groups that the data have been arranged into.
par(mfrow=c(1,1))
plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means Clustering result with k = ', opt.cluster,sep=" "),pch=20,cex=2)
points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares
library("factoextra")
fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal()
df.num_kmeans<-df.num
df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as
save this dataset & kmeans model for further use
saveRDS(pccomp.km, "kmeans_model.RDS")
write.csv(df.num_kmeans,"dfnum_kmeans.cluster.csv")
library(cluster)
clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)
library(ggfortify)
autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm')
Then i run random forest model and compute accuracy:
dfnum.kmeans <- read.csv("dfnum_kmeans.cluster.csv")
table(dfnum.kmeans$cluster.kmeans) # size of each cluster
convert cluster var into a factor
dfnum.kmeans$cluster.kmeans <- as.factor(dfnum.kmeans$cluster.kmeans)
is.factor(dfnum.kmeans$cluster.kmeans)
create training and test sets (75:25 split) using 'caret' package
set.seed(128) # for reproducibility
inTrain_kmeans <- caret::createDataPartition(y = dfnum.kmeans$cluster.kmeans, p = 0.75, list = FALSE)
training_kmeans <- dfnum.kmeans[inTrain_kmeans, ]
testing_kmeans <- dfnum.kmeans[-inTrain_kmeans, ]
set.seed(122)
control <- trainControl(method = "repeatedcv", number = 10,allowParallel = TRUE)
modFit.rfcaret_kmeans <- caret::train(cluster.kmeans~ ., method = "rf",data = training_kmeans, trControl = control, number = 25)
modFit.rfcaret_kmeans$finalModel
pred.test_kmeans = predict(modFit.rfcaret_kmeans, testing_kmeans); confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )
confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )$overall[1]
I would like to do Kmeans and random forest for a range of Ks say k=2:6 each time making plots for the respective k as well as saving the models as well as the data as a csv but each done separately for different k's.Then for random forest would like to import the above saved csv's, create train & test data,run random forest model,then predict for each test data and finally compute confusion matrix and accuracy ....thus iteratively get the solution
Need help to convert the above codes into an iterative with the counter i going from 2 till 6.