How can i iteratively do clustering for different clusters (k) values

How can i iteratively do clustering for different clusters (k) values - r

I have the following PCA data on which i am doing Kmeans clustering:
head(pcdffinal)
PC1 PC2 PC3 PC4 PC5 PC6
1 -9.204228 -2.73517110 2.7975063 0.6794614 -0.84627095 0.4455297
2 2.927245 0.05666389 0.5085896 0.1472800 0.18193152 0.1041490
3 -4.667932 -1.98176361 2.2751862 0.5347725 -0.43314927 0.3222719
4 -1.366505 -0.40858595 0.5005192 0.4507366 -0.54996933 0.5533013
5 -4.689454 -2.77185636 2.4323856 0.7387788 0.49237229 -0.4817083
6 -3.477046 -1.84904214 1.5539558 0.5463861 -0.03231143 0.2814843
opt.cluster<-3
set.seed(115)
pccomp.km <- kmeans(pcdffinal,opt.cluster,nstart=25)
head(pccomp.km$cluster)
[1] 2 1 2 2 2 2
barplot(table(pccomp.km$cluster), col="steelblue")
pccomp.km$tot.withinss #For total within cluster sum of squares.
[1] 13172.59
We can also use a plot to illustrate the groups that the data have been arranged into.
par(mfrow=c(1,1))
plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means Clustering result with k = ', opt.cluster,sep=" "),pch=20,cex=2)
points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares
library("factoextra")
fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal()
df.num_kmeans<-df.num
df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as
save this dataset & kmeans model for further use
saveRDS(pccomp.km, "kmeans_model.RDS")
write.csv(df.num_kmeans,"dfnum_kmeans.cluster.csv")
library(cluster)
clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)
library(ggfortify)
autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm')
I would like to do Kmeans iteratively for a range of Ks say k=2:6 each time making plots for the respective k as well as saving the models as well as the data as a csv but each done separately for different k's.
Need help to convert the above codes into an iterative with the counter i going from 2 till 6.
original data:
head(df.num_kmeans)
datausage mou revenue calldrop handset2g handset3g smartphone
1 896804.7 2854801 40830.404 27515 7930 19040 20810
2 155932.1 419109 5512.498 5247 2325 2856 3257
3 674983.3 2021183 25252.265 21068 6497 13056 14273
4 522787.2 1303221 14547.380 8865 4693 9439 10746
5 523465.7 1714641 24177.095 25441 8668 12605 14766
6 527062.3 1651303 20153.482 18219 6822 11067 12994
rechargecount rechargesum arpu subscribers
1 4461 235430 197704.10 105822
2 843 39820 34799.21 18210
3 2944 157099 133842.38 71351
4 2278 121697 104681.58 44975
5 2802 144262 133190.55 75860
6 2875 143333 119389.91 63740
Using random forest for accuracy comparison
dfnum.kmeans <- read.csv("dfnum_kmeans.cluster.csv")
table(dfnum.kmeans$cluster.kmeans) # size of each cluster
convert cluster var into a factor
dfnum.kmeans$cluster.kmeans <- as.factor(dfnum.kmeans$cluster.kmeans)
is.factor(dfnum.kmeans$cluster.kmeans)
create training and test sets (75:25 split) using 'caret' package
set.seed(128) # for reproducibility
inTrain_kmeans <- caret::createDataPartition(y = dfnum.kmeans$cluster.kmeans, p = 0.75, list = FALSE)
training_kmeans <- dfnum.kmeans[inTrain_kmeans, ]
testing_kmeans <- dfnum.kmeans[-inTrain_kmeans, ]
set.seed(122)
control <- trainControl(method = "repeatedcv", number = 10,allowParallel = TRUE)
modFit.rfcaret_kmeans <- caret::train(cluster.kmeans~ ., method = "rf",data = training_kmeans, trControl = control, number = 25)
modFit.rfcaret_kmeans$finalModel
pred.test_kmeans = predict(modFit.rfcaret_kmeans, testing_kmeans); confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )
confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )$overall[1]

Assuming that your original dataframe is df.num, the following could save all the files (for different k values) in your working directory:
for (k in 2:6) {
set.seed(115)
pccomp.km <- kmeans(pcdffinal,k,nstart=25)
head(pccomp.km$cluster)
print(paste(k, pccomp.km$tot.withinss)) #For total within cluster sum of squares.
png(paste0('kmeans_proj_',k, '.png'))
par(mfrow=c(1,1))
plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means Clustering result with k = ', k,sep=" "),pch=20,cex=2)
points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares
dev.off()
png(paste0('kmeans_fviz_',k, '.png'))
print(fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal())
dev.off()
df.num_kmeans<-df.num
df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as
saveRDS(pccomp.km, paste0("kmeans_model_", k, ".RDS"))
write.csv(df.num_kmeans,paste0("dfnum_kmeans_", k, ".cluster.csv"))
png(paste0('clusplot_',k, '.png'))
clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)
dev.off()
png(paste0('autoplot_',k, '.png'))
print(autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm'))
dev.off()
}

Related

Generate totals of multinomial distribution directly

Let's assume we want to generate n samples from a multinomial distribution from given probabilities p. This works well with sample or rmultinorm. The totals can then be counted with table. Now I wonder if there is a direct way (or another distribution) available to get the result of table directly without generating complete sample vectors.
Here an example:
set.seed(123)
n <- 10000 # sample size
p <- c(0.1, 0.2, 0.7) # probabilities, sum up to 1.0
## 1) approach with sample
x <- sample(1:3, size = n, prob = p, replace = TRUE)
table(x)
# x
# 1 2 3
# 945 2007 7048
## 2) approach with rmultinorm
x <- rmultinom(n, size = 1, prob = p) * 1:3
table(x[x != 0])
# 1 2 3
# 987 1967 7046

How can I use SOM algorithm for classification prediction

I would like to see If SOM algorithm can be used for classification prediction.
I used to code below but I see that the classification results are far from being right. For example, In the test dataset, I get a lot more than just the 3 values that I have in the training target variable. How can I create a prediction model that will be in alignment to the training target variable?
library(kohonen)
library(HDclassif)
data(wine)
set.seed(7)
training <- sample(nrow(wine), 120)
Xtraining <- scale(wine[training, ])
Xtest <- scale(wine[-training, ],
center = attr(Xtraining, "scaled:center"),
scale = attr(Xtraining, "scaled:scale"))
som.wine <- som(Xtraining, grid = somgrid(5, 5, "hexagonal"))
som.prediction$pred <- predict(som.wine, newdata = Xtest,
trainX = Xtraining,
trainY = factor(Xtraining$class))
And the result:
$unit.classif
[1] 7 7 1 7 1 11 6 2 2 7 7 12 11 11 12 2 7 7 7 1 2 7 2 16 20 24 25 16 13 17 23 22
[33] 24 18 8 22 17 16 22 18 22 22 18 23 22 18 18 13 10 14 15 4 4 14 14 15 15 4

This might help:
SOM is an unsupervised classification algorithm, so you shouldn't expect it to be trained on a dataset that contains a classifier label (if you do that it will need this information to work, and will be useless with unlabelled datasets)
The idea is that it will kind of "convert" an input numeric vector to a network unit number (try to run your code again with a 1 per 3 grid and you'll have the output you expected)
You'll then need to convert those network units numbers back into the categories you are looking for (that is the key part missing in your code)
Reproducible example below will output a classical classification error. It includes one implementation option for the "convert back" part missing in your original post.
Though, for this particular dataset, the model overfitts pretty quickly: 3 units give the best results.
#Set and scale a training set (-1 to drop the classes)
data(wine)
set.seed(7)
training <- sample(nrow(wine), 120)
Xtraining <- scale(wine[training, -1])
#Scale a test set (-1 to drop the classes)
Xtest <- scale(wine[-training, -1],
center = attr(Xtraining, "scaled:center"),
scale = attr(Xtraining, "scaled:scale"))
#Set 2D grid resolution
#WARNING: it overfits pretty quickly
#Errors are 36% for 1 unit, 63% for 2, 93% for 3, 89% for 4
som_grid <- somgrid(xdim = 1, ydim=3, topo="hexagonal")
#Create a trained model
som_model <- som(Xtraining, som_grid)
#Make a prediction on test data
som.prediction <- predict(som_model, newdata = Xtest)
#Put together original classes and SOM classifications
error.df <- data.frame(real = wine[-training, 1],
predicted = som.prediction$unit.classif)
#Return the category number that has the strongest association with the unit
#number (0 stands for ambiguous)
switch <- sapply(unique(som_model$unit.classif), function(x, df){
cat <- as.numeric(names(which.max(table(
error.df[error.df$predicted==x,1]))))
if(length(cat)<1){
cat <- 0
}
return(c(x, cat))
}, df = data.frame(real = wine[training, 1], predicted = som_model$unit.classif))
#Translate units numbers into classes
error.df$corrected <- apply(error.df, MARGIN = 1, function(x, switch){
cat <- switch[2, which(switch[1,] == x["predicted"])]
if(length(cat)<1){
cat <- 0
}
return(cat)
}, switch = switch)
#Compute a classification error
sum(error.df$corrected == error.df$real)/length(error.df$real)

How to Iteratively run Kmeans / Random Forest and compare accuracy

original data:
head(df.num_kmeans)
datausage mou revenue calldrop handset2g handset3g smartphone
1 896804.7 2854801 40830.404 27515 7930 19040 20810
2 155932.1 419109 5512.498 5247 2325 2856 3257
3 674983.3 2021183 25252.265 21068 6497 13056 14273
4 522787.2 1303221 14547.380 8865 4693 9439 10746
5 523465.7 1714641 24177.095 25441 8668 12605 14766
6 527062.3 1651303 20153.482 18219 6822 11067 12994
rechargecount rechargesum arpu subscribers
1 4461 235430 197704.10 105822
2 843 39820 34799.21 18210
3 2944 157099 133842.38 71351
4 2278 121697 104681.58 44975
5 2802 144262 133190.55 75860
6 2875 143333 119389.91 63740
I have the following PCA data on which i am doing Kmeans clustering:
head(pcdffinal)
PC1 PC2 PC3 PC4 PC5 PC6
1 -9.204228 -2.73517110 2.7975063 0.6794614 -0.84627095 0.4455297
2 2.927245 0.05666389 0.5085896 0.1472800 0.18193152 0.1041490
3 -4.667932 -1.98176361 2.2751862 0.5347725 -0.43314927 0.3222719
4 -1.366505 -0.40858595 0.5005192 0.4507366 -0.54996933 0.5533013
5 -4.689454 -2.77185636 2.4323856 0.7387788 0.49237229 -0.4817083
6 -3.477046 -1.84904214 1.5539558 0.5463861 -0.03231143 0.2814843
opt.cluster<-3
set.seed(115)
pccomp.km <- kmeans(pcdffinal,opt.cluster,nstart=25)
head(pccomp.km$cluster)
[1] 2 1 2 2 2 2
barplot(table(pccomp.km$cluster), col="steelblue")
pccomp.km$tot.withinss #For total within cluster sum of squares.
[1] 13172.59
We can also use a plot to illustrate the groups that the data have been arranged into.
par(mfrow=c(1,1))
plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means Clustering result with k = ', opt.cluster,sep=" "),pch=20,cex=2)
points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares
library("factoextra")
fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal()
df.num_kmeans<-df.num
df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as
save this dataset & kmeans model for further use
saveRDS(pccomp.km, "kmeans_model.RDS")
write.csv(df.num_kmeans,"dfnum_kmeans.cluster.csv")
library(cluster)
clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)
library(ggfortify)
autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm')
Then i run random forest model and compute accuracy:
dfnum.kmeans <- read.csv("dfnum_kmeans.cluster.csv")
table(dfnum.kmeans$cluster.kmeans) # size of each cluster
convert cluster var into a factor
dfnum.kmeans$cluster.kmeans <- as.factor(dfnum.kmeans$cluster.kmeans)
is.factor(dfnum.kmeans$cluster.kmeans)
create training and test sets (75:25 split) using 'caret' package
set.seed(128) # for reproducibility
inTrain_kmeans <- caret::createDataPartition(y = dfnum.kmeans$cluster.kmeans, p = 0.75, list = FALSE)
training_kmeans <- dfnum.kmeans[inTrain_kmeans, ]
testing_kmeans <- dfnum.kmeans[-inTrain_kmeans, ]
set.seed(122)
control <- trainControl(method = "repeatedcv", number = 10,allowParallel = TRUE)
modFit.rfcaret_kmeans <- caret::train(cluster.kmeans~ ., method = "rf",data = training_kmeans, trControl = control, number = 25)
modFit.rfcaret_kmeans$finalModel
pred.test_kmeans = predict(modFit.rfcaret_kmeans, testing_kmeans); confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )
confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )$overall[1]
I would like to do Kmeans and random forest for a range of Ks say k=2:6 each time making plots for the respective k as well as saving the models as well as the data as a csv but each done separately for different k's.Then for random forest would like to import the above saved csv's, create train & test data,run random forest model,then predict for each test data and finally compute confusion matrix and accuracy ....thus iteratively get the solution
Need help to convert the above codes into an iterative with the counter i going from 2 till 6.

mcmcglmm loop to create many chains

Following up from this question (see for reproducible data frame) I want to run MCMCGLMM n times, where n is the number of randomisations. I have tried to construct a loop which runs all the chains, and saves them (to retrieve the posterior distributions of the randomised variable later) but I am encountering problems.
This is what the data frame looks like (when n = 5, hence R1-R5), A = response variable, L and V are random effect variables, B is a fixed effect, R1-R5 are random assignments of L with structure of V maintained:
ID L B V A R1 R2 R3 R4 R5
1 1_1_1 1 1 1 11.1 6 19 21 1 31
2 1_1_1 1 1 1 6.9 6 19 21 1 31
3 1_1_4 1 1 4 7.7 2 24 8 22 22
4 1_1_4 1 1 4 10.5 2 24 8 22 22
5 1_1_5 1 1 5 8.5 11 27 14 17 22
6 1_1_7 1 1 7 11.2 5 24 13 18 25
I can create the names I want to assign to my chains, and the names of the variable that changes with each run of the MCMC chain (R1-Rn):
n = 5
Rs = as.vector(rep(NA,n))
for(i in 1:n){
Rs[i] = paste("R",i, sep = "")
}
Rs
Output:
> Rs
[1] "R1" "R2" "R3" "R4" "R5"
I then tried this loop to produce 5 chains:
for(i in 1:n){
chains[i] = MCMCglmm(A ~1 + B,
random = as.formula(paste0("~" ,Rs[i], " + Vial")),
rcov = ~units,
nitt = 500,
thin = 2,
burnin = 50,
prior = prior2,
family = "gaussian",
start = list(QUASI = FALSE),
data = df)
}
Thanks Roland for helping to get the random effect to call properly, previously I was getting an error Error in buildZ(rmodel.terms[r] ... object Rs[i] not found- fixed by as.formula
But this stores all of the data in chains and seemingly only the $Sol components, but I need to be able to access the values within the VCV, specifically the posterior distributions of the R variables (e.g. summary(chainR1$VCV))
In summary: It seems I am making a mistake in how I assign the chain names, does anyone have a suggestion of how to do this, and save the posterior distributions or even the whole chain?

Using assign was a key point:
n = 10 #Number of chains to run
chainVCVdf = matrix(rep(NA, times = ((nitt-burnin)/thin)*n), ncol = n)
colnames(chainVCVdf)=c(rep("X", times = n))
for(i in 1:n){
assign("chainX",paste0("chain",Rs[i]))
chainX = MCMCglmm(A ~1 + B,
random = as.formula(paste0("~" ,Rs[i], " + V")),
rcov = ~units,
nitt = nitt,
thin = thin,
burnin = burnin,
prior = prior1,
family = "gaussian",
start = list(QUASI = FALSE),
data = df)
assign("chainVCV", chainX$VCV[,1])
chainVCVdf[,i]=(chainVCV)
colnames(chainVCVdf)[i] = colnames(chainX$VCV)[1]
}
It then became possible to build a matrix of the VCV component that I am interested in (namely the randomised L assignment in columns R1-Rn)

It seems as though you want to run a number of different MCMCglmm formulas in a loop. #Roland has helped you found the solution to this (although I personally would create the formulas prior to the loop). #Roland also points out that in order to save the results of each model, you should save them in a list - rather than a chain as you are currently doing. You could also save each model as an .RData file, as seen in the end of the question. To formalize an answer to this question I would perform this in the following way:
Rs = paste0("~R", 1:5, " + V") ## Create all model formulae
chainNames = paste0("chainR", 1:5) ## Names for each model
chains = list() ## Initialize list
## Loop over models
for(i in 1:length(Rs)){
chains[[i]] = MCMCglmm(A ~1 + B,
random = formula(Rs[i]),
rcov = ~units,
nitt = 500,
thin = 2,
burnin = 50,
prior = prior2,
family = "gaussian",
start = list(QUASI = FALSE),
data = df)
}
names(chains) = chainNames ## Name each model
save(chains, "chainsR1-R5.Rdata") ## Save all model output
A side note, paste0 is the same as paste, but with the argument sep="" by default

Get specific elements from clustered data in R

I generate this image using the hclust function. Now I wand to ID of those elements highlighted by squares.
Is there any way to get the ID and related value from the clusted datasets? Thanks
EDIT
I used this R script
library(gplots)
library(geneplotter)
# read the data in from URL
bots <- read.table("expression.txt")
# get just the alpha data
abot <- bots[,c(1:9)]
rownames(abot) <- bots[,1]
abot[1:7,]
# get rid of NAs
abot[is.na(abot)] <- 0
# we need to find a way of reducing the data. Can't do ANOVA as there are no
# replicates. Sort on max difference and take first 1000
min <-apply(abot, 1, min)
max <- apply(abot, 1, max)
sabot <- abot[order(max - min, decreasing=TRUE),][1:1000,]
# cluster on correlation
cdist <- as.dist(1 - cor(t(sabot)))
hc <- hclust(cdist, "average")
# draw a heatmap
x11()
heatmap.2(as.matrix(sabot),
Rowv=as.dendrogram(hc),
Colv=FALSE,
cexRow=1,
cexCol=1,
dendrogram="row",
scale="row",
trace="none",
density.info="none",
key=FALSE,
col=greenred.colors(80))
and my data look like this
YF MF SF YL ML SL Stem Root SULE
1 31.64075611 32.2728151 38.81790359 252.8901009 269.7599455 138.5011042 16.58308894 10.47935935 3.364295997
2 6.484902171 9.141084197 5.748798541 3.637332586 4.762966989 4.149302282 7.194971046 9.932508868 1.600027931
3 14.15218386 8.784155316 9.740794214 6.566584262 6.130503033 7.747728536 12.57014531 15.75181203 9.22907038
4 15.72881736 19.95755802 10.13050089 10.31313758 9.838844457 14.24864327 13.00442008 23.85404067 12.17251862
5 30.45475953 15.57131432 17.15277867 8.884751572 8.78786964 12.4745649 11.90176123 35.9844343 6.904763942
6 15.87149807 19.05523246 13.12846166 12.99750491 15.3775883 19.0044086 21.66051467 20.38501538 39.58478032
7 16.58935728 18.63990933 17.20955634 13.04423927 29.98424087 18.02165996 22.22403582 32.38377369 10.90832984
8 29.91118855 19.65844846 23.45958109 62.56338088 55.3926187 39.85296152 31.4832543 14.8484163 1.326553777
9 4.09192129 15.52499475 12.14321788 1.680854758 3.448485979 5.245481483 15.14443161 28.85873063 1.073855381
10 7.02768911 4.267210165 3.383501945 3.53716686 3.105614581 3.493791292 3.806360251 6.713067543 3.338740245
11 17.61821596 18.03607855 12.939663 8.951935241 15.45268577 15.53817186 20.5098186 23.42760284 27.97680418
12 66.35291651 40.41837702 37.7239447 32.42998176 30.09696289 27.81089554 33.27197681 46.5393928 4.141505618
13 15.45804403 15.98469202 17.21176468 9.105208867 11.76140929 13.9751105 14.72159466 25.68388472 7.493988128