how to rename values in Hierarchical cluster analysis in R? - r

I have this data with one column in character and one column in value.
data = structure(list(Station = c("1A", "1B", "2A", "2B", "3A", "3B",
"4A", "4B", "5A", "5B", "6A", "6B", "7A", "7B"), Particles.kg = c(370L,
420L, 250L, 320L, 130L, 210L, 290L, 390L, 230L, 340L, 60L, 90L,
130L, 170L)), class = "data.frame", row.names = c(NA, -14L))
now i convert the character in to factor by
data$Station = as.factor(data$Station)
then i start Hierarchical Cluster analysis
rownames(data) = c(data$Station)
data = data[,-1]
require(stats)
res.dist = dist(x=data, method = "euclidean")
hcl = hclust(d=res.dist, method = "complete")
plot(x=hcl, hang = -1, cex = 0.6)
(sorry can't upload the picture for network issue) but after this in my picture intead 1A, 2A, 3A, 3B it comes 1,2,3,.....,14.
how can i solve this?

After dropping the 1st column, there is only one column left which collapses the data into a vector. Vector drops the rownames of the dataframe, hence there are no labels.
You can use drop = FALSE to keep the data as dataframe after the subset.
rownames(data) = data$Station
data = data[,-1, drop = FALSE]
res.dist = dist(x=data, method = "euclidean")
hcl = hclust(d=res.dist, method = "complete")
plot(x=hcl, hang = -1, cex = 0.6)

There is no need to keep only the measurements column, the plot can be done by subsetting the data in the dist instruction.
Note that I use data[-1], not data[, -1], which would drop the dim attribute. The former always returns a sub-df.
rownames(data) <- data$Station
res.dist <- dist(x = data[-1], method = "euclidean")
hcl <- hclust(d = res.dist, method = "complete")
plot(x = hcl, hang = -1, cex = 0.6)

Related

How do I scatterplot LD1 vs LD2 in lda analysis?

I'm very very new to R, so thanks in advance for the help
I did the lda analysis on my dataset (tme.lda), in the console I get all my results with LD1, LD2, LD3, LD4, LD5 and LD6 but when I try to plot it I tried a lot of different methods but I get every kind of error: Error LD1 object not found - Error in fortify - Error in as.data.frame just to say a few.
This is my dataset:
dput(head(tme.lda))
structure(list(Word = structure(1:6, levels = c("bene", "bile",
"casa", "come", "posso", "tutto", "vero"), class = "factor"),
f0min = c(184L, 193L, 189L, 199L, 175L, 144L), f0max = c(229L,
226L, 198L, 225L, 192L, 188L), F1 = c(600L, 347L, 980L, 531L,
550L, 432L), F2 = c(2406L, 2695L, 1759L, 997L, 996L, 1901L
), F4 = c(4125L, 4403L, 3837L, 3988L, 3909L, 4171L), max_F0 = c(143L,
130L, 124L, 133L, 123L, 120L)), row.names = c(NA, 6L), class = "data.frame")
And this is the code I wrote, how can I get from here the scatterplot LD1 vs LD2?
View(tme.lda)
#lDFA analysis with "WORD" as grouping factor
tme.lda<-cbind(tme[,5],tme.lda[,1:6])
names(tme.lda)
#> [1] "tme[, 5]" "f0min" "f0max" "F1" "F2" "F4" "max_F0"
names(tme.lda)=c("Word","f0min","f0max","F1","F2","F4","max_F0")
names(tme.lda)
#> [1] "Word" "f0min" "f0max" "F1" "F2" "F4" "max_F0"
library(MASS)
lda(Word~f0min+f0max+F1+F2+F4+max_F0,data = tme.lda)
I tried this:
plot(Word, panel = tme.lda, abbrev = FALSE, xlab = "LD1", ylab = "LD2")
plot(x, panel = panel.lda, cex = 0.7, dimen=2, abbrev = FALSE, xlab = "LD1", ylab = "LD2")
ggplot(Word, panel = tme.lda, cex = 0.7, dimen=2, xlab = "LD1", ylab = "LD2")
ggplot2::aes(LD1,LD2) (Word, panel = tme.lda, cex = 0.7, dimen=2, xlab = "LD1", ylab = "LD2")
plot.lda<-lda(Word~f0min+f0max+F1+F2+F4+max_F0,data = tme.lda)
ggp <- ggplot(plot.lda, aes(x = LD1, y=LD2)) +
geom_point(mapping = aes(colour=Word)) +
ggtitle("LD1 Vs. LD2")
ggp <- ggplot(plot.lda, aes(x = LD1, y=LD2))
Just to say a few things I tried

Create a contingency table with 2 factors from messy data

I have the following data in messy format:
structure(list(com_level = c("B", "B", "B", "B", "A", "A"),
hf_com = c(1, 1, 1, 1, 1, 1),
sal_level = c("2", "3", "1", "2", "1", "4"),
exp_sal = c(NA, 1, 1, NA, 1, NA)),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -6L))
Column com_level is the factor with 2 levels and column hf_com gives the frequency count for that level.
Column sal_level is the factor with 4 levels and column exp_sal gives the frequency count for that level.
I want to create a contingency table similar to this:
structure(list(`1` = c(1L, 2L),
`2` = c(0L, 1L),
`3` = c(0L, 2L),
`4` = c(1L, 0L)),
row.names = c("A", "B"), class = "data.frame")
I have code that works when I want to compare two columns with the same factor:
# 1 step to create table with frequency counts for exp_sal and curr_sal per category of level
cs_es_table <- df_not_na_num %>%
dplyr::count(sal_level, exp_sal, curr_sal) %>%
tidyr::spread(key = sal_level,value = n) %>% # this code spreads on just one key
select(curr_sal, exp_sal, 1, 2, 3, 4, 5, 6, 7, -8) %>% # reorder columns and omit Column 8 (no answer)
as.data.frame()
# step 2- convert cs_es_table to long format and summarise exp_sal and curr_sal frequencies
cs_es_table <- cs_es_table %>%
gather(key, value, -curr_sal,-exp_sal) %>% # crucial step to make data long
mutate(curr_val = ifelse(curr_sal == 1,value,NA),
exp_val = ifelse(exp_sal == 1,value,NA)) %>% #mutate actually cleans up the data and assigns a value to each new column for 'exp' and 'curr'
group_by(key) %>% #for your summary, because you want to sum up your previous rows which are now assigned a key in a new column
summarise_at( .vars = vars(curr_val, exp_val), .funs = sum, na.rm = TRUE)
This code produces this table but just spreads on one key in step 1:
structure(list(curr_val = c(533L, 448L, 237L, 101L, 56L), exp_val = c(179L,
577L, 725L, 401L, 216L)), row.names = c("< 1000 EUR", "1001-1500 EUR",
"2001-3000 EUR", "3001-4000 EUR", "4001-5000 EUR"), class = "data.frame")
Will I need to use pivot_wider as in this example?
Is it possible to use spread on multiple columns in tidyr similar to dcast?
or
tidyr::spread() with multiple keys and values
Any help would be appreciated to compare the two columns with different factors.

Group results under data names with sapply function and plot in R

Let's say I have a data which involves 3 separate data. Here is my data;
data<-structure(list(x = structure(list(value = c(2L, 4L, 5L, 6L, 9L,
4L, 3L, 2L, 10L, 6L)), .Names = "value", class = "data.frame", row.names = c(NA,
-10L)), y = structure(list(value = c(2, 2.1, 4, 3, 0, 1.2, 4.2,
3, 4, 9)), .Names = "value", class = "data.frame", row.names = c(NA,
-10L)), z = structure(list(value = c(1, 2, 7, 6, 0.3, 5.4, 4,
3, 6, 7)), .Names = "value", class = "data.frame", row.names = c(NA,
-10L))), .Names = c("x", "y", "z"))
And here is my sample function;
sam<-function(x) {
ex<-c(3,2,4,5,2)
z<-data.frame(x)
y<-as.matrix(sapply(z, as.numeric))
h<-lapply(c(2,5,10), function(xx) tapply(y, as.integer(gl(nrow(x), xx, nrow(x)) ), FUN = sum))
names(h)<-c("min2", "min5", "min10")
min2<-h[[1]]
pdf("plots.pdf")
plot(min2, ex, main="min. compare",
xlab="Historical Values ", ylab="Disaggregated Values", pch=19, col = "blue")
dev.off()
return(h)
}
In the function, I am aggregating values as shown. And then plotting min2 with ex data.
With the code below, I tried to use the function for all data like;
v1<-sapply(data, sam)
But I can not see calculation's name as min2 min5 min10 in result list. And also results are coming complexly, not under the x, y and z
I desire these two;
1) Grouping results under each data name. Like;
[x] [y] [z]
min2 min2 min2
min5 min5 min5
min10 min10 min10
2) Plotting the desired ones for all x, y and z as mentioned above. And export three plots to one pdf or separately.
To get the output, like #JonnyPhelps suggested, use lapply instead of sapply. To make the plots and get correlation you need to alter the function:
sam<-function(x) {
ex<-c(3,2,4,5,2)
z<-data.frame(x)
y<-as.matrix(sapply(z, as.numeric))
h<-lapply(c(2,5,10), function(xx) tapply(y, as.integer(gl(nrow(x), xx, nrow(x)) ), FUN = sum))
names(h)<-c("min2", "min5", "min10")
min2<-h[[1]]
plot(min2, ex, main="min. compare",
xlab="Historical Values ", ylab="Disaggregated Values",
pch=19, col = "blue")
COR = cor.test(min2,ex)
LABEL = paste("cor=",signif(COR$estimate,3),"\np=",signif(COR$p.value,3))
mtext(LABEL,side=3,padj=2)
return(h)
}
The correlation is calculated and you use mtext to place it at the top of the plot. You can play around with padj and adj to get the text where you need.
In your previous function, you called the plot in the function, this overwrites the file with every iteration. To plot all on a pdf, you need to do:
pdf("plots.pdf")
v1<-lapply(data, sam)
dev.off()
Or if you want them on the same page:
pdf("plots.pdf",width=8,height=4)
par(mfrow=c(1,3))
v1<-lapply(data, sam)
dev.off()

Implementing Tabu Search in R

I am trying to implement Tabu Search on a classification dataset namely Indian patients liver disease available in the UCI repository on https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset) but facing issues.
Following is the code I've used
NF <- 10
NTR <- 193
NTE <- 193
library(class)
library(e1071)
library(caret)
library(party)
library(nnet)
ILPD <- read.csv("C:/Users/Dell/Desktop/Codes and Datasets/ILPD.csv")
nrow(ILPD)
set.seed(9850)
gp<-runif(nrow(ILPD))
ILPD<-ILPD[order(gp),]
idx <- createDataPartition(y = ILPD$Class, p = 0.7, list = FALSE)
train<-ILPD[idx,]
test<-ILPD[-idx,]
ver<-test[,11]
evaluate <- function(th){
if (sum(th) == 0)return(0)
model <- svm(train[ ,th==1], train[,11] , gamma = 0.1, kernel ="sigmoid", na.action = na.omit)
pred <- predict(model, test[ ,th==1])
csRate <- sum(pred == ver)/NTE
penalty <- (NF - sum(th))/NF
return(csRate + penalty)
}
library(tabuSearch)
res <- tabuSearch(size = NF, iters = 2, objFunc = evaluate, config = matrix(1,1,NF), listSize = 5, nRestarts = 4)
plot(res)
plot(res, "tracePlot")
summary(res, verbose = TRUE)
Error:
Error in if (any(co)) { : missing value where TRUE/FALSE needed
In addition: Warning message:
In FUN(newX[, i], ...) : NAs introduced by coercion
Called from: svm.default(train[, th == 1], train[, 11], gamma = 0.1, kernel = "sigmoid", na.action = na.omit)
Some part of the data
structure(list(age = c(55L, 48L, 14L, 17L, 40L, 37L), gender = c(0L,
0L, 0L, 0L, 1L, 0L), TB = c(0.9, 2.4, 0.9, 0.9, 0.9, 0.7), DB = c(0.2,
1.1, 0.3, 0.2, 0.3, 0.2), Alkphos = c(116L, 554L, 310L, 224L,
293L, 235L), SGPT = c(36L, 141L, 21L, 36L, 232L, 96L), sgot = c(16L,
73L, 16L, 45L, 245L, 54L), TP = c(6.2, 7.5, 8.1, 6.9, 6.8, 9.5
), ALB = c(3.2, 3.6, 4.2, 4.2, 3.1, 4.9), AG = c(1, 0.9, 1, 1.55,
0.8, 1), Class = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("One",
"Two"), class = "factor")), .Names = c("age", "gender", "TB",
"DB", "Alkphos", "SGPT", "sgot", "TP", "ALB", "AG", "Class"), row.names = c(216L,
405L, 316L, 103L, 20L, 268L), class = "data.frame")
If anyone could help me with it
I wanted to see how tabu worked anyway so seemed a good place to start.
Basically you need to test your code better, evaluate just did not work. It is easy to test by hand by creating values of th and then calling evaluate on them.
Also use high level comments to organize your code and keep track of what you are doing, especially when posting to SO for help so as to save us time figuring out what you intend.
Not sure if these results are good, the amount of data is so minimal it is hard to tell.
Anyway here is the changed code:
NF <- 10
NTR <- 193
NTE <- 193
library(class)
library(e1071)
library(caret)
library(party)
library(nnet)
ILPD1 <- structure(
list(
age = c(55L,48L,14L,17L,40L,37L),
gender = c(0L,0L,0L,0L,1L,0L),
TB = c(0.9,2.4,0.9,0.9,0.9,0.7),
DB = c(0.2,1.1,0.3,0.2,0.3,0.2),
Alkphos = c(116L,554L,310L,224L,293L,235L),
SGPT = c(36L,141L,21L,36L,232L,96L),
sgot = c(16L,73L,16L,45L,245L,54L),
TP = c(6.2,7.5,8.1,6.9,6.8,9.5),
ALB = c(3.2,3.6,4.2,4.2,3.1,4.9),
AG = c(1,0.9,1,1.55,0.8,1),
Class = structure(c(2L,1L,2L,1L,1L,1L),
.Label = c("One","Two"),
class = "factor")
),
.Names = c("age","gender","TB","DB","Alkphos",
"SGPT","sgot","TP","ALB","AG","Class"),
row.names = c(216L,405L,316L,103L,20L,268L),
class = "data.frame"
)
ILPD <- ILPD1
#ILPD <- read.csv("ILPD.csv")
nrow(ILPD)
set.seed(9850)
# setup test and training data
gp <- runif(nrow(ILPD))
ILPD <- ILPD[order(gp),]
idx <- createDataPartition(y = ILPD$Class,p = 0.7,list = FALSE)
train <- ILPD[idx,]
test <- ILPD[ - idx,]
ver <- test[,11]
evaluate <- function(th) {
# evaluate the tabu for a value of th
# tabuSearch will use this function to evaluate points in its search space
#
# if everything is turned off just return zero as we are not interested
if(sum(th) == 0) return(0)
# we just want to train our svm on the columns for which th==1
svmtrn <- train[,th==1]
# but we need to have the Class varible as our label
if (is.null(trn$Class)) return(0)
# Train up an svm now
# Note that the first argument is the forumula we are training
model <- svm(Class~.,svmtrn,gamma = 0.1,kernel = "sigmoid",na.action = na.omit)
pred <- predict(model,test)
# now evaluate how well our prediction worked
csRate <- sum(pred == ver) / NTE
penalty <- (NF - sum(th)) / NF
return(csRate + penalty)
}
library(tabuSearch)
evaluate(matrix(1,1,NF))
res <- tabuSearch(size = NF,iters = 2,objFunc = evaluate,
config = matrix(1,1,NF),listSize = 5,nRestarts = 4)
plot(res)
plot(res,"tracePlot")
summary(res,verbose = TRUE)
Here are the output results:
[1] 6
[1] 0.005181347
Tabu Settings
Type = binary configuration
No of algorithm repeats = 1
No of iterations at each prelim search = 2
Total no of iterations = 12
No of unique best configurations = 8
Tabu list size = 5
Configuration length = 10
No of neighbours visited at each iteration = 10
Results:
Highest value of objective fn = 0.70518
Occurs # of times = 1
Optimum number of variables = 3
Optimum configuration:
[1] 1 0 0 0 0 1 0 0 0 1
And here is your plot:

Adding a continuous color gradient legend strip to plot

I am using following data and code to plot a barplot:
ddf=structure(list(VAR = structure(1:9, .Label = c("aaa", "bbb",
"ccc", "ddd", "eee", "fff", "ggg", "hhh", "iii"), class = "factor"),
VAL = c(0L, 32L, 64L, 96L, 128L, 160L, 192L, 224L, 256L)), .Names = c("VAR",
"VAL"), class = "data.frame", row.names = c(NA, -9L))
barplot(ddf$VAL,col=rgb(ddf$VAL,256-ddf$VAL,0,maxColorValue=256))
How can I add a legend strip having these colors on it, with corresponding values written there? I want a legend strip as appears in following plot:
library(latticeExtra)
dat <- data.frame(x = rnorm(1000), y = rnorm(1000), z = rnorm(1000, mean = 1))
maxz <- max(abs(dat$z))
levelplot(z ~ x * y, dat, at = seq(-maxz, maxz, length = 100), panel = panel.levelplot.points, par.settings = custom.theme.2())
I have tried to use following function examples but am unable to modify them to my needs:
legend("bottom", legend = LETTERS[1:6], col = c6, ncol = 2, cex = 2, lwd = 3, text.font = i, text.col = c6)
libarary(plotrix)
testcol<-color.gradient(c(0,1),0,c(1,0),nslices=5)
color.legend(11,6,11.8,9,col.labels,testcol,gradient="y")
Thanks in advance.
The trick is to draw the color key as an image in a separate panel. Notice that I decrese the margin between the panels by reducing par(mar=...) and add an outer margin around both panels with par(oma=...).
my.palette <- rgb(ddf$VAL,256-ddf$VAL,0,maxColorValue=256)
# Create two panels side by side
layout(t(1:2), widths=c(5,1))
# Set margins and turn all axis labels horizontally (with `las=1`)
par(mar=rep(.5, 4), oma=rep(3, 4), las=1)
barplot(ddf$VAL,col=my.palette)
# Draw the color legend
image(1, ddf$VAL, t(seq_along(ddf$VAL)), col=my.palette, axes=FALSE)
axis(4)
just for completeness the color.legend version, because I think it's the easiest one:
your data:
ddf=structure(list(VAR = structure(1:9, .Label = c("aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg", "hhh", "iii"), class = "factor"), VAL = c(0L, 32L, 64L, 96L, 128L, 160L, 192L, 224L, 256L)), .Names = ("VAR","VAL"), class = "data.frame", row.names = c(NA, -9L))
now we plot the barplot with the legend.
The numbers are the legend margins (x1,y1,x2,y2)
par(mar=c(1,3,1,5)+0.1,xpd=TRUE)
col.pal<-rgb(ddf$VAL,256-ddf$VAL,0,maxColorValue=256)
barplot(ddf$VAL,col=rgb(ddf$VAL,256-ddf$VAL,0,maxColorValue=256))
color.legend(12,220,13,80,rev(ddf$VAL),rev(col.pal),gradient="y")
Alternatively you can use colors.scale from the plotrix package and add it for example to the standard legend. It's exactly what the plotZCol function does in (my) spatDataManagement package.
From the examples :
x <- rnorm(1000)
y <- rnorm(1000)
library("spatDataManagement")
plotZCol(x,y,zCol=x+y)

Resources