I am learning about the "kohonen" package in R for the purpose of making Self Organizing Maps (SOM, also called Kohonen Networks - a type of Machine Learning algorithm). I am following this R language tutorial over here: https://www.rpubs.com/loveb/som
I tried to create my own data (this time with both "factor" and "numeric" variables) and run the SOM algorithm (this time using the "supersom()" function instead):
#load libraries and adjust colors
library(kohonen) #fitting SOMs
library(ggplot2) #plots
library(RColorBrewer) #colors, using predefined palettes
contrast <- c("#FA4925", "#22693E", "#D4D40F", "#2C4382", "#F0F0F0", "#3D3D3D") #my own, contrasting pairs
cols <- brewer.pal(10, "Paired")
#create and format data
a =rnorm(1000,10,10)
b = rnorm(1000,10,5)
c = rnorm(1000,5,5)
d = rnorm(1000,5,10)
e <- sample( LETTERS[1:4], 100 , replace=TRUE, prob=c(0.25, 0.25, 0.25, 0.25) )
f <- sample( LETTERS[1:5], 100 , replace=TRUE, prob=c(0.2, 0.2, 0.2, 0.2, 0.2) )
g <- sample( LETTERS[1:2], 100 , replace=TRUE, prob=c(0.5, 0.5) )
data = data.frame(a,b,c,d,e,f,g)
data$e = as.factor(data$e)
data$f = as.factor(data$f)
data$g = as.factor(data$g)
cols <- 1:4
data[cols] <- scale(data[cols])
#som model
som <- supersom(data= as.list(data), grid = somgrid(10,10, "hexagonal"),
dist.fct = "euclidean", keep.data = TRUE)
From here, I was able to successfully make some of the basic plots:
#plots
#pretty gradient colors
colour1 <- tricolor(som$grid)
colour4 <- tricolor(som$grid, phi = c(pi/8, 6, -pi/6), offset = 0.1)
plot(som, type="changes")
plot(som, type="count")
plot(som, type="quality", shape = "straight")
plot(som, type="dist.neighbours", palette.name=grey.colors, shape = "straight")
However, the problem arises when I try to make individual plots for each variable:
#error
var <- 1 #define the variable to plot
plot(som, type = "property", property = getCodes(som)[,var], main=colnames(getCodes(som))[var], palette.name=terrain.colors)
var <- 6 #define the variable to plot
plot(som, type = "property", property = getCodes(som)[,var], main=colnames(getCodes(som))[var], palette.name=terrain.colors)
This produces an error: "Error: Incorrect Number of Dimensions"
A similar error (NAs by coercion) is produced when attempting to cluster the SOM Network:
#cluster (error)
set.seed(33) #for reproducability
fit_kmeans <- kmeans(data, 3) #3 clusters are used, as indicated by the wss development.
cl_assignmentk <- fit_kmeans$cluster[data$unit.classif]
par(mfrow=c(1,1))
plot(som, type="mapping", bg = rgb(colour4), shape = "straight", border = "grey",col=contrast)
add.cluster.boundaries(som, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4])
Can someone please tell me what I am doing wrong?
Thanks
Sources: https://www.rdocumentation.org/packages/kohonen/versions/2.0.5/topics/supersom
getCodes() produces a list and as such you have to treat it like one.
Calling getCodes(som) produces a list containing 7 items named a-g as such you should be selecting items from the list either using $ or [[]]
e.g
plot(som, type = "property", property = getCodes(som)[[1]], main=names(getCodes(som))[1], palette.name=terrain.colors)
or
plot(som, type = "property", property = getCodes(som)$a, main="a", palette.name=terrain.colors)
or
plot(som, type = "property", property = getCodes(som)[["a"]], main="a", palette.name=terrain.colors)
if you must set the variable prior to calling the plot you can do so like:
var <- 1
plot(som, type = "property", property = getCodes(som)[[var]], main=names(getCodes(som))[var], palette.name=terrain.colors)
Regarding kmeans()
kmeans() needs a matrix or an object that can be coerced into a matrix, you have factors (categorical data) which cannot be coerced into numeric, either drop the factors, or find another method.
drop the factors:
#cluster (error)
set.seed(33)
#for reproducability
fit_kmeans <- kmeans(as.matrix(data[1:4]), 3)
#3 clusters are used, as indicated by the wss development.
cl_assignmentk <- fit_kmeans$cluster[data$unit.classif]
par(mfrow=c(1,1))
plot(som, type="mapping", bg = rgb(colour4), shape = "straight", border = "grey",col=contrast)
add.cluster.boundaries(som, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4])
edit:
Alternatively you can specify the code directly from getCodes() by using idx like so:
plot(som, type = "property", property = getCodes(som, idx = 1), main="a"), palette.name=terrain.colors)
Related
This is my first post so please tell me if I am breaking your rules! My problem is described in the comments of this simplified version of my code.
I want to plot an unrooted phylogenetic tree that neatly highlights selected clades.
I like the results of using geom_hilight() from ggtree with type = 'encircle', but I do not like having to individually edit the node and color values for every input. (see method 1)
method 2 is close to what I want, but with the wrong highlight type (roundrect)
method 3 uses the correct highlight type (encircle) but returns an error.
# I don't think all of these packages are needed to reproduce this problem
library(ape)
library(dplyr)
library(GGally)
library(ggalt)
library(ggforce)
library(ggplot2)
library(ggtree)
library(tidyr)
library(tidytree)
library(treeio)
#my pipeline uses the output of RAxML. I made this simpler tree for discussion purposes.
sink("test.tree")
cat("((((((t24:0.8024311261,t11:0.7828436729):0.3048173019,(t21:0.4867131179,t18:0.2167164627):0.7519672168):0.5776117099,t5:0.4223263576):0.5963104749,(t17:0.1558260066,t20:0.41109852):0.09447153704):0.2661841849,((((t6:0.009324073093,t12:0.2732205035):0.7790091021,t10:0.08588226303):0.3282297731,t9:0.2075715209):0.664191803,(((t15:0.5832811284,t14:0.8461383074):0.6081165755,t19:0.5950602938):0.7095833826,t8:0.7146228608):0.7801561591):0.6674923887):0.654328516,(((t13:0.6356930537,t3:0.8536336934):0.8644152461,t2:0.1784738901):0.7129137593,t23:0.8907998055):0.3618239218,((t16:0.1825823467,t7:0.8856151809):0.4720220205,(t22:0.672613536,(t1:0.9215354125,(t4:0.9248593273,t25:0.5937075356):0.3007316259):0.6941311779):0.6789765966):0.2112918347);")
sink()
#import tree
tree1 <- read.tree("test.tree")
#choose root nodes and colors for highlighting clades
group.roots <- c(34, 28, 44, 41)
group.colors <- c("#fd00fe", "#62ce75", "#9a1073", "#4ad9e1")
#write a data frame
g <- data.frame(gnode = group.roots, gfill = group.colors)
#
tree1unrooted <- ggtree(tree1,layout = 'unrooted')
#method 1: I want my plot to look like this, but I do not want to use so many instances of "geom_hilight()"
tree1unrooted + geom_label(aes(label = node)) +
geom_hilight(
node = 34,
alpha = 1,
fill = "#fd00fe",
type = "encircle",
to.bottom = TRUE
) +
geom_hilight(
node = 28,
alpha = 1,
fill = "#62ce75",
type = "encircle",
to.bottom = TRUE
) +
geom_hilight(
node = 44,
alpha = 1,
fill = "#9a1073",
type = "encircle",
to.bottom = TRUE
) +
geom_hilight(
node = 41,
alpha = 1,
fill = "#4ad9e1",
type = "encircle",
to.bottom = TRUE
)
#method 2: I have used this method to highlight multiple clades successfully with "type = 'roundrect'", but the highlighed regions overlap.
tree1unrooted +
geom_hilight(
data = g,
mapping = aes(node = gnode, fill = gfill),
alpha = 1,
type='roundrect',
to.bottom = TRUE
)
#method 3: I need "type = 'encircle'" for my plot. This gives the error: "Error in FUN(X[[i]], ...) : object 'x' not found"
tree1unrooted +
geom_hilight(
data = g,
mapping = aes(node = gnode, fill = gfill),
alpha = 1,
type='encircle',
to.bottom = TRUE
)
This seems like a bug to me, since one wouldn't think changing the fill shape should cause an error when a different shape works with the same syntax.
It appears that the data passed to the geom_hilight layer gets merged with the plot data, and for some reason this step goes with the "encircle" shape.
Anyway, one obvious solution is to program a list of single geom_hilight layers and add that to the plot:
tree1unrooted +
lapply(seq(nrow(g)), function(i) {
geom_hilight(
node = g$gnode[i],
alpha = 1,
fill = g$gfill[i],
type = "encircle",
to.bottom = TRUE
)
})
I am a newer in R. I would like to create a circular heatmap and set some split according to https://jokergoo.github.io/2020/05/21/make-circular-heatmaps/, which says :
If the value for split argument is a factor, the order of the factor levels controls the order of heatmaps. If split is a simple vector, the order of heatmaps is unique(split).
# note since circos.clear() was called in the previous plot,
# now the layout starts from theta = 0 (the first sector is 'e')
circos.heatmap(mat1, split = factor(split, levels = c("e", "d", "c", "b", "a")),
col = col_fun1, show.sector.labels = TRUE)
refered result plot
my data was like this:
esters.csv
This is my code
library(circlize)
library(ComplexHeatmap)
library(dendextend)
mat1=read.csv("esters.csv")
row.names(mat1)<-mat1[,1]#
mat2<-mat1[,-1]##remove the first column
mat3<-mat1[-1,]##remove the first row
#Draw circoheatmap
col_fun1 = colorRamp2(c(0, 0.00001, 0.0001, 0.001, 0.01,0.1, 0.4, 0.8), c("#FAFAFA", "#EAF7E7", "#E0F3DC", "#D7F0D1", "#CDEBC6", "#D5E4FD", "#8CACE3", "#5E7192"))##
circos.par(start.degree = 90, gap.degree = 10, gap.after = c(10))##
mat1 = mat1[sample(165, 165), ] # randomly permute rows
split = sample(letters[1:5], 165, replace = TRUE)
splits = factor(split, levels = letters[1:5])
circos.heatmap(mat2, col = col_fun1, split = splits,
dend.track.height = 0.15,
dend.side = "inside",
rownames.side = "outside",
dend.callback = function(dend, m, si) {
color_branches(dend, k = 4, col = 1:4)
}
)
#By default, the numeric matrix is clustered on rows.
#Used to draw legend
lgd = Legend(title = "Relative abundance", col_fun = col_fun1)
grid.draw(lgd)
circos.clear()
I want to add the split according to the specific row name, like "ester40", "ester80", "ester128". For example, the first split or sector contained 40 rows named "ester1, ester2, ester3, ester4,...to ester40" and all columns from "H6d_T" to "M10d_P".
I tried my best to understand it, but it still did not work.
Did anyone could tell me what should I type in
split = ???
I want to plot a graph via Rgraphviz but I can't handle the design attributes of the clusters that I set.
There are similar questions already on SO and elsewhere but none has a real minimal working example and none of them is answered. So I want to try to ask a complete question to receive a complete answer. As an introduction to the package, I read the paper "How To Plot A Graph Using Rgraphviz" by Gentry, Gentleman, and Huber.
My example network:
library(Rgraphviz)
set.seed(123)
V <- letters[1:6]
M <- 1:4
g1 <- randomGraph(V, M, 0.2)
If I want to plot it, I can easily give it some attributes via a list:
attributes <- list(node = list(shape = "rectangle", fixedsize = FALSE),
graph = list(layout = "dot", bgcolor = "transparent"))
plot(g1, attrs = attributes )
Plotting it via plot(g1) gives the following result:
Now I want to define two clusters/subgraphs. This can be done this way:
sg1= subGraph(c("a", "e", "f"), g1)
sg2= subGraph(c("b", "c", "d"), g1)
subGList <- vector(mode = "list", length = 2)
subGList[[1]] <- list(graph = sg1, cluster = TRUE)
subGList[[2]] <- list(graph = sg2, cluster = TRUE)
Plotting the graph again now including a subGlist argument:
plot(g1, attrs = attributes , subGList = subGList)
So, obviously, there has been a change in the setting and even though it would be convenient having the clusters a little bit more separated, the result is ok.
Now if I want to define cluster-specific styles or try to have them framed, I start having problems. According to page 4 of the mentioned introductory paper one can simply add an element called attrs to the sublists of subGlist.
To my understanding, it should work this way:
subGList[[1]] <- list(graph = sg1,
cluster = TRUE,
attrs = c(fontcolor = "red"))
plot(g1, attrs = attrs, subGList = subGList)
Unfortunately, it doesn't. As mentioned, I would like to frame my clusters (similar to this SO post) but as I can't even handle the fontcolors of the clusters, I think I make a somehow more fundamental mistake.
My complete code:
library(Rgraphviz)
set.seed(123)
V <- letters[1:6]
M <- 1:4
g1 <- randomGraph(V, M, 0.2)
attributes <- list(node = list(shape = "rectangle", fixedsize = FALSE),
graph = list(layout = "dot", bgcolor = "transparent"))
#plot(g1, attrs = attributes )
sg1= subGraph(c("a", "e", "f"), g1)
sg2= subGraph(c("b", "c", "d"), g1)
subGList <- vector(mode = "list", length = 2)
subGList[[1]] <- list(graph = sg1, cluster = TRUE)
subGList[[2]] <- list(graph = sg2, cluster = TRUE)
#plot(g1, attrs = attributes , subGList = subGList)
subGList[[1]] <- list(graph = sg1,
cluster = TRUE,
attrs = c(fontcolor = "red"))
plot(g1, attrs = attrs, subGList = subGList)
I hope someone can help! Thank you
I'm trying to use the calculate.overlap function within the VennDiagram package to first calculate and then print a Venn Diagram. I was able to calculate the overlap of my data set but looking for help how to print the Venn graphic. Can anyone provide assistance? I read through the documentation but didn't find this.
> library('VennDiagram')
# A simple single-set diagram
cardiome <- letters[1:10]
superset <- letters[8:24]
overlap <- calculate.overlap(
x = list(
"Cardiome" = cardiome,
"SuperSet" = superset
)
);
Another simple example that shows how to print a Venn diagram using the VennDiagram package:
library(VennDiagram)
cardiome <- letters[1:10]
superset <- letters[8:24]
overlap <- calculate.overlap(
x <- list("Cardiome"=cardiome, "SuperSet"=superset))
venn.plot <- draw.pairwise.venn(
area1 = length(cardiome),
area2 = length(superset),
cross.area = length(overlap),
category = c("Cardiome", "Superset"),
fill = c("blue", "red"),
lty = "blank",
cex = 2,
cat.cex = 2,
cat.pos = c(180, 180),
cat.dist = 0.05,
cat.just = list(c(0, 1), c(1, 1))
)
grid.draw(venn.plot)
savePlot(filename="venndiag", type="png")
Venn diagrams with item labels inside the sets:
library(RAM)
vectors <- list(Cardiome=cardiome, Superset=superset)
group.venn(vectors=vectors, label=TRUE,
fill = c("blue", "red"),
cat.pos = c(180, 180),
lab.cex=1.1)
The funtion venn.diagram() does it. For instance in your example
venn.diagram(x = list(
"Cardiome" = cardiome,
"SuperSet" = superset
), "plot_venn")
It saves to working directory. Type getwd() to see what it is set to.
See the
?venn.diagram()
for more info.
?venn.diagram suggests this
library('VennDiagram')
venn.plot <- venn.diagram(
x = list(
cardiome = letters[1:10],
superset = letters[8:24]
),
filename = NULL
);
grid.draw(venn.plot);
I found a nice tutorial of self organizing map clustering in R in which, it is explained how to display your input data in the unit space (see below). In order to set up some rules for the labeling, I would like to compute the probability of each class in each neuron and plot it. Computing the probability is rather easy: take for each unit the number of observations of class i and divide it by the total number of observations in this unit. I end up with data.frame pc. Now I struggle to map this result, any clue on how to do it?
library(kohonen)
data(yeast)
set.seed(7)
yeast.supersom <- supersom(yeast, somgrid(8, 8, "hexagonal"),whatmap = 3:6)
classes <- levels(yeast$class)
colors <- c("yellow", "green", "blue", "red", "orange")
par(mfrow = c(3, 2))
plot(yeast.supersom, type = "mapping",pch = 1, main = "All", keepMargins = TRUE,bgcol = gray(0.85))
library(plyr)
pc <- data.frame(Var1=c(1:64))
for (i in seq(along = classes)) {
X.class <- lapply(yeast, function(x) subset(x, yeast$class == classes[i]))
X.map <- map(yeast.supersom, X.class)
plot(yeast.supersom, type = "mapping", classif = X.map,
col = colors[i], pch = 1, main = classes[i], add=TRUE)
# compute percentage per unit
v1F <- levels(as.factor(X.map$unit.classif))
v2F <- levels(as.factor(yeast.supersom$unit.classif))
fList<- base::union(v2F,v1F)
pc <- join(pc,as.data.frame(table(factor(X.map$unit.classif,levels=fList))/table(factor(yeast.supersom$unit.classif,levels=fList))*100),by = 'Var1')
colnames(pc)[NCOL(pc)]<-classes[i]
}
OKay guys here is a solution:
Once I have computed the probability, it derives a color code from a defined gradient (rbPal). The gradient is defined by a upper and a lower bound and the shade of the colors are proportional to their interval. THis is done with the function findInterval.
# compute percentage per unit
v1F <- levels(as.factor(X.map$unit.classif))
v2F <- levels(as.factor(yeast.supersom$unit.classif))
fList<- base::union(v2F,v1F)
pc <- join(pc,as.data.frame(table(factor(X.map$unit.classif,levels=fList))/table(factor(yeast.supersom$unit.classif,levels=fList))*100),by = 'Var1')
colnames(pc)[NCOL(pc)]<-classes[i]
rbPal <- colorRampPalette(c('blue','yellow','red'))
plot(yeast.supersom, type="mapping", bgcol = rbPal((100))[(findInterval(pc[,which(colnames(pc)==as.character(classes[i]))], seq(0:100))+1)], main = paste("Probabily Clusters:", classes[i]))