There is a need to plot the result of a unsupervised rectangular SOM model. Additional requirements: 1) draw each node as a pie chart with corresponding observed classes; size of a chart should reflect the number of samples in the node. Default plot.kohonen doesn't suit such a case.
Here is a possible solution. The first function som.prep.df is called from the second 'som.draw', which has only two parameters SOM model and observed classes of training set.
som.prep.df <- function(som.model, obs.classes, scaled) {
require(reshape2)
lev <- factor(wine.classes)
df <- data.frame(cbind(unit=som.model$unit.classif, class=as.integer(lev)))
# create table
df2 <- data.frame(table(df))
df2 <- dcast(df2, unit ~ class, value.var="Freq")
df2$unit <- as.integer(df2$unit)
# calc sum
df2$sum <- rowSums(df2[,-1])
# calc fraction borders of classes in each node
tmp <- data.frame(cbind(X0=rep(0,nrow(df2)),
t(apply(df2[,-1], 1, function(x) {
cumsum(x[1:(length(x)-1)]) / x[length(x)]
}))))
df2 <- cbind(df2, tmp)
df2 <- melt(df2, id.vars=which(!grepl("^\\d$", colnames(df2))))
df2 <- df2[,-ncol(df2)]
# define border for each classs in each node
tmp <- t(apply(df2, 1, function(x) {
c(x[paste0("X", as.character(as.integer(x["variable"])-1))],
x[paste0("X", as.character(x["variable"]))])
}))
tmp <- data.frame(tmp, stringsAsFactors=FALSE)
tmp <- sapply(tmp, as.numeric)
colnames(tmp) <- c("ymin", "ymax")
df2 <- cbind(df2, tmp)
# scale size of pie charts
if (is.logical(scaled)) {
if (scaled) {
df2$xmax <- log2(df2$sum)
} else {
df2$xmax <- df2$sum
}
}
df2 <- df2[,c("unit", "variable", "ymin", "ymax", "xmax")]
colnames(df2) <- c("unit", "class", "ymin", "ymax", "xmax")
# replace classes with original levels names
df2$class <- levels(lev)[df2$class]
return(df2)
}
som.draw <- function(som.model, obs.classes, scaled=FALSE) {
# scaled - make or not a logarithmic scaling of the size of each node
require(ggplot2)
require(grid)
g <- som.model$grid
df <- som.prep.df(som.model, obs.classes, scaled)
df <- cbind(g$pts, df[,-1])
df$class <- factor(df$class)
g <- ggplot(df, aes(fill=class, ymax=ymax, ymin=ymin, xmax=xmax, xmin=0)) +
geom_rect() +
coord_polar(theta="y") +
facet_wrap(x~y, ncol=g$xdim, nrow=g$ydim) +
theme(axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
panel.margin = unit(0, "cm"),
strip.background = element_blank(),
strip.text = element_blank(),
plot.margin = unit(c(0,0,0,0), "cm"),
panel.background = element_blank(),
panel.grid = element_blank())
return(g)
}
Usage example.
require(kohonen)
data(wines)
som.wines <- som(scale(wines), grid = somgrid(5, 5, "rectangular"))
# Non-scaled map
som.draw(som.wines, wine.classes)
# Scaled map
som.draw(som.wines, wine.classes, TRUE)
This function can also be used for the visualization of supervised models as well. But it suits only for rectangular maps. Hope this will help someone.
There are several possible improvements:
Choose a better scaling function than logarithm. Because now nodes with single sample become invisible after scaling.
Add legend to the whole plot which will reflect the size of nodes.
Or add information about nodes population on each chart.
PS. The code isn't very elegant, so any suggestions and improvements are welcome.
Related
I am testing some templates of ggplot2, and I am interesting to the slope chart, available from here:
Slope Chart - Link
It works perfectly.
I am only trying to make a similar one but with only a group defined, like the following code:
library(dplyr)
library(ggplot2)
theme_set(theme_classic())
source_df <- read.csv("https://raw.githubusercontent.com/jkeirstead/r-slopegraph/master/cancer_survival_rates.csv")
source_df <- filter(source_df, group == "Thyroid")
then I copy the remaining code from the example:
# Define functions. Source: https://github.com/jkeirstead/r-slopegraph
tufte_sort <- function(df, x="year", y="value", group="group", method="tufte", min.space=0.05) {
## First rename the columns for consistency
ids <- match(c(x, y, group), names(df))
df <- df[,ids]
names(df) <- c("x", "y", "group")
## Expand grid to ensure every combination has a defined value
tmp <- expand.grid(x=unique(df$x), group=unique(df$group))
tmp <- merge(df, tmp, all.y=TRUE)
df <- mutate(tmp, y=ifelse(is.na(y), 0, y))
## Cast into a matrix shape and arrange by first column
require(reshape2)
tmp <- dcast(df, group ~ x, value.var="y")
ord <- order(tmp[,2])
tmp <- tmp[ord,]
min.space <- min.space*diff(range(tmp[,-1]))
yshift <- numeric(nrow(tmp))
## Start at "bottom" row
## Repeat for rest of the rows until you hit the top
for (i in 2:nrow(tmp)) {
## Shift subsequent row up by equal space so gap between
## two entries is >= minimum
mat <- as.matrix(tmp[(i-1):i, -1])
d.min <- min(diff(mat))
yshift[i] <- ifelse(d.min < min.space, min.space - d.min, 0)
}
tmp <- cbind(tmp, yshift=cumsum(yshift))
scale <- 1
tmp <- melt(tmp, id=c("group", "yshift"), variable.name="x", value.name="y")
## Store these gaps in a separate variable so that they can be scaled ypos = a*yshift + y
tmp <- transform(tmp, ypos=y + scale*yshift)
return(tmp)
}
plot_slopegraph <- function(df) {
ylabs <- subset(df, x==head(x,1))$group
yvals <- subset(df, x==head(x,1))$ypos
fontSize <- 3
gg <- ggplot(df,aes(x=x,y=ypos)) +
geom_line(aes(group=group),colour="grey80") +
geom_point(colour="white",size=8) +
geom_text(aes(label=y), size=fontSize, family="American Typewriter") +
scale_y_continuous(name="", breaks=yvals, labels=ylabs)
return(gg)
}
## Prepare data
df <- tufte_sort(source_df,
x="year",
y="value",
group="group",
method="tufte",
min.space=0.05)
df <- transform(df,
x=factor(x, levels=c(5,10,15,20),
labels=c("5 years","10 years","15 years","20 years")),
y=round(y))
## Plot
plot_slopegraph(df) + labs(title="Estimates of % survival rates") +
theme(axis.title=element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust=0.5,
family = "American Typewriter",
face="bold"),
axis.text = element_text(family = "American Typewriter",
face="bold"))
The problem I find is that the connecting lines disappears if I take only one object in source_df$group ()in this case Thyroid, like the followings:
If I add only another item in the same column everything is fine and the connecting line appears.
Is there a way to have the lines also in this situation? I have tried on a lot of ways, removing the lines containing NA values without success, therefore I do not know how to fix this problem, if ... it is possible to be fixed.
Thank you in advance for every eventual reply!
May I suggest a much easier way, with the {ggh4x} package, which has a base R type = "b" like geom. You can remove the points, and plot text instead.
You'll get your result in three lines of code :)
library(tidyverse)
library(ggh4x)
source_df <- read.csv("https://raw.githubusercontent.com/jkeirstead/r-slopegraph/master/cancer_survival_rates.csv")
source_df <- filter(source_df, group == "Thyroid")
ggplot(source_df, aes(year, value)) +
## set shape to NA
geom_pointpath(aes(group = group, mult = 1), shape = NA) +
geom_text(aes(label = value))
Created on 2021-12-30 by the reprex package (v2.0.1)
I fixed simply adding the line:
df <- df[complete.cases(df), ]
Before the graphing instructions. The problem was the generation of many lines with NA values and this line removes lines with null value.
I have a function called "my function" which creates a ggplot graph and saves it within the function.
#define a function "add_data" to transpose peak values into summary table
my_function <- function(exp, cond) {
#Read in appropriate experiment number
#Remove first 2 columns
#Rename first column to "mintime" and convert to minutes
#Normalize raw fluorescence values
flowdata <- read_csv(paste0(exp, ".csv"))
title <- cond
flowdata <- flowdata[, -c(1:2)] %>%
rename(mintime = 1) %>%
transform(mintime = mintime / 60)
flowdata[,-1] <- data.frame(lapply(flowdata[,-1], function(X) X/X[1]))
#Exclude values up to 5 minutes
#Determine number of peaks per cell
#Add number of peaks per cell to summary table
flowdata_cut <- flowdata[which(flowdata$mintime>=5),]
peak_info <- lapply(flowdata_cut[,-1], findpeaks, threshold=2)
numberpeak <- unlist(lapply(peak_info, nrow))
summarypeaks <- add_peaks(summarypeaks, numberpeak, title)
#Prepare data for line graph
melted <- melt(flowdata, id.vars="mintime")
#####CREATE GRAPH#####
#Plot graph
ggplot(data=melted, aes(x=mintime, y=value, group=variable)) +
geom_line(show.legend = FALSE) +
scale_x_continuous(limits = c(3, 12), breaks = seq(3, 12, by = 3)) +
labs(y="Fluo-4 fluorescence (F/F0)", x = "Time (min)") +
ggtitle(title) +
theme_bw() +
# remove elements we don't need
theme(panel.grid = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
panel.background = element_blank())
#####SAVE GRAPH#####
#Save line graph as .png file
ggsave(filename = paste0(exp, "_Line_Graph.jpg"), width = 8, height = 4)
# Return
return(summarypeaks)
}
When I knit the document, the ggplot graph doesn't show in my HTML output despite these parameters
{r fig.cap="Experiment DATE", results = TRUE, eval = TRUE, echo= FALSE, warning=FALSE, message=FALSE, error=FALSE}
#Add experiment and condition
summarypeaks <- my_function(1284, "CONDITION")
Is there a way for the R Markdown HTML document to display the outputs of plots created within a function?
There's a way of getting the plot to show even without returning it, by using side effects. Specifically, you need to save your ggplot object into some variable, say p, and then tell R to print it using plot(). You don't need to change anything about how your function returns its results, it will just happen as a side effect. See below an updated version of your function:
#define a function "add_data" to transpose peak values into summary table
my_function <- function(exp, cond) {
#Read in appropriate experiment number
#Remove first 2 columns
#Rename first column to "mintime" and convert to minutes
#Normalize raw fluorescence values
flowdata <- read_csv(paste0(exp, ".csv"))
title <- cond
flowdata <- flowdata[, -c(1:2)] %>%
rename(mintime = 1) %>%
transform(mintime = mintime / 60)
flowdata[,-1] <- data.frame(lapply(flowdata[,-1], function(X) X/X[1]))
#Exclude values up to 5 minutes
#Determine number of peaks per cell
#Add number of peaks per cell to summary table
flowdata_cut <- flowdata[which(flowdata$mintime>=5),]
peak_info <- lapply(flowdata_cut[,-1], findpeaks, threshold=2)
numberpeak <- unlist(lapply(peak_info, nrow))
summarypeaks <- add_peaks(summarypeaks, numberpeak, title)
#Prepare data for line graph
melted <- melt(flowdata, id.vars="mintime")
#####CREATE GRAPH#####
#Plot graph
p <- ggplot(data=melted, aes(x=mintime, y=value, group=variable)) +
geom_line(show.legend = FALSE) +
scale_x_continuous(limits = c(3, 12), breaks = seq(3, 12, by = 3)) +
labs(y="Fluo-4 fluorescence (F/F0)", x = "Time (min)") +
ggtitle(title) +
theme_bw() +
# remove elements we don't need
theme(panel.grid = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
panel.background = element_blank())
plot(p)
#####SAVE GRAPH#####
#Save line graph as .png file
ggsave(filename = paste0(exp, "_Line_Graph.jpg"), width = 8, height = 4)
# Return
return(summarypeaks)
}
Is there a way to create a function for my code below? I have multiple csv files that I'm running this exact code on, but it has been tiring using this same code over and over to do the same thing (and has made my script very long). Here is my code-
#define a function "add_peaks" to transpose peak values into summary table
add_peaks <- function(df, vec, colname) {
if(colname %in% names(df)) vec <- c(df[[colname]], vec)
new_row <- max(nrow(df), length(vec))
new_df <- df[1:new_row, ,drop = FALSE]
new_df[colname] <- c(vec, rep(NA, new_row - length(vec)))
new_df[is.na(new_df)] <- 0
rownames(new_df) <- NULL
new_df
}
#####INPUT#####
#Read in appropriate experiment number
#Remove first 2 columns
#Rename first column to "mintime" and convert to minutes
#Normalize raw fluorescence values
flowdata <- read_csv("####.csv") ####CHANGE EXP NUMBER
title <- "CONDITION HERE" ####CHANGE CONDITION"
flowdata <- flowdata[, -c(1:2)] %>%
rename(mintime = 1) %>%
transform(mintime = mintime / 60)
flowdata[,-1] <- data.frame(lapply(flowdata[,-1], function(X) X/X[1]))
#Exclude values up to 5 minutes
#Determine number of peaks per cell
#Add number of peaks per cell to summary table
flowdata_cut <- flowdata[which(flowdata$mintime>=5),]
peak_info <- lapply(flowdata_cut[,-1], findpeaks, threshold=2)
numberpeak <- unlist(lapply(peak_info, nrow))
summarypeaks <- add_peaks(summarypeaks, numberpeak, title)
#Prepare data for line graph
melted <- melt(flowdata, id.vars="mintime")
#####CREATE GRAPH#####
#Plot graph
ggplot(data=melted, aes(x=mintime, y=value, group=variable)) +
geom_line(show.legend = FALSE) +
scale_x_continuous(limits = c(3, 12), breaks = seq(3, 12, by = 3)) +
labs(y="Fluo-4 fluorescence (F/F0)", x = "Time (min)") +
ggtitle(title) +
theme_bw() +
# remove elements we don't need
theme(panel.grid = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
panel.background = element_blank())
#####SAVE GRAPH#####
#Save line graph as .png file
ggsave(filename = "####_Line_Graph.png", ####CHANGE EXP NUMBER
width = 8, height = 4)
Please let me know if it is possible (code is appreciated), even if it is part of it only. Obviously I am new to coding. Thank you!
Below is your code, extracted into a function that takes 2 arguments - the experiment number (exp) and the condition (cond).
my_function <- function(exp, cond) {
#Read in appropriate experiment number
#Remove first 2 columns
#Rename first column to "mintime" and convert to minutes
#Normalize raw fluorescence values
flowdata <- read_csv(paste0(exp, ".csv"))
title <- cond
flowdata <- flowdata[, -c(1:2)] %>%
rename(mintime = 1) %>%
transform(mintime = mintime / 60)
flowdata[,-1] <- data.frame(lapply(flowdata[,-1], function(X) X/X[1]))
#Exclude values up to 5 minutes
#Determine number of peaks per cell
#Add number of peaks per cell to summary table
flowdata_cut <- flowdata[which(flowdata$mintime>=5),]
peak_info <- lapply(flowdata_cut[,-1], findpeaks, threshold=2)
numberpeak <- unlist(lapply(peak_info, nrow))
summarypeaks <- add_peaks(summarypeaks, numberpeak, title)
#Prepare data for line graph
melted <- melt(flowdata, id.vars="mintime")
#####CREATE GRAPH#####
#Plot graph
ggplot(data=melted, aes(x=mintime, y=value, group=variable)) +
geom_line(show.legend = FALSE) +
scale_x_continuous(limits = c(3, 12), breaks = seq(3, 12, by = 3)) +
labs(y="Fluo-4 fluorescence (F/F0)", x = "Time (min)") +
ggtitle(title) +
theme_bw() +
# remove elements we don't need
theme(panel.grid = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
panel.background = element_blank())
#####SAVE GRAPH#####
#Save line graph as .png file
ggsave(filename = paste0(exp, "_Line_Graph.png"), width = 8, height = 4)
# Return
return(summarypeaks)
}
If you have experiment number 005 and condition "test", call the function like so, assigning the result to replace the old value of summarypeaks:
summarypeaks <- my_function(005, "test")
The idea is to combine R packages ClustOfVar and ggdendro to give a visual summary of variable clustering.
When there are few columns in the data, the result is pretty good except that there are areas not covered(as circled in the chart below). Using mtcars for example:
library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(ClustOfVar)
library(ggdendro)
fit = hclustvar(X.quanti = mtcars)
labels = cutree(fit,k = 5)
labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))
p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)
df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))
p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
scale_y_continuous(expand=c(0,0))+
theme(axis.title=element_blank(),
axis.ticks=element_blank(),
axis.text=element_blank(),
legend.position="none")+coord_flip()+
geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))
When there are a large number of columns, another issue occurs. That is, the height of the color tiles part does not match the height the dendrogram.
library(ClustOfVar)
library(ggdendro)
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
fit = hclustvar(X.quanti = X)
labels = cutree(fit,k = 5)
labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))
p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)
df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))
p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
scale_y_continuous(expand=c(0,0))+
theme(axis.title=element_blank(),
axis.ticks=element_blank(),
axis.text=element_blank(),
legend.position="none")+coord_flip()+
geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))
#Sandy Muspratt has actually provided an excellent solution to this IF we have the R upgraded to version 3.3.1.
R: ggplot slight adjustment for clustering summary
But since I cannot change the version of the R deployed in the corporate server, I wonder if there is any other workaround that can align these two parts.
As far as I can tell, your code is not far wrong. The problem is that you are trying to match a continuous scale to a discrete scale when you merge the two plots. Also, it appears that ggdendrogram() adds additional space to the y-axis.
library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(ClustOfVar)
library(ggdendro)
# Data
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
# Cluster analysis
fit = hclustvar(X.quanti = X)
# Labels data frames
df2 <- data.frame(cluster = cutree(fit, k =5),
states = factor(fit$labels, levels = fit$labels[fit$order]))
df3 <- ddply(df2, .(cluster), summarise, pos = mean(as.numeric(states)))
# Dendrogram
# scale_x_continuous() for p1 should match scale_x_discrete() from p2
# scale_x_continuous strips off the labels. I grab them from df2
# scale _y_continuous() puts a little space between the labels and the dendrogram
p1 <- ggdendrogram(as.dendrogram(fit), rotate = TRUE) +
scale_x_continuous(expand = c(0, 0.5), labels = levels(df2$states), breaks = 1:length(df2$states)) +
scale_y_continuous(expand = c(0.02, 0))
# Tiles and labels
p2 <- ggplot(df2,aes(states, y = 1, fill = factor(cluster))) +
geom_tile() +
scale_y_continuous(expand = c(0, 0)) +
scale_x_discrete(expand = c(0, 0)) +
geom_text(data = df3, aes(x = pos, label = cluster)) +
coord_flip() +
theme(axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
legend.position = "none")
# Get the ggplot grobs
gp1 <- ggplotGrob(p1)
gp2 <- ggplotGrob(p2)
# Make sure the heights match
maxHeight <- unit.pmax(gp1$heights, gp2$heights)
gp1$heights <- as.list(maxHeight)
gp2$heights <- as.list(maxHeight)
# Combine the two plots
grid.arrange(gp2, gp1, ncol = 2,widths = c(1/6, 5/6))
I try to draw something like this (but very simplified):
So the thing is that I set height, width and nob - number of bombs and I want to draw table with height*width cells, where there will by nob bombs ramdomly set (it could be for example text 'bomb',. it is not important). Furthermore, for every empty cell I want to count number of bombs in neighborhood and put that number in the middle of that cell (when zero - nothing). But I really have no idea for some "algorithm" for this. I draw board with proper size and that's all I can do. Any ideas, help?
w <- 7
h <- 5
nob <- 5
plot.new()
plot.window(xlim=c(0,w), ylim=c(0,h))
rect(0, 0, w, h)
for (i in 1:h-1){
lines(x=c(0,w), y=c(i,i))
}
for (j in 1:w-1){
lines(x=c(j,j), y=c(0, h))
}
sample(w*h, nob)
Some nice fun for Xmas time:
w <- 7
h <- 5
nob <- 5
nwal <- 7
set.seed(42) #for reproducibility
m <- matrix(0, ncol=w, nrow=h)
#place the walls
m[sample(length(m), nwal)] <- 1
o <- matrix("", ncol=w, nrow=h)
#place the bombs
o[sample(which(m == 0), nob)] <- "o"
#http://stackoverflow.com/a/22573306/1412059
#there is probably an alternative using igraph
sumNeighbors <- function(z) {
rbind(z[-1,],0) +
rbind(0,z[-nrow(z),]) +
cbind(z[,-1],0) +
cbind(0,z[,-ncol(z)]) +
cbind(rbind(z[-1,-1],0),0) +
cbind(0,rbind(z[-1,-ncol(z)],0)) +
cbind(rbind(0,z[-nrow(z),-1]),0) +
cbind(0,rbind(0,z[-nrow(z),-ncol(z)]))
}
library(reshape2)
DF <- melt(m, varnames = c("x", "y"), value.name = "z")
DF <- merge(DF, melt(o, varnames = c("x", "y"), value.name = "b"))
DF <- merge(DF, melt(sumNeighbors(o == "o"), varnames = c("x", "y"), value.name = "n"))
DF$n[DF$n == 0 | DF$b == "o" | DF$z == 1] <- ""
DF$t <- paste0(DF$n, DF$b)
library(ggplot2)
ggplot(DF, aes(x=x, y=y, fill=factor(z))) +
geom_tile(color="dark grey") +
geom_text(aes(label=t)) +
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = "none")