R heatmap.2 manual grouping of rows and columns [duplicate] - r

This question already has answers here:
How to create pre-annotated rowside column in heatmap.2
(2 answers)
Closed 6 years ago.
I have the following MWE in which I make a heatmap without performing any clustering and showing any dendrogram. I want to group my rows (genes) together in categories, in a better looking way than how it is now.
This is the MWE:
#MWE
library(gplots)
mymat <- matrix(rexp(600, rate=.1), ncol=12)
colnames(mymat) <- c(rep("treatment_1", 3), rep("treatment_2", 3), rep("treatment_3", 3), rep("treatment_4", 3))
rownames(mymat) <- paste("gene", 1:dim(mymat)[1], sep="_")
rownames(mymat) <- paste(rownames(mymat), c(rep("CATEGORY_1", 10), rep("CATEGORY_2", 10), rep("CATEGORY_3", 10), rep("CATEGORY_4", 10), rep("CATEGORY_5", 10)), sep=" --- ")
mymat #50x12 MATRIX. 50 GENES IN 5 CATEGORIES, ACROSS 4 TREATMENTS WITH 3 REPLICATES EACH
png(filename="TEST.png", height=800, width=600)
print(
heatmap.2(mymat, col=greenred(75),
trace="none",
keysize=1,
margins=c(8,14),
scale="row",
dendrogram="none",
Colv = FALSE,
Rowv = FALSE,
cexRow=0.5 + 1/log10(dim(mymat)[1]),
cexCol=1.25,
main="Genes grouped by categories")
)
dev.off()
Which produces this:
I would like to group the CATEGORIES in the rows together (and, if possible, the treatments in the columns as well), so it looks something like the following:
Or, maybe even better, with the CATEGORIES on the left, the same way as when clustering is performed and dendrograms are shown; however is easier and clearer...
Is there any way? Thanks!!
EDIT!!
I was made aware of the RowSideColors in the comments and I made the MWE below. However, I don't seem to be able to print the legend in the output png, plus the colors in the legend are not correct, and I cannot get the position right either. So please help me with the legend in the MWE below.
On another hand, I use the palette "Set3", consisting of 12 colors, but what if I need more than 12 colors (if I have more than 12 categories)??
NEW MWE
library(gplots)
library(RColorBrewer)
col1 <- brewer.pal(12, "Set3")
mymat <- matrix(rexp(600, rate=.1), ncol=12)
colnames(mymat) <- c(rep("treatment_1", 3), rep("treatment_2", 3), rep("treatment_3", 3), rep("treatment_4", 3))
rownames(mymat) <- paste("gene", 1:dim(mymat)[1], sep="_")
mymat
mydf <- data.frame(gene=paste("gene", 1:dim(mymat)[1], sep="_"), category=c(rep("CATEGORY_1", 10), rep("CATEGORY_2", 10), rep("CATEGORY_3", 10), rep("CATEGORY_4", 10), rep("CATEGORY_5", 10)))
mydf
png(filename="TEST.png", height=800, width=600)
print(
heatmap.2(mymat, col=greenred(75),
trace="none",
keysize=1,
margins=c(8,6),
scale="row",
dendrogram="none",
Colv = FALSE,
Rowv = FALSE,
cexRow=0.5 + 1/log10(dim(mymat)[1]),
cexCol=1.25,
main="Genes grouped by categories",
RowSideColors=col1[as.numeric(mydf$category)]
)
#THE LEGEND DOESN'T WORK INSIDE print(), AND THE POSITION AND COLORS ARE WRONG
#legend("topright",
# legend = unique(mydf$category),
# col = col1[as.numeric(mydf$category)],
# lty= 1,
# lwd = 5,
# cex=.7
# )
)
dev.off()
Which produces:
Please help me with the legend, and with the hypothetical case I need more than 12 colors. Thanks!

I would use pheatmap package. Your example would look something like that:
library(pheatmap)
library(RColorBrewer)
# Generte data (modified the mydf slightly)
col1 <- brewer.pal(12, "Set3")
mymat <- matrix(rexp(600, rate=.1), ncol=12)
colnames(mymat) <- c(rep("treatment_1", 3), rep("treatment_2", 3), rep("treatment_3", 3), rep("treatment_4", 3))
rownames(mymat) <- paste("gene", 1:dim(mymat)[1], sep="_")
mydf <- data.frame(row.names = paste("gene", 1:dim(mymat)[1], sep="_"), category = c(rep("CATEGORY_1", 10), rep("CATEGORY_2", 10), rep("CATEGORY_3", 10), rep("CATEGORY_4", 10), rep("CATEGORY_5", 10)))
# add row annotations
pheatmap(mymat, cluster_cols = F, cluster_rows = F, annotation_row = mydf)
# Add gaps
pheatmap(mymat, cluster_cols = F, cluster_rows = F, annotation_row = mydf, gaps_row = c(10, 20, 30, 40))
# Save to file with dimensions that keep both row and column names readable
pheatmap(mymat, cluster_cols = F, cluster_rows = F, annotation_row = mydf, gaps_row = c(10, 20, 30, 40), cellheight = 10, cellwidth = 20, file = "TEST.png")

Related

The plot does not display when arranged as single plot?

I created a heatmap and a pca plot and tried to merge them as single figure. But the are not displayed as single figure.
library(factoextra)
library(FactoMineR)
library(pheatmap)
library(RColorBrewer)
library(ggpubr)
# make test matrix
test = matrix(rnorm(200), 20, 10)
test[1:10, seq(1, 10, 2)] = test[1:10, seq(1, 10, 2)] + 3
test[11:20, seq(2, 10, 2)] = test[11:20, seq(2, 10, 2)] + 2
test[15:20, seq(2, 10, 2)] = test[15:20, seq(2, 10, 2)] + 4
colnames(test) = paste("Test", 1:10, sep = "")
rownames(test) = paste("Gene", 1:20, sep = "")
# define the annotation
annotation_row = data.frame(GeneClass = factor(rep(c("Path1", "Path2", "Path3"), c(10, 4, 6))),
AdditionalAnnotation = c(rep("random1", 10), rep("random2", 10)))
rownames(annotation_row) = paste("Gene", 1:20, sep = "")
a=pheatmap(test, annotation_row = annotation_row)
# creating pca
# Compute PCA with ncp = 3
res.pca <- PCA(test, ncp = 3, graph = FALSE)
# Compute hierarchical clustering on principal components
res.hcpc <- HCPC(res.pca, graph = FALSE)
# Principal components + tree
b=plot(res.hcpc, choice = "3D.map")
#arranging in a single plot
ggarrange(a$gtable, b, labels = c("A", "B"))
The output was without pca:
plot (or plot.HCPC) returns NULL and therefore b is NULL.
And from ?ggarrange, it expects a list of plots to be arranged into the grid. The plots can be either ggplot2 plot objects or arbitrary gtables.
So one option could be using as.ggplot() function from {ggplotify} package to convert that base plot to ggplot object and then pass it to ggarrange.
b <- ggplotify::as.ggplot(~plot(res.hcpc, choice = "3D.map"))
#arranging in a single plot
ggarrange(a$gtable, b, labels = c("A", "B"))

R pheatmap: Perform clustering and show dendrograms PER ANNOTATION CATEGORY

I know how to group rows (genes) by annotation category using pheatmap, and I know how to perform Person's correlation clustering on the whole set of rows (genes), but what I would like to accomplish would be to perform clustering (and show independent dendrograms) on each category independently.
Is that even possible? Or am I forced to create a separate heat map for each category to do the clustering on a category basis?
Check my MWE below:
set.seed(1)
library(pheatmap)
mymat <- matrix(rexp(600, rate=.1), ncol=12)
colnames(mymat) <- c(rep("treatment_1", 3), rep("treatment_2", 3), rep("treatment_3", 3), rep("treatment_4", 3))
rownames(mymat) <- paste("gene", 1:dim(mymat)[1], sep="_")
annotdf <- data.frame(row.names = paste("gene", 1:dim(mymat)[1], sep="_"), category = c(rep("CATEGORY_1", 10), rep("CATEGORY_2", 10), rep("CATEGORY_3", 10), rep("CATEGORY_4", 10), rep("CATEGORY_5", 10)))
pheatmap(mymat,
scale="row",
cluster_rows = FALSE,
cluster_cols = FALSE,
gaps_row=c(10,20,30,40),
gaps_col=c(3,6,9),
cellheight = 6,
cellwidth = 20,
border_color=NA,
fontsize_row = 6,
filename = "TEST1.png",
annotation_row = annotdf
)
pheatmap(mymat,
scale="row",
cluster_rows = TRUE,
cluster_cols = FALSE,
clustering_distance_rows = "correlation",#Pearson's
clustering_method = "average",
gaps_col=c(3,6,9),
cellheight = 6,
cellwidth = 20,
border_color=NA,
fontsize_row = 6,
filename = "TEST2.png",
annotation_row = annotdf
)
Which produces:

how can I set the bin centre values of histogram myself?

Lets say I have a data frame like below
mat <- data.frame(matrix(data = rexp(200, rate = 10), nrow = 100, ncol = 10))
Which then I can calculate the histogram on each of them columns using
matAllCols <- apply(mat, 2, hist)
Now if you look at matAllCols$breaks , you can see sometimes 11, sometimes 12 etc.
what I want is to set a threshold for it. for example it should always be 12 and the distances between each bin centre (which is stored as matAllCols$mids) be 0.01
Doing it for one column at the time seems to be simple, but when I tried to do it for all columns, it does not work. also this is only breaks, how to set the mids is also not straightforward
matAllCols <- apply(mat, 2, function(x) hist(x , breaks = 12))
is there anyway to do this ?
You can solve the probrem by giving the all breakpoints between histogram cells as breaks. (But this is written in stat.ethz.ch/R-manual/R-devel/library/graphics/html/hist.html as #Colonel Beauvel said)
set.seed(1); mat <- data.frame(matrix(data = rexp(200, rate = 10), nrow = 100, ncol = 10))
# You need to check the data range to decide the breakpoints.
range(mat) # [1] 0.002025041 0.483281274
# You can set the breakpoints manually.
matAllCols <- apply(mat, 2, function(x) hist(x , breaks = seq(0, 0.52, 0.04)))
You are looking for
set.seed(1)
mat <- data.frame(matrix(data = rexp(200, rate = 10), nrow = 100, ncol = 10))
matAllCols <- apply(mat, 2, function(x) hist(x , breaks = seq(0, 0.5, 0.05)))
or simply
x <- rexp(200, rate = 10)
hist(x[x>=0 & x <=0.5] , breaks = seq(0, 0.5, 0.05))

Edit row and col names in Pheatmap

I want to edit row and col names in my pheatmap or eventually delete and add new row and col names to be edited. In this case I will set show_colnames and show_rownames to FALSE.
library("pheatmap")
pheatmap(scale(dat), show_colnames = T, show_rownames = T,legend = TRUE,
cluster_rows=F, cluster_cols=F, border_color = "grey60")
Can somebody help me thanks.
You can use labels_row and labels_col parameters.
> set.seed(1)
> mat <- matrix(rnorm(100), 10, 10, dimnames=list(letters[1:10], letters[11:20]))
> pheatmap(mat)
> pheatmap(mat, labels_row=paste0("foo", 1:10), labels_col=paste0("bar", 1:10))
Alternatively you can modify rownames / colnames of the matrix you pass to the pheatmap function.
library(magrittr)
mat %>%
set_rownames(paste0("foo", 1:10)) %>%
set_colnames(paste0("bar", 1:10)) %>%
pheatmap()

Simplest way to plot changes in ranking between two ordered lists in R?

I'm wondering if there is an easy way to plot the changes in position of elements between 2 lists in the form of a directed bipartite graph in R. For example, list 1 and 2 are vectors of character strings, not necessarily containing the same elements:
list.1 <- c("a","b","c","d","e","f","g")
list.2 <- c("b","x","e","c","z","d","a")
I would like to generate something similar to:
I've had a slight bash at using the igraph package, but couldn't easily construct what I would like, which I imagine and hope shouldn't be too hard.
Cheers.
Here is a simple function to do what you want. Essentially it uses match to match elements from one vector to another and arrows to draw arrows.
plotRanks <- function(a, b, labels.offset=0.1, arrow.len=0.1)
{
old.par <- par(mar=c(1,1,1,1))
# Find the length of the vectors
len.1 <- length(a)
len.2 <- length(b)
# Plot two columns of equidistant points
plot(rep(1, len.1), 1:len.1, pch=20, cex=0.8,
xlim=c(0, 3), ylim=c(0, max(len.1, len.2)),
axes=F, xlab="", ylab="") # Remove axes and labels
points(rep(2, len.2), 1:len.2, pch=20, cex=0.8)
# Put labels next to each observation
text(rep(1-labels.offset, len.1), 1:len.1, a)
text(rep(2+labels.offset, len.2), 1:len.2, b)
# Now we need to map where the elements of a are in b
# We use the match function for this job
a.to.b <- match(a, b)
# Now we can draw arrows from the first column to the second
arrows(rep(1.02, len.1), 1:len.1, rep(1.98, len.2), a.to.b,
length=arrow.len, angle=20)
par(old.par)
}
A few example plots
par(mfrow=c(2,2))
plotRanks(c("a","b","c","d","e","f","g"),
c("b","x","e","c","z","d","a"))
plotRanks(sample(LETTERS, 20), sample(LETTERS, 5))
plotRanks(c("a","b","c","d","e","f","g"), 1:10) # No matches
plotRanks(c("a", "b", "c", 1:5), c("a", "b", "c", 1:5)) # All matches
par(mfrow=c(1,1))
Here's a solution using igraph functions.
rankchange <- function(list.1, list.2){
grp = c(rep(0,length(list.1)),rep(1,length(list.2)))
m = match(list.1, list.2)
m = m + length(list.1)
pairs = cbind(1:length(list.1), m)
pairs = pairs[!is.na(pairs[,1]),]
pairs = pairs[!is.na(pairs[,2]),]
g = graph.bipartite(grp, as.vector(t(pairs)), directed=TRUE)
V(g)$color = c("red","green")[grp+1]
V(g)$label = c(list.1, list.2)
V(g)$x = grp
V(g)$y = c(length(list.1):1, length(list.2):1)
g
}
This builds and then plots the graph from your vectors:
g = rankchange(list.1, list.2)
plot(g)
Adjust the colour scheme and symbolism to suit using options detailed in the igraph docs.
Note this is not thoroughly tested (only tried on your sample data) but you can see how it builds a bipartite graph from the code.
With ggplot2:
v1 <- c("a","b","c","d","e","f","g")
v2 <- c("b","x","e","c","z","d","a")
o <- 0.05
DF <- data.frame(x = c(rep(1, length(v1)), rep(2, length(v2))),
x1 = c(rep(1 + o, length(v1)), rep(2 - o, length(v2))),
y = c(rev(seq_along(v1)), rev(seq_along(v2))),
g = c(v1, v2))
library(ggplot2)
library(grid)
ggplot(DF, aes(x=x, y=y, group=g, label=g)) +
geom_path(aes(x=x1), arrow = arrow(length = unit(0.02,"npc")),
size=1, color="green") +
geom_text(size=10) +
theme_minimal() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
This can of course be wrapped in a function easily.
Here's a generalization of nico's result for use with data frames:
plotRanks <- function(df, rank_col, time_col, data_col, color_col = NA, labels_offset=0.1, arrow_len=0.1, ...){
time_vec <- df[ ,time_col]
unique_dates <- unique(time_vec)
unique_dates <- unique_dates[order(unique_dates)]
rank_ls <- lapply(unique_dates, function(d){
temp_df <- df[time_vec == d, ]
temp_df <- temp_df[order(temp_df[ ,data_col], temp_df[ ,rank_col]), ]
temp_d <- temp_df[ ,data_col]
temp_rank <- temp_df[ ,rank_col]
if(is.na(color_col)){
temp_color = rep("blue", length(temp_d))
}else{
temp_color = temp_df[ ,color_col]
}
temp_rank <- temp_df[ ,rank_col]
temp_ls <- list(temp_rank, temp_d, temp_color)
names(temp_ls) <- c("ranking", "data", "color")
temp_ls
})
first_rank <- rank_ls[[1]]$ranking
first_data <- rank_ls[[1]]$data
first_length <- length(first_rank)
y_max <- max(sapply(rank_ls, function(l) length(l$ranking)))
plot(rep(1, first_length), 1:first_length, pch=20, cex=0.8,
xlim=c(0, length(rank_ls) + 1), ylim = c(1, y_max), xaxt = "n", xlab = NA, ylab="Ranking", ...)
text_paste <- paste(first_rank, "\n", "(", first_data, ")", sep = "")
text(rep(1 - labels_offset, first_length), 1:first_length, text_paste)
axis(1, at = 1:(length(rank_ls)), labels = unique_dates)
for(i in 2:length(rank_ls)){
j = i - 1
ith_rank <- rank_ls[[i]]$ranking
ith_data <- rank_ls[[i]]$data
jth_color <- rank_ls[[j]]$color
jth_rank <- rank_ls[[j]]$ranking
ith_length <- length(ith_rank)
jth_length <- length(jth_rank)
points(rep(i, ith_length), 1:ith_length, pch = 20, cex = 0.8)
i_to_j <- match(jth_rank, ith_rank)
arrows(rep(i - 0.98, jth_length), 1:jth_length, rep(i - 0.02, ith_length), i_to_j
, length = 0.1, angle = 10, col = jth_color)
offset_choice <- ifelse(length(rank_ls) == 2, i + labels_offset, i - labels_offset)
text_paste <- paste(ith_rank, "\n", "(", ith_data, ")", sep = "")
text(rep(offset_choice, ith_length), 1:ith_length, text_paste)
}
}
Here's an example using a haphazard reshape of the presidents dataset:
data(presidents)
years <- rep(1945:1974, 4)
n <- length(presidents)
q1 <- presidents[seq(1, n, 4)]
q2 <- presidents[seq(2, n, 4)]
q3 <- presidents[seq(3, n, 4)]
q4 <- presidents[seq(4, n, 4)]
quarters <- c(q1, q2, q3, q4)
q_label <- c(rep("Q1", n / 4), rep("Q2", n / 4), rep("Q3", n / 4), rep("Q4", n / 4))
q_colors <- c(Q1 = "blue", Q2 = "red", Q3 = "green", Q4 = "orange")
q_colors <- q_colors[match(q_label, names(q_colors))]
new_prez <- data.frame(years, quarters, q_label, q_colors)
new_prez <- na.omit(new_prez)
png("C:/users/fasdfsdhkeos/desktop/prez.png", width = 15, height = 10, units = "in", res = 300)
plotRanks(new_prez[new_prez$years %in% 1960:1970, ], "q_label", "years", "quarters", "q_colors")
dev.off()
This produces a time series ranking plot, and it introduces color if tracking a certain observation is desired:

Resources