Order heatmap rows in ggplot2 facet plot - r

I'm having a problem with faceted heatmap rendering in ggplot2. The idea is that I have several elements (these are genes in the real life) and several experiments (F1 and F2 in the example below). Using the F1 experiment, I'm able to create class of elements/genes based on their mean expression (high, ..., moderate, ..., low). In the heatmap produced through the example below, I would like to order each elements in each class (01, 02, 03, 04) based on its mean expression value in F1. Unfortunately, the elements appear in alphabetic order. I would be very happy to get some hints...
Best
library(ggplot2)
library(reshape2)
set.seed(123)
# let's create a fake dataset
nb.experiment <- 4
n.row <- 200
n.col <- 5
d <- matrix(round(runif(n.row * n.col),2), nc=n.col)
colnames(d) <- sprintf("%02d", 1:5)
# These strings will be the row names of each heatmap
# in the subsequent facet plot
elements <- sample(replicate(n.row/2, rawToChar(as.raw(sample(65:90, 6, replace=T)))))
# let's create a data.frame d
d <- data.frame(d,
experiment = sort(rep(c("F1","F2"), n.row/2)),
elements= elements)
# For elements related to experiment F1
# we artificially produce a gradient of values that will
# create elements with increasing row means
d[d$experiment =="F1",1:5] <- round(sweep(d[d$experiment =="F1",1:5],
1,
seq(from=1, 10, length.out = 100),
"+"), 2)
# For elements related to experiment F2
# we artificially produce a gradient of values that will
# create elements with decreasing row means
d[d$experiment =="F2",1:5] <- round(sweep(d[d$experiment =="F2",1:5],
1,
seq(from=10, 1, length.out = 100),
"+"), 2)
#print(d[d$experiment =="F1",1:5])
# Now we split the dataset by experiments
d.split <- split(d, d$experiment)
# For all experiments, we order elements based on the mean expression signal in
# F1.
row.means.F1 <- rowMeans(d.split$F1[,1:5])
pos <- order(row.means.F1)
for(s in names(d.split)){
d.split[[s]] <- d.split[[s]][pos,]
}
# We create several classes of elements based on their
# mean expression signal in F1.
cuts <- cut(1:nrow(d.split$F1), nb.experiment)
levels(cuts) <- sprintf("%02d", 1:nb.experiment)
for(s in names(d.split)){
d.split[[s]] <- split(d.split[[s]], cuts)
}
# Data are melt (their is perhaps a better solution...)
# in order to use the ggplot function
dm <- melt(do.call('rbind',lapply(d.split, melt)), id.var=c( "experiment", "elements", "variable", "L1"))
dm <- dm[, -5]
colnames(dm) <- c("experiment","elements", "pos", "rowMeanClass", "exprs")
# Now we plot the data
p <- ggplot(dm, aes(x = pos, y = elements, fill = exprs))
p <- p + geom_raster()
p <- p + facet_wrap(~rowMeanClass +experiment , scales = "free", ncol = 2)
p <- p + theme_bw()
p <- p + theme(text = element_text(size=4))
p <- p + theme(text = element_text(family = "mono", face = "bold"))
ggsave("RPlot_test.jpeg", p)

Using your advises I was able to find a solution (which implies to clearly specify the order of levels for the 'elements' factor). Thank you #hrbrmstr (and all others).
NB: I only added few lines compare to the original code that are denoted below with 'Added: begin' and 'Added: end' flags.
library(ggplot2)
library(reshape2)
set.seed(123)
# let's create a fake dataset
nb.experiment <- 4
n.row <- 200
n.col <- 5
d <- matrix(round(runif(n.row * n.col),2), nc=n.col)
colnames(d) <- sprintf("%02d", 1:5)
# These strings will be the row names of each heatmap
# in the subsequent facet plot
elements <- sample(replicate(n.row/2, rawToChar(as.raw(sample(65:90, 6, replace=T)))))
# let's create a data.frame d
d <- data.frame(d,
experiment = sort(rep(c("F1","F2"), n.row/2)),
elements= elements)
# For elements related to experiment F1
# we artificially produce a gradient of values that will
# create elements with increasing row means
d[d$experiment =="F1",1:5] <- round(sweep(d[d$experiment =="F1",1:5],
1,
seq(from=1, 10, length.out = 100),
"+"), 2)
# For elements related to experiment F2
# we artificially produce a gradient of values that will
# create elements with decreasing row means
d[d$experiment =="F2",1:5] <- round(sweep(d[d$experiment =="F2",1:5],
1,
seq(from=10, 1, length.out = 100),
"+"), 2)
#print(d[d$experiment =="F1",1:5])
# Now we split the dataset by experiments
d.split <- split(d, d$experiment)
# For all experiments, we order elements based on the mean expression signal in
# F1.
row.means.F1 <- rowMeans(d.split$F1[,1:5])
pos <- order(row.means.F1)
for(s in names(d.split)){
d.split[[s]] <- d.split[[s]][pos,]
}
## Added: begin ###
#Get the list of elements in proper order (based on row mean)
mean.order <- as.character(d.split$F1$elements)
## Added: end###
# We create several classes of elements based on their
# mean expression signal in F1.
cuts <- cut(1:nrow(d.split$F1), nb.experiment)
levels(cuts) <- sprintf("%02d", 1:nb.experiment)
for(s in names(d.split)){
d.split[[s]] <- split(d.split[[s]], cuts)
}
# Data are melt (their is perhaps a better solution...)
# in order to use the ggplot function
dm <- melt(do.call('rbind',lapply(d.split, melt)), id.var=c( "experiment", "elements", "variable", "L1"))
dm <- dm[, -5]
colnames(dm) <- c("experiment","elements", "pos", "rowMeanClass", "exprs")
## Added: begin###
#Ensure that dm$elements is an ordered factor with levels
# ordered as expected
dm$elements <- factor(dm$elements, levels = mean.order, ordered = TRUE)
## Added: end###
# Now we plot the data
p <- ggplot(dm, aes(x = pos, y = elements, fill = exprs))
p <- p + geom_raster()
p <- p + facet_wrap(~rowMeanClass +experiment , scales = "free", ncol = 2)
p <- p + theme_bw()
p <- p + theme(text = element_text(size=4))
p <- p + theme(text = element_text(family = "mono", face = "bold"))
ggsave("RPlot_test.jpeg", p)

Related

number of items to replace is not a multiple of replacement length + several plots in same graph error

I'm trying to draw several plots in the same graph in Rstudio, but to no avail. This is the code I'm using:
for (i in 1:10){
require(ggplot2)
N <- 100
T <- 3
Delta <- T/N
B <- numeric(N+1)
t <- seq(0,T,length=N+1)
for(i in 2:(N+1)){
B[i] <- B[i-1]+rnorm(1) * sqrt(Delta)
}
x <- 0
y <- 3
BB[i] <- x+B-(t/T)*(B[N+1]-y+x)
df <- melt(data = BB, id.vars = "t")
ggplot(data = df, aes(x = t, y = value, colour = variable)) + geom_line()
}
Using ggplot2 as I seen recomended in several Stackoverflow post yieds "number of items to replace is not a multiple of replacement length".
I've seen several answers to that question but being quite a noob in R I don't see how it applies to my problem. Please and thank you in advance.
How about this:
BB <- list() # define BB as a list
for (i in 1:10){
require(ggplot2)
N <- 100
T <- 3
Delta <- T/N
B <- numeric(N+1)
t <- seq(0,T,length=N+1)
for(q in 2:(N+1)){ # Change your index from i to q
B[q] <- B[q-1]+rnorm(1) * sqrt(Delta)
}
x <- 0
y <- 3
BB[[i]] <- x+B-(t/T)*(B[N+1]-y+x) # Assign each iteration to a list entry
}
# Exit the for loop
df <- as.data.frame(cbind(unlist(BB), # unlist the values in BB
rep(t,10), # define t variable by simply repeating it
rep(1:10,each= 101))) # define loop id in a similar manner
names(df) <- c('value','t','variable') # give names to the variables
df$variable <- as.factor(df$variable) # turn variable into a factor
ggplot(data = df, aes(x = t, y = value, colour = variable)) + geom_line()
The resulting plot:

Add error bars to multiple lines to show standard deviation on a plot in R

I have a plot with many diferent lines and I'd like to add error bars to each point on every line.
df <- matrix(runif(25),5,5)
plot(1:5,seq(0,1,1/4),type = 'n')
mapply(lines,as.data.frame(df),col=cols,pch=1:5,type="o")
I have tried to use the arrows function but with no success.
stdev <- matrix(runif(25,0,0.1),5,5)
A <- as.data.frame(df) + as.data.frame(stdev)
B <- as.data.frame(df) - as.data.frame(stdev)
mapply(arrows(1:5,A,1:5,B,col=cols,angle=90,length=0.03, code=3))
Any suggestions?
arrows is a vectorized function. So there is a possibility to avoid mapply call. Consider (I have also replaced your first mapply call by matplot):
## generate example data
set.seed(0)
mat <- matrix(runif(25), 5, 5) ## data to plot
stdev <- matrix(runif(25,0,0.1), 5, 5) ## arbitrary standard error
low <- mat - stdev ## lower bound
up <- mat + stdev ## upper bound
x <- seq(0,1,1/4) ## x-locations to plot against
## your colour setting; should have `ncol(mat)` colours
## as an example I just use `cols = 1:ncol(mat)`
cols <- 1:ncol(mat)
## plot each column of `mat` one by one (set y-axis limit appropriately)
matplot(x, mat, col = cols, pch = 1:5, type = "o", ylim = c(min(low), max(up)))
xx <- rep.int(x, ncol(mat)) ## recycle `x` for each column of `mat`
repcols <- rep(cols, each = nrow(mat)) ## recycle `col` for each row of `mat`
## adding error bars using vectorization power of `arrow`
arrows(xx, low, xx, up, col = repcols, angle = 90, length = 0.03, code = 3)
With ggplot:
set.seed(123) # for reproducibility
data <- as.data.frame(matrix(runif(25),5,5)) # sample data matrix
se <- as.data.frame(matrix(runif(25,0,0.1),5,5)) # SE matrix
data$line <- se$line <- as.factor(1:nrow(data))
library(reshape2)
data <- melt(data, id='line')
se <- melt(se, id='line')
data$ymax <- data$value + se$value
data$ymin <- data$value - se$value
library(ggplot2)
ggplot(data, aes(variable, value, group=line, color=line)) + geom_point() + geom_line() +
geom_errorbar(aes(ymax=ymax, ymin=ymin), width=0.25) + xlab('points')

geom_raster faceted plot with ggplot2: control row height

In the example below I have a dataset containing two experiments F1 and F2. A classification is performed based on F1 signal, and both F1 and F2 values are ordered accordingly. In this diagram, each facet has the same dimension although the number of rows is not the same (e.g class #7 contains only few elements compare to the other classes). I would like to modify the code to force row height to be the same across facets (facets would thus have various blank space below). Any hints would be greatly appreciated.
Thank you
library(ggplot2)
library(reshape2)
set.seed(123)
# let's create a fake dataset
nb.experiment <- 4
n.row <- 200
n.col <- 5
nb.class <- 7
d <- matrix(round(runif(n.row * n.col),2), nc=n.col)
colnames(d) <- sprintf("%02d", 1:5)
# These strings will be the row names of each heatmap
# in the subsequent facet plot
elements <- sample(replicate(n.row/2, rawToChar(as.raw(sample(65:90, 6, replace=T)))))
# let's create a data.frame d
d <- data.frame(d,
experiment = sort(rep(c("F1","F2"), n.row/2)),
elements= elements)
# Now we split the dataset by experiments
d.split <- split(d, d$experiment)
# Now we create classes (here using hierarchical clustering )
# based on F1 experiment
dist.mat <- as.dist(1-cor(t(d.split$F1[,1:5]), method="pearson"))
hc <- hclust(dist.mat)
cuts <- cutree(hc, nb.class)
levels(cuts) <- sprintf("Class %02d", 1:nb.experiment)
# We split F1 and F2 based on classification result
for(s in names(d.split)){
d.split[[s]] <- split(d.split[[s]], cuts)
}
# Data are melt (their is perhaps a better solution...)
# in order to use the ggplot function
dm <- melt(do.call('rbind',lapply(d.split, melt)), id.var=c( "experiment", "elements", "variable", "L1"))
dm <- dm[, -5]
colnames(dm) <- c("experiment","elements", "pos", "class", "exprs")
dm$class <- as.factor(dm$class)
levels(dm$class) <- paste("Class", levels(dm$class))
# Now we plot the data
p <- ggplot(dm, aes(x = pos, y = elements, fill = exprs))
p <- p + geom_raster()
p <- p + facet_wrap(~class +experiment , scales = "free", ncol = 2)
p <- p + theme_bw()
p <- p + theme(text = element_text(size=4))
p <- p + theme(text = element_text(family = "mono", face = "bold"))
print(p)
Use facet_grid instead of facet_wrap and set the space attribute:
ggplot(dm, aes(x = pos, y = elements, fill = exprs)) +
geom_raster() +
facet_grid(rowMeanClass ~ experiment , scales = "free", space = "free_y") +
theme_bw()

how to identify where of a column is more rich in one values?

If i generate randomly a binary data frame like below
Mat <- matrix(sample(0:1, 200*50, replace = TRUE),200,50)
If I have 200 rows for each column and I set a threshold like 50 up and 30 down.
how can I check whether the 50 rows in top of each column contains more 1 values or the 30 rows down of each column or the middle ?
how can I then plot something to show graphically the results ?
By doing
f <- function(x, u = 200, d = 200){
res <- list(NA)
for(i in 1:ncol(x)){
res[[i]] <- c(sum(x[1:u,i] == 1), sum(x[(u+1):(nrow(x)- d),i] == 1), sum(x[(nrow(x)-d+1):nrow(x),i] == 1))
}
res <- do.call(rbind, res)
res
}
then calculate
res_value <- f(output)
the res_values can be found here
https://gist.github.com/anonymous/a1f68b9798affe630e65
df <- data.frame(cbind(c(t(res_value)), rep(1:50, each = 3)), X3 = rep(1:3))
ggplot(df, aes(x = factor(X2), y = X1, fill = as.factor(X3))) + geom_bar(position="fill", stat = "identity")
I got a warning like below
Warning message:
In cbind(c(t(res_value)), rep(1:50, each = 3)) :
number of rows of result is not a multiple of vector length (arg 2)
and of course the plot is like below which is not good at all
What about this? First write a function to calculate the number of ones in each of the three groups using the thresholds (u and d) and then plot the result as filled barplot:
f <- function(x, u = 50, d = 30){
res <- list(NA)
for(i in 1:ncol(x)){
res[[i]] <- c(sum(x[1:u,i] == 1), sum(x[(u+1):(nrow(x)- d),i] == 1), sum(x[(nrow(x)-d+1):nrow(x),i] == 1))
}
res <- do.call(rbind, res)
res
}
res <- f(Mat)
df <- data.frame(cbind(c(t(res)), rep(1:50, each = 3)), X3 = rep(1:3))
ggplot(df, aes(x = factor(X2), y = X1, fill = as.factor(X3))) + geom_bar(position="fill", stat = "identity")
Group 1 is the upper group, 2 the middle and 3 the bottom group. If you want the exact numers to be plotted instead of normalized values you can set position = stack
This is how to view the matrix...
image(Mat)
You can try something like this:
Mat <- matrix(sample(0:1, 200*50, replace = TRUE),200,50)
high_t<-70
bottom_t<-70
sums <- rbind(colSums(Mat[1:high_t,]),colSums(Mat[(high_t+1):(nrow(Mat)-bottom_t),]),colSums(Mat[(nrow(Mat)-bottom_t+1):nrow(Mat),]))
res <- apply(sums,2,which.max)
For each interval, use colSums to sum the columns, then rbind the results and use which.max to find which interval has the most 1s, 1 for top, 2 for middle and 3 for bottom.
I changed your thresholds because the middle always wins if you choose 50 and 30 (the middle then has 120 rows)
library(reshape2)
library(gplots)
library(ggplot2)
Mat <- matrix(sample(0:1, 200*50, replace = TRUE), 200, 50)
low_cut <- 50
high_cut <- 30
lows <- apply(Mat, 2, function(x) sum(x[1:low_cut]))
highs <- apply(Mat, 2, function(x) sum(x[(length(x)-high_cut):length(x)]))
totals <- colSums(Mat)
mids <- totals - lows - highs
results <- data.frame(id = 1:NCOL(Mat),
lows = lows,
mids = mids,
highs = highs)
excludeVars <- names(results) %in% c('id')
image(as.matrix(results[!excludeVars]))
heatmap.2(as.matrix(results[!excludeVars]),
trace = "none")
melted_results <- melt(results, id.vars = "id")
ggplot(melted_results, aes(x=variable, y=id)) +
geom_tile(aes(fill=value))

Matrix of density plots with each plot overlaying two distributions

I have a data.frame with 5 columns and I'd like to generate a matrix of density plots, such that each density plot is an overlay of two density plots. (This is akin to plotmatrix, except that in my case, the number of non-NA value in each column differ from column to column and I want overlaid distributions rather than scatter plots).
My first attempt, which didn't work, is given below:
library(ggplot2)
library(reshape)
tmp1 <- data.frame(do.call(cbind, lapply(1:5, function(x) {
r <- rnorm(100)
r[sample(1:100, 20)] <- NA
return(r)
})))
ggplot( melt(tmp1), aes(x=value, fill=variable))+
geom_density(alpha=0.2, position="identity")+opts(legend.position = "none")+
facet_grid(variable ~ variable)
My second approach got me nearly there, but instead of 5 different colors, I only want to use two colors across all the plots. And, I'm sure there is a more elegant way to construct this expanded matrix:
tmp2 <- do.call(rbind, lapply(1:5, function(i) {
do.call(rbind, lapply(1:5, function(j) {
r <- rbind(data.frame(var=sprintf('X%d', i), val=tmp1[,i]),
data.frame(var=sprintf('X%d', j), val=tmp1[,j]))
r <- data.frame(xx=sprintf('X%d', i), yy=sprintf('X%d', j), r)
return(r)
}))
}))
ggplot(tmp2, aes(x=val, fill=var))+
geom_density(alpha=0.2, position="identity")+opts(legend.position = "none")+
facet_grid(xx ~ yy)
My solution was to manually loop through the pairs of columns and generate the overlaid density plots by hand, saving them to a list. I then arranged them in a grid using `grid.arrange' giving the image below.
But is it possible to achieve this using facet_grid instead?
The easiest way is to reshape your data with all permutations (5 * 5 = 25 of them).
require(gregmisc)
perm <- permutations(5, 2, paste0("X", 1:5), repeats.allowed=TRUE)
# instead of gregmisc + permutations, you can use expand.grid from base as:
# perm <- expand.grid(paste0("X", 1:5), paste0("X", 1:5))
o <- apply(perm, 1, function(idx) {
t <- tmp1[idx]
names(t) <- c("A", "B")
t$id1 <- idx[1]
t$id2 <- idx[2]
t
})
require(ggplot2)
require(reshape2)
o <- do.call(rbind, o)
o.m <- melt(o, c("id1", "id2"))
o.m$id1 <- factor(o.m$id1)
o.m$id2 <- factor(o.m$id2)
p <- ggplot(o.m, aes(x = value))
p <- p + geom_density(alpha = 0.2, position = "identity", aes(fill = variable))
p <- p + theme(legend.position = "none")
p <- p + facet_grid(id1 ~ id2)
p

Resources