how to create a heatmap with a fixed external hierarchical cluster - r

I have a matrix data, and want to visualize it with heatmap. The rows are species, so I want visualize the phylogenetic tree aside the rows and reorder the rows of the heatmap according the tree. I know the heatmap function in R can create the hierarchical clustering heatmap, but how can I use my phylogenetic clustering instead of the default created distance clustering in the plot?

First you need to use package ape to read in your data as a phylo object.
library(ape)
dat <- read.tree(file="your/newick/file")
#or
dat <- read.tree(text="((A:4.2,B:4.2):3.1,C:7.3);")
The following only works if your tree is ultrametric.
The next step is to transform your phylogenetic tree into class dendrogram.
Here is an example:
data(bird.orders) #This is already a phylo object
hc <- as.hclust(bird.orders) #Compulsory step as as.dendrogram doesn't have a method for phylo objects.
dend <- as.dendrogram(hc)
plot(dend, horiz=TRUE)
mat <- matrix(rnorm(23*23),nrow=23, dimnames=list(sample(bird.orders$tip, 23), sample(bird.orders$tip, 23))) #Some random data to plot
First we need to order the matrix according to the order in the phylogenetic tree:
ord.mat <- mat[bird.orders$tip,bird.orders$tip]
Then input it to heatmap:
heatmap(ord.mat, Rowv=dend, Colv=dend)
Edit: Here is a function to deal with ultrametric and non-ultrametric trees.
heatmap.phylo <- function(x, Rowp, Colp, ...){
# x numeric matrix
# Rowp: phylogenetic tree (class phylo) to be used in rows
# Colp: phylogenetic tree (class phylo) to be used in columns
# ... additional arguments to be passed to image function
x <- x[Rowp$tip, Colp$tip]
xl <- c(0.5, ncol(x)+0.5)
yl <- c(0.5, nrow(x)+0.5)
layout(matrix(c(0,1,0,2,3,4,0,5,0),nrow=3, byrow=TRUE),
width=c(1,3,1), height=c(1,3,1))
par(mar=rep(0,4))
plot(Colp, direction="downwards", show.tip.label=FALSE,
xlab="",ylab="", xaxs="i", x.lim=xl)
par(mar=rep(0,4))
plot(Rowp, direction="rightwards", show.tip.label=FALSE,
xlab="",ylab="", yaxs="i", y.lim=yl)
par(mar=rep(0,4), xpd=TRUE)
image((1:nrow(x))-0.5, (1:ncol(x))-0.5, x,
xaxs="i", yaxs="i", axes=FALSE, xlab="",ylab="", ...)
par(mar=rep(0,4))
plot(NA, axes=FALSE, ylab="", xlab="", yaxs="i", xlim=c(0,2), ylim=yl)
text(rep(0,nrow(x)),1:nrow(x),Rowp$tip, pos=4)
par(mar=rep(0,4))
plot(NA, axes=FALSE, ylab="", xlab="", xaxs="i", ylim=c(0,2), xlim=xl)
text(1:ncol(x),rep(2,ncol(x)),Colp$tip, srt=90, pos=2)
}
Here is with the previous (ultrametric) example:
heatmap.phylo(mat, bird.orders, bird.orders)
And with a non-ultrametric:
cat("owls(((Strix_aluco:4.2,Asio_otus:4.2):3.1,Athene_noctua:7.3):6.3,Tyto_alba:13.5);",
file = "ex.tre", sep = "\n")
tree.owls <- read.tree("ex.tre")
mat2 <- matrix(rnorm(4*4),nrow=4,
dimnames=list(sample(tree.owls$tip,4),sample(tree.owls$tip,4)))
is.ultrametric(tree.owls)
[1] FALSE
heatmap.phylo(mat2,tree.owls,tree.owls)

First, I create a reproducible example. Without data we can just guess what you want. So please try to do better next time(specially you are confirmed user). For example you can do this to create your tree in newick format:
tree.text='(((XXX:4.2,ZZZ:4.2):3.1,HHH:7.3):6.3,AAA:13.6);'
Like #plannpus, I am using ape to converts this tree to a hclust class. Unfortunatlty, it looks that we can do the conversion only for ultrametric tree: the distance from the root to each tip is the same.
library(ape)
tree <- read.tree(text='(((XXX:4.2,ZZZ:4.2):3.1,HHH:7.3):6.3,AAA:13.6);')
is.ultrametric(tree)
hc <- as.hclust.phylo(tree)
Then I am using dendrogramGrob from latticeExtra to plot my tree. and levelplot from lattice to draw the heatmap.
library(latticeExtra)
dd.col <- as.dendrogram(hc)
col.ord <- order.dendrogram(dd.col)
mat <- matrix(rnorm(4*4),nrow=4)
colnames(mat) <- tree$tip.label
rownames(mat) <- tree$tip.label
levelplot(mat[tree$tip,tree$tip],type=c('g','p'),
aspect = "fill",
colorkey = list(space = "left"),
legend =
list(right =
list(fun = dendrogramGrob,
args =
list(x = dd.col,
side = "right",
size = 10))),
panel=function(...){
panel.fill('black',alpha=0.2)
panel.levelplot.points(...,cex=12,pch=23)
}
)

I adapted plannapus' answer to deal with more than one tree (also cutting out some options I didn't need in the process):
library(ape)
heatmap.phylo <- function(x, Rowp, Colp, breaks, col, denscol="cyan", respect=F, ...){
# x numeric matrix
# Rowp: phylogenetic tree (class phylo) to be used in rows
# Colp: phylogenetic tree (class phylo) to be used in columns
# ... additional arguments to be passed to image function
scale01 <- function(x, low = min(x), high = max(x)) {
x <- (x - low)/(high - low)
x
}
col.tip <- Colp$tip
n.col <- 1
if (is.null(col.tip)) {
n.col <- length(Colp)
col.tip <- unlist(lapply(Colp, function(t) t$tip))
col.lengths <- unlist(lapply(Colp, function(t) length(t$tip)))
col.fraction <- col.lengths / sum(col.lengths)
col.heights <- unlist(lapply(Colp, function(t) max(node.depth.edgelength(t))))
col.max_height <- max(col.heights)
}
row.tip <- Rowp$tip
n.row <- 1
if (is.null(row.tip)) {
n.row <- length(Rowp)
row.tip <- unlist(lapply(Rowp, function(t) t$tip))
row.lengths <- unlist(lapply(Rowp, function(t) length(t$tip)))
row.fraction <- row.lengths / sum(row.lengths)
row.heights <- unlist(lapply(Rowp, function(t) max(node.depth.edgelength(t))))
row.max_height <- max(row.heights)
}
cexRow <- min(1, 0.2 + 1/log10(n.row))
cexCol <- min(1, 0.2 + 1/log10(n.col))
x <- x[row.tip, col.tip]
xl <- c(0.5, ncol(x)+0.5)
yl <- c(0.5, nrow(x)+0.5)
screen_matrix <- matrix( c(
0,1,4,5,
1,4,4,5,
0,1,1,4,
1,4,1,4,
1,4,0,1,
4,5,1,4
) / 5, byrow=T, ncol=4 )
if (respect) {
r <- grconvertX(1, from = "inches", to = "ndc") / grconvertY(1, from = "inches", to = "ndc")
if (r < 1) {
screen_matrix <- screen_matrix * matrix( c(r,r,1,1), nrow=6, ncol=4, byrow=T)
} else {
screen_matrix <- screen_matrix * matrix( c(1,1,1/r,1/r), nrow=6, ncol=4, byrow=T)
}
}
split.screen( screen_matrix )
screen(2)
par(mar=rep(0,4))
if (n.col == 1) {
plot(Colp, direction="downwards", show.tip.label=FALSE,xaxs="i", x.lim=xl)
} else {
screens <- split.screen( as.matrix(data.frame( left=cumsum(col.fraction)-col.fraction, right=cumsum(col.fraction), bottom=0, top=1)))
for (i in 1:n.col) {
screen(screens[i])
plot(Colp[[i]], direction="downwards", show.tip.label=FALSE,xaxs="i", x.lim=c(0.5,0.5+col.lengths[i]), y.lim=-col.max_height+col.heights[i]+c(0,col.max_height))
}
}
screen(3)
par(mar=rep(0,4))
if (n.col == 1) {
plot(Rowp, direction="rightwards", show.tip.label=FALSE,yaxs="i", y.lim=yl)
} else {
screens <- split.screen( as.matrix(data.frame( left=0, right=1, bottom=cumsum(row.fraction)-row.fraction, top=cumsum(row.fraction))) )
for (i in 1:n.col) {
screen(screens[i])
plot(Rowp[[i]], direction="rightwards", show.tip.label=FALSE,yaxs="i", x.lim=c(0,row.max_height), y.lim=c(0.5,0.5+row.lengths[i]))
}
}
screen(4)
par(mar=rep(0,4), xpd=TRUE)
image((1:nrow(x))-0.5, (1:ncol(x))-0.5, x, xaxs="i", yaxs="i", axes=FALSE, xlab="",ylab="", breaks=breaks, col=col, ...)
screen(6)
par(mar=rep(0,4))
plot(NA, axes=FALSE, ylab="", xlab="", yaxs="i", xlim=c(0,2), ylim=yl)
text(rep(0,nrow(x)),1:nrow(x),row.tip, pos=4, cex=cexCol)
screen(5)
par(mar=rep(0,4))
plot(NA, axes=FALSE, ylab="", xlab="", xaxs="i", ylim=c(0,2), xlim=xl)
text(1:ncol(x),rep(2,ncol(x)),col.tip, srt=90, adj=c(1,0.5), cex=cexRow)
screen(1)
par(mar = c(2, 2, 1, 1), cex = 0.75)
symkey <- T
tmpbreaks <- breaks
if (symkey) {
max.raw <- max(abs(c(x, breaks)), na.rm = TRUE)
min.raw <- -max.raw
tmpbreaks[1] <- -max(abs(x), na.rm = TRUE)
tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE)
} else {
min.raw <- min(x, na.rm = TRUE)
max.raw <- max(x, na.rm = TRUE)
}
z <- seq(min.raw, max.raw, length = length(col))
image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks,
xaxt = "n", yaxt = "n")
par(usr = c(0, 1, 0, 1))
lv <- pretty(breaks)
xv <- scale01(as.numeric(lv), min.raw, max.raw)
axis(1, at = xv, labels = lv)
h <- hist(x, plot = FALSE, breaks = breaks)
hx <- scale01(breaks, min.raw, max.raw)
hy <- c(h$counts, h$counts[length(h$counts)])
lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s",
col = denscol)
axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy))
par(cex = 0.5)
mtext(side = 2, "Count", line = 2)
close.screen(all.screens = T)
}
tree <- read.tree(text = "(A:1,B:1);((C:1,D:2):2,E:1);((F:1,G:1,H:2):5,((I:1,J:2):2,K:1):1);", comment.char="")
N <- sum(unlist(lapply(tree, function(t) length(t$tip))))
set.seed(42)
m <- cor(matrix(rnorm(N*N), nrow=N))
rownames(m) <- colnames(m) <- LETTERS[1:N]
heatmap.phylo(m, tree, tree, col=bluered(10), breaks=seq(-1,1,length.out=11), respect=T)

This exact application of a heatmap is already implemented in the plot_heatmap function (based on ggplot2) in the phyloseq package, which is openly/freely developed on GitHub. Examples with complete code and results are included here:
http://joey711.github.io/phyloseq/plot_heatmap-examples
One caveat, and not what you are explicitly asking for here, but phyloseq::plot_heatmap does not overlay a hierarchical tree for either axis. There is a good reason not to base your axis ordering on hierarchical clustering -- and this is because of the way indices at the end of long branches can still be next to each other arbitrarily depending on how branches are rotated at the nodes. This point, and an alternative based on non-metric multidimensional scaling is explained further in an article about the NeatMap package, which is also written for R and uses ggplot2. This dimension-reduction (ordination) approach to ordering the indices in a heatmap is adapted for phylogenetic abundance data in phyloseq::plot_heatmap.

While my suggestion for phlyoseq::plot_heatmap would get you part of the way there, the powerful "ggtree" package can do this, or more, if representing data on trees is really what you are going for.
Some examples are shown on the top of the following ggtree documentation page:
http://www.bioconductor.org/packages/3.7/bioc/vignettes/ggtree/inst/doc/advanceTreeAnnotation.html
Note that I am not affiliated with ggtree dev at all. Just a fan of the project and what it can already do.

After communication with #plannapus, I've modified (just a few) the code to remove some extra xlab="" information on the above code.
Here you will find the code. You can see the commented lines having the extra code and now the new lines just erasing them.
Hope this can help new users like me! :)
heatmap.phylo <- function(x, Rowp, Colp, ...){
# x numeric matrix
# Rowp: phylogenetic tree (class phylo) to be used in rows
# Colp: phylogenetic tree (class phylo) to be used in columns
# ... additional arguments to be passed to image function
x <- x[Rowp$tip, Colp$tip]
xl <- c(0.5, ncol(x) + 0.5)
yl <- c(0.5, nrow(x) + 0.5)
layout(matrix(c(0,1,0,2,3,4,0,5,0),nrow = 3, byrow = TRUE),
width = c(1,3,1), height = c(1,3,1))
par(mar = rep(0,4))
# plot(Colp, direction = "downwards", show.tip.label = FALSE,
# xlab = "", ylab = "", xaxs = "i", x.lim = xl)
plot(Colp, direction = "downwards", show.tip.label = FALSE,
xaxs = "i", x.lim = xl)
par(mar = rep(0,4))
# plot(Rowp, direction = "rightwards", show.tip.label = FALSE,
# xlab = "", ylab = "", yaxs = "i", y.lim = yl)
plot(Rowp, direction = "rightwards", show.tip.label = FALSE,
yaxs = "i", y.lim = yl)
par(mar = rep(0,4), xpd = TRUE)
image((1:nrow(x)) - 0.5, (1:ncol(x)) - 0.5, x,
#xaxs = "i", yaxs = "i", axes = FALSE, xlab = "", ylab = "", ...)
xaxs = "i", yaxs = "i", axes = FALSE, ...)
par(mar = rep(0,4))
plot(NA, axes = FALSE, ylab = "", xlab = "", yaxs = "i", xlim = c(0,2), ylim = yl)
text(rep(0, nrow(x)), 1:nrow(x), Rowp$tip, pos = 4)
par(mar = rep(0,4))
plot(NA, axes = FALSE, ylab = "", xlab = "", xaxs = "i", ylim = c(0,2), xlim = xl)
text(1:ncol(x), rep(2, ncol(x)), Colp$tip, srt = 90, pos = 2)
}

Related

How to plot a two-columned grid of time series with custom titles using R?

I have the following code (although without data, sadly):
detrend_plot <- cbind(l_p_lng,l_vol_lng,l_p_oil,l_rgdpe, ldiff_p_lng,ldiff_vol_lng,ldiff_p_oil,ldiff_rgdpe)
plot.ts(detrend_plot, main="",)
which gives the following plot:
What I want to do is to add custom titles, individual y-axis labels, and x-axis labels. I know that this is possible using GGPLOT, although my knowledge of it is sparse. Has anyone encountered a similar problem? I don't think this is possible using the regular plot.ts( ) function.
I don't think you can pass multiple titles and labels to plot.ts directly, but you can just loop over your columns with vectors of labels for each:
set.seed(1)
z <- ts(matrix(rt(200 * 8, df = 3), 200, 8), start = c(1961, 1), frequency = 12)
## vectors of x, y, and main labels
xl <- sprintf('x label %s', 1:8)
yl <- sprintf('y label %s', 1:8)
ml <- sprintf('main label %s', 1:8)
par(mfrow = c(4, 2), mar = c(5, 5, 1, 1), oma = c(0, 0, 1, 2))
lapply(1:8, function(ii) {
x <- z[, ii, drop = FALSE]
plot(x, xlab = xl[ii], ylab = yl[ii], main = ml[ii])
})
You can also pass vectors of arguments (eg, for x- or y-axis limits) using lists:
ylim <- list(c(-10, 10))
ylim <- rep(ylim, 8)
par(mfrow = c(4, 2), mar = c(5, 5, 1, 1), oma = c(0, 0, 1, 2))
lapply(1:8, function(ii) {
x <- z[, ii, drop = FALSE]
plot(x, xlab = xl[ii], ylab = yl[ii], main = ml[ii], col = ii, ylim = ylim[[ii]])
})
To get a figure closer to the default plot.ts look, you can just set top and bottom margins to 0 and adjust the axes (which is what plot.ts is doing under the hood). This method is a bit more verbose than plot.ts but will allow for more customization:
par(mfrow = c(4, 2), mar = c(0, 5, 0, 1), oma = c(5, 0, 3, 2))
lapply(1:8, function(ii) {
x <- z[, ii, drop = FALSE]
plot(x, xlab = xl[ii], ylab = yl[ii], col = ii, axes = FALSE)
axis(2, las = 1)
box()
if (ii %in% 7:8) {
axis(1)
title(xlab = 'Year', xpd = NA)
}
if (ii %in% 1:2)
title(main = c('Group 1', 'Group 2')[ii], xpd = NA, line = 1)
})

BRT: Add gradient colors to interaction plots using gbm.perspec

I would like to add a gradient of colours following the fitted values (e.g. higher fitted values darker colours, lower fitted values lighter colours) in my three-dimensional dependence plots.
I have used the example presented in dismo package:
library(dismo)
data(Anguilla_train)
angaus.tc5.lr01 <- gbm.step(data=Anguilla_train, gbm.x = 3:13, gbm.y = 2,
family = "bernoulli", tree.complexity = 5, learning.rate = 0.01,
bag.fraction = 0.5)
# Find interactions in the gbm model:
find.int <- gbm.interactions( angaus.tc5.lr01)
find.int$interactions
find.int$rank.list
I have only managed to add the same colour to the whole plot:
gbm.perspec( angaus.tc5.lr01, 7, 1,
x.label = "USRainDays",
y.label = "SegSumT",
z.label = "Fitted values",
z.range=c(0,0.435),
col="blue")
Or to add a gradient colour but not following the fitted values:
gbm.perspec( angaus.tc5.lr01, 7, 1,
x.label = "USRainDays",
y.label = "SegSumT",
z.label = "Fitted values",
col=heat.colors(50),
z.range=c(0,0.435))
I also checked the code of function gbm.perspec, and If I understood correctly the fitted values are call inside the formula as "prediction", and later on are part of the "pred.matrix" that is passed to the final plotting: persp(x = x.var, y = y.var, z = pred.matrix...), but I have no managed to access them from the gbm.perspec formula. I tried to modified the gbm.perpec function by adding "col=heat.colors(100)[round(pred.matrix*100, 0)]" into the persp() inside the function, but it does not do what I am looking for:
persp(x = x.var, y = y.var, z = pred.matrix, zlim = z.range,
xlab = x.label, ylab = y.label, zlab = z.label,
theta = theta, phi = phi, r = sqrt(10), d = 3,
ticktype = ticktype,
col=heat.colors(100)[round(pred.matrix*100, 0)],
mgp = c(4, 1, 0), ...)
I believe the solution might come from modifying the gbm.perpec function, do you know how?
Thank you for your time!
Modifying the gbm.perspec function is certainly an option, although if you use the predicted values from the gbm model and plot them onto a 3D scatterplot from another package you should be able to achieve it as well.
Here's an option using the plot3Drgl package, original code was provided by #Fabrice.
library(dismo); library(plot3Drgl); library(devEMF)
data(Anguilla_train)
angaus.tc5.lr01 <- gbm.step(data=Anguilla_train, gbm.x = 3:13, gbm.y = 2,
family = "bernoulli", tree.complexity = 5, learning.rate = 0.01,
bag.fraction = 0.5)
# Find interactions in the gbm model:
find.int <- gbm.interactions( angaus.tc5.lr01)
find.int$interactions
find.int$rank.list
d<-plot(angaus.tc5.lr01,c(1,7),return.grid=T)
x <- d$SegSumT
y <- d$USRainDays
z <- d$y
grid.lines = 30
elevation.site = loess(z ~ x*y, data=d, span=1, normalize = FALSE)
x.pred <- seq(min(x), max(x), length.out = grid.lines) # x grid
y.pred <- seq(min(y), max(y), length.out = grid.lines) # y grid
xy <- expand.grid( x = x.pred, y = y.pred) # final grid combined
z.site=matrix(predict(elevation.site, newdata = xy), nrow = grid.lines, ncol = grid.lines) # predicedt matrix
scatter3D(x, y, z, theta = 160, phi = 35, # x y z coords and angle of plot
clab = c(""), # Needs moving - label legend
colkey = list(side = 4, length = 0.65,
adj.clab = 0.15, dist = -0.15, cex.clab = 0.6, cex.axis = 0.6), # change the location and length of legend, change position of label and legend
clim = c(-4,0.1),
bty = "b", # type of box
col = ramp.col(col = c("grey", "blue"), 200),
pch = 19, cex = 0.55, # shape and size of points
xlab = "SegSumT",
xlim=c(10,20),ylim=c(0,3.5), zlim=c(-4,0.1), d= 2,
ylab = "USRaindays",
zlab= "Fitted values", #axes labels
cex.lab = 0.8, font.lab = 1, cex.axis = 0.6, font.axis= 1, # size and font of axes and ticks
ticktype = "detailed", nticks = 5, # ticks and numer of ticks
#type = "h", # vertical lines
surf = list(x = x.pred, y = y.pred, z = z.site,
facets = NA, CI=NULL))
enter image description here
By tweaking with grid.lines and reversing the x axis you should be able to produce exactly what you want.
By incorporating some of the code found here into the gbm.perspec() source code you can create the desired effect.
First run
# Color palette (100 colors)
col.pal<-colorRampPalette(c("blue", "red"))
colors<-col.pal(100)
Then, add z.facet.center to gbm.perspec() source code after else and change the z in the code to pred.matrixas follows,
# and finally plot the result
#
if (!perspective) {
image(x = x.var, y = y.var, z = pred.matrix, zlim = z.range)
} else {
z.facet.center <- (pred.matrix[-1, -1] + pred.matrix[-1, -ncol(pred.matrix)] +
pred.matrix[-nrow(pred.matrix), -1] + pred.matrix[-nrow(pred.matrix), -ncol(pred.matrix)])/4
# Range of the facet center on a 100-scale (number of colors)
z.facet.range<-cut(z.facet.center, 100)
persp(x=x.var, y=y.var, z=pred.matrix, zlim= z.range, # input vars
xlab = x.label, ylab = y.label, zlab = z.label, # labels
theta=theta, phi=phi, r = sqrt(10), d = 3,
col=colors[z.facet.range],# viewing pars
ticktype = ticktype, mgp = c(4,1,0), ...) #
which will give you a plot like this (please note, this is not plotted using the sample dataset which is why the interaction effect is different than the plot in the question).
Alternatively, you can create a new function. The following example modifies gbm.perspec() to give a white-to-red gradient. Simply run the code in R, then change gbm.perspec() to gbm.perspec2()
# interaction function
# Color palette (100 colors)
col.pal<-colorRampPalette(c("white", "pink", "red"))
colors<-col.pal(100)
gbm.perspec2 <- function(gbm.object,
x = 1, # the first variable to be plotted
y = 2, # the second variable to be plotted
pred.means = NULL, # allows specification of values for other variables
x.label = NULL, # allows manual specification of the x label
x.range = NULL, # manual range specification for the x variable
y.label = NULL, # and y la seminar committeebel
z.label = "fitted value", #default z label
y.range = NULL, # and the y
z.range = NULL, # allows control of the vertical axis
leg.coords = NULL, #can specify coords (x, y) for legend
ticktype = "detailed",# specifiy detailed types - otherwise "simple"
theta = 55, # rotation
phi=40, # and elevation
smooth = "none", # controls smoothing of the predicted surface
mask = FALSE, # controls masking using a sample intensity model
perspective = TRUE, # controls whether a contour or perspective plot is drawn
...) # allows the passing of additional arguments to plotting routine
# useful options include shade, ltheta, lphi for controlling illumination
# and cex for controlling text size - cex.axis and cex.lab have no effect
{
if (! requireNamespace('gbm') ) { stop('you need to install the gbm package to use this function') }
requireNamespace('splines')
#get the boosting model details
gbm.call <- gbm.object$gbm.call
gbm.x <- gbm.call$gbm.x
n.preds <- length(gbm.x)
gbm.y <- gbm.call$gbm.y
pred.names <- gbm.call$predictor.names
family = gbm.call$family
# and now set up range variables for the x and y preds
have.factor <- FALSE
x.name <- gbm.call$predictor.names[x]
if (is.null(x.label)) {
x.label <- gbm.call$predictor.names[x]
}
y.name <- gbm.call$predictor.names[y]
if (is.null(y.label)) {
y.label <- gbm.call$predictor.names[y]
}
data <- gbm.call$dataframe[ , gbm.x, drop=FALSE]
n.trees <- gbm.call$best.trees
# if marginal variable is a vector then create intervals along the range
if (is.vector(data[,x])) {
if (is.null(x.range)) {
x.var <- seq(min(data[,x],na.rm=T),max(data[,x],na.rm=T),length = 50)
} else {
x.var <- seq(x.range[1],x.range[2],length = 50)
}
} else {
x.var <- names(table(data[,x]))
have.factor <- TRUE
}
if (is.vector(data[,y])) {
if (is.null(y.range)) {
y.var <- seq(min(data[,y],na.rm=T),max(data[,y],na.rm=T),length = 50)
} else {y.var <- seq(y.range[1],y.range[2],length = 50)}
} else {
y.var <- names(table(data[,y]))
if (have.factor) { #check that we don't already have a factor
stop("at least one marginal predictor must be a vector!")
} else {have.factor <- TRUE}
}
pred.frame <- expand.grid(list(x.var,y.var))
names(pred.frame) <- c(x.name,y.name)
pred.rows <- nrow(pred.frame)
#make sure that the factor variable comes first
if (have.factor) {
if (is.factor(pred.frame[,2])) { # swap them about
pred.frame <- pred.frame[,c(2,1)]
x.var <- y.var
}
}
j <- 3
# cycle through the predictors
# if a non-target variable find the mean
for (i in 1:n.preds) {
if (i != x & i != y) {
if (is.vector(data[,i])) {
m <- match(pred.names[i],names(pred.means))
if (is.na(m)) {
pred.frame[,j] <- mean(data[,i],na.rm=T)
} else pred.frame[,j] <- pred.means[m]
}
if (is.factor(data[,i])) {
m <- match(pred.names[i],names(pred.means))
temp.table <- table(data[,i])
if (is.na(m)) {
pred.frame[,j] <- rep(names(temp.table)[2],pred.rows)
} else {
pred.frame[,j] <- pred.means[m]
}
pred.frame[,j] <- factor(pred.frame[,j],levels=names(temp.table))
}
names(pred.frame)[j] <- pred.names[i]
j <- j + 1
}
}
#
# form the prediction
#
#assign("pred.frame", pred.frame, pos=1)
prediction <- gbm::predict.gbm(gbm.object,pred.frame,n.trees = n.trees, type="response")
#assign("prediction", prediction, pos=1, immediate =T)
# model smooth if required
if (smooth == "model") {
pred.glm <- glm(prediction ~ ns(pred.frame[,1], df = 8) * ns(pred.frame[,2], df = 8), data=pred.frame,family=poisson)
prediction <- fitted(pred.glm)
}
# report the maximum value and set up realistic ranges for z
max.pred <- max(prediction)
message("maximum value = ",round(max.pred,2),"\n")
if (is.null(z.range)) {
if (family == "bernoulli") {
z.range <- c(0,1)
} else if (family == "poisson") {
z.range <- c(0,max.pred * 1.1)
} else {
z.min <- min(data[,y],na.rm=T)
z.max <- max(data[,y],na.rm=T)
z.delta <- z.max - z.min
z.range <- c(z.min - (1.1 * z.delta), z.max + (1.1 * z.delta))
}
}
# now process assuming both x and y are vectors
if (have.factor == FALSE) {
# form the matrix
pred.matrix <- matrix(prediction,ncol=50,nrow=50)
# kernel smooth if required
if (smooth == "average") { #apply a 3 x 3 smoothing average
pred.matrix.smooth <- pred.matrix
for (i in 2:49) {
for (j in 2:49) {
pred.matrix.smooth[i,j] <- mean(pred.matrix[c((i-1):(i+1)),c((j-1):(j+1))])
}
}
pred.matrix <- pred.matrix.smooth
}
# mask out values inside hyper-rectangle but outside of sample space
if (mask) {
mask.trees <- gbm.object$gbm.call$best.trees
point.prob <- gbm::predict.gbm(gbm.object[[1]],pred.frame, n.trees = mask.trees, type="response")
point.prob <- matrix(point.prob,ncol=50,nrow=50)
pred.matrix[point.prob < 0.5] <- 0.0
}
#
# and finally plot the result
#
if (!perspective) {
image(x = x.var, y = y.var, z = pred.matrix, zlim = z.range)
} else {
z.facet.center <- (pred.matrix[-1, -1] + pred.matrix[-1, -ncol(pred.matrix)] +
pred.matrix[-nrow(pred.matrix), -1] + pred.matrix[-nrow(pred.matrix), -ncol(pred.matrix)])/4
# Range of the facet center on a 100-scale (number of colors)
z.facet.range<-cut(z.facet.center, 100)
persp(x=x.var, y=y.var, z=pred.matrix, zlim= z.range, # input vars
xlab = x.label, ylab = y.label, zlab = z.label, # labels
theta=theta, phi=phi, r = sqrt(10), d = 3,
col=colors[z.facet.range],# viewing pars
ticktype = ticktype, mgp = c(4,1,0), ...) #
}
}
if (have.factor) {
# we need to plot values of y for each x
factor.list <- names(table(pred.frame[,1]))
n <- 1
#add this bit so z.range still works as expected:
if (is.null(z.range)) {
vert.limits <- c(0, max.pred * 1.1)
} else {
vert.limits <- z.range
}
plot(pred.frame[pred.frame[,1]==factor.list[1],2],
prediction[pred.frame[,1]==factor.list[1]],
type = 'l',
#ylim = c(0, max.pred * 1.1),
ylim = vert.limits,
xlab = y.label,
ylab = z.label, ...)
for (i in 2:length(factor.list)) {
#factor.level in factor.list) {
factor.level <- factor.list[i]
lines(pred.frame[pred.frame[,1]==factor.level,2],
prediction[pred.frame[,1]==factor.level], lty = i)
}
# now draw a legend
if(is.null(leg.coords)){
x.max <- max(pred.frame[,2])
x.min <- min(pred.frame[,2])
x.range <- x.max - x.min
x.pos <- c(x.min + (0.02 * x.range),x.min + (0.3 * x.range))
y.max <- max(prediction)
y.min <- min(prediction)
y.range <- y.max - y.min
y.pos <- c(y.min + (0.8 * y.range),y.min + (0.95 * y.range))
legend(x = x.pos, y = y.pos, factor.list, lty = c(1:length(factor.list)), bty = "n")
} else {
legend(x = leg.coords[1], y = leg.coords[2], factor.list, lty = c(1:length(factor.list)), bty = "n", ncol = 2)
}
}
}

Graphical output of density for the function gammamixEM (package mixtools)

I'm using the function gammamixEM from the package mixtools. How can I return the graphical output of density as in the function normalmixEM (i.e., the second plot in plot(...,which=2)) ?
Update:
Here is a reproducible example for the function gammamixEM:
x <- c(rgamma(200, shape = 0.2, scale = 14), rgamma(200,
shape = 32, scale = 10), rgamma(200, shape = 5, scale = 6))
out <- gammamixEM(x, lambda = c(1, 1, 1)/3, verb = TRUE)
Here is a reproducible example for the function normalmixEM:
data(faithful)
attach(faithful)
out <- normalmixEM(waiting, arbvar = FALSE, epsilon = 1e-03)
plot(out, which=2)
I would like to obtain this graphical output of density from the function gammamixEM.
Here you go.
out <- normalmixEM(waiting, arbvar = FALSE, epsilon = 1e-03)
x <- out
whichplots <- 2
density = 2 %in% whichplots
loglik = 1 %in% whichplots
def.par <- par(ask=(loglik + density > 1), "mar") # only ask and mar are changed
mix.object <- x
k <- ncol(mix.object$posterior)
x <- sort(mix.object$x)
a <- hist(x, plot = FALSE)
maxy <- max(max(a$density), .3989*mix.object$lambda/mix.object$sigma)
I just had to dig into the source code of plot.mixEM
So, now to do this with gammamixEM:
x <- c(rgamma(200, shape = 0.2, scale = 14), rgamma(200,
shape = 32, scale = 10), rgamma(200, shape = 5, scale = 6))
gammamixEM.out <- gammamixEM(x, lambda = c(1, 1, 1)/3, verb = TRUE)
mix.object <- gammamixEM.out
k <- ncol(mix.object$posterior)
x <- sort(mix.object$x)
a <- hist(x, plot = FALSE)
maxy <- max(max(a$density), .3989*mix.object$lambda/mix.object$sigma)
main2 <- "Density Curves"
xlab2 <- "Data"
col2 <- 2:(k+1)
hist(x, prob = TRUE, main = main2, xlab = xlab2,
ylim = c(0,maxy))
for (i in 1:k) {
lines(x, mix.object$lambda[i] *
dnorm(x,
sd = sd(x)))
}
I believe it should be pretty straight forward to continue this example a bit, if you want to add the labels, smooth lines, etc. Here's the source of the plot.mixEM function.

How to get Spearman R2 value using multiple linear regression

The R2 obtained from a linear regression is the Pearson correlation coefficient. However, I am wondering if I could get Spearman rank coefficient instead of Pearson in a linear regression.
I would be especially interested to get it using R with the package leaps:
library(leaps)
#Plotting function plot.regsubsets2 <-
function (x, labels = obj$xnames, main = NULL, scale = c("bic",
"Cp", "adjr2", "r2"), col = gray(seq(0, 0.9, length = 10)),
...)
{
obj <- x
lsum <- summary(obj)
par(mar = c(7, 5, 6, 3) + 0.1)
nmodels <- length(lsum$rsq)
np <- obj$np
propscale <- FALSE
sscale <- pmatch(scale[1], c("bic", "Cp", "adjr2", "r2"),
nomatch = 0)
if (sscale == 0)
stop(paste("Unrecognised scale=", scale))
if (propscale)
stop(paste("Proportional scaling only for probabilities"))
yscale <- switch(sscale, lsum$bic, lsum$cp, lsum$adjr2, lsum$rsq)
up <- switch(sscale, -1, -1, 1, 1)
index <- order(yscale * up)
colorscale <- switch(sscale, yscale, yscale, -log(pmax(yscale,
1e-04)), -log(pmax(yscale, 1e-04)))
image(z = t(ifelse(lsum$which[index, ], colorscale[index],
NA + max(colorscale) * 1.5)), xaxt = "n", yaxt = "n",
x = (1:np), y = 1:nmodels, xlab = "", ylab = scale[1],
col = col)
laspar <- par("las")
on.exit(par(las = laspar))
par(las = 2)
axis(1, at = 1:np, labels = labels, ...) # I modified this line
axis(2, at = 1:nmodels, labels = signif(yscale[index], 2))
if (!is.null(main))
title(main = main)
box()
invisible(NULL)
}
#Leap
leaps = regsubets(y~x1+x2+x2, data=mydf, nbest=10)
summary(leaps)
plot.regsubsets2(leaps, scale='r2')
Again, I would like if it possible to screen Spearman instead of Pearson (r2) in the plot.

Simplest way to plot changes in ranking between two ordered lists in R?

I'm wondering if there is an easy way to plot the changes in position of elements between 2 lists in the form of a directed bipartite graph in R. For example, list 1 and 2 are vectors of character strings, not necessarily containing the same elements:
list.1 <- c("a","b","c","d","e","f","g")
list.2 <- c("b","x","e","c","z","d","a")
I would like to generate something similar to:
I've had a slight bash at using the igraph package, but couldn't easily construct what I would like, which I imagine and hope shouldn't be too hard.
Cheers.
Here is a simple function to do what you want. Essentially it uses match to match elements from one vector to another and arrows to draw arrows.
plotRanks <- function(a, b, labels.offset=0.1, arrow.len=0.1)
{
old.par <- par(mar=c(1,1,1,1))
# Find the length of the vectors
len.1 <- length(a)
len.2 <- length(b)
# Plot two columns of equidistant points
plot(rep(1, len.1), 1:len.1, pch=20, cex=0.8,
xlim=c(0, 3), ylim=c(0, max(len.1, len.2)),
axes=F, xlab="", ylab="") # Remove axes and labels
points(rep(2, len.2), 1:len.2, pch=20, cex=0.8)
# Put labels next to each observation
text(rep(1-labels.offset, len.1), 1:len.1, a)
text(rep(2+labels.offset, len.2), 1:len.2, b)
# Now we need to map where the elements of a are in b
# We use the match function for this job
a.to.b <- match(a, b)
# Now we can draw arrows from the first column to the second
arrows(rep(1.02, len.1), 1:len.1, rep(1.98, len.2), a.to.b,
length=arrow.len, angle=20)
par(old.par)
}
A few example plots
par(mfrow=c(2,2))
plotRanks(c("a","b","c","d","e","f","g"),
c("b","x","e","c","z","d","a"))
plotRanks(sample(LETTERS, 20), sample(LETTERS, 5))
plotRanks(c("a","b","c","d","e","f","g"), 1:10) # No matches
plotRanks(c("a", "b", "c", 1:5), c("a", "b", "c", 1:5)) # All matches
par(mfrow=c(1,1))
Here's a solution using igraph functions.
rankchange <- function(list.1, list.2){
grp = c(rep(0,length(list.1)),rep(1,length(list.2)))
m = match(list.1, list.2)
m = m + length(list.1)
pairs = cbind(1:length(list.1), m)
pairs = pairs[!is.na(pairs[,1]),]
pairs = pairs[!is.na(pairs[,2]),]
g = graph.bipartite(grp, as.vector(t(pairs)), directed=TRUE)
V(g)$color = c("red","green")[grp+1]
V(g)$label = c(list.1, list.2)
V(g)$x = grp
V(g)$y = c(length(list.1):1, length(list.2):1)
g
}
This builds and then plots the graph from your vectors:
g = rankchange(list.1, list.2)
plot(g)
Adjust the colour scheme and symbolism to suit using options detailed in the igraph docs.
Note this is not thoroughly tested (only tried on your sample data) but you can see how it builds a bipartite graph from the code.
With ggplot2:
v1 <- c("a","b","c","d","e","f","g")
v2 <- c("b","x","e","c","z","d","a")
o <- 0.05
DF <- data.frame(x = c(rep(1, length(v1)), rep(2, length(v2))),
x1 = c(rep(1 + o, length(v1)), rep(2 - o, length(v2))),
y = c(rev(seq_along(v1)), rev(seq_along(v2))),
g = c(v1, v2))
library(ggplot2)
library(grid)
ggplot(DF, aes(x=x, y=y, group=g, label=g)) +
geom_path(aes(x=x1), arrow = arrow(length = unit(0.02,"npc")),
size=1, color="green") +
geom_text(size=10) +
theme_minimal() +
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
This can of course be wrapped in a function easily.
Here's a generalization of nico's result for use with data frames:
plotRanks <- function(df, rank_col, time_col, data_col, color_col = NA, labels_offset=0.1, arrow_len=0.1, ...){
time_vec <- df[ ,time_col]
unique_dates <- unique(time_vec)
unique_dates <- unique_dates[order(unique_dates)]
rank_ls <- lapply(unique_dates, function(d){
temp_df <- df[time_vec == d, ]
temp_df <- temp_df[order(temp_df[ ,data_col], temp_df[ ,rank_col]), ]
temp_d <- temp_df[ ,data_col]
temp_rank <- temp_df[ ,rank_col]
if(is.na(color_col)){
temp_color = rep("blue", length(temp_d))
}else{
temp_color = temp_df[ ,color_col]
}
temp_rank <- temp_df[ ,rank_col]
temp_ls <- list(temp_rank, temp_d, temp_color)
names(temp_ls) <- c("ranking", "data", "color")
temp_ls
})
first_rank <- rank_ls[[1]]$ranking
first_data <- rank_ls[[1]]$data
first_length <- length(first_rank)
y_max <- max(sapply(rank_ls, function(l) length(l$ranking)))
plot(rep(1, first_length), 1:first_length, pch=20, cex=0.8,
xlim=c(0, length(rank_ls) + 1), ylim = c(1, y_max), xaxt = "n", xlab = NA, ylab="Ranking", ...)
text_paste <- paste(first_rank, "\n", "(", first_data, ")", sep = "")
text(rep(1 - labels_offset, first_length), 1:first_length, text_paste)
axis(1, at = 1:(length(rank_ls)), labels = unique_dates)
for(i in 2:length(rank_ls)){
j = i - 1
ith_rank <- rank_ls[[i]]$ranking
ith_data <- rank_ls[[i]]$data
jth_color <- rank_ls[[j]]$color
jth_rank <- rank_ls[[j]]$ranking
ith_length <- length(ith_rank)
jth_length <- length(jth_rank)
points(rep(i, ith_length), 1:ith_length, pch = 20, cex = 0.8)
i_to_j <- match(jth_rank, ith_rank)
arrows(rep(i - 0.98, jth_length), 1:jth_length, rep(i - 0.02, ith_length), i_to_j
, length = 0.1, angle = 10, col = jth_color)
offset_choice <- ifelse(length(rank_ls) == 2, i + labels_offset, i - labels_offset)
text_paste <- paste(ith_rank, "\n", "(", ith_data, ")", sep = "")
text(rep(offset_choice, ith_length), 1:ith_length, text_paste)
}
}
Here's an example using a haphazard reshape of the presidents dataset:
data(presidents)
years <- rep(1945:1974, 4)
n <- length(presidents)
q1 <- presidents[seq(1, n, 4)]
q2 <- presidents[seq(2, n, 4)]
q3 <- presidents[seq(3, n, 4)]
q4 <- presidents[seq(4, n, 4)]
quarters <- c(q1, q2, q3, q4)
q_label <- c(rep("Q1", n / 4), rep("Q2", n / 4), rep("Q3", n / 4), rep("Q4", n / 4))
q_colors <- c(Q1 = "blue", Q2 = "red", Q3 = "green", Q4 = "orange")
q_colors <- q_colors[match(q_label, names(q_colors))]
new_prez <- data.frame(years, quarters, q_label, q_colors)
new_prez <- na.omit(new_prez)
png("C:/users/fasdfsdhkeos/desktop/prez.png", width = 15, height = 10, units = "in", res = 300)
plotRanks(new_prez[new_prez$years %in% 1960:1970, ], "q_label", "years", "quarters", "q_colors")
dev.off()
This produces a time series ranking plot, and it introduces color if tracking a certain observation is desired:

Resources