I am trying to plot a heatmap with the library pheatmap in R.
I think that by default the branch length is proportional to the "dissimilarity" of the clusters that got merged at this step. I would like to chance that, so it is a fixed value because for my purpose it looks very weird!
If anyone has an idea how I can fix this, I would be very happy.
Here is a sample code
library(pheatmap)
test = matrix(rnorm(6000), 100, 60)
pheatmap(test)
Cheers!
Here is an example of two column groups with high dissimilarity:
library(pheatmap)
test = cbind(matrix(rnorm(3000), 100, 30),
matrix(rnorm(3000)+10, 100, 30))
pheatmap(test)
TIn pheatmapthe dendrogram is plotted by the pheatmap:::draw_dendrogram function
and branch lengths are stored in the h object.
Below I define equal-length branches adding the command
hc$height <- cumsum(rep(1/length(hc$height), length(hc$height)))
as follows:
draw_dendrogram <- function(hc, gaps, horizontal = T) {
# Define equal-length branches
hc$height <- cumsum(rep(1/length(hc$height), length(hc$height)))
h = hc$height/max(hc$height)/1.05
m = hc$merge
o = hc$order
n = length(o)
m[m > 0] = n + m[m > 0]
m[m < 0] = abs(m[m < 0])
dist = matrix(0, nrow = 2 * n - 1, ncol = 2, dimnames = list(NULL,
c("x", "y")))
dist[1:n, 1] = 1/n/2 + (1/n) * (match(1:n, o) - 1)
for (i in 1:nrow(m)) {
dist[n + i, 1] = (dist[m[i, 1], 1] + dist[m[i, 2], 1])/2
dist[n + i, 2] = h[i]
}
draw_connection = function(x1, x2, y1, y2, y) {
res = list(x = c(x1, x1, x2, x2), y = c(y1, y, y, y2))
return(res)
}
x = rep(NA, nrow(m) * 4)
y = rep(NA, nrow(m) * 4)
id = rep(1:nrow(m), rep(4, nrow(m)))
for (i in 1:nrow(m)) {
c = draw_connection(dist[m[i, 1], 1], dist[m[i, 2], 1],
dist[m[i, 1], 2], dist[m[i, 2], 2], h[i])
k = (i - 1) * 4 + 1
x[k:(k + 3)] = c$x
y[k:(k + 3)] = c$y
}
x = pheatmap:::find_coordinates(n, gaps, x * n)$coord
y = unit(y, "npc")
if (!horizontal) {
a = x
x = unit(1, "npc") - y
y = unit(1, "npc") - a
}
res = polylineGrob(x = x, y = y, id = id)
return(res)
}
# Replace the non-exported function `draw_dendrogram` in `pheatmap`:
assignInNamespace(x="draw_dendrogram", value=draw_dendrogram, ns="pheatmap")
pheatmap(test)
The result is:
Related
I have to perform many comparisons between different measurement methods and I have to use the Passing-Bablok regression approach.
I would like to take advantage of ggplot2 and faceting, but I don't know how to add a geom_smooth layer based on the Passing-Bablok regression.
I was thinking about something like: https://stackoverflow.com/a/59173260/2096356
Furthermore, I would also need to show the regression line equation, with confidence interval for intercept and slope parameters, in each plot.
Edit with partial solution
I've found a partial solution combining the code provided in this post and in this answer.
## Regression algorithm
passing_bablok.fit <- function(x, y) {
x_name <- deparse(substitute(x))
lx <- length(x)
l <- lx*(lx - 1)/2
k <- 0
S <- rep(NA, lx)
for (i in 1:(lx - 1)) {
for (j in (i + 1):lx) {
k <- k + 1
S[k] <- (y[i] - y[j])/(x[i] - x[j])
}
}
S.sort <- sort(S)
N <- length(S.sort)
neg <- length(subset(S.sort,S.sort < 0))
K <- floor(neg/2)
if (N %% 2 == 1) {
b <- S.sort[(N+1)/2+K]
} else {
b <- sqrt(S.sort[N / 2 + K]*S.sort[N / 2 + K + 1])
}
a <- median(y - b * x)
res <- as.vector(c(a,b))
names(res) <- c("(Intercept)", x_name)
class(res) <- "Passing_Bablok"
res
}
## Computing confidence intervals
passing_bablok <- function(formula, data, R = 100, weights = NULL){
ret <- boot::boot(
data = model.frame(formula, data),
statistic = function(data, ind) {
data <- data[ind, ]
args <- rlang::parse_exprs(colnames(data))
names(args) <- c("y", "x")
rlang::eval_tidy(rlang::expr(passing_bablok.fit(!!!args)), data, env = rlang::current_env())
},
R=R
)
class(ret) <- c("Passing_Bablok", class(ret))
ret
}
## Plotting confidence bands
predictdf.Passing_Bablok <- function(model, xseq, se, level) {
pred <- as.vector(tcrossprod(model$t0, cbind(1, xseq)))
if(se) {
preds <- tcrossprod(model$t, cbind(1, xseq))
data.frame(
x = xseq,
y = pred,
ymin = apply(preds, 2, function(x) quantile(x, probs = (1-level)/2)),
ymax = apply(preds, 2, function(x) quantile(x, probs = 1-((1-level)/2)))
)
} else {
return(data.frame(x = xseq, y = pred))
}
}
An example of usage:
z <- data.frame(x = rnorm(100, mean = 100, sd = 5),
y = rnorm(100, mean = 110, sd = 8))
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0)
So far, I haven't been able to show the regression line equation, with confidence interval for intercept and slope parameters (as +- or in parentheses).
You've arguably done with difficult part with the PaBa regression.
Here's a basic solution using your passing_bablok.fit function:
z <- data.frame(x = 101:200+rnorm(100,sd=10),
y = 101:200+rnorm(100,sd=8))
mycoefs <- as.numeric(passing_bablok.fit(x = z$x, y=z$y))
paba_eqn <- function(thecoefs) {
l <- list(m = format(thecoefs[2], digits = 2),
b = format(abs(thecoefs[1]), digits = 2))
if(thecoefs[1] >= 0){
eq <- substitute(italic(y) == m %.% italic(x) + b,l)
} else {
eq <- substitute(italic(y) == m %.% italic(x) - b,l)
}
as.character(as.expression(eq))
}
library(ggplot2)
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0) +
annotate("text",x = 110, y = 220, label = paba_eqn(mycoefs), parse = TRUE)
Note the equation will vary because of rnorm in the data creation..
The solution could definitely be made more slick and robust, but it works for both positive and negative intercepts.
Equation concept sourced from: https://stackoverflow.com/a/13451587/2651663
I'm trying to increase the space between each segment for each successive interval to avoid overplotting. Not sure how to approach this so that it will increment vertically in the loop.
Here is my code and some reproducible data:
set.seed(200)
x <- rnorm(100, 10)
truemean <- mean(x)
mat <- replicate(100, t.test(sample(x, rep = T))$conf.int)
mat <- t(mat)
myfunc <- function(mat, truemean) {
plot(x = c(min(mat[ , 1]), max(mat[ , 2])),
y = c(1, 100),
type = "n",
xlab = "0:100",
ylab = "0:100")
abline(v = truemean)
for (i in 1:nrow(mat)) {
if (mat[i, 1] <= truemean & mat[i, 2] >= truemean) {
segments(x0 = mat[i, 1], y0 = i,
x1 = mat[i, 2], y1 = i,
col = "blue",
lwd = 2)
} else {
segments(x0 = mat[i, 1], y0 = i,
x1 = mat[i, 2], y1 = i,
col = "red",
lwd = 2)
}
}
}
myfunc(mat, truemean)
You can certainly add anything you want in the segment call, but I'm not sure what you are asking. First we can greatly simplify your code:
set.seed(200)
x <- rnorm(100, 10)
truemean <- mean(x)
mat <- replicate(100, t.test(sample(x, rep = T))$conf.int)
mat <- t(mat)
yval <- seq(dim(mat)[1])
clr <- ifelse(mat[, 1] <= truemean & mat[, 2] >= truemean, "blue", "red")
plot(NA, xlim=c(min(mat[ , 1]), max(mat[ , 2])), ylim=c(0, length(yval)), type="n",
xlab="Conf Int", ylab="Trials")
abline(v=truemean)
segments(mat[, 1], yval, mat[, 2], yval, col=clr)
This produces the following plot:
You could replace yval with yval+.1 in the segments function to shift everything up. If there are so many lines that they overlap you can increase the height of the plot to make more room.
I have the following function:
fx <- function(x) {
if(x >= 0 && x < 3) {
res <- 0.2;
} else if(x >=3 && x < 5) {
res <- 0.05;
} else if(x >= 5 && x < 6) {
res <- 0.15;
} else if(x >= 7 && x < 10) {
res <- 0.05;
} else {
res <- 0;
}
return(res);
}
How can I plot it's CDF function on the interval [0,10]?
Try
fx <- Vectorize(fx)
grid <- 0:10
p <- fx(grid)
cdf <- cumsum(p)
plot(grid, cdf, type = 'p', ylim = c(0, 1), col = 'steelblue',
xlab = 'x', ylab = expression(F(x)), pch = 19, las = 1)
segments(x0 = grid, x1 = grid + 1, y0 = cdf)
segments(x0 = grid + 1, y0 = c(cdf[-1], 1), y1 = cdf, lty = 2)
To add a bit accuracy to #Martin Schmelzer's answer. A cummulative distribution function(CDF)
evaluated at x, is the probability that X will take a value less than
or equal to x
So to get CDF from Probability Density Function(PDF), you need to integrate on PDF:
fx <- Vectorize(fx)
dx <- 0.01
x <- seq(0, 10, by = dx)
plot(x, cumsum(fx(x) * dx), type = "l", ylab = "cummulative probability", main = "My CDF")
Just adding up on the previous answers and using ggplot
# cdf
Fx <- function(x, dx) {
cumsum(fx(x)*dx)
}
fx <- Vectorize(fx)
dx <- 0.01
x <- seq(0, 10, dx)
df <- rbind(data.frame(x, value=fx(x), func='pdf'),
data.frame(x, value=Fx(x, dx), func='cdf'))
library(ggplot2)
ggplot(df, aes(x, value, col=func)) +
geom_point() + geom_line() + ylim(0, 1)
I have a fairly large dataframe (df) with pathing information in the form of continuous x,y coordinates:
df$x
df$y
With these data, I would like to:
1. Calculate a set of continuous vectors
2. Determine the angle between each of these vectors (in degrees)
3. Count the number of angles in the dataframe that meet a certain threshold (i.e. <90°)
Thank you!
Please see the post here for reference
require("ggplot2")
Hypocycloid <- function(num_points) {
r = 1
k = 3
theta = seq(from = 0, to = 2*pi, length.out = num_points)
x = r*(k - 1)*cos(theta) + r*cos((k - 1)*theta)
y = r*(k - 1)*sin(theta) - r*sin((k - 1)*theta)
df = data.frame(x = x, y = y)
gg1 = ggplot(df,
aes(x = x, y = y),
size = 1) +
geom_path()
print(gg1)
return(df)
}
ComputeUnitVectors <- function(points_df) {
npoints = nrow(points_df)
vx = points_df$x[2:npoints] - points_df$x[1:(npoints-1)]
vy = points_df$y[2:npoints] - points_df$y[1:(npoints-1)]
length = sqrt(vx^2 + vy^2)
return(data.frame(vx = vx/length, vy = vy/length))
}
ComputeAngles <- function(vectors_df) {
Angle <- function(v1, v2) {
return(acos(as.numeric(v1) %*% as.numeric(v2))*180/pi)
}
nvectors = nrow(vectors_df)
v1 = vectors_df[1:(nvectors-1),]
v2 = vectors_df[2:nvectors,]
v_df = cbind(v1, v2)
angle = apply(v_df, 1, function(row) {Angle(row[1:2], row[3:4])})
return(data.frame(angle))
}
points.df = Hypocycloid(20)
vectors.df = ComputeUnitVectors(points.df)
print(vectors.df)
angles.df = ComputeAngles(vectors.df)
print(angles.df)
I would like to create a plot, where one can see an alternative ticking of the x-axis, e.g. pi/2, pi, 3pi/2, etc. So far, this works for me only with a rather unhandy code (look at the lines where I create pi.halfs, pi.fulls and merge them later into vec.expr):
require (ggplot2)
# Create vectors: breaks and corresponding labels as multiples of pi/2
vec.breaks <- seq(from = pi/2, to = 7*pi/2, by = pi/2)
pi.halfs <- c(paste(expression(pi), "/2"),
paste(seq(from = 3, to = 21, by = 2), "*" , expression(pi), "/2"))
pi.fulls <- c(paste(expression(pi)),
paste(seq(from = 2, to = 11, by = 1), "*" , expression(pi)))
vec.expr <- parse(text = c(rbind(pi.halfs, pi.fulls)))[1:7]
# Create some time and signal
time <- seq(from = 0, to = 4*pi, by = 0.01)
signal <- sin(time)
df <- data.frame(time,signal)
# Now plot the signal with the new x axis labels
fig <- ggplot(data = df, aes(x = time, y = signal)) +
geom_line() +
scale_x_continuous(breaks=vec.breaks, labels=vec.expr)
print(fig)
... resulting in ...
Is anyone aware of an easier approach, where one can change the base of some x-axis labeling in ggplot2, e.g. like here from decimals to multiples of pi? Are there any nice packages, that I missed so far? I found some duplicates of this question, but only in other languages...
You are looking for the scales package, which lets you create arbitrary formatting functions for scales and also has a number of helpful formatting functions already built in. Looking through the scales package help, I was surprised not to find a radian scale, but you should be able to create one using math_formatter(). This code gets the same results, though not with the fractions.
library(ggplot2)
library(scales)
time <- seq(from = 0, to = 4*pi, by = 0.01)
signal <- sin(time)
df <- data.frame(time,signal)
pi_scales <- math_format(.x * pi, format = function(x) x / pi)
fig <- ggplot(data = df, aes(x = time, y = signal)) +
geom_line() +
scale_x_continuous(labels = pi_scales, breaks = seq(pi / 2, 7 * pi / 2, pi / 2))
print(fig)
Here is a function to make fractional labels (maybe a little clunky). It uses fractions from MASS package and allows you to change the multiplier you want to use on the x-axis. You just pass it a symbol (ie. "pi"). If the symbol has a value, the ticks will be scaled by width*value, otherwise just by width.
# Now plot the signal with the new x axis labels
p <- ggplot(data = df, aes(x = time, y = signal)) +
geom_line()
## Convert x-ticks to fractional x-ticks with a symbol multiplier
fracAx <- function(p, symbol, width=0.5) {
require(MASS) # for fractions
val <- tryCatch(eval(parse(text=symbol)), error=function(e) 1)
info <- ggplot_build(p)
xrange <- info[[2]]$ranges[[1]]$x.range/val # get the x-range of figure
vec.breaks <- seq(floor(xrange[1]), ceiling(xrange[2]), by=width)
fracs <- strsplit(attr(fractions(vec.breaks), "fracs"), "/") # convert to fractions
labels <- sapply(fracs, function(i)
if (length(i) > 1) { paste(i[1], "*", symbol, "/", i[2]) }
else { paste(i, "*", symbol) })
p + scale_x_continuous(breaks=vec.breaks*val, labels=parse(text=labels))
}
## Make the graph with pi axis
fracAx(p, "pi")
## Make the graph with e axis, for example
e <- exp(1)
fracAx(p, "e")
## Make the graph with a symbol that has no value
fracAx(p, "theta", width=2)
Based on the other answers here I was able to piece together some functions which implement a general radians format that can be used independently of mucking about with the internals of ggplot2 objects.
numerator <- function(x) {
f = attr(x, "fracs")
s <- as.integer(sign(x))
ifelse(is.finite(x), as.integer(stringr::str_extract(f, "^[^/]*")), s)
}
denominator <- function(x) {
f = attr(x, "fracs")
s <- as.integer(sign(x))
ratio <- str_detect(f, "/")
d <- as.integer(stringr::str_extract(f, "[^/]*$"))
ifelse(is.finite(x), ifelse(ratio, d, 1L), 0L)
}
char_sign <- function(x) {
s <- sign(x)
ifelse(s == 1, "+",
ifelse(s == -1, "-", ""))
}
#' Convert value to radians formatting
radians <- function(x) {
y = x/pi
f = suppressWarnings(MASS::as.fractions(y))
n = suppressWarnings(numerator(f))
d = suppressWarnings(denominator(f))
s <- char_sign(x)
o <- vector(mode = "character", length = length(x))
o <- ifelse(d == 0 & n != 0, paste0(s, "∞"), o)
o <- ifelse(n == 0 & d != 0, "0", o)
o <- ifelse(n != 0 & d != 0, paste0(n, "π/", d), o)
o <- ifelse(n == 1 & d != 0, paste0("π/", d), o)
o <- ifelse(n == -1 & d == 1, paste0(s, "π"), o)
o <- ifelse(n == -1 & d != 0 & d != 1, paste0(s, "π/", d), o)
o <- ifelse(d == 1 & n != 0 & abs(n) != 1, paste0(n, "π"), o)
o <- ifelse(n == d & is.finite(n), "π", o)
o
}
Here it is in use:
```r
time <- seq(from = 0, to = 4*pi, by = 0.01)
signal <- sin(time)
df <- data.frame(time,signal)
ggplot(df, aes(time, signal)) +
geom_line() +
scale_x_continuous(labels = trans_format(radians, force),
breaks = seq(0, 4*pi, pi/2))