How to plot a CDF functon from PDF in R - r

I have the following function:
fx <- function(x) {
if(x >= 0 && x < 3) {
res <- 0.2;
} else if(x >=3 && x < 5) {
res <- 0.05;
} else if(x >= 5 && x < 6) {
res <- 0.15;
} else if(x >= 7 && x < 10) {
res <- 0.05;
} else {
res <- 0;
}
return(res);
}
How can I plot it's CDF function on the interval [0,10]?

Try
fx <- Vectorize(fx)
grid <- 0:10
p <- fx(grid)
cdf <- cumsum(p)
plot(grid, cdf, type = 'p', ylim = c(0, 1), col = 'steelblue',
xlab = 'x', ylab = expression(F(x)), pch = 19, las = 1)
segments(x0 = grid, x1 = grid + 1, y0 = cdf)
segments(x0 = grid + 1, y0 = c(cdf[-1], 1), y1 = cdf, lty = 2)

To add a bit accuracy to #Martin Schmelzer's answer. A cummulative distribution function(CDF)
evaluated at x, is the probability that X will take a value less than
or equal to x
So to get CDF from Probability Density Function(PDF), you need to integrate on PDF:
fx <- Vectorize(fx)
dx <- 0.01
x <- seq(0, 10, by = dx)
plot(x, cumsum(fx(x) * dx), type = "l", ylab = "cummulative probability", main = "My CDF")

Just adding up on the previous answers and using ggplot
# cdf
Fx <- function(x, dx) {
cumsum(fx(x)*dx)
}
fx <- Vectorize(fx)
dx <- 0.01
x <- seq(0, 10, dx)
df <- rbind(data.frame(x, value=fx(x), func='pdf'),
data.frame(x, value=Fx(x, dx), func='cdf'))
library(ggplot2)
ggplot(df, aes(x, value, col=func)) +
geom_point() + geom_line() + ylim(0, 1)

Related

Add a Passing-Bablok regression line

I have to perform many comparisons between different measurement methods and I have to use the Passing-Bablok regression approach.
I would like to take advantage of ggplot2 and faceting, but I don't know how to add a geom_smooth layer based on the Passing-Bablok regression.
I was thinking about something like: https://stackoverflow.com/a/59173260/2096356
Furthermore, I would also need to show the regression line equation, with confidence interval for intercept and slope parameters, in each plot.
Edit with partial solution
I've found a partial solution combining the code provided in this post and in this answer.
## Regression algorithm
passing_bablok.fit <- function(x, y) {
x_name <- deparse(substitute(x))
lx <- length(x)
l <- lx*(lx - 1)/2
k <- 0
S <- rep(NA, lx)
for (i in 1:(lx - 1)) {
for (j in (i + 1):lx) {
k <- k + 1
S[k] <- (y[i] - y[j])/(x[i] - x[j])
}
}
S.sort <- sort(S)
N <- length(S.sort)
neg <- length(subset(S.sort,S.sort < 0))
K <- floor(neg/2)
if (N %% 2 == 1) {
b <- S.sort[(N+1)/2+K]
} else {
b <- sqrt(S.sort[N / 2 + K]*S.sort[N / 2 + K + 1])
}
a <- median(y - b * x)
res <- as.vector(c(a,b))
names(res) <- c("(Intercept)", x_name)
class(res) <- "Passing_Bablok"
res
}
## Computing confidence intervals
passing_bablok <- function(formula, data, R = 100, weights = NULL){
ret <- boot::boot(
data = model.frame(formula, data),
statistic = function(data, ind) {
data <- data[ind, ]
args <- rlang::parse_exprs(colnames(data))
names(args) <- c("y", "x")
rlang::eval_tidy(rlang::expr(passing_bablok.fit(!!!args)), data, env = rlang::current_env())
},
R=R
)
class(ret) <- c("Passing_Bablok", class(ret))
ret
}
## Plotting confidence bands
predictdf.Passing_Bablok <- function(model, xseq, se, level) {
pred <- as.vector(tcrossprod(model$t0, cbind(1, xseq)))
if(se) {
preds <- tcrossprod(model$t, cbind(1, xseq))
data.frame(
x = xseq,
y = pred,
ymin = apply(preds, 2, function(x) quantile(x, probs = (1-level)/2)),
ymax = apply(preds, 2, function(x) quantile(x, probs = 1-((1-level)/2)))
)
} else {
return(data.frame(x = xseq, y = pred))
}
}
An example of usage:
z <- data.frame(x = rnorm(100, mean = 100, sd = 5),
y = rnorm(100, mean = 110, sd = 8))
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0)
So far, I haven't been able to show the regression line equation, with confidence interval for intercept and slope parameters (as +- or in parentheses).
You've arguably done with difficult part with the PaBa regression.
Here's a basic solution using your passing_bablok.fit function:
z <- data.frame(x = 101:200+rnorm(100,sd=10),
y = 101:200+rnorm(100,sd=8))
mycoefs <- as.numeric(passing_bablok.fit(x = z$x, y=z$y))
paba_eqn <- function(thecoefs) {
l <- list(m = format(thecoefs[2], digits = 2),
b = format(abs(thecoefs[1]), digits = 2))
if(thecoefs[1] >= 0){
eq <- substitute(italic(y) == m %.% italic(x) + b,l)
} else {
eq <- substitute(italic(y) == m %.% italic(x) - b,l)
}
as.character(as.expression(eq))
}
library(ggplot2)
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0) +
annotate("text",x = 110, y = 220, label = paba_eqn(mycoefs), parse = TRUE)
Note the equation will vary because of rnorm in the data creation..
The solution could definitely be made more slick and robust, but it works for both positive and negative intercepts.
Equation concept sourced from: https://stackoverflow.com/a/13451587/2651663

Increasing vertical space between segments in base R

I'm trying to increase the space between each segment for each successive interval to avoid overplotting. Not sure how to approach this so that it will increment vertically in the loop.
Here is my code and some reproducible data:
set.seed(200)
x <- rnorm(100, 10)
truemean <- mean(x)
mat <- replicate(100, t.test(sample(x, rep = T))$conf.int)
mat <- t(mat)
myfunc <- function(mat, truemean) {
plot(x = c(min(mat[ , 1]), max(mat[ , 2])),
y = c(1, 100),
type = "n",
xlab = "0:100",
ylab = "0:100")
abline(v = truemean)
for (i in 1:nrow(mat)) {
if (mat[i, 1] <= truemean & mat[i, 2] >= truemean) {
segments(x0 = mat[i, 1], y0 = i,
x1 = mat[i, 2], y1 = i,
col = "blue",
lwd = 2)
} else {
segments(x0 = mat[i, 1], y0 = i,
x1 = mat[i, 2], y1 = i,
col = "red",
lwd = 2)
}
}
}
myfunc(mat, truemean)
You can certainly add anything you want in the segment call, but I'm not sure what you are asking. First we can greatly simplify your code:
set.seed(200)
x <- rnorm(100, 10)
truemean <- mean(x)
mat <- replicate(100, t.test(sample(x, rep = T))$conf.int)
mat <- t(mat)
yval <- seq(dim(mat)[1])
clr <- ifelse(mat[, 1] <= truemean & mat[, 2] >= truemean, "blue", "red")
plot(NA, xlim=c(min(mat[ , 1]), max(mat[ , 2])), ylim=c(0, length(yval)), type="n",
xlab="Conf Int", ylab="Trials")
abline(v=truemean)
segments(mat[, 1], yval, mat[, 2], yval, col=clr)
This produces the following plot:
You could replace yval with yval+.1 in the segments function to shift everything up. If there are so many lines that they overlap you can increase the height of the plot to make more room.

Changing branch length in dendrogram (pheatmap)

I am trying to plot a heatmap with the library pheatmap in R.
I think that by default the branch length is proportional to the "dissimilarity" of the clusters that got merged at this step. I would like to chance that, so it is a fixed value because for my purpose it looks very weird!
If anyone has an idea how I can fix this, I would be very happy.
Here is a sample code
library(pheatmap)
test = matrix(rnorm(6000), 100, 60)
pheatmap(test)
Cheers!
Here is an example of two column groups with high dissimilarity:
library(pheatmap)
test = cbind(matrix(rnorm(3000), 100, 30),
matrix(rnorm(3000)+10, 100, 30))
pheatmap(test)
TIn pheatmapthe dendrogram is plotted by the pheatmap:::draw_dendrogram function
and branch lengths are stored in the h object.
Below I define equal-length branches adding the command
hc$height <- cumsum(rep(1/length(hc$height), length(hc$height)))
as follows:
draw_dendrogram <- function(hc, gaps, horizontal = T) {
# Define equal-length branches
hc$height <- cumsum(rep(1/length(hc$height), length(hc$height)))
h = hc$height/max(hc$height)/1.05
m = hc$merge
o = hc$order
n = length(o)
m[m > 0] = n + m[m > 0]
m[m < 0] = abs(m[m < 0])
dist = matrix(0, nrow = 2 * n - 1, ncol = 2, dimnames = list(NULL,
c("x", "y")))
dist[1:n, 1] = 1/n/2 + (1/n) * (match(1:n, o) - 1)
for (i in 1:nrow(m)) {
dist[n + i, 1] = (dist[m[i, 1], 1] + dist[m[i, 2], 1])/2
dist[n + i, 2] = h[i]
}
draw_connection = function(x1, x2, y1, y2, y) {
res = list(x = c(x1, x1, x2, x2), y = c(y1, y, y, y2))
return(res)
}
x = rep(NA, nrow(m) * 4)
y = rep(NA, nrow(m) * 4)
id = rep(1:nrow(m), rep(4, nrow(m)))
for (i in 1:nrow(m)) {
c = draw_connection(dist[m[i, 1], 1], dist[m[i, 2], 1],
dist[m[i, 1], 2], dist[m[i, 2], 2], h[i])
k = (i - 1) * 4 + 1
x[k:(k + 3)] = c$x
y[k:(k + 3)] = c$y
}
x = pheatmap:::find_coordinates(n, gaps, x * n)$coord
y = unit(y, "npc")
if (!horizontal) {
a = x
x = unit(1, "npc") - y
y = unit(1, "npc") - a
}
res = polylineGrob(x = x, y = y, id = id)
return(res)
}
# Replace the non-exported function `draw_dendrogram` in `pheatmap`:
assignInNamespace(x="draw_dendrogram", value=draw_dendrogram, ns="pheatmap")
pheatmap(test)
The result is:

Trying to recreate plot with tresholds in ggplot

So basically I'm trying to recreate this plot in ggplot, to match my theme:
and I've come pretty close:
but I can't recreate the treshold in my plot. How can I possibly add this to my ggplot? Here is the source code of the original plotting function:
function (data, option = c("alpha", "xi", "quantile"), start = 15,end = NA,
reverse = FALSE, p = NA, ci = 0.95, auto.scale = TRUE, labels = TRUE, ...)
{
if (is.timeSeries(data))
data <- as.vector(series(data))
data <- as.numeric(data)
ordered <- rev(sort(data))
ordered <- ordered[ordered > 0]
n <- length(ordered)
option <- match.arg(option)
if ((option == "quantile") && (is.na(p)))
stop("\nInput a value for the probability p.\n")
if ((option == "quantile") && (p < 1 - start/n)) {
cat("Graph may look strange !! \n\n")
cat(paste("Suggestion 1: Increase `p' above", format(signif(1 -
start/n, 5)), "\n"))
cat(paste("Suggestion 2: Increase `start' above ", ceiling(length(data) *
(1 - p)), "\n"))
}
k <- 1:n
loggs <- logb(ordered)
avesumlog <- cumsum(loggs)/(1:n)
xihat <- c(NA, (avesumlog - loggs)[2:n])
alphahat <- 1/xihat
y <- switch(option, alpha = alphahat, xi = xihat, quantile = ordered *
((n * (1 - p))/k)^(-1/alphahat))
ses <- y/sqrt(k)
if (is.na(end))
end <- n
x <- trunc(seq(from = min(end, length(data)), to = start))
y <- y[x]
ylabel <- option
yrange <- range(y)
if (ci && (option != "quantile")) {
qq <- qnorm(1 - (1 - ci)/2)
u <- y + ses[x] * qq
l <- y - ses[x] * qq
ylabel <- paste(ylabel, " (CI, p =", ci, ")", sep = "")
yrange <- range(u, l)
}
if (option == "quantile")
ylabel <- paste("Quantile, p =", p)
index <- x
if (reverse)
index <- -x
if (auto.scale) {
plot(index, y, ylim = yrange, type = "l", xlab = "",
ylab = "", axes = FALSE, ...)
}
else {
plot(index, y, type = "l", xlab = "", ylab = "", axes = FALSE,
...)
}
axis(1, at = index, labels = paste(x), tick = FALSE)
axis(2)
threshold <- findthreshold(data, x)
axis(3, at = index, labels = paste(format(signif(threshold,
3))), tick = FALSE)
box()
if (ci && (option != "quantile")) {
lines(index, u, lty = 2, col = 2)
lines(index, l, lty = 2, col = 2)
}
if (labels) {
title(xlab = "Order Statistics", ylab = ylabel)
mtext("Threshold", side = 3, line = 3)
}
return(invisible(list(x = index, y = y)))
}
Thanks for your help!
I think you are looking for the sec.axis argument for scale_x_continuous(). To make sure everything lined up, I had to create a function to find every nth value. Hope this helps
set.seed(1234)
df <- tibble(
x = 1:50,
threshold = round(1+rnorm(1:50), 2),
base_y = c(0.1, runif(49, -1, 2)/5),
mid = cumsum(base_y),
upper = mid + 5,
lower = mid - 5
)
every_nth <- function(x, n) {
x[seq(0, length(x), n)]
}
ggplot(df, aes(x = x)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) +
geom_line(aes(y = mid)) +
scale_x_continuous(
breaks = every_nth(df$x, 10),
sec.axis = dup_axis(
labels = every_nth(df$threshold, 10),
name = "Threshold"
)
) +
theme_minimal()

PCA Scaling with ggbiplot

I'm trying to plot a principal component analysis using prcomp and ggbiplot. I'm getting data values outside of the unit circle, and haven't been able to rescale the data prior to calling prcomp in such a way that I can constrain the data to the unit circle.
data(wine)
require(ggbiplot)
wine.pca=prcomp(wine[,1:3],scale.=TRUE)
ggbiplot(wine.pca,obs.scale = 1,
var.scale=1,groups=wine.class,ellipse=TRUE,circle=TRUE)
I tried scaling by subtracting mean and dividing by standard deviation before calling prcomp:
wine2=wine[,1:3]
mean=apply(wine2,2,mean)
sd=apply(wine2,2,mean)
for(i in 1:ncol(wine2)){
wine2[,i]=(wine2[,i]-mean[i])/sd[i]
}
wine2.pca=prcomp(wine2,scale.=TRUE)
ggbiplot(wine2.pca,obs.scale=1,
var.scale=1,groups=wine.class,ellipse=TRUE,circle=TRUE)
ggbiplot package installed as follows:
require(devtools)
install_github('ggbiplot','vqv')
Output of either code chunk:
Per #Brian Hanson's comment below, I'm adding an additional image reflecting the output I'm trying to get.
I edited the code for the plot function and was able to get the functionality I wanted.
ggbiplot2=function(pcobj, choices = 1:2, scale = 1, pc.biplot = TRUE,
obs.scale = 1 - scale, var.scale = scale,
groups = NULL, ellipse = FALSE, ellipse.prob = 0.68,
labels = NULL, labels.size = 3, alpha = 1,
var.axes = TRUE,
circle = FALSE, circle.prob = 0.69,
varname.size = 3, varname.adjust = 1.5,
varname.abbrev = FALSE, ...)
{
library(ggplot2)
library(plyr)
library(scales)
library(grid)
stopifnot(length(choices) == 2)
# Recover the SVD
if(inherits(pcobj, 'prcomp')){
nobs.factor <- sqrt(nrow(pcobj$x) - 1)
d <- pcobj$sdev
u <- sweep(pcobj$x, 2, 1 / (d * nobs.factor), FUN = '*')
v <- pcobj$rotation
} else if(inherits(pcobj, 'princomp')) {
nobs.factor <- sqrt(pcobj$n.obs)
d <- pcobj$sdev
u <- sweep(pcobj$scores, 2, 1 / (d * nobs.factor), FUN = '*')
v <- pcobj$loadings
} else if(inherits(pcobj, 'PCA')) {
nobs.factor <- sqrt(nrow(pcobj$call$X))
d <- unlist(sqrt(pcobj$eig)[1])
u <- sweep(pcobj$ind$coord, 2, 1 / (d * nobs.factor), FUN = '*')
v <- sweep(pcobj$var$coord,2,sqrt(pcobj$eig[1:ncol(pcobj$var$coord),1]),FUN="/")
} else {
stop('Expected a object of class prcomp, princomp or PCA')
}
# Scores
df.u <- as.data.frame(sweep(u[,choices], 2, d[choices]^obs.scale, FUN='*'))
# Directions
v <- sweep(v, 2, d^var.scale, FUN='*')
df.v <- as.data.frame(v[, choices])
names(df.u) <- c('xvar', 'yvar')
names(df.v) <- names(df.u)
if(pc.biplot) {
df.u <- df.u * nobs.factor
}
# Scale the radius of the correlation circle so that it corresponds to
# a data ellipse for the standardized PC scores
r <- 1
# Scale directions
v.scale <- rowSums(v^2)
df.v <- df.v / sqrt(max(v.scale))
## Scale Scores
r.scale=sqrt(max(df.u[,1]^2+df.u[,2]^2))
df.u=.99*df.u/r.scale
# Change the labels for the axes
if(obs.scale == 0) {
u.axis.labs <- paste('standardized PC', choices, sep='')
} else {
u.axis.labs <- paste('PC', choices, sep='')
}
# Append the proportion of explained variance to the axis labels
u.axis.labs <- paste(u.axis.labs,
sprintf('(%0.1f%% explained var.)',
100 * pcobj$sdev[choices]^2/sum(pcobj$sdev^2)))
# Score Labels
if(!is.null(labels)) {
df.u$labels <- labels
}
# Grouping variable
if(!is.null(groups)) {
df.u$groups <- groups
}
# Variable Names
if(varname.abbrev) {
df.v$varname <- abbreviate(rownames(v))
} else {
df.v$varname <- rownames(v)
}
# Variables for text label placement
df.v$angle <- with(df.v, (180/pi) * atan(yvar / xvar))
df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar)) / 2)
# Base plot
g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) +
xlab(u.axis.labs[1]) + ylab(u.axis.labs[2]) + coord_equal()
if(var.axes) {
# Draw circle
if(circle)
{
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <- data.frame(xvar = r * cos(theta), yvar = r * sin(theta))
g <- g + geom_path(data = circle, color = muted('white'),
size = 1/2, alpha = 1/3)
}
# Draw directions
g <- g +
geom_segment(data = df.v,
aes(x = 0, y = 0, xend = xvar, yend = yvar),
arrow = arrow(length = unit(1/2, 'picas')),
color = muted('red'))
}
# Draw either labels or points
if(!is.null(df.u$labels)) {
if(!is.null(df.u$groups)) {
g <- g + geom_text(aes(label = labels, color = groups),
size = labels.size)
} else {
g <- g + geom_text(aes(label = labels), size = labels.size)
}
} else {
if(!is.null(df.u$groups)) {
g <- g + geom_point(aes(color = groups), alpha = alpha)
} else {
g <- g + geom_point(alpha = alpha)
}
}
# Overlay a concentration ellipse if there are groups
if(!is.null(df.u$groups) && ellipse) {
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <- cbind(cos(theta), sin(theta))
ell <- ddply(df.u, 'groups', function(x) {
if(nrow(x) < 2) {
return(NULL)
} else if(nrow(x) == 2) {
sigma <- var(cbind(x$xvar, x$yvar))
} else {
sigma <- diag(c(var(x$xvar), var(x$yvar)))
}
mu <- c(mean(x$xvar), mean(x$yvar))
ed <- sqrt(qchisq(ellipse.prob, df = 2))
data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = '+'),
groups = x$groups[1])
})
names(ell)[1:2] <- c('xvar', 'yvar')
g <- g + geom_path(data = ell, aes(color = groups, group = groups))
}
# Label the variable axes
if(var.axes) {
g <- g +
geom_text(data = df.v,
aes(label = varname, x = xvar, y = yvar,
angle = angle, hjust = hjust),
color = 'darkred', size = varname.size)
}
# Change the name of the legend for groups
# if(!is.null(groups)) {
# g <- g + scale_color_brewer(name = deparse(substitute(groups)),
# palette = 'Dark2')
# }
# TODO: Add a second set of axes
return(g)
}

Resources