Nonparametric regression ggplot - r

I'm trying to plot some nonparametric regression curves with ggplot2. I achieved It with the base plot()function:
library(KernSmooth)
set.seed(1995)
X <- runif(100, -1, 1)
G <- X[which (X > 0)]
L <- X[which (X < 0)]
u <- rnorm(100, 0 , 0.02)
Y <- -exp(-20*L^2)-exp(-20*G^2)/(X+1)+u
m <- lm(Y~X)
plot(Y~X)
abline(m, col="red")
m2 <- locpoly(X, Y, bandwidth = 0.05, degree = 0)
lines(m2$x, m2$y, col = "red")
m3 <- locpoly(X, Y, bandwidth = 0.15, degree = 0)
lines(m3$x, m3$y, col = "black")
m4 <- locpoly(X, Y, bandwidth = 0.3, degree = 0)
lines(m4$x, m4$y, col = "green")
legend("bottomright", legend = c("NW(bw=0.05)", "NW(bw=0.15)", "NW(bw=0.3)"),
lty = 1, col = c("red", "black", "green"), cex = 0.5)
With ggplot2 have achieved plotting the linear regression:
With this code:
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25))
But I dont't know how to add the other lines to this plot, as in the base R plot. Any suggestions? Thanks in advance.

Solution
The shortest solution (though not the most beautiful one) is to add the lines using the data= argument of the geom_line function:
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25)) +
geom_line(data = as.data.frame(m2), mapping = aes(x=x,y=y))
Beautiful solution
To get beautiful colors and legend, use
# Need to convert lists to data.frames, ggplot2 needs data.frames
m2 <- as.data.frame(m2)
m3 <- as.data.frame(m3)
m4 <- as.data.frame(m4)
# Colnames are used as names in ggplot legend. Theres nothing wrong in using
# column names which contain symbols or whitespace, you just have to use
# backticks, e.g. m2$`NW(bw=0.05)` if you want to work with them
colnames(m2) <- c("x","NW(bw=0.05)")
colnames(m3) <- c("x","NW(bw=0.15)")
colnames(m4) <- c("x","NW(bw=0.3)")
# To give the different kernel density estimates different colors, they must all be in one data frame.
# For merging to work, all x columns of m2-m4 must be the same!
# the merge function will automatically detec columns of same name
# (that is, x) in m2-m4 and use it to identify y values which belong
# together (to the same x value)
mm <- Reduce(x=list(m2,m3,m4), f=function(a,b) merge(a,b))
# The above line is the same as:
# mm <- merge(m2,m3)
# mm <- merge(mm,m4)
# ggplot needs data in long (tidy) format
mm <- tidyr::gather(mm, kernel, y, -x)
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25)) +
geom_line(data = mm, mapping = aes(x=x,y=y,color=kernel))
Solution which will settle this for everyone and for eternity
The most beautiful and reproducable way though will be to create a custom stat in ggplot2 (see the included stats in ggplot).
There is this vignette of the ggplot2 team to this topic: Extending ggplot2. I have never undertaken such a heroic endeavour though.

Related

How to plot multiple Poisson distribution in one plot

I would like to plot multiple Poisson (with different lambdas (1:10))
I found the following function to draw a plot
plot_pois = function(lambda = 5)
{
plot(0:20, dpois( x=0:20, lambda=lambda ), xlim=c(-2,20))
normden <- function(x){dnorm(x, mean= lambda, sd=sqrt(lambda))}
curve(normden, from=-4, to=20, add=TRUE, col=lambda)
}
plot.new()
plot_pois(2)
But I can't plot another Poisson over it. I tried to change plot to points or lines but it totally changes the plot. I would also like to add a legends containing different colors for different values of lambda.
If I could plot it using ggplot, it would be a better option.
Another possible tidyverse solution:
library(tidyverse)
# Build Poisson distributions
p_dat <- map_df(1:10, ~ tibble(
l = paste(.),
x = 0:20,
y = dpois(0:20, .)
))
# Build Normal distributions
n_dat <- map_df(1:10, ~ tibble(
l = paste(.),
x = seq(0, 20, by = 0.001),
y = dnorm(seq(0, 20, by = 0.001), ., sqrt(.))
))
# Use ggplot2 to plot
ggplot(n_dat, aes(x, y, color = factor(l, levels = 1:10))) +
geom_line() +
geom_point(data = p_dat, aes(x, y, color = factor(l, levels = 1:10))) +
labs(color = "Lambda:") +
theme_minimal()
Created on 2019-05-06 by the reprex package (v0.2.1)
In ggplot2 you can use lapply to loop over different lambdas:
library(ggplot2)
lambdas <- c(5, 2)
ggplot(data = data.frame(x = 0:20)) +
lapply(lambdas, function(l) geom_point(aes(x = x, y = dpois(x, lambda = l), col = factor(l)))) +
lapply(lambdas, function(l) stat_function(fun = dnorm, args = list(mean = l, sd = sqrt(l)),
aes(x = x, col = factor(l))))
Axes titles and limits, the legend title etc. can then be customized as usual in ggplot2.

Add jitterred points to a ggnetwork plot

I have a graph of vertices and edges which I'd like to plot using a fruchtermanreingold layout.
Here's the graph edges matrix:
edge.mat <- matrix(as.numeric(strsplit("3651,0,0,1,0,0,0,0,2,0,11,2,0,0,0,300,0,1,0,0,66,0,78,9,0,0,0,0,0,0,11690,0,1,0,0,0,0,0,0,0,0,493,1,1,0,4288,5,0,0,36,0,9,7,3,0,6,1,0,1,7,490,0,0,0,6,0,0,628,6,12,0,0,0,0,0,641,0,0,4,0,0,0,0,0,0,66,0,0,0,0,3165,0,281,0,0,0,0,0,0,0,0,45,1,0,0,35248,0,1698,2,0,1,0,2,99,0,0,6,29,286,0,31987,0,1,10,0,8,0,16,0,21,1,0,0,1718,0,51234,0,0,17,3,12,0,0,7,0,0,0,1,0,2,16736,0,0,0,3,0,0,4,630,0,0,0,9,0,0,29495,53,6,0,0,0,0,5,0,0,0,0,3,0,19,186,0,0,0,482,8,12,0,1,0,7,1,0,6,0,26338",
split = ",")[[1]]),
nrow = 14,
dimnames = list(LETTERS[1:14], LETTERS[1:14]))
I then create an igraph object from that using:
gr <- igraph::graph_from_adjacency_matrix(edge.mat, mode="undirected", weighted=T, diag=F)
And then use ggnetwork to convert gr to a data.frame, with specified vertex colors:
set.seed(1)
gr.df <- ggnetwork::ggnetwork(gr,
layout="fruchtermanreingold",
weights="weight",
niter=50000,
arrow.gap=0)
And then I plot it using ggplot2 and ggnetwork:
vertex.colors <- strsplit("#00BE6B,#DC2D00,#F57962,#EE8044,#A6A400,#62B200,#FF6C91,#F77769,#EA8332,#DA8E00,#C59900,#00ACFC,#C49A00,#DC8D00",
split=",")[[1]]
library(ggplot2)
library(ggnetwork)
ggplot(gr.df, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "gray", aes(size = weight)) +
geom_nodes(color = "black")+
geom_nodelabel(aes(label = vertex.names),
color = vertex.colors, fontface = "bold")+
theme_minimal() +
theme(axis.text=element_blank(),
axis.title=element_blank(),
legend.position="none")
In my case each vertex actually represents many points, where each vertex has a different number of points. Adding that information to gr.df:
gr.df$n <- NA
gr.df$n[which(is.na(gr.df$weight))] <- as.integer(runif(length(which(is.na(gr.df$weight))), 100, 500))
What I'd like to do is add to the plot gr.df$n radially jittered points around each vertex (i.e., with its corresponding n), with the same vertex.colors coding. Any idea how to do that?
I think sampling and then plotting with geom_point is a reasonable strategy. (otherwise you could create your own geom).
Here is some rough code, starting from the relevant bit of your question
gr.df$n <- 1
gr.df$n[which(is.na(gr.df$weight))] <- as.integer(runif(length(which(is.na(gr.df$weight))), 100, 500))
# function to sample
# https://stackoverflow.com/questions/5837572/generate-a-random-point-within-a-circle-uniformly
circSamp <- function(x, y, R=0.1){
n <- length(x)
A <- a <- runif(n,0,1)
b <- runif(n,0,1)
ind <- b < a
a[ind] <- b[ind]
b[ind] <- A[ind]
xn = x+b*R*cos(2*pi*a/b)
yn = y+b*R*sin(2*pi*a/b)
cbind(x=xn, y=yn)
}
# sample
d <- with(gr.df, data.frame(vertex.names=rep(vertex.names, n),
circSamp(rep(x,n), rep(y,n))))
# p is your plot
p + geom_point(data=d, aes(x, y, color = vertex.names),
alpha=0.1, inherit.aes = FALSE) +
scale_color_manual(values = vertex.colors)
Giving

Draw line through 2d density plot

I have a large dataset of gene expression from ~10,000 patient samples (TCGA), and I'm plotting a predicted expression value (x) and the actual observed value (y) of a certain gene signature. For my downstream analysis, I need to draw a precise line through the plot and calculate different parameters in samples above/below the line.
No matter how I draw a line through the data (geom_smooth(method = 'lm', 'glm', 'gam', or 'loess')), the line always seems imperfect - it doesn't cut through the data to my liking (red line is lm in figure).
After playing around for a while, I realized that the 2d kernel density lines (geom_density2d) actually do a good job of showing the slope/trends of my data, so I manually drew a line that kind of cuts through the density lines (black line in figure).
My question: how can I automatically draw a line that cuts through the kernel density lines, as for the black line in the figure? (Rather than manually playing with different intercepts and slopes till something looks good).
The best approach I can think of is to somehow calculate intercept and slope of the longest diameter for each of the kernel lines, take an average of all those intercepts and slopes and plot that line, but that's a bit out of my league. Maybe someone here has experience with this and can help?
A more hacky approach may be getting the x,y coords of each kernel density line from ggplot_build, and going from there, but it feels too hacky (and is also out of my league).
Thanks!
EDIT: Changed a few details to make the figure/analysis easier. (Density lines are smoother now).
Reprex:
library(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1] # standard normal (mu=0, sd=1)
y <- data[, 2] # standard normal (mu=0, sd=1)
test.df <- data.frame(x = x, y = y)
lm(y ~ x, test.df)
ggplot(test.df, aes(x, y)) +
geom_point(color = 'grey') +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2)) + ### EDIT: h = c(2,2)
geom_smooth(method = "glm", se = F, lwd = 1, color = 'red') +
geom_abline(intercept = 0, slope = 0.7, lwd = 1, col = 'black') ## EDIT: slope to 0.7
Figure:
I generally agree with #Hack-R.
However, it was kind of a fun problem and looking into ggplot_build is not such a big deal.
require(dplyr)
require(ggplot2)
p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2))
#basic version of your plot
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),] %>%
select(x,y) # extracts the x/y coordinates of the points on the largest ellipse from your 2d-density contour
Now this answer helped me to find the points on this ellipse which are furthest apart.
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>%
mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2)) #extra column specifying the distance of each point to the mean of those points
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
# gives the coordinates of the point farthest away from the mean point
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
#now this looks which of the points is the farthest from the point farthest from the mean point :D
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2)) +
# geom_segment using the coordinates of the points farthest apart
geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
xend = coord_fff['x'], yend = coord_fff['y']))) +
geom_smooth(method = "glm", se = F, lwd = 1, color = 'red') +
# as per your request with your geom_smooth line
coord_equal()
coord_equal is super important, because otherwise you will get super weird results - it messed up my brain too. Because if the coordinates are not set equal, the line will seemingly not pass through the point furthest apart from the mean...
I leave it to you to build this into a function in order to automate it. Also, I'll leave it to you to calculate the y-intercept and slope from the two points
Tjebo's approach was kind of good initially, but after a close look, I found that it found the longest distance between two points on an ellipse. While this is close to what I wanted, it failed with either an irregular shape of the ellipse, or the sparsity of points in the ellipse. This is because it measured the longest distance between two points; whereas what I really wanted is the longest diameter of an ellipse; i.e.: the semi-major axis. See image below for examples/details.
Briefly:
To find/draw density contours of specific density/percentage:
R - How to find points within specific Contour
To get the longest diameter ("semi-major axis") of an ellipse:
https://stackoverflow.com/a/18278767/3579613
For function that returns intercept and slope (as in OP), see last piece of code.
The two pieces of code and images below compare two Tjebo's approach vs. my new approach based on the above posts.
#### Reprex from OP
require(dplyr)
require(ggplot2)
require(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1] # standard normal (mu=0, sd=1)
y <- data[, 2] # standard normal (mu=0, sd=1)
test.df <- data.frame(x = x, y = y)
#### From Tjebo
p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 2)
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>%
mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2)) #extra column specifying the distance of each point to the mean of those points
p_maxring = p_maxring[round(seq(1, nrow(p_maxring), nrow(p_maxring)/23)),] #### Make a small ellipse to illustrate flaws of approach
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
# gives the coordinates of the point farthest away from the mean point
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
#now this looks which of the points is the farthest from the point farthest from the mean point :D
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
farthest_2_points = data.frame(t(cbind(coord_farthest, coord_fff)))
plot(p_maxring[,1:2], asp=1)
lines(farthest_2_points, col = 'blue', lwd = 2)
#### From answer in another post
d = cbind(p_maxring[,1], p_maxring[,2])
r = ellipsoidhull(d)
exy = predict(r) ## the ellipsoid boundary
lines(exy)
me = colMeans((exy))
dist2center = sqrt(rowSums((t(t(exy)-me))^2))
max(dist2center) ## major axis
lines(exy[dist2center == max(dist2center),], col = 'red', lwd = 2)
#### The plot here is made from the data in the reprex in OP, but with h = 0.5
library(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1] # standard normal (mu=0, sd=1)
y <- data[, 2] # standard normal (mu=0, sd=1)
test.df <- data.frame(x = x, y = y)
## MAKE BLUE LINE
p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 0.5) ## NOTE h = 0.5
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>%
mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2))
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
## MAKE RED LINE
## h = 0.5
## Given the highly irregular shape of the contours, I will use only the largest contour line (0.95) for draing the line.
## Thus, average = 1. See function below for details.
ln = long.diam("x", "y", test.df, h = 0.5, average = 1) ## NOTE h = 0.5
## PLOT
ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 0.5) + ## NOTE h = 0.5
geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
xend = coord_fff['x'], yend = coord_fff['y'])), col = 'blue', lwd = 2) +
geom_abline(intercept = ln[1], slope = ln[2], color = 'red', lwd = 2) +
coord_equal()
Finally, I came up with the following function to deal with all this. Sorry for the lack of comments/clarity
#### This will return the intercept and slope of the longest diameter (semi-major axis).
####If Average = TRUE, it will average the int and slope across different density contours.
long.diam = function(x, y, df, probs = c(0.95, 0.5, 0.1), average = T, h = 2) {
fun.df = data.frame(cbind(df[,x], df[,y]))
colnames(fun.df) = c("x", "y")
dens = kde2d(fun.df$x, fun.df$y, n = 200, h = h)
dx <- diff(dens$x[1:2])
dy <- diff(dens$y[1:2])
sz <- sort(dens$z)
c1 <- cumsum(sz) * dx * dy
levels <- sapply(probs, function(x) {
approx(c1, sz, xout = 1 - x)$y
})
names(levels) = paste0("L", str_sub(formatC(probs, 2, format = 'f'), -2))
#plot(fun.df$x,fun.df$y, asp = 1)
#contour(dens, levels = levels, labels=probs, add=T, col = c('red', 'blue', 'green'), lwd = 2)
#contour(dens, add = T, col = 'red', lwd = 2)
#abline(lm(fun.df$y~fun.df$x))
ls <- contourLines(dens, levels = levels)
names(ls) = names(levels)
lines.info = list()
for (i in 1:length(ls)) {
d = cbind(ls[[i]]$x, ls[[i]]$y)
exy = predict(ellipsoidhull(d))## the ellipsoid boundary
colnames(exy) = c("x", "y")
me = colMeans((exy)) ## center of the ellipse
dist2center = sqrt(rowSums((t(t(exy)-me))^2))
#plot(exy,type='l',asp=1)
#points(d,col='blue')
#lines(exy[order(dist2center)[1:2],])
#lines(exy[rev(order(dist2center))[1:2],])
max.dist = data.frame(exy[rev(order(dist2center))[1:2],])
line.fit = lm(max.dist$y ~ max.dist$x)
lines.info[[i]] = c(as.numeric(line.fit$coefficients[1]), as.numeric(line.fit$coefficients[2]))
}
names(lines.info) = names(ls)
#plot(fun.df$x,fun.df$y, asp = 1)
#contour(dens, levels = levels, labels=probs, add=T, col = c('red', 'blue', 'green'), lwd = 2)
#abline(lines.info[[1]], col = 'red', lwd = 2)
#abline(lines.info[[2]], col = 'blue', lwd = 2)
#abline(lines.info[[3]], col = 'green', lwd = 2)
#abline(apply(simplify2array(lines.info), 1, mean), col = 'black', lwd = 4)
if (isTRUE(average)) {
apply(simplify2array(lines.info), 1, mean)
} else {
lines.info[[average]]
}
}
Finally, here's the final implementation of the different answers:
library(MASS)
set.seed(123)
samples = 10000
r = 0.9
data = mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x = data[, 1] # standard normal (mu=0, sd=1)
y = data[, 2] # standard normal (mu=0, sd=1)
#plot(x, y)
test.df = data.frame(x = x, y = y)
#### Find furthest two points of contour
## BLUE
p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 2, contour = T, h = 2)
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>%
mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2))
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
#### Find the average intercept and slope of 3 contour lines (0.95, 0.5, 0.1), as in my long.diam function above.
## RED
ln = long.diam("x", "y", test.df)
#### Plot everything. Black line is GLM
ggplot(test.df, aes(x, y)) +
geom_point(color = 'grey') +
geom_density2d(color = 'red', lwd = 1, contour = T, h = 2) +
geom_smooth(method = "glm", se = F, lwd = 1, color = 'black') +
geom_abline(intercept = ln[1], slope = ln[2], col = 'red', lwd = 1) +
geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
xend = coord_fff['x'], yend = coord_fff['y'])), col = 'blue', lwd = 1) +
coord_equal()

Transform color scale to probability-transformed color distribution with scale_fill_gradientn()

I am trying to visualize heavily tailed raster data, and I would like a non-linear mapping of colors to the range of the values. There are a couple of similar questions, but they don't really solve my specific problem (see links below).
library(ggplot2)
library(scales)
set.seed(42)
dat <- data.frame(
x = floor(runif(10000, min=1, max=100)),
y = floor(runif(10000, min=2, max=1000)),
z = rlnorm(10000, 1, 1) )
# colors for the colour scale:
col.pal <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
fill.colors <- col.pal(64)
This is how the data look like if not transformed in some way:
ggplot(dat, aes(x = x, y = y, fill = z)) +
geom_tile(width=2, height=30) +
scale_fill_gradientn(colours=fill.colors)
My question is sort of a follow-up question related to
this one or this one , and the solution given here actually yields exactly the plot I want, except for the legend:
qn <- rescale(quantile(dat$z, probs=seq(0, 1, length.out=length(fill.colors))))
ggplot(dat, aes(x = x, y = y, fill = z)) +
geom_tile(width=2, height=30) +
scale_fill_gradientn(colours=fill.colors, values = qn)
Now I want the colour scale in the legend to represent the non-linear distribution of the values (now only the red part of the scale is visible), i.e. the legend should as well be based on quantiles. Is there a way to accomplish this?
I thought the trans argument within the colour scale might do the trick, as suggested here , but that throws an error, I think because qnorm(pnorm(dat$z)) results in some infinite values (I don't completely understand the function though..).
norm_trans <- function(){
trans_new('norm', function(x) pnorm(x), function(x) qnorm(x))
}
ggplot(dat, aes(x = x, y = y, fill = z)) +
geom_tile(width=2, height=30) +
scale_fill_gradientn(colours=fill.colors, trans = 'norm')
> Error in seq.default(from = best$lmin, to = best$lmax, by = best$lstep) : 'from' must be of length 1
So, does anybody know how to have a quantile-based colour distribution in the plot and in the legend?
This code will make manual breaks with a pnorm transformation. Is this what you are after?
ggplot(dat, aes(x = x, y = y, fill = z)) +
geom_tile(width=2, height=30) +
scale_fill_gradientn(colours=fill.colors,
trans = 'norm',
breaks = quantile(dat$z, probs = c(0, 0.25, 1))
)
I believe you have been looking for a quantile transform. Unfortunately there is none in scales, but it is not to hard to build one yourself (on the fly):
make_quantile_trans <- function(x, format = scales::label_number()) {
name <- paste0("quantiles_of_", deparse1(substitute(x)))
xs <- sort(x)
N <- length(xs)
transform <- function(x) findInterval(x, xs)/N # find the last element that is smaller
inverse <- function(q) xs[1+floor(q*(N-1))]
scales::trans_new(
name = name,
transform = transform,
inverse = inverse,
breaks = function(x, n = 5) inverse(scales::extended_breaks()(transform(x), n)),
minor_breaks = function(x, n = 5) inverse(scales::regular_minor_breaks()(transform(x), n)),
format = format,
domain = xs[c(1, N)]
)
}
ggplot(dat, aes(x = x, y = y, fill = z)) +
geom_tile(width=2, height=30) +
scale_fill_gradientn(colours=fill.colors, trans = make_quantile_trans(dat$z))
Created on 2021-11-12 by the reprex package (v2.0.1)

Computing weighted average using lowess mthod in R

I am trying to use the lowess method from R to compute the weighted average of a data set which is not uniformly distributed along x axis. For example, the first 5 data points are like this, where the first column is the x and the second is the y.
375.0 2040.0
472.0 5538.0
510.0 4488.0
573.0 2668.0
586.0 7664.0
I used the following command in R:
x<-read.table(add,header=FALSE,sep="\t")
y<-lowess(x[,1],x[,2],f=0.01)
write.table(y, file = results , sep = "\t", col.names =FALSE, row.names =FALSE)
The output looks like this:
The green line shows the average computed by the smooth function in matlab (tri-cubic kernel), and the red line is the average line computed by lowess method in R. The blue dots are the data points.
I can't find why the method in R does not work. Do you have any idea?
Here is a link to part of the data.
Thanks a lot for your help.
Th smooth function in matlab is like a filter ,
yy = smooth(y)
yy(1) = y(1)
yy(2) = (y(1) + y(2) + y(3))/3
yy(3) = (y(1) + y(2) + y(3) + y(4) + y(5))/5 ## convolution of size 5
yy(4) = (y(2) + y(3) + y(4) + y(5) + y(6))/5
I think it is better to do a simple smooth here.
Here some attempts using loess, lowesss with f = 0.2(1/5) and using smooth.spline
I am using ggplot2 to plot ( to use geom_jitter with some alpha )
library(ggplot2)
dat <- subset(data, V2 < 5000)
#dat <- data
xy <- lowess(dat$V1,dat$V2,f = 0.8)
xy <- as.data.frame(do.call(cbind,xy))
p1<- ggplot(data = dat, aes(x= V1, y = V2))+
geom_jitter(position = position_jitter(width = .2), alpha= 0.1)+
geom_smooth()
xy <- lowess(dat$V1,dat$V2,f = 0.2)
xy <- as.data.frame(do.call(cbind,xy))
xy.smooth <- smooth.spline(dat$V1,dat$V2)
xy.smooth <- data.frame(x= xy.smooth$x,y = xy.smooth$y)
p2 <- ggplot(data = dat, aes(x= V1, y = V2))+
geom_jitter(position = position_jitter(width = .2), alpha= 0.1)+
geom_line(data = xy, aes(x=x, y = y, group = 1 ), color = 'red')+
geom_line(data = xy.smooth, aes(x=x, y = y, group = 1 ), color = 'blue')
library(gridExtra)
grid.arrange(p1,p2)

Resources