vertically distribute multiple lines with smart spacing - r

A common display of spectroscopic data (intensity vs wavelength) is used below to compare the position of peaks in the data across multiple spectra. Assuming they all share a baseline at 0, it is convenient to offset the multiple lines vertically by a constant spacing, to avoid the distraction of overlapping lines.
Thus becomes
I'm looking for a better strategy to perform this vertical shift automatically, starting from data in long format. Here is a minimal example.
# fake data (5 similar-looking spectra)
spec <- function(){
x <- runif(100, 0, 100)
data.frame(x=x, y=jitter(dnorm(x, mean=jitter(50), sd=jitter(5)), amount=0.01))
}
require(plyr)
all <- ldply(1:5, function(ii) data.frame(spec(), id=ii))
My current strategy is as follows:
convert the spectra from long format to wide format. This involves interpolation, as the spectra do not necessarily have identical x axis values.
find the minimum offset between spectra to avoid overlap between neighbours
shift the spectra by multiples of this distance
melt back to long format
I implemented this using plyr,
# function that evenly spaces the spectra to avoid overlap
# d is in long format, s is a scaling factor for the vertical shift
require(plyr); require(ggplot2)
spread_plot <- function(d, s=1){
ranges <- ddply(d, "id", with, each(min,max,length)(x))
common_x <- seq(max(ranges$min), min(ranges$max), length=max(ranges$length))
new_y <- dlply(d, "id", function(x) approx(x$x, x$y, common_x)$y)
mat <- do.call(cbind, new_y)
test <- apply(mat, 1, diff)
shift <- max(-test[test < 0])
origins <- s*seq(0, by=shift, length=ncol(mat))
for(ii in seq_along(origins)){
current <- unique(d[["id"]])[ii]
d[d[["id"]] == current, "y"] <-
d[d[["id"]] == current, "y"] + origins[ii]
}
d
}
test <- spread_plot(all)
ggplot(test, aes(x, y, colour=id, group=id))+
geom_line() + guides(colour=guide_legend())
This strategy suffers from a few shortcomings:
it is slow
the offset is not a pretty number; I do not know how to automatically round it well so that spectra are offset e.g. by 0.02, or 50, etc. depending on the range of the intensities. pretty(origins) is problematic in that it can return a different number of values.
I feel I'm missing a simpler solution, perhaps working directly with the original data in long format.

Interesting question.
Here's a possibility, offered without detailed comment, except to point out that it:
Should be very fast, due to a combo of its avoidance of plyr, use of data.table, and operation on data in its original long format.
Uses pretty() to pick a pretty offset.
Like your code, is not guaranteed to produce no intersections of lines, since overlap can happen between the lattice of points formed by common_x.
Here's the code
## Setup
library(data.table)
library(plyr)
library(ggplot2)
spec <- function(){
x <- runif(100, 0, 100)
data.frame(x=x, y=jitter(dnorm(x, mean=jitter(50), sd=jitter(5)), amount=0.01))
}
all <- ldply(1:5, function(ii) data.frame(spec(), id=ii))
## Function that uses data.table rather than plyr to compute and add offsets
spread_plot <- function(d, s=1){
d <- data.table(d, key="id")
ranges <- d[, list(min=min(x), max=max(x), length=length(x)),by="id"]
common_x <- seq(max(ranges$min), min(ranges$max), length=max(ranges$length))
new_y <- d[,list(y=approx(x, y, common_x)$y, N=seq_along(common_x)),
by="id"]
shift <- max(new_y[, max(abs(diff(y))), by = "N"][[2]])
shift <- pretty(c(0, shift), n=0)[2]
origins <- s*seq(0, by=shift, length=length(unique(d$id)))
d[,y:=(y + origins[.GRP]),by="id"]
d
}
## Try it out
test <- spread_plot(all)
ggplot(test, aes(x, y, colour=id, group=id))+
geom_line() + guides(colour=guide_legend())

I still think you could rely on some assumptions about typical data from spectroscopy. Usually, x values are sorted, the number of them is equal for all spectra and they are quite similar:
# new fake data (5 similar-looking spectra)
spec <- function(){
x <- jitter(seq(0,100,1),0.1)
data.frame(x=x, y=jitter(dnorm(x, mean=jitter(50), sd=jitter(5)), amount=0.01))
}
require(plyr)
all <- ldply(1:5, function(ii) data.frame(spec(), id=ii))
If these assumptions are valid, you could treat the spectra as having identical x values:
library(ggplot2)
spread_plot <- function(d, s=0.05) {
#add some checks here, e.g., for equal length
d <- d[order(d$x),]
d$id <- factor(d$id)
l <- levels(d$id)
pretty_offset <- pretty(s*min(tapply(d$y, d$id, function(x) abs(diff(range(x))))))[2]
for (i in seq_len(length(l)-1)+1) {
mean_delta_y <- mean(d[d$id == l[i], "y"] - d[d$id == l[i-1], "y"])
d[d$id == l[i], "y"] <- d[d$id == l[i], "y"] - mean_delta_y
min_delta_y <- abs(1.05 * min(d[d$id == l[i], "y"] - d[d$id == l[i-1], "y"]))
pretty_delta_y <- max(min_delta_y, pretty_offset)
d[d$id == l[i], "y"] <- d[d$id == l[i], "y"] + pretty_delta_y
}
p <- ggplot(d, aes(x=x, y=y, col=id)) + geom_line()
print(p)
}
spread_plot(all, s=0)
spread_plot(all, s=0.5)

As suggested by hadley, the for loop can be avoided very simply,
d$y <- d$y + origins[d$id]
Full code:
spread_plot <- function(d, s=1){
ranges <- ddply(d, "id", with, each(min,max,length)(x))
common_x <- seq(max(ranges$min), min(ranges$max), length=max(ranges$length))
new_y <- dlply(d, "id", function(x) approx(x$x, x$y, common_x)$y)
mat <- do.call(cbind, new_y)
test <- apply(mat, 1, diff)
shift <- max(-test[test < 0])
origins <- s*seq(0, by=shift, length=ncol(mat))
d$y <- d$y + origins[d$id]
d
}
test <- spread_plot(all)
ggplot(test, aes(x, y, colour=id, group=id))+
geom_line() + guides(colour=guide_legend())

Related

scale_x_continues sends NA values to trans when there are no NA values in x [duplicate]

I want to skip part of my y-axis for a dataset with most values between -10 and 100, and then a few at 400 again. So I want to squeeze this empty area. I already am using facet grid in my plot for 3 different scenario's, so I would prefer to just "squash" the Y axis and not create mutltiple plots.
I found the "squash_axis" function on RPubs (https://rpubs.com/huanfaChen/squash_remove_y_axix_ggplot_), which may be able to help me. But I can not get it to work with my own dataset, and not even with the example dataset.
Example dataset (mine looks quite similar except that there is another column with time)
dat <- data.frame(group=rep(c('A', 'B', 'C', 'D'), each = 10),
value=c(rnorm(10), rnorm(10)+100)
)
Then the Squash axis function:
require(ggplot2)
squash_axis <- function(from, to, factor) {
# A transformation function that squashes the range of [from, to] by factor on a given axis
# Args:
# from: left end of the axis
# to: right end of the axis
# factor: the compression factor of the range [from, to]
#
# Returns:
# A transformation called "squash_axis", which is capsulated by trans_new() function
trans <- function(x) {
# get indices for the relevant regions
isq <- x > from & x < to
ito <- x >= to
# apply transformation
x[isq] <- from + (x[isq] - from)/factor
x[ito] <- from + (to - from)/factor + (x[ito] - to)
return(x)
}
inv <- function(x) {
# get indices for the relevant regions
isq <- x > from & x < from + (to - from)/factor
ito <- x >= from + (to - from)/factor
# apply transformation
x[isq] <- from + (x[isq] - from) * factor
x[ito] <- to + (x[ito] - (from + (to - from)/factor))
return(x)
}
# return the transformation
return(trans_new("squash_axis", trans, inv))
}
And the plot from the example:
ggplot(dat,aes(x=group,y=value))+
geom_point()+
scale_y_continuous(trans = squash_axis(5, 95, 10))
I then get the error:
Error in x[isq] <- from + (x[isq] - from) * factor :
NAs are not allowed in subscripted assignments
I don't understand because there are no NAs in my data and not in the example data either.
What is going on?
Using browser() if figured out that the problem arises in the inv part of the squish transformation. But I could only guess what the reason is, probably has to do with how the breaks are set (??). However, instead of applying the transformation inside scale_y_continuous I tried applying it via coord_trans ... et voila, worked:
library(ggplot2)
dat <- data.frame(group=rep(c('A', 'B', 'C', 'D'), each = 10),
value=c(rnorm(10), rnorm(10)+100)
)
squash_axis <- function(from, to, factor) {
# A transformation function that squashes the range of [from, to] by factor on a given axis
# Args:
# from: left end of the axis
# to: right end of the axis
# factor: the compression factor of the range [from, to]
#
# Returns:
# A transformation called "squash_axis", which is capsulated by trans_new() function
trans <- function(x) {
# get indices for the relevant regions
isq <- x > from & x < to
ito <- x >= to
# apply transformation
x[isq] <- from + (x[isq] - from)/factor
x[ito] <- from + (to - from)/factor + (x[ito] - to)
return(x)
}
inv <- function(x) {
# get indices for the relevant regions
isq <- x > from & x < from + (to - from)/factor
ito <- x >= from + (to - from)/factor
# apply transformation
x[isq] <- from + (x[isq] - from) * factor
x[ito] <- to + (x[ito] - (from + (to - from)/factor))
return(x)
}
# return the transformation
return(scales::trans_new("squash_axis", trans, inv, domain = c(from, to)))
}
ggplot(dat,aes(x=group, y = value))+
geom_point()+
coord_trans(y = squash_axis(5, 95, 10))
Created on 2020-04-03 by the reprex package (v0.3.0)

Plot Sphere with custom gridlines in R

I would like to plot a sphere in R with the gridlines on the surface corresponding to the equal area gridding of the sphere using the arcos transformation.
I have been experimenting with the R packakge rgl and got some help from :
Plot points on a sphere in R
Which plots the gridlines with equal lat long spacing.
I have the below function which returns a data frame of points that are the cross over points of the grid lines I want, but not sure how to proceed.
plot_sphere <- function(theta_num,phi_num){
theta <- seq(0,2*pi,(2*pi)/(theta_num))
phi <- seq(0,pi,pi/(phi_num))
tmp <- seq(0,2*phi_num,2)/phi_num
phi <- acos(1-tmp)
tmp <- cbind(rep(seq(1,theta_num),each = phi_num),rep(seq(1,phi_num),times = theta_num))
results <- as.data.frame(cbind(theta[tmp[,1]],phi[tmp[,2]]))
names(results) <- c("theta","phi")
results$x <- cos(results$theta)*sin(results$phi)
results$y <- sin(results$theta)*sin(results$phi)
results$z <- cos(results$phi)
return(results)
}
sphere <- plot_sphere(10,10)
Can anyone help, in general I am finding the rgl functions tricky to work with.
If you use lines3d or plot3d(..., type="l"), you'll get a plot joining the points in your dataframe. To get breaks (you don't want one long line), add rows containing NA values.
The code in your plot_sphere function seems really messed up (you compute phi twice, you don't generate vectors of the requested length, etc.), but this function based on it works:
function(theta_num,phi_num){
theta0 <- seq(0,2*pi, len = theta_num)
tmp <- seq(0, 2, len = phi_num)
phi0 <- acos(1-tmp)
i <- seq(1, (phi_num + 1)*theta_num) - 1
theta <- theta0[i %/% (phi_num + 1) + 1]
phi <- phi0[i %% (phi_num + 1) + 1]
i <- seq(1, phi_num*(theta_num + 1)) - 1
theta <- c(theta, theta0[i %% (theta_num + 1) + 1])
phi <- c(phi, phi0[i %/% (theta_num + 1) + 1])
results <- data.frame( x = cos(theta)*sin(phi),
y = sin(theta)*sin(phi),
z = cos(phi))
lines3d(results)
}

flexible log10 grid with ggplot2

i am trying to make flexible log10-grid in ggplot2. The idea is that between 0.1-1 the gridline breaks are 0.1 apart, between 1-10, they are 1 apart, between 10-100 they are 10 apart, etc
This way the grid lines repeat the same pattern as many times as required based on a variable vector (CAfails) with data i supply. This is what i came up with after a lot of tweaking:
CAfails<-data.frame(c(2.5,5.8,10.7,16.2,23,36.2,45.3,49.5,70.1,80.3,83.6,90))
LOG.as<-c(t((10^((floor(log10(min(CAfails)))-1):ceiling(log10(max(CAfails)))))%o%c(1:10)))
LOG.as<-LOG.as[-10*((floor(log10(min(CAfails))):ceiling(log10(max(CAfails))))+1)]
After which i pass it to ggplot2:
scale_x_log10(limits=c(1,10^(ceiling(log10(max(CAfails))))),breaks=LOG.as)
scale_y_log10(limits=c(0.1,10^(ceiling(log10(max(CAfails))))),breaks=LOG.as)
It works Ok but i was wondering if there wasn't a simpler and more easy way to do this
Here is a complete example:
CAfails<-data.frame(x=c(2.5,5.8,10.7,16.2,23,36.2,45.3,49.5,70.1,80.3,83.6,90))
LOG.as<-c(t((10^((floor(log10(min(CAfails)))-1):ceiling(log10(max(CAfails)))))%o%c(1:10)))
LOG.as<-LOG.as[-10*((floor(log10(min(CAfails))):ceiling(log10(max(CAfails))))+1)]
pdf$x <- 1:nrow(CAfails)
pdf$y <- CAfails$x
ggplot(data=pdf,aes(x,y)) + geom_point() +
scale_x_log10(limits=c(1,10^(ceiling(log10(max(CAfails))))),breaks=LOG.as) +
scale_y_log10(limits=c(0.1,10^(ceiling(log10(max(CAfails))))),breaks=LOG.as)
Which yields this:
I think this works a bit better:
CAfails<-data.frame(x=c(2.5,5.8,10.7,16.2,23,36.2,45.3,49.5,70.1,80.3,83.6,90))
pdf$x <- 1:nrow(CAfails)
pdf$y <- CAfails$x
genbreaks <- function(x){
# only works on positive vals
minx <- min(x)
maxx <- max(x)
flminx <- floor(log10(minx))
clmaxx <- ceil(log10(maxx))
rv <- c()
xlo <- 10^flminx
for (i in flminx:clmaxx) {
rv <- c(rv,seq(xlo,xlo*10,xlo))
xlo <- xlo*10
}
return(rv)
}
ggplot(data=pdf,aes(x,y)) + geom_point(color="blue") +
scale_x_log10(breaks=genbreaks(pdf$x)) +
scale_y_log10(breaks=genbreaks(pdf$y))
Yielding:
But there could be a standard way of doing it.

R: Draw a polygon with conditional colour

I want to colour the area under a curve. The area with y > 0 should be red, the area with y < 0 should be green.
x <- c(1:4)
y <- c(0,1,-1,2,rep(0,4))
plot(y[1:4],type="l")
abline(h=0)
Using ifelse() does not work:
polygon(c(x,rev(x)),y,col=ifelse(y>0,"red","green"))
What I achieved so far is the following:
polygon(c(x,rev(x)),y,col="green")
polygon(c(x,rev(x)),ifelse(y>0,y,0),col="red")
But then the red area is too large. Do you have any ideas how to get the desired result?
If you want two different colors, you need two different polygons. You can either call polygon multiple times, or you can add NA values in your x and y vectors to indicate a new polygon. R will not automatically calculate the intersection for you. You must do that yourself. Here's how you could draw that with different colors.
x <- c(1,2,2.5,NA,2.5,3,4)
y <- c(0,1,0,NA,0,-1,0)
#calculate color based on most extreme y value
g <- cumsum(is.na(x))
gc <- ifelse(tapply(y, g,
function(x) x[which.max(abs(x))])>0,
"red","green")
plot(c(1, 4),c(-1,1), type = "n")
polygon(x, y, col = gc)
abline(h=0)
In the more general case, it might not be as easy to split a polygon into different regions. There seems to be some support for this type of operation in GIS packages, where this type of thing is more common. However, I've put together a somewhat general case that may work for simple polygons.
First, I define a closure that will define a cutting line. The function will take a slope and y-intercept for a line and will return the functions we need to cut a polygon.
getSplitLine <- function(m=1, b=0) {
force(m); force(b)
classify <- function(x,y) {
y >= m*x + b
}
intercepts <- function(x,y, class=classify(x,y)) {
w <- which(diff(class)!=0)
m2 <- (y[w+1]-y[w])/(x[w+1]-x[w])
b2 <- y[w] - m2*x[w]
ix <- (b2-b)/(m-m2)
iy <- ix*m + b
data.frame(x=ix,y=iy,idx=w+.5, dir=((rank(ix, ties="first")+1) %/% 2) %% 2 +1)
}
plot <- function(...) {
abline(b,m,...)
}
list(
intercepts=intercepts,
classify=classify,
plot=plot
)
}
Now we will define a function to actually split a polygon using the splitter we've just defined.
splitPolygon <- function(x, y, splitter) {
addnullrow <- function(x) if (!all(is.na(x[nrow(x),]))) rbind(x, NA) else x
rollup <- function(x,i=1) rbind(x[(i+1):nrow(x),], x[1:i,])
idx <- cumsum(is.na(x) | is.na(y))
polys <- split(data.frame(x=x,y=y)[!is.na(x),], idx[!is.na(x)])
r <- lapply(polys, function(P) {
x <- P$x; y<-P$y
side <- splitter$classify(x, y)
if(side[1] != side[length(side)]) {
ints <- splitter$intercepts(c(x,x[1]), c(y, y[1]), c(side, side[1]))
} else {
ints <- splitter$intercepts(x, y, side)
}
sideps <- lapply(unique(side), function(ss) {
pts <- data.frame(x=x[side==ss], y=y[side==ss],
idx=seq_along(x)[side==ss], dir=0)
mm <- rbind(pts, ints)
mm <- mm[order(mm$idx), ]
br <- cumsum(mm$dir!=0 & c(0,head(mm$dir,-1))!=0 &
c(0,diff(mm$idx))>1)
if (length(unique(br))>1) {
mm<-rollup(mm, sum(br==br[1]))
}
br <- cumsum(c(FALSE,abs(diff(mm$dir*mm$dir))==3))
do.call(rbind, lapply(split(mm, br), addnullrow))
})
pss<-rep(unique(side), sapply(sideps, nrow))
ps<-do.call(rbind, lapply(sideps, addnullrow))[,c("x","y")]
attr(ps, "side")<-pss
ps
})
pss<-unname(unlist(lapply(r, attr, "side")))
src <- rep(seq_along(r), sapply(r, nrow))
r <- do.call(rbind, r)
attr(r, "source")<-src
attr(r, "side")<-pss
r
}
The input is just the values of x and y as you would pass to polygon along with the cutter. It will return a data.frame with x and y values that can be used with polygon.
For example
x <- c(1,2,2.5,NA,2.5,3,4)
y <- c(1,-2,2,NA,-1,2,-2)
sl<-getSplitLine(0,0)
plot(range(x, na.rm=T),range(y, na.rm=T), type = "n")
p <- splitPolygon(x,y,sl)
g <- cumsum(c(F, is.na(head(p$y,-1))))
gc <- ifelse(attr(p,"side")[is.na(p$y)],
"red","green")
polygon(p, col=gc)
sl$plot(lty=2, col="grey")
This should work for simple concave polygons as well with sloped lines. Here's another example
x <- c(1,2,3,4,5,4,3,2)
y <- c(-2,2,1,2,-2,.5,-.5,.5)
sl<-getSplitLine(.5,-1.25)
plot(range(x, na.rm=T),range(y, na.rm=T), type = "n")
p <- splitPolygon(x,y,sl)
g <- cumsum(c(F, is.na(head(p$y,-1))))
gc <- ifelse(attr(p,"side")[is.na(p$y)],
"red","green")
polygon(p, col=gc)
sl$plot(lty=2, col="grey")
Right now things can get a bit messy when the the vertex of the polygon falls directly on the splitting line. I may try to correct that in the future.
A faster, but not very accurate solution is to split data frame to list according to grouping variable (e.g. above=red and below=blue). This is a pretty nice workaround for rather big (I would say > 100 elements) datasets. For smaller chunks some discontinuity may be visible:
x <- 1:100
y1 <- sin(1:100/10)*0.8
y2 <- sin(1:100/10)*1.2
plot(x, y2, type='l')
lines(x, y1, col='red')
df <- data.frame(x=x, y1=y1, y2=y2)
df$pos_neg <- ifelse(df$y2-df$y1>0,1,-1) # above (1) or below (-1) average
# create the number for chunks to be split into lists:
df$chunk <- c(1,cumsum(abs(diff(df$pos_neg)))/2+1) # first element needs to be added`
df$colors <- ifelse(df$pos_neg>0, "red","blue") # colors to be used for filling the polygons
# create lists to be plotted:
l <- split(df, df$chunk) # we should get 4 sub-lists
lapply(l, function(x) polygon(c(x$x,rev(x$x)),c(x$y2,rev(x$y1)),col=x$colors))
As I said, for smaller dataset some discontinuity may be visible if sharp changes occur between positive and negative areas, but if horizontal line distinguishes between those two, or more elements are plotted then this effect is neglected:

Plotting elements from repeat in R

I am trying to create a plot of the (X0,Ujn) points created in the repeat function. Is there a way to do this in R? Here is my code:
LaxFriedrichs <- function(X0,delx,delt,t){
repeat{
Uj <- sin(X0)
U <- sin(X0+2*delx)
Ujn <- (Uj + U)/2 + (Uj - U)*(t/(2*delx))
X0 <- X0+delx
t <- delt + t
plot(X0,Ujn)
if (X0 > 2*pi/40) break
}
}
This might not be the most efficient implementation, but it at least gets all your points plotted (keeps appending to x and y list and then plots those points at the end):
LaxFriedrichs <- function(X0,delx,delt,t){
all.x = c()
all.y = c()
repeat{
Uj <- sin(X0)
U <- sin(X0+2*delx)
Ujn <- (Uj + U)/2 + (Uj - U)*(t/(2*delx))
X0 <- X0+delx
t <- delt + t
all.x <- c(all.x, X0)
all.y <- c(all.y, Ujn)
if (X0 > 2*pi/40) break
}
plot(all.x, all.y)
}
LaxFriedrichs(.001, .001, .001, 0.5)
A slightly shorter version that takes advantage of R's vector operations.
f <- function(x0, dx, dt, t0) {
x <- seq(x0,2*pi/40,by=dx)
t <- seq(t0,t0+(length(x)-1)*dt,by=dt)
Uj <- sin(x)
U <- sin(x+2*dx)
Ujn <- (Uj + U)/2 + (Uj - U)*(t/(2*dx))
plot(x,Ujn)
}
f(.001, .001, .001, .5)
Here, x and t are vectors, so Uj, U, and finally Ujn are calculated in one step, rather than in a loop.
One thing to note: in the original algorithm, at each step Ujn is calculated at x but x+dx is stored, so you end up plotting Ujn(x) vs (x+dx). The approach here corrects that, so the x-axis is offset by -dx.

Resources