Adding loess regresion line on a hexbin plot - r

I have been trying to find method to add a loess regression line on a hexbin plot. So far I do not have any success... Any suggestions?
My code is as follow:
bin<-hexbin(Dataset$a, Dataset$b, xbins=40)
plot(bin, main="Hexagonal Binning",
xlab = "a", ylab = "b",
type="l")

I would suggest using ggplot2 to build the plot.
Since you didn't include any example data, I've used the palmerpenguins package dataset for the example below.
library(palmerpenguins) # For the data
library(ggplot2) # ggplot2 for plotting
ggplot(penguins, aes(x = body_mass_g,
y = bill_length_mm)) +
geom_hex(bins = 40) +
geom_smooth(method = 'loess', se = F, color = 'red')
Created on 2021-01-05 by the reprex package (v0.3.0)

I don't have a solution for base, but it's possible to do this with ggplot. It should be possible with base too, but if you look at the documentation for ?hexbin, you can see the quote:
Note that when plotting a hexbin object, the grid package is used. You must use its graphics (or those from package lattice if you know how) to add to such plots.
I'm not familiar with how to modify these. I did try ggplotify to convert the base to ggplot and edit that way, but couldn't get the loess line added to the plot window properly.
So here is a solution with ggplot with some fake data that you can try on your Datasets:
library(hexbin)
library(ggplot2)
# fake data with a random walk, replace with your data
set.seed(100)
N <- 1000
x <- rnorm(N)
x <- sort(x)
y <- vector("numeric", length=N)
for(i in 2:N){
y[i] <- y[i-1] + rnorm(1, sd=0.1)
}
# current method
# In documentation for ?hexbin it says:
# "You must use its graphics (or those from package lattice if you know how) to add to such plots."
(bin <- hexbin(x, y, xbins=40))
plot(bin)
# ggplot option. Can play around with scale_fill_gradient to
# get the colour scale similar or use other ggplot options
df <- data.frame(x=x, y=y)
d <- ggplot(df, aes(x, y)) +
geom_hex(bins=40) +
scale_fill_gradient(low = "grey90", high = "black") +
theme_bw()
d
# easy to add a loess fit to the data
# span controls the degree of smoothing, decrease to make the line
# more "wiggly"
model <- loess(y~x, span=0.2)
fit <- predict(model)
loess_data <- data.frame(x=x, y=fit)
d + geom_line(data=loess_data, aes(x=x, y=y), col="darkorange",
size=1.5)

Here are two options; you will need to decide if you want to smooth over the raw data or the binned data.
library(hexbin)
library(grid)
# Some data
set.seed(101)
d <- data.frame(x=rnorm(1000))
d$y <- with(d, 2*x^3 + rnorm(1000))
Method A - binned data
# plot hexbin & smoother : need to grab plot viewport
# From ?hexVP.loess : "Fit a loess line using the hexagon centers of mass
# as the x and y coordinates and the cell counts as weights."
bin <- hexbin(d$x, d$y)
p <- plot(bin)
hexVP.loess(bin, hvp = p$plot.vp, span = 0.4, col = "red", n = 200)
Method B - raw data
# calculate loess predictions outside plot on raw data
l = loess(y ~ x, data=d, span=0.4)
xp = with(d, seq(min(x), max(x), length=200))
yp = predict(l, xp)
# plot hexbin
bin <- hexbin(d$x, d$y)
p <- plot(bin)
# add loess line
pushHexport(p$plot.vp)
grid.lines(xp, yp, gp=gpar(col="red"), default.units = "native")
upViewport()

Related

How do I add the curve from GauPro in ggplot?

GauPro is an R library for fitting gaussian processes. You can also get it to produce a nuce predicted curve for you.
The documentation for GauPro uses builtin r plotting functions to do plots like this:
gp <- GauPro(x,y) ## fit a gaussian process model to x & y
plot(x,y) ## plots the x,y points
curve(gp$predict(x), add=T, col=2) ## adds the predicted curve from the gaussian process
What would be the equivalent using ggplot? I can get the points to show up, but I can't quite figure out how to add the curve.
GauPro documentation I refer to is here
We can do this by building a little data frame of predictions. Let's start by loading the necessary packages and creating some sample data:
library(GauPro)
library(ggplot2)
set.seed(69)
x <- 1:10
y <- cumsum(runif(10))
Now we can create our model and plot it using the same plotting functions shown in the vignette you linked:
gp <- GauPro(x, y)
plot(x, y)
curve(gp$predict(x), add = TRUE, col = 2)
Now if we want to customize this plot using ggplot, we need a data frame with columns for the x values at which we wish to predict, the y prediction at that point, and a column each for upper and lower 95% confidence intervals. We can obtain the x values like this:
new_x <- seq(min(x), max(x), length.out = 100)
and we can get the three sets of corresponding y values using predict like this:
predict_df <- predict(gp, new_x, se.fit = TRUE)
predict_df$x <- new_x
predict_df$y <- predict_df$mean
predict_df$lower <- predict_df$y - 1.96 * predict_df$se
predict_df$upper <- predict_df$y + 1.96 * predict_df$se
this is now quite straightforward to plot in ggplot with themes customized as you choose:
ggplot(data.frame(x, y), aes(x, y)) +
geom_point() +
geom_line(data = predict_df, color = "deepskyblue4", linetype = 2) +
geom_ribbon(data = predict_df, aes(ymin = lower, ymax = upper),
alpha = 0.2, fill = "deepskyblue4") +
theme_minimal()
Created on 2020-07-29 by the reprex package (v0.3.0)

How to plot loess surface with ggplot

i have this code and i create a loess surface of my dataframe.
library(gstat)
library(sp)
x<-c(0,55,105,165,270,65,130,155,155,225,250,295,
30,100,110,135,160,190,230,300,30,70,105,170,
210,245,300,0,85,175,300,15,60,90,90,140,210,
260,270,295,5,55,55,90,100,140,190,255,285,270)
y<-c(305,310,305,310,310,260,255,265,285,280,250,
260,210,240,225,225,225,230,210,215,160,190,
190,175,160,160,170,120,135,115,110,85,90,90,
55,55,90,85,50,50,25,30,5,35,15,0,40,20,5,150)
z<-c(870,793,755,690,800,800,730,728,710,780,804,
855,813,762,765,740,765,760,790,820,855,812,
773,812,827,805,840,890,820,873,875,873,865,
841,862,908,855,850,882,910,940,915,890,880,
870,880,960,890,860,830)
dati<-data.frame(x,y,z)
x.range <- as.numeric(c(min(x), max(x)))
y.range <- as.numeric(c(min(y), max(y)))
meuse.loess <- loess(z ~ x * y, dati, degree=2, span = 0.25,
normalize=F)
meuse.mar <- list(x = seq(from = x.range[1], to = x.range[2], by = 1), y = seq(from = y.range[1],
to = y.range[2], by = 1))
meuse.lo <- predict(meuse.loess, newdata=expand.grid(meuse.mar), se=TRUE)
Now I want to plot meuse.lo[[1]] with ggplot2 function... but i don't know how to convert meuse.lo[[1]] in a dataframe with x,y (grid's coordinates) and z (interpolated value) columns. Thanks.
Your problem here is that loess() returns a matrix if you use grid.expand() to generate the new data for loess().
This is mentioned in the help for ?loess.predict:
If newdata was the result of a call to expand.grid, the predictions (and s.e.'s if requested) will be an array of the appropriate dimensions.
Now, you can still use grid.expand() to compute the new data, but force this function to return a data frame and dropping the attributes.
From ?grid.expand:
KEEP.OUT.ATTRS: a logical indicating the "out.attrs" attribute (see below) should be computed and returned.
So, try this:
nd <- expand.grid(meuse.mar, KEEP.OUT.ATTRS = FALSE)
meuse.lo <- predict(meuse.loess, newdata=nd, se=TRUE)
# Add the fitted data to the `nd` object
nd$z <- meuse.lo$fit
library(ggplot2)
ggplot(nd, aes(x, y, col = z)) +
geom_tile() +
coord_fixed()
The result:
ggplot2 is probably not the best choice for 3d graphs. However here is an easy solution with rgl
library(rgl)
plot3d(x, y, z, type="s", size=0.75, lit=FALSE,col="red")
surface3d(meuse.mar[[1]], meuse.mar[[2]], meuse.lo[[1]],
alpha=0.4, front="lines", back="lines")

plot/ggplot2 - Fill area with too many points

Final implementation - not finished but heading the right way
Idea/Problem: You have a plot with many overlapping points and want to replace them by a plain area, therefore increasing performance viewing the plot.
Possible implementation: Calculate a distance matrix between all points and connect all points below a specified distance.
Todo/Not finished: This currently works for manually set distances depending on size of the printed plot. I stopped here because the outcome didnt meet my aesthetic sense.
Minimal example with intermediate plots
set.seed(074079089)
n.points <- 3000
mat <- matrix(rnorm(n.points*2, 0,0.2), nrow=n.points, ncol=2)
colnames(mat) <- c("x", "y")
d.mat <- dist(mat)
fit.mat <-hclust(d.mat, method = "single")
lims <- c(-1,1)
real.lims <- lims*1.1 ## ggplot invokes them approximately
# An attempt to estimate the point-sizes, works for default pdfs pdf("test.pdf")
cutsize <- sum(abs(real.lims))/100
groups <- cutree(fit.mat, h=cutsize) # cut tree at height cutsize
# plot(fit.mat) # display dendogram
# draw dendogram with red borders around the 5 clusters
# rect.hclust(fit.mat, h=cutsize, border="red")
library(ggplot2)
df <- data.frame(mat)
df$groups <- groups
plot00 <- ggplot(data=df, aes(x,y, col=factor(groups))) +
geom_point() + guides(col=FALSE) + xlim(lims) + ylim(lims)+
ggtitle("Each color is a group")
pdf("plot00.pdf")
print(plot00)
dev.off()
# If less than 4 points are connected, show them seperately
t.groups <- table(groups) # how often which group
drop.group <- as.numeric(names(t.groups[t.groups<4])) # groups with less than 4 points are taken together
groups[groups %in% drop.group] <- 0 # in group 0
df$groups <- groups
plot01 <- ggplot(data=df, aes(x,y, col=factor(groups))) +
geom_point() + xlim(lims)+ ylim(lims) +
scale_color_hue(l=10)
pdf("plot01.pdf")
print(plot01)
dev.off()
find_hull <- function(df_0)
{
return(df_0[chull(df_0$x, df_0$y), ])
}
library(plyr)
single.points.df <- df[df$groups == 0 , ]
connected.points.df <- df[df$groups != 0 , ]
hulls <- ddply(connected.points.df, "groups", find_hull) # for all groups find a hull
plot02 <- ggplot() +
geom_point(data=single.points.df, aes(x,y, col=factor(groups))) +
xlim(lims)+ ylim(lims) +
scale_color_hue(l=10)
pdf("plot02.pdf")
print(plot02)
dev.off()
plot03 <- plot02
for(grp in names(table(hulls$groups)))
{
plot03 <- plot03 + geom_polygon(data=hulls[hulls$groups==grp, ],
aes(x,y), alpha=0.4)
}
# print(plot03)
plot01 <- plot01 + theme(legend.position="none")
plot03 <- plot03 + theme(legend.position="none")
# multiplot(plot01, plot03, cols=2)
pdf("plot03.pdf")
print(plot03)
dev.off()
Initial Question
I have a (maybe odd) question.
In some plots, I have thousands of points in my analysis. To display them, the pc takes quite a bit of time because there are so many points.
After now, many of these points can overlap, I have a filled area (which is fine!).
To save time/effort displaying, it would be usefull to just fill this area but plotting each point on its own.
I know there are possibilities in heatmaps and so on, but this is not the idea I have in mind. My idea is something like:
#plot00: ggplot with many many points and a filled area of points
plot00 <- plot00 + fill.crowded.areas()
# with plot(), I sadly have an idea how to manage it
Any ideas? Or is this nothing anyone would do anytime?
# Example code
# install.packages("ggplot2")
library(ggplot2)
n.points <- 10000
mat <- matrix(rexp(n.points*2), nrow=n.points, ncol=2)
colnames(mat) <- c("x", "y")
df <- data.frame(mat)
plot00 <- ggplot(df, aes(x=x, y=y)) +
theme_bw() + # white background, grey strips
geom_point(shape=19)# Aussehen der Punkte
print(plot00)
# NO ggplot2
plot(df, pch=19)
Edit:
To have density-plots like mentioned by fdetsch (how can I mark the name?) there are some questions concerning this topic. But this is not the thing I want exactly. I know my concern is a bit strange, but the densities make a plot more busy sometimes as necessary.
Links to topics with densities:
Scatterplot with too many points
High Density Scatter Plots
How about using panel.smoothScatter from lattice? It displays a certain number of points in low-density regions (see argument 'nrpoints') and everywhere else, point densities are displayed rather than single (and possibly overlapping) points, thus providing more meaningful insights into your data. See also ?panel.smoothScatter for further information.
## load 'lattice'
library(lattice)
## display point densities
xyplot(y ~ x, data = df, panel = function(x, y, ...) {
panel.smoothScatter(x, y, nbin = 250, ...)
})
You could use a robust estimator to estimate the location of the majority of your points and plot the convex hull of the points as follows:
set.seed(1337)
n.points <- 500
mat <- matrix(rexp(n.points*2), nrow=n.points, ncol=2)
colnames(mat) <- c("x", "y")
df <- data.frame(mat)
require(robustbase)
my_poly <- function(data, a, ...){
cov_rob = covMcd(data, alpha = a)
df_rob = data[cov_rob$best,]
ch = chull(df_rob$x, df_rob$y)
geom_polygon(data = df_rob[ch,], aes(x,y), ...)
}
require(ggplot2)
ggplot() +
geom_point(data=df, aes(x,y)) +
my_poly(df, a = 0.5, fill=2, alpha=0.5) +
my_poly(df, a = 0.7, fill=3, alpha=0.5)
This leads to:
by controlling the alpha-value of covMcd you can increase/decrease the size of the area. See ?robustbase::covMcd for details.
Btw.: Mcd stands for Minimum Covariance Determinant. Instead of it you can also use MASS::cov.mve to calculate the minimum valume ellipsoid with MASS::cov.mve(..., quantile.used=-percent of points within the ellipsoid.
For 2+ classes:
my_poly2 <- function(data, a){
cov_rob = covMcd(data, alpha = a)
df_rob = data[cov_rob$best,]
ch = chull(df_rob[,1], df_rob[,2])
df_rob[ch,]
}
ggplot(faithful, aes(waiting, eruptions, color = eruptions > 3)) +
geom_point() +
geom_polygon(data = my_poly2(faithful[faithful$eruptions > 3,], a=0.5), aes(waiting, eruptions), fill = 2, alpha = 0.5) +
geom_polygon(data = my_poly2(faithful[faithful$eruptions < 3,], a=0.5), aes(waiting, eruptions), fill = 3, alpha = 0.5)
Or if you are ok with un-robust ellipsoids have a look at stat_ellipse
Do you mean something like the convex hull of your points:
set.seed(1337)
n.points <- 100
mat <- matrix(rexp(n.points*2), nrow=n.points, ncol=2)
colnames(mat) <- c("x", "y")
df <- data.frame(mat)
ch <- chull(df$x, df$y) # This computes the convex hull
require(ggplot2)
ggplot() +
geom_point(data=df, aes(x,y)) +
geom_polygon(data = df[ch,], aes(x,y), alpha=0.5)

How to plot a contour line showing where 95% of values fall within, in R and in ggplot2

Say we have:
x <- rnorm(1000)
y <- rnorm(1000)
How do I use ggplot2 to produce a plot containing the two following geoms:
The bivariate expectation of the two series of values
A contour line showing where 95% of the estimates fall within?
I know how to do the first part:
df <- data.frame(x=x, y=y)
p <- ggplot(df, aes(x=x, y=y))
p <- p + xlim(-10, 10) + ylim(-10, 10) # say
p <- p + geom_point(x=mean(x), y=mean(y))
And I also know about the stat_contour() and stat_density2d() functions within ggplot2.
And I also know that there are 'bins' options within stat_contour.
However, I guess what I need is something like the probs argument within quantile, but over two dimensions rather than one.
I have also seen a solution within the graphics package. However, I would like to do this within ggplot.
Help much appreciated,
Jon
Unfortunately, the accepted answer currently fails with Error: Unknown parameters: breaks on ggplot2 2.1.0. I cobbled together an alternative approach based on the code in this answer, which uses the ks package for computing the kernel density estimate:
library(ggplot2)
set.seed(1001)
d <- data.frame(x=rnorm(1000),y=rnorm(1000))
kd <- ks::kde(d, compute.cont=TRUE)
contour_95 <- with(kd, contourLines(x=eval.points[[1]], y=eval.points[[2]],
z=estimate, levels=cont["5%"])[[1]])
contour_95 <- data.frame(contour_95)
ggplot(data=d, aes(x, y)) +
geom_point() +
geom_path(aes(x, y), data=contour_95) +
theme_bw()
Here's the result:
TIP: The ks package depends on the rgl package, which can be a pain to compile manually. Even if you're on Linux, it's much easier to get a precompiled version, e.g. sudo apt install r-cran-rgl on Ubuntu if you have the appropriate CRAN repositories set up.
Riffing off of Ben Bolker's answer, a solution that can handle multiple levels and works with ggplot 2.2.1:
library(ggplot2)
library(MASS)
library(reshape2)
# create data:
set.seed(8675309)
Sigma <- matrix(c(0.1,0.3,0.3,4),2,2)
mv <- data.frame(mvrnorm(4000,c(1.5,16),Sigma))
# get the kde2d information:
mv.kde <- kde2d(mv[,1], mv[,2], n = 400)
dx <- diff(mv.kde$x[1:2]) # lifted from emdbook::HPDregionplot()
dy <- diff(mv.kde$y[1:2])
sz <- sort(mv.kde$z)
c1 <- cumsum(sz) * dx * dy
# specify desired contour levels:
prob <- c(0.95,0.90,0.5)
# plot:
dimnames(mv.kde$z) <- list(mv.kde$x,mv.kde$y)
dc <- melt(mv.kde$z)
dc$prob <- approx(sz,1-c1,dc$value)$y
p <- ggplot(dc,aes(x=Var1,y=Var2))+
geom_contour(aes(z=prob,color=..level..),breaks=prob)+
geom_point(aes(x=X1,y=X2),data=mv,alpha=0.1,size=1)
print(p)
The result:
This works, but is quite inefficient because you actually have to compute the kernel density estimate three times.
set.seed(1001)
d <- data.frame(x=rnorm(1000),y=rnorm(1000))
getLevel <- function(x,y,prob=0.95) {
kk <- MASS::kde2d(x,y)
dx <- diff(kk$x[1:2])
dy <- diff(kk$y[1:2])
sz <- sort(kk$z)
c1 <- cumsum(sz) * dx * dy
approx(c1, sz, xout = 1 - prob)$y
}
L95 <- getLevel(d$x,d$y)
library(ggplot2); theme_set(theme_bw())
ggplot(d,aes(x,y)) +
stat_density2d(geom="tile", aes(fill = ..density..),
contour = FALSE)+
stat_density2d(colour="red",breaks=L95)
(with help from http://comments.gmane.org/gmane.comp.lang.r.ggplot2/303)
update: with a recent version of ggplot2 (2.1.0) it doesn't seem possible to pass breaks to stat_density2d (or at least I don't know how), but the method below with geom_contour still seems to work ...
You can make things a little more efficient by computing the kernel density estimate once and plotting the tiles and contours from the same grid:
kk <- with(dd,MASS::kde2d(x,y))
library(reshape2)
dimnames(kk$z) <- list(kk$x,kk$y)
dc <- melt(kk$z)
ggplot(dc,aes(x=Var1,y=Var2))+
geom_tile(aes(fill=value))+
geom_contour(aes(z=value),breaks=L95,colour="red")
doing the 95% level computation from the kk grid (to reduce the number of kernel computations to 1) is left as an exercise
I'm not sure why stat_density2d(geom="tile") and geom_tile give slightly different results (the former is smoothed)
I haven't added the bivariate mean, but something like annotate("point",x=mean(d$x),y=mean(d$y),colour="red") should work.
I had an example where the MASS::kde2d() bandwidth specifications were not flexible enough, so I ended up using the ks package and the ks::kde() function and, as an example, the ks::Hscv() function to estimate flexible bandwidths that captured the smoothness better. This computation can be a bit slow, but it has much better performance in some situations. Here is a version of the above code for that example:
set.seed(1001)
d <- data.frame(x=rnorm(1000),y=rnorm(1000))
getLevel <- function(x,y,prob=0.95) {
kk <- MASS::kde2d(x,y)
dx <- diff(kk$x[1:2])
dy <- diff(kk$y[1:2])
sz <- sort(kk$z)
c1 <- cumsum(sz) * dx * dy
approx(c1, sz, xout = 1 - prob)$y
}
L95 <- getLevel(d$x,d$y)
library(ggplot2); theme_set(theme_bw())
ggplot(d,aes(x,y)) +
stat_density2d(geom="tile", aes(fill = ..density..),
contour = FALSE)+
stat_density2d(colour="red",breaks=L95)
## using ks::kde
hscv1 <- Hscv(d)
fhat <- ks::kde(d, H=hscv1, compute.cont=TRUE)
dimnames(fhat[['estimate']]) <- list(fhat[["eval.points"]][[1]],
fhat[["eval.points"]][[2]])
library(reshape2)
aa <- melt(fhat[['estimate']])
ggplot(aa, aes(x=Var1, y=Var2)) +
geom_tile(aes(fill=value)) +
geom_contour(aes(z=value), breaks=fhat[["cont"]]["50%"], color="red") +
geom_contour(aes(z=value), breaks=fhat[["cont"]]["5%"], color="purple")
For this particular example, the differences are minimal, but in an example where the bandwidth specification requires more flexibility, this modification may be important. Note that the 95% contour is specified using the breaks=fhat[["cont"]]["5%"], which I found a little bit counter-intuitive, because it is called here the "5% contour".
Just mixing answers from above, putting them in a more tidyverse friendly way, and allowing for multiple contour levels. I use here geom_path(group=probs), adding them manually geom_text. Another approach is to use geom_path(colour=probs) which will automatically label the contours as legend.
library(ks)
library(tidyverse)
set.seed(1001)
## data
d <- MASS::mvrnorm(1000, c(0, 0.2), matrix(c(1, 0.4, 1, 0.4), ncol=2)) %>%
magrittr::set_colnames(c("x", "y")) %>%
as_tibble()
## density function
kd <- ks::kde(d, compute.cont=TRUE, h=0.2)
## extract results
get_contour <- function(kd_out=kd, prob="5%") {
contour_95 <- with(kd_out, contourLines(x=eval.points[[1]], y=eval.points[[2]],
z=estimate, levels=cont[prob])[[1]])
as_tibble(contour_95) %>%
mutate(prob = prob)
}
dat_out <- map_dfr(c("10%", "20%","80%", "90%"), ~get_contour(kd, .)) %>%
group_by(prob) %>%
mutate(n_val = 1:n()) %>%
ungroup()
## clean kde output
kd_df <- expand_grid(x=kd$eval.points[[1]], y=kd$eval.points[[2]]) %>%
mutate(z = c(kd$estimate %>% t))
ggplot(data=kd_df, aes(x, y)) +
geom_tile(aes(fill=z)) +
geom_point(data = d, alpha = I(0.4), size = I(0.4), colour = I("yellow")) +
geom_path(aes(x, y, group = prob),
data=filter(dat_out, !n_val %in% 1:3), colour = I("white")) +
geom_text(aes(label = prob), data =
filter(dat_out, (prob%in% c("10%", "20%","80%") & n_val==1) | (prob%in% c("90%") & n_val==20)),
colour = I("black"), size =I(3))+
scale_fill_viridis_c()+
theme_bw() +
theme(legend.position = "none")
Created on 2019-06-25 by the reprex package (v0.3.0)

using multiple size scales in a ggplot

I'm trying to construct a plot which shows transitions from one class to another. I want to have circles representing each class sized according to a class attribute, and arrows from one class to another, sized according to the number of transitions from one class to another.
As an example:
library(ggplot2)
points <- data.frame( x=runif(10), y=runif(10),class=1:10, size=runif(10,min=1000,max=100000) )
trans <- data.frame( from=rep(1:10,times=10), to=rep(1:10,each=10), amount=runif(100)^3 )
trans <- merge( trans, points, by.x="from", by.y="class" )
trans <- merge( trans, points, by.x="to", by.y="class", suffixes=c(".to",".from") )
ggplot( points, aes( x=x, y=y ) ) + geom_point(aes(size=size),color="red") +
scale_size_continuous(range=c(4,20)) +
geom_segment( data=trans, aes( x=x.from, y=y.from, xend=x.to, yend=y.to, size=amount ),lineend="round",arrow=arrow(),alpha=0.5)
I'd like to be able to scale the arrows on a different scale to the circles. Ideally, I'd like a legend with both scales on, but I understand this may not be possible (using two scale colour gradients on one ggplot)
Is there a more elegant way to do this than applying arbitrary scaling to the underlying data?
A nice option is to generate the circumference of your classes as a series of points, adjusting the scale (diameter) according to your data. Then you draw the circles either as paths or polygons.
Follows some example code. The circleFun was shared by #joran in a previous post. Does this work? I think you should tweak the circle scales acording to your real data.
Important note:
Also, from your use of arrow without attaching grid, I assume you have not updated ggplot2. I changed that code to work with my setup, and tried not to include any ggplot2 code that might cause backward compatibility issues.
# Load packages
library(package=ggplot2) # You should update ggplot2
library(package=plyr) # To proccess each class separately
# Your data generating code
points <- data.frame(x=runif(10), y=runif(10),class=1:10,
size=runif(10,min=1000,max=100000) )
trans <- data.frame(from=rep(1:10,times=10), to=rep(1:10,each=10),
amount=runif(100)^3 )
trans <- merge(trans, points, by.x="from", by.y="class" )
trans <- merge(trans, points, by.x="to", by.y="class", suffixes=c(".to",".from") )
# Generate a set of points in a circumference
# Originally posted by #joran in
# https://stackoverflow.com/questions/6862742/draw-a-circle-with-ggplot2
circleFun <- function(center = c(0,0), diameter = 1, npoints = 100){
r = diameter / 2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
# Get max and min sizes and min distances to estimate circle scales
min_size <- min(points$size, na.rm=TRUE)
max_size <- max(points$size, na.rm=TRUE)
xs <- apply(X=combn(x=points$x, m=2), MARGIN=2, diff, na.rm=TRUE)
ys <- apply(X=combn(x=points$y, m=2), MARGIN=2, diff, na.rm=TRUE)
min_dist <- min(abs(c(xs, ys))) # Seems too small
mean_dist <- mean(abs(c(xs, ys)))
# Adjust sizes
points$fit_size <- points$size * (mean_dist/max_size)
# Generate the circles based on the points
circles <- ddply(.data=points, .variables='class',
.fun=function(class){
with(class,
circleFun(center = c(x, y), diameter=fit_size))
})
circles <- merge(circles, points[, c('class', 'size', 'fit_size')])
# Plot
ggplot(data=circles, aes(x=x, y=y)) +
geom_polygon(aes(group=factor(class), fill=size)) +
geom_segment(data=trans,
aes(x=x.from, y=y.from, xend=x.to, yend=y.to, size=amount),
alpha=0.6, lineend="round", arrow=grid::arrow()) +
coord_equal()

Resources