Automated way to prevent ggplot hexbin from cutting geoms off axes - r

This is a slightly different question from an earlier post(ggplot hexbin shows different number of hexagons in plot versus data frame).
I am using hexbin() to bin data into hexagon objects, and ggplot() to plot the results. I notice that, sometimes, the hexagons on the edge of the plot are cut in half. Below is an example.
library(hexbin)
library(ggplot2)
set.seed(1)
data <- data.frame(A=rnorm(100), B=rnorm(100), C=rnorm(100), D=rnorm(100), E=rnorm(100))
maxVal = max(abs(data))
maxRange = c(-1*maxVal, maxVal)
x = data[,c("A")]
y = data[,c("E")]
h <- hexbin(x=x, y=y, xbins=5, shape=1, IDs=TRUE, xbnds=maxRange, ybnds=maxRange)
hexdf <- data.frame (hcell2xy (h), hexID = h#cell, counts = h#count)
ggplot(hexdf, aes(x = x, y = y, fill = counts, hexID = hexID)) +
geom_hex(stat = "identity") +
coord_cartesian(xlim = c(maxRange[1], maxRange[2]), ylim = c(maxRange[1], maxRange[2]))
This creates a graphic where one hexagon is cut off at the top and one hexagon is cut off at the bottom:
Another approach I can try is to hard-code a value (here 1.5) to be added to the limits of the x and y axis. Doing so does seem to solve the problem in that no hexagons are cut off anymore.
ggplot(hexdf, aes(x = x, y = y, fill = counts, hexID = hexID)) +
geom_hex(stat = "identity") +
scale_x_continuous(limits = maxRange * 1.5) +
scale_y_continuous(limits = maxRange * 1.5)
However, even though the second approach solves the problem in this instance, the value of 1.5 is arbitrary. I am trying to automate this process for a variety of data and variety of bin sizes and hexagon sizes that could be used. Is there a solution to keeping all hexagons fully visible in the plot without having to hard-code an arbitrary value that may be too large or too small for certain instances?

Consider that you can skip the computation of hexbin, and let ggplot do the job.
Then, if you prefer to manually set the width of the bins you can set the binwidth and modify the limits:
bwd = 1
ggplot(data, aes(x = x, y = y)) +
geom_hex(binwidth = bwd) +
coord_cartesian(xlim = c(min(x) - bwd, max(x) + bwd),
ylim = c(min(y) - bwd, max(y) + bwd),
expand = T) +
geom_point(color = "red") +
theme_bw()
this way, hexagons should never be truncated (though you may end up with some "empty" space.
Result with bwd = 1:
Result with bwd = 3:
If instead you prefer to programmatically set the number of the bins, you can use:
nbins_x <- 4
nbins_y <- 6
range_x <- range(data$A, na.rm = T)
range_y <- range(data$E, na.rm = T)
bwd_x <- (range_x[2] - range_x[1])/nbins_x
bwd_y <- (range_y[2] - range_y[1])/nbins_y
ggplot(data, aes(x = A, y = E)) +
geom_hex(bins = c(nbins_x,nbins_y)) +
coord_cartesian(xlim = c(range_x[1] - bwd_x, range_x[2] + bwd_x),
ylim = c(range_y[1] - bwd_y, range_y[2] + bwd_y),
expand = T) +
geom_point(color = "red")+
theme_bw()

Related

Is there an equivalent to points() on ggplot2

I'm working with stock prices and trying to plot the price difference.
I created one using autoplot.zoo(), my question is, how can I manage to change the point shapes to triangles when they are above the upper threshold and to circles when they are below the lower threshold. I understand that when using the basic plot() function you can do these by calling the points() function, wondering how I can do this but with ggplot2.
Here is the code for the plot:
p<-autoplot.zoo(data, geom = "line")+
geom_hline(yintercept = threshold, color="red")+
geom_hline(yintercept = -threshold, color="red")+
ggtitle("AAPL vs. SPY out of sample")
p+geom_point()
We can't fully replicate without your data, but here's an attempt with some sample generated data that should be similar enough that you can adapt for your purposes.
# Sample data
data = data.frame(date = c(2001:2020),
spread = runif(20, -10,10))
# Upper and lower threshold
thresh <- 4
You can create an additional variable that determines the shape, based on the relationship in the data itself, and pass that as an argument into ggplot.
# Create conditional data
data$outlier[data$spread > thresh] <- "Above"
data$outlier[data$spread < -thresh] <- "Below"
data$outlier[is.na(data$outlier)] <- "In Range"
library(ggplot2)
ggplot(data, aes(x = date, y = spread, shape = outlier, group = 1)) +
geom_line() +
geom_point() +
geom_hline(yintercept = c(thresh, -thresh), color = "red") +
scale_shape_manual(values = c(17,16,15))
# If you want points just above and below# Sample data
data = data.frame(date = c(2001:2020),
spread = runif(20, -10,10))
thresh <- 4
data$outlier[data$spread > thresh] <- "Above"
data$outlier[data$spread < -thresh] <- "Below"
ggplot(data, aes(x = date, y = spread, shape = outlier, group = 1)) +
geom_line() +
geom_point() +
geom_hline(yintercept = c(thresh, -thresh), color = "red") +
scale_shape_manual(values = c(17,16))
Alternatively, you can just add the points above and below the threshold as individual layers with manually specified shapes, like this. The pch argument points to shape type.
# Another way of doing this
data = data.frame(date = c(2001:2020),
spread = runif(20, -10,10))
# Upper and lower threshold
thresh <- 4
ggplot(data, aes(x = date, y = spread, group = 1)) +
geom_line() +
geom_point(data = data[data$spread>thresh,], pch = 17) +
geom_point(data = data[data$spread< (-thresh),], pch = 16) +
geom_hline(yintercept = c(thresh, -thresh), color = "red") +
scale_shape_manual(values = c(17,16))

Extending ggplot geom_polygon to end of plot

How can I make a confidence interval band that extends to the end of the plot in ggplot?
I can do it if the plotted band is entirely within the plot, for example
limits <- c(1e2, 1e7)
confPolygon <- tibble(
x = c(limits[1], limits[1]*10, limits[2], limits[2], limits[2]/10, limits[1], limits[1]),
y = c(limits[1], limits[1], limits[2]/10, limits[2], limits[2], limits[1]*10, limits[1])
)
plot <- ggplot() +
geom_polygon(data = confPolygon, aes(x = x, y = y), fill = "grey", alpha = .25) +
scale_x_log10(limits = limits) +
scale_y_log10(limits = limits)
works. However, if I try any shape that extends the polygon to the edges
confPolygon <- tibble(
x = c(limits[1], limits[2]*10, limits[2]*10, limits[1], limits[1]),
y = c(limits[1], limits[1], limits[2]*10, limits[2]*10, limits[1])
)
then it doesn't plot the polygon.
The reason is because the method you are using to zoom in to the plot (setting limits within the x or y scales) isn't meant to zoom in; it actually subsets the data, accidentally creating missing values on the way. Use coord_cartesian(xlim = c(0,5), ylim = c(0,5)), or in your case, coord_cartesian(xlim = limits, ylim = limits) instead, as this step does not subset the data.
One way to do this is with oob=scales::squish().
plot2 <- ggplot() +
geom_polygon(data = confPolygon, aes(x = x, y = y), fill = "grey", alpha = .25) +
scale_x_log10(limits = limits, oob=scales::squish) +
scale_y_log10(limits = limits, oob=scales::squish)
If you really want the polygon to extend all the way to the edge, you should also add expand=c(0,0) to each of the scale_*_log10() argument lists.

ggplot2 plots more points than asked

I am trying to fill a square region with non-overlapping squares with different colors and ggplot2 is plotting more points than those in the dataframe at the higher x and y limits. Here is the code
l = 1000
a=seq(0,1, 1/(l-1))
x=rep(a, each=length(a))
y=rep(a, length(a))
k = length(x)
c=sample(1:10, k, replace = TRUE)
data <- data.frame(x, y, c)
ggplot(data, aes(x=x, y=y)) + geom_point(shape=15, color=c)
ggsave('k.jpg', width=10, height=10)
The result I am getting with RStudio is this. Notice the extra points on the right and top of the image.
How can I get ggplot to plot exactly one square exclusively for those points in the dataframe and not more?
As a second related question, this is what happens if l is changed from 1000 to l=100
My problem is now that the squares are not perfectly stacked, leaving empty space between them. I would like to know how can I compute from the number of points in each dimension of the array (l), the correct value for size inside geom_point so that the squares are perfectly stacked.
Many thanks
You might be better off with geom_tile, rather than geom_point, as this will allow more control over the size of the rectangles and the border width. See ?geom_tile for details.
Providing a couple of alternatives using OP's example, reducing the data frame dimension to increase the size of the tile:
Data
library(ggplot2)
l = 100
a = seq(0, 1, 1 / (l - 1))
x = rep(a, each = length(a))
y = rep(a, length(a))
k = length(x)
c = sample(1:10, k, replace = TRUE)
data <- data.frame(x, y, c)
Example 1
Very simple, just pasing "white" as colour to make the tiles more distinctive.
ggplot(data, aes(x = x, y = y, fill = c)) + geom_tile(colour = "white")
Example 2
Creating manually a palette, and coord_equal to force a specified ratio (default 1) so tiles are squares:
colors<-c("peachpuff", "yellow", "orange", "orangered", "red",
"darkred","firebrick", "royalblue", "darkslategrey", "black")
ggplot(data, aes(x = x, y = y)) +
geom_tile(aes(fill = factor(c)), colour = "white") +
scale_fill_manual(values = colors, name = "Colours") +
coord_equal()
Comparing geom_point and geom_tile
Creating small data frame (10 x 10, l = 10) to observe closer what happens when using geom_point instead of geom_tile.
Original OP code
ggplot(data, aes(x = x, y = y)) + geom_point(shape = 15, color = c)
Example 1
ggplot(data, ae(x = x, y = y, fill = c)) + geom_tile(colour = "white")
Example 2
colors<-c("peachpuff", "yellow", "orange", "orangered", "red",
"darkred","firebrick", "royalblue", "darkslategrey", "black")
ggplot(data, aes(x = x, y = y)) +
geom_tile(aes(fill = factor(c)), colour = "white") +
scale_fill_manual(values = colors, name = "Colours") +
coord_equal()

Know proportions of ggplot2 plot

I usually save the plots from ggplot2 using the the png device. The width and the height of the output are set by the arguments of the function. Blank zones are drawn when the "natural proportions" of the graph dont't suit the proportions of the device. In order to avoid this and use the whole defined canvas, the proportions of the plot must be known. ¿Is there a way to find out this value without trial and error?
This code can be used as an example:
x <- seq(from = 0, to = 1, by = 0.1)
y <- seq(from = 1, to = 2, by = 0.1)
df <- expand.grid(x = x, y = y)
df <- cbind(df, z = rnorm(ncol(df), 0, 1))
p <- ggplot(df, aes(x,y, fill = z)) + geom_raster() + coord_fixed()
ppi <- 300
#Value 0.4 is used to change inches into milimeters
png("plot.png", width = 16*0.4*ppi, height = 20*0.4*ppi, res = ppi)
print(p)
dev.off()
It can be seen that some blank space is added at the top and at the bottom to fill the png file. This could be easily corrected by using a proportion different from 20/16, which is not optimal.
You can modify the ratio arg inside coord_fixed():
p <- ggplot(df, aes(x,y, fill = z)) +
geom_raster() +
coord_fixed(ratio = 20/16)
Alteratively you can specify the aspect.ratio inside the theme():
p <- ggplot(df, aes(x,y, fill = z)) +
geom_raster() +
theme(aspect.ratio = 20/16)
The result is the same:

Create a colour blind test with ggplot

I would like to create a colour blind test, similar to that below, using ggplot.
The basic idea is to use geom_hex (or perhaps a voronoi diagram, or possibly even circles as in the figure above) as the starting point, and define a dataframe that, when plotted in ggplot, produces the image.
We would start by creating a dataset, such as:
df <- data.frame(x = rnorm(10000), y = rnorm(10000))
then plot this:
ggplot(df, aes(x, y)) +
geom_hex() +
coord_equal() +
scale_fill_gradient(low = "red", high = "green", guide = FALSE) +
theme_void()
which gives the image below:
The main missing step is to create a dataset that actually plots a meaningful symbol (letter or number), and I'm not sure how best to go about this without painstakingly mapping the coordinates. Ideally one would be able to read in the coordinates perhaps from an image file.
Finally, a bit of tidying up could round the plot edges by removing the outlying points.
All suggestions are very welcome!
EDIT
Getting a little closer to what I'm after, we can use the image below of the letter 'e':
Using the imager package, we can read this in and convert it to a dataframe:
img <- imager::load.image("e.png")
df <- as.data.frame(img)
then plot that dataframe using geom_raster:
ggplot(df, aes(x, y)) +
geom_raster(aes(fill = value)) +
coord_equal() +
scale_y_continuous(trans = scales::reverse_trans()) +
scale_fill_gradient(low = "red", high = "green", guide = FALSE) +
theme_void()
If we use geom_hex instead of geom_raster, we can get the following plot:
ggplot(df %>% filter(value %in% 1), aes(x, y)) +
geom_hex() +
coord_equal() +
scale_y_continuous(trans = scales::reverse_trans()) +
scale_fill_gradient(low = "red", high = "green", guide = FALSE) +
theme_void()
so, getting there but clearly still a long way off...
Here's an approach for creating this plot:
Packages you need:
library(tidyverse)
library(packcircles)
Get image into a 2D matrix (x and y coordinates) of values. To do this, I downloaded the .png file of the e as "e.png" and saved in my working directory. Then some processing:
img <- png::readPNG("e.png")
# From http://stackoverflow.com/questions/16496210/rotate-a-matrix-in-r
rotate <- function(x) t(apply(x, 2, rev))
# Convert to one colour layer and rotate it to be in right direction
img <- rotate(img[,,1])
# Check that matrix makes sense:
image(img)
Next, create a whole lot of circles! I did this based on this post.
# Create random "circles"
# *** THESE VALUES WAY NEED ADJUSTING
ncircles <- 1200
offset <- 100
rmax <- 80
x_limits <- c(-offset, ncol(img) + offset)
y_limits <- c(-offset, nrow(img) + offset)
xyr <- data.frame(
x = runif(ncircles, min(x_limits), max(x_limits)),
y = runif(ncircles, min(y_limits), max(y_limits)),
r = rbeta(ncircles, 1, 10) * rmax)
# Find non-overlapping arrangement
res <- circleLayout(xyr, x_limits, y_limits, maxiter = 1000)
cat(res$niter, "iterations performed")
#> 1000 iterations performed
# Convert to data for plotting (just circles for now)
plot_d <- circlePlotData(res$layout)
# Check circle arrangement
ggplot(plot_d) +
geom_polygon(aes(x, y, group=id), colour = "white", fill = "skyblue") +
coord_fixed() +
theme_minimal()
Finally, interpolate the image pixel values for the centre of each circle. This will indicate whether a circle is centered over the shape or not. Add some noise to get variance in colour and plot.
# Get x,y positions of centre of each circle
circle_positions <- plot_d %>%
group_by(id) %>%
summarise(x = min(x) + (diff(range(x)) / 2),
y = min(y) + (diff(range(y)) / 2))
# Interpolate on original image to get z value for each circle
circle_positions <- circle_positions %>%
mutate(
z = fields::interp.surface(
list(x = seq(nrow(img)), y = seq(ncol(img)), z = img),
as.matrix(.[, c("x", "y")])),
z = ifelse(is.na(z), 1, round(z)) # 1 is the "empty" area shown earlier
)
# Add a little noise to the z values
set.seed(070516)
circle_positions <- circle_positions %>%
mutate(z = z + rnorm(n(), sd = .1))
# Bind z value to data for plotting and use as fill
plot_d %>%
left_join(select(circle_positions, id, z)) %>%
ggplot(aes(x, y, group = id, fill = z)) +
geom_polygon(colour = "white", show.legend = FALSE) +
scale_fill_gradient(low = "#008000", high = "#ff4040") +
coord_fixed() +
theme_void()
#> Joining, by = "id"
To get colours right, tweak them in scale_fill_gradient

Resources