Related
I would like to overlay two ggplots from different data sources. I don't think a left_join will work because the dataframes are of two different lengths and would potential change the underlying plots.[Maybe?]
library(tidyverse)
set.seed(123)
player_df <- tibble(name = rep(c("A","B","C","D"), each = 10, times = 1),
pos = rep(c("DEF","DEF","MID","MID"), each = 10, times = 1),
load = c(rnorm(10, mean = 200, sd = 100),
rnorm(10, mean = 300, sd = 50),
rnorm(10, mean = 400, sd = 100),
rnorm(10, mean = 500, sd = 50)))
p1 <- player_df %>%
ggplot(aes(x = load, y = name)) +
geom_point()
pos_df <- tibble(pos = rep(c("DEF","MID"), each = 30, times = 1),
load = (c(rnorm(30, mean = 250, sd = 100),
rnorm(30, mean = 350, sd = 100))))
p2 <- pos_df %>%
ggplot(aes(x = load, y = pos)) +
geom_boxplot()
p1
p2
# add p2 to every p1 player plot by pos
I would like p1 to have the corresponding p2 - by pos - appear behind it. So... add the matching p2 boxplot to each p1 scatterplot.
p1:
p2:
It's not really advisable to attempt to superimpose two plots on each other. A ggplot is made of layers already, so usually it's just a case of superimposing one geom on another. This can be difficult if (as in your case) one of the axes has different labels. However, with a little work it is possible to wrangle your data so that it all sits on a single plot. In your case, you could do something like:
levs <- c("A", "DEF", "B", "C", "MID", "D")
ggplot(within(pos_df, pos <- factor(pos, levs)), aes(x = load, y = pos)) +
geom_boxplot(width = 2.3) +
geom_point(data = within(player_df, pos <- factor(name, levs))) +
scale_y_discrete(limits = c("A", "DEF", "B", " ", "C", "MID", "D"))
Dug into ggplot a bit and re-engineered a boxplot bit by bit.
# manually calculate stats that are used in boxplots
pos_df_summary <- pos_df %>%
group_by(pos, .drop = FALSE) %>%
summarise(min = fivenum(load)[1],
Q1 = fivenum(load)[2],
median = fivenum(load)[3],
Q3 = fivenum(load)[4],
max = fivenum(load)[5]
)
# add the boxplot data to each player
joined_df <- player_df %>%
left_join(., pos_df_summary, by = "pos") %>%
distinct(name, .keep_all = TRUE)
# plot
ggplot(data = NULL, aes(group = name)) +
# create the line from min to max
geom_segment(data = joined_df, aes(y = name, yend = name, x=min, xend=max), color="black") +
#create the box with median line
geom_crossbar(data = joined_df,
aes(y = name, xmin = Q1, xmax = Q3, x = median, fill = "NA"),
color = "black",
fatten = 1) +
scale_fill_manual(values = "white") +
# add the points from the player_df
geom_point(data = player_df,
aes(x = load, y = name, group=name),
color = "red",
show.legend=FALSE) +
theme(legend.position = "none")
There may be some extraneous code in here as I cobbled it from some other resources. Specifically, I'm not sure what the aes(group = name) in the ggplot() call does exactly.
I have measurements of a quantity (value) at specific points (lon and lat), like the example data below:
library(ggplot2)
set.seed(1)
dat <- data.frame(lon = runif(1000, 1, 15),
lat = runif(1000, 40, 60),
value = rnorm(1000))
I want to make a 2D summary (e.g. mean) of the measured values with color in space and on top of that I want to show the counts as labels.
I can plot the labels and to the summary plot
## Left plot
ggplot(dat) +
aes(x = lon, y = lat, z = value) +
stat_summary_hex(bins = 5, fun = "mean", geom = "hex")
## Right plot
ggplot(dat) +
aes(x = lon, y = lat, z = value) +
stat_binhex(aes(label = ..count..), bins = 5, geom = "text")
But when I combine both I loose the summary:
ggplot(dat) +
aes(x = lon, y = lat, z = value) +
stat_summary_hex(bins = 5, fun = "mean", geom = "hex") +
stat_binhex(aes(label = ..count..), bins = 5, geom = "text")
I can achieve the opposite, counts as color and summary as labels:
ggplot(dat, aes(lon, lat, z = value)) +
geom_hex(bins = 5) +
stat_summary_hex(aes(label=..value..), bins = 5,
fun = function(x) round(mean(x), 3),
geom = "text")
While writing the question, which took some hours of testing, I found a solution: adding a fill=NULL, or fill=mean(value) in the text one gives me what I want. Below the code and their resulting plots; the only difference is the label of the legend.
But it feels very hacky, so I would appreciate a better solution.
ggplot(dat) +
aes(x = lon, y = lat, z = value) +
stat_summary_hex(bins = 5, fun = "mean", geom = "hex") +
stat_binhex(aes(label = ..count.., fill = NULL), bins = 5, geom = "text") +
theme_bw()
ggplot(dat) +
aes(x = lon, y = lat, z = value) +
stat_summary_hex(bins = 5, fun = "mean", geom = "hex") +
stat_binhex(aes(label = ..count.., fill = mean(value)), bins = 5, geom = "text") +
theme_bw()
I propose a completely different approach to this problem. However, it needs to be clarified a bit first. You write "I have measurements of a quantity (value) at specific points (lon and lat)" but you do not specify these points exactly. Your data (generated) contains 1000 lon points and the same number of lat points.
Anyway, see for yourself.
library(tidyverse)
set.seed(1)
dat <-
tibble(
lon = runif(1000, 1, 15),
lat = runif(1000, 40, 60),
value = rnorm(1000)
)
dat %>% distinct(lon) %>% nrow() #1000
dat %>% distinct(lat) %>% nrow() #1000
My guess is that for real data you have a much smaller set of values for lon and lat.
Let me break it down to an accuracy of 2.
grid = 2
dat %>% mutate(
lon = round(lon/grid)*grid,
lat = round(lat/grid)*grid,
) %>%
group_by(lon, lat) %>%
summarise(
mean = mean(value),
label = n()
)
As you can see after rounding, the data was grouped according to these two variables and then I calculated the statistics you are interested in (mean and number of observations).
Also note that these statistics are generated at the intersection of lon and lat, so we have a square grid. In your solution, this is not the case at all. You are not getting the number of observations at these points and your grid is not square.
So let's make a graph.
dat %>% ggplot(aes(lon,lat,z=mean)) +
geom_contour_filled(binwidth = 0.25) +
geom_text(aes(label = label)) +
theme_bw()
Nothing stands in the way of increasing your grid a bit, let's say 4.
grid = 4
datg = dat %>% mutate(
lon = round(lon/grid)*grid,
lat = round(lat/grid)*grid,
) %>%
group_by(lon, lat) %>%
summarise(
mean = mean(value),
label = n()
)
datg %>% ggplot(aes(lon,lat,z=mean)) +
geom_contour_filled(binwidth = 0.25) +
geom_text(aes(label = label)) +
theme_bw()
Using such a solution, we can easily supplement the labels in the points of interest to us, e.g. with the average value. This time we will use grid = 1.5.
grid = 1.5
datg = dat %>% mutate(
lon = round(lon/grid)*grid,
lat = round(lat/grid)*grid,
) %>%
group_by(lon, lat) %>%
summarise(
mean = mean(value),
label = n(),
lab2 = paste0("(", round(mean, 2), ")")
)
datg %>% ggplot(aes(lon,lat,z=mean)) +
geom_contour_filled(binwidth = 0.25) +
geom_text(aes(label = label)) +
geom_text(aes(label = lab2), nudge_y = -.5, size = 3) +
theme_bw()
Hope this solution fits your needs much better than the stat_binhex based solution.
The problem here is that both plots share the same legend scale.
As the scales ranges are different : 0-40 vs -1.5 - 0.5, the biggest range makes values of the smallest range appear with (almost) the same color.
This is why displaying count as color works, but the opposite doesn't seem to work.
As an illustration, if you rescale the mean calculation, colors variations are visible:
rescaled_mean <- function(x) mean(x)*40
ggplot(dat) +
aes(x = lon, y = lat, z = value) +
stat_summary_hex(bins = 5, fun = "rescaled_mean", geom = "hex")+
stat_binhex(aes(label = ..count..), bins = 5, geom = "text") +
theme_bw()
To be fair, I find this a very strange behaviour. I like your solution though - I really don't find it very hacky to add fill = NULL. In contrary, I find this very elegant. Here a more hacky approach, basically resulting the same, but with one more line. It's using ggnewscale.
library(ggplot2)
set.seed(1)
dat <- data.frame(lon = runif(1000, 1, 15),
lat = runif(1000, 40, 60),
value = rnorm(1000))
ggplot(dat) +
aes(x = lon, y = lat,z = value) +
stat_summary_hex(bins = 5, fun = "mean", geom = "hex") +
ggnewscale::new_scale_fill() +
stat_binhex(aes(label = ..count..), bins = 5, geom = "text")
Created on 2022-02-17 by the reprex package (v2.0.1)
I'm wanting to use stat_difference() from the ggh4x package to add a shaded area between two lines in my plot. I have melted my example dataset below as I thought this was the correct approach to facet_wrap all the variables in my dataset, but I'm unsure how to use stat_difference() with the categorical variable team. I essentially want the line corresponding to Team A or Team B shaded depending on which one has a higher value, similar to the example here. Any suggestions will be great! Thanks.
library(tidyverse)
library(ggh4x)
library(reshape2)
set.seed(100)
team <- rep(rep(paste("Team", LETTERS[1:2]), each = 20))
week <- rep(c(1:20), times = 2)
var_1 <- rnorm(n = 40, mean = 20, sd = 5)
var_2 <- rnorm(n = 40, mean = 20, sd = 5)
var_3 <- rnorm(n = 40, mean = 250, sd = 50)
var_4 <- rnorm(n = 40, mean = 100, sd = 50)
dat <- data.frame(team, week, var_1, var_2, var_3, var_4)
plot_dat <- melt(dat, id.vars = c("team", "week"))
ggplot(plot_dat, aes(x = week)) +
geom_line(aes(y = value, color = team)) +
facet_wrap(~variable, scales = "free_y")
Following the post you referenced you could achieve your desired result by making separate columns with the values for each team using e.g. pivot_wider, add the lines via two geom_line and then apply stat_difference:
library(tidyverse)
library(ggh4x)
library(reshape2)
plot_dat <- pivot_wider(plot_dat, names_from = team, values_from = value)
ggplot(plot_dat, aes(x = week)) +
geom_line(aes(y = `Team A`, color = "Team A")) +
geom_line(aes(y = `Team B`, color = "Team B")) +
facet_wrap(~variable, scales = "free_y") +
stat_difference(aes(ymin = `Team B`, ymax = `Team A`), alpha = 0.3)
How do I pre-define bins for the histogram? For e.g.
predefine_bin_edges <- seq(0, 10, 1)
Can someone please use this predefine bin edges to update the example.
Edited later -
Also, Is there a way to include the extreme points, that are outside the binrange, in the corner bins for a complete picture of data distribution.
library(tidyverse)
# data
x <- rnorm(n = 1000, mean = 5, sd = 3)
tbl <- tibble(x)
# geom_histogram()
ggplot(data = tbl,
aes(x = x)) +
geom_histogram()
Do you mean like this?
library(tidyverse)
# data
x <- rnorm(n = 1000, mean = 5, sd = 3)
tbl <- tibble(x)
# geom_histogram()
ggplot(data = tbl,
aes(x = x)) +
geom_histogram(breaks = seq(0, 10, 1))
I don't think there is an argument to geom histogram that can include the values outside the range in the corner bins. However, you can squish the data at the aes() level.
ggplot(data = tbl,
aes(x = pmax(pmin(x, 10), 0))) +
geom_histogram(breaks = seq(0, 10, 1))
Or if you're uncomfortable with that, you can set the limits + oob arguments in the scale.
ggplot(data = tbl,
aes(x = x)) +
geom_histogram(breaks = seq(0, 10, 1)) +
scale_x_continuous(limits = c(0, 10),
oob = scales::oob_squish)
I'd like to plot a horizontal facet-wide line with the population median of that facet.
I tried the approach without creating a dummy summary table with the following code:
require(ggplot2)
dt = data.frame(gr = rep(1:2, each = 500),
id = rep(1:5, 2, each = 100),
y = c(rnorm(500, mean = 0, sd = 1), rnorm(500, mean = 1, sd = 2)))
ggplot(dt, aes(x = as.factor(id), y = y)) +
geom_boxplot() +
facet_wrap(~ gr) +
geom_hline(aes(yintercept = median(y), group = gr), colour = 'red')
However, the line is drawn for the median of the entire dataset instead of the median separately for each facet:
In the past, a solution has been suggested to use
geom_line(stat = "hline", yintercept = "median")
but it's been discontinued (produces the error "No stat called StatHline").
Another solution suggested
geom_errorbar(aes(ymax=..y.., ymin=..y.., y = mean))
but it generates
Error in data.frame(y = function (x, ...) :
arguments imply differing number of rows: 0, 1000
Finally, there's a way to plot the median by creating a dummy table with the desired stats but I'd like to avoid it.
You could create an extra column in dt for median per facet.
library(dplyr) # With dplyr for example
dt <- dt %>% group_by(gr) %>%
mutate(med = median(y))
# Rerun ggplot line with yintercept = med
ggplot(dt, aes(x = as.factor(id), y = y)) +
geom_boxplot() +
facet_wrap(~ gr) +
geom_hline(aes(yintercept = med, group = gr), colour = 'red')
If you don't want to add a new column with the computed median, you can add a geom_smooth using a quantile regression :
library(ggplot2)
library(quantreg)
set.seed(1234)
dt <- data.frame(gr = rep(1:2, each = 500),
id = rep(1:5, 2, each = 100),
y = c(rnorm(500, mean = 0, sd = 1),
rnorm(500, mean = 1, sd = 2)))
ggplot(dt, aes(y = y)) +
geom_boxplot(aes(x = as.factor(id))) +
geom_smooth(aes(x = id), method = "rq", formula = y ~ 1, se = FALSE) +
facet_wrap(~ gr)