A stacked range plot using ggplot - r

Suppose I wish to make a range plot with the design below using ggplot with the following dummy data:
with following legend.
set.seed(1)
test.dat <- data.frame(
yval = sample(1:100, 40),
xcat = rep(LETTERS[1:4], 10),
base = sample(c(1, 0),40, replace=T),
col = rep(c("red", "blue"), 40)
)
> head(test.dat)
yval xcat base col
1 68 A 0 red
2 39 B 0 blue
3 1 C 0 red
4 34 D 1 blue
5 87 A 0 red
6 43 B 0 blue
The gray portion shows the range of the data where base == 1 and the whisker-like line (that resembles errorbar) shows the range of the data where base == 0 using the respective color designed for each xcat.
So using this dummy data, I would expect:
minmax <- function(x){
return(
c(min(x),max(x))
)
}
> minmax(test.dat[test.dat$xcat == "D" & test.dat$base == 1,]$yval)
[1] 24 99
> minmax(test.dat[test.dat$xcat == "D" & test.dat$base == 0,]$yval)
[1] 21 82
> unique(test.dat[test.dat$xcat == "D",]$col)
[1] "blue"
for xcat == "D", a gray bar to range from 24 to 99, and a blue whisker line to range from 21 to 82.
How can I achieve this? It looks like there is no straightforward ggplot function to create a range plot.
My approach idea was to adjust geom_boxplot's hinges and whisper definition for gray part, and use geom_line or geom_linerange to create the whisker-line part, but I am unsure how to do that.
Thank you.

You first create a dataframe where you have min and max for each combination of (xcat, base and col)
data2 <- test.dat %>% group_by(xcat, base, col) %>% summarise(min = min(yval), max=max(yval))
Then you use geom_linerange for the gray "bars" and geom_errorbar for the whisker line:
ggplot()+
geom_linerange(data= data2 %>% filter(base==1), aes(x= xcat, ymin=min, ymax=max), size=12, alpha=0.5)+
geom_errorbar(data= data2 %>% filter(base==0), aes(x= xcat, ymin=min, ymax=max), colour=data2[data2$base==1,]$col, width=.2)
And this is the
Plot

I would suggest doing some reshaping first using dplyr/tidyr, and then geom_tile:
library(tidyverse)
test.dat %>%
group_by(xcat, base, col) %>%
summarize(mid = mean(range(yval)),
range = diff(range(yval)), .groups = "drop") %>%
pivot_wider(names_from = base, values_from = mid:range) %>%
ggplot(aes(x = xcat)) +
geom_tile(aes(y = mid_0, height = range_0), fill = "gray70", color = "black") +
geom_tile(aes(y = mid_1, height = range_1, fill = col), color = "black") +
scale_fill_identity()

Related

gganimate does not stop at the defined value

Update: Sorry I forgot to add the vector:
time=1:100
value = 1:67
fill = rep(max(value), 100-max(value))
I want to make an animated stacked bar, Here is my example:
library(tidyverse)
library(gganimate)
#example data frame
df <- tibble(time = time,
value = c(value, fill),
x = "A") %>%
mutate(fill_color = "gold") %>%
mutate(gold_nr = value) %>%
mutate(blue_nr = rev(gold_nr)) %>%
pivot_longer(c(gold_nr, blue_nr),
names_to = "color_group",
values_to = "value_group")
# the code:
p <- df %>%
ggplot(aes("", value_group, fill=color_group)) +
geom_col(width = 0.3, position = position_fill())+
scale_fill_manual(values = c("gold", "steelblue"))+
theme_minimal()+
# theme(legend.position="none")+
transition_manual(value)+
coord_flip()
animate(p, fps=24, renderer = gifski_renderer(loop = FALSE))
My question is: Why does the bar not stop at 67 and jumps over 75?
I think I have to organize the data in a other way?
You should use time column to group the frames.
tail(df, 4)
# time fill_color color_group value_group
# 197 99 gold gold_nr 67
# 198 99 gold blue_nr 2
# 199 100 gold gold_nr 67
# 200 100 gold blue_nr 1
The plot now shows correctly the proportions in the last time frame.
proportions(df[df$time == 100, ]$value_group)
# [1] 0.98529412 0.01470588
library('magrittr')
p <- df %>%
ggplot2::ggplot(ggplot2::aes("", value_group, fill=color_group)) +
ggplot2::geom_col(width=0.3, position=position_fill()) +
ggplot2::scale_fill_manual(values=c("steelblue", "gold")) +
ggplot2::theme_minimal() +
ggplot2::coord_flip() +
gganimate::transition_manual(time)
a <- gganimate::animate(p, fps=24, renderer=gifski_renderer(loop=TRUE))
gganimate::anim_save('a.gif', a)
Note, that your colors were flipped.

Plot a horizontal bar chart with default values in R

I have a DF like this:
df_test <- data.frame (ID = c(88888, 99999),
Cluster1 = c(0, 1),Cluster2 = c(0, 2),Cluster3 = c(1, 3)
)
ID Cluster1 Cluster2 Cluster3
1 88888 0 0 1
2 99999 1 2 3
Now I want a horizontal bar graph with the clusters on the y axis. All bars should go from 0-3 (min - max), because this is the range of the clusters. As color I want to have three gradations, 0-1 red, 1-2 yellow and 2-3 green. The values from the DF should then be shown as an arrow or line on the overall bar. Is this somehow possible with ggplot2?
You can use the geom_col following example here: https://ggplot2.tidyverse.org/reference/geom_bar.html
library(dplyr)
library(tidyr)
library(ggplot2)
First, make the data tidy:
df <- df_test %>% pivot_longer(cols = 2:4,
names_to = "Cluster",
values_to = "value")
Keep the largest of each cluster for making a bar chart:
df <- df %>% group_by(Cluster) %>%
filter(value == max(value)) %>%
ungroup() %>%
# identify color scheme:
mutate(cols = case_when(value <=1 ~ "red",
value > 1 & value <= 2 ~ "yellow",
value > 2 ~ "green"))
ggplot(df) + geom_col(aes(x = value, y=Cluster, fill = Cluster)) +
scale_colour_manual(
values = df$cols,
aesthetics = c("colour", "fill")
)

Geom_freqpoly with Predefined Count

I can plot geom_freqpoly without problems using the number of observation
ggplot(data=demo) +
geom_freqpoly(mapping=aes(x = value))
But I'd like to use the precalculated obeservation count contained in the data.
I tried using stat = "identity" but it apparently doesn't work.
ggplot(data=demo) +
geom_freqpoly(mapping=aes(x = value, y = cnt), stat = "identity")
This is my sample data
demo <- tribble(
~value, ~cnt,
.25, 20,
.25, 30,
.1, 40
)
TL;DR: You didn't get the graph you want, because the data of pre-calculated counts you passed to ggplot was NOTHING like what was used to produce the freqpoly graph.
Since you didn't include code for the original demo used to generate graph 1, I'll venture a guess:
demo.orig <- data.frame(value = c(0.25, 0.25, 0.1))
p <- ggplot(demo.orig, aes(x = value)) +
geom_freqpoly()
p # show plot to verify its appearance, which matches the graph in the question
layer_data(p) # look at the calculated data used by geom_freqpoly
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
y count x xmin xmax width density ncount ndensity PANEL group colour size linetype alpha
1 0 0 0.09310345 0.09051724 0.09568966 0.005172414 0.00000 0.0 0.0 1 -1 black 0.5 1 NA
2 1 1 0.09827586 0.09568966 0.10086207 0.005172414 64.44444 0.5 0.5 1 -1 black 0.5 1 NA
3 0 0 0.10344828 0.10086207 0.10603448 0.005172414 0.00000 0.0 0.0 1 -1 black 0.5 1 NA
... (omitted to conserve space)
30 0 0 0.24310345 0.24051724 0.24568966 0.005172414 0.00000 0.0 0.0 1 -1 black 0.5 1 NA
31 2 2 0.24827586 0.24568966 0.25086207 0.005172414 128.88889 1.0 1.0 1 -1 black 0.5 1 NA
32 0 0 0.25344828 0.25086207 0.25603448 0.005172414 0.00000 0.0 0.0 1 -1 black 0.5 1 NA
From a small dataframe with only two unique values, stat_bin generated a much larger dataframe with the x-axis split into 30 bins (the default number), and count / y = 0 everywhere except for the two bins containing the original values.
> geom_freqpoly
function (mapping = NULL, data = NULL, stat = "bin", position = "identity",
..., na.rm = FALSE, show.legend = NA, inherit.aes = TRUE)
{
params <- list(na.rm = na.rm, ...)
if (identical(stat, "bin")) {
params$pad <- TRUE
}
layer(data = data, mapping = mapping, stat = stat, geom = GeomPath,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = params)
}
A quick check by printing geom_freqpoly to console shows that its underlying geom is simply GeomPath, which plots x/y pairs in sequential order.
In other words, if you want to get the peaks from graph 1, you need to provide a similar dataset, with rows indicating where y should drop to 0. While it's certainly possible to calculate this by digging into the code for StatBin$compute_group, I'd think it's simpler to expand from the data of pre-calculated counts and let ggplot do its normal job:
demo %>%
tidyr::uncount(cnt) %>%
ggplot(aes(x = value)) +
geom_freqpoly() +
theme_minimal()
Edit: solution without fully expanding dataframe of aggregated counts
Sample dataset with 2 groups:
demo <- data.frame(value = c(0.25, 0.5, 0.1, 0.25, 0.75, 0.1),
cnt = c(5, 2, 4, 3, 8, 7) * 10e8,
group = rep(c("a", "b"), each = 3))
Code:
library(ggplot2)
library(dplyr)
demo %>%
rename(x = value, y = cnt) %>% # rename here so approach below can be easily applied
# to other datasets with different column names
tidyr::nest(data = c(x, y)) %>% # nest to apply same approach for each group
mutate(data = purrr::map(
data,
function(d) ggplot2:::bin_vector( # cut x's range into appropriate bins
x = d$x,
bins = ggplot2:::bin_breaks_bins(
x_range = range(d$x),
bins = 30), # default bin count is 30; change if desired
pad = TRUE) %>%
select(x, xmin, xmax) %>%
# place y counts into the corresponding x bins (this is probably similar
# to interval join, but I don't have that package installed on my machine)
tidyr::crossing(d %>% rename(x2 = x)) %>%
mutate(y = ifelse(x2 >= xmin & x2 < xmax, y, 0)) %>%
select(-x2) %>%
group_by(x) %>%
filter(y == max(y)) %>%
ungroup() %>%
unique())) %>%
tidyr::unnest(cols = c(data)) %>% # unnest to get one flat dataframe back
ggplot(aes(x = x, y = y, colour = group)) + # plot as per normal
geom_path() +
theme_bw()
# package versions used: dplyr 1.0.0, ggplot2 3.3.1, tidyr 1.1.0, purrr 0.3.4
Based on the similar problem for histograms the solution seems to be as simple as to use the weight parameter in the aesthetics.
The solution using the sample data from the other answer would be
demo <- data.frame(value = c(0.25, 0.5, 0.1, 0.25, 0.75, 0.1),
cnt = c(5, 2, 4, 3, 8, 7) * 10e8,
group = rep(c("a", "b"), each = 3))
ggplot(demo, aes(value, weight = cnt, color = group)) + geom_freqpoly()

R - Plot the rolling mean of different time series in a lineplot with ggplot2

I want to plot the rolling mean of data of different time series with ggplot2. My data have the following structure:
library(dplyr)
library(ggplot2)
library(zoo)
library(tidyr)
df <- data.frame(episode=seq(1:1000),
t_0 = runif(1000),
t_1 = 1 + runif(1000),
t_2 = 2 + runif(1000))
df.tidy <- gather(df, "time", "value", -episode) %>%
separate("time", c("t", "time"), sep = "_") %>%
subset(select = -t)
> head(df.tidy)
# episode time value
#1 1 0 0.7466480
#2 2 0 0.7238865
#3 3 0 0.9024454
#4 4 0 0.7274303
#5 5 0 0.1932375
#6 6 0 0.1826925
Now, the code below creates a plot where the lines for time = 1 and time = 2 towards the beginning of the episodes do not represent the data because value is filled with NAs and the first numeric entry in value is for time = 0.
ggplot(df.tidy, aes(x = episode, y = value, col = time)) +
geom_point(alpha = 0.2) +
geom_line(aes(y = rollmean(value, 10, align = "right", fill = NA)))
How do I have to adapt my code such that the rolling-mean lines are representative of my data?
Your issue is you are applying a moving average over the whole column, which makes data "leak" from one value of time to another.
You could group_by first to apply the rollmean to each time separately:
ggplot(df.tidy, aes(x = episode, y = value, col = time)) +
geom_point(alpha = 0.2) +
geom_line(data = df.tidy %>%
group_by(time) %>%
mutate(value = rollmean(value, 10, align = "right", fill = NA)))

how to automate the legend in a ggplot chart?

Consider this simple example
library(dplyr)
library(forcats)
library(ggplot2)
mydata <- data_frame(cat1 = c(1,1,2,2),
cat2 = c('a','b','a','b'),
value = c(10,20,-10,-20),
time = c(1,2,1,2))
mydata <- mydata %>% mutate(cat1 = factor(cat1),
cat2 = factor(cat2))
> mydata
# A tibble: 4 x 4
cat1 cat2 value time
<fct> <fct> <dbl> <dbl>
1 1 a 10.0 1.00
2 1 b 20.0 2.00
3 2 a -10.0 1.00
4 2 b -20.0 2.00
Now, I want to create a chart where I interact the two factor variables.
I know I can use interact in ggplot2 (see below).
My big problem is that I do not know how to automate the labeling (and the colouring) of the interactions so that I can avoid any manual error using scale_colour_manual.
For instance:
ggplot(mydata,
aes(x = time, y = value, col = interaction(cat1, cat2) )) +
geom_point(size=15) + theme(legend.position="bottom")+
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme(legend.position="bottom",
legend.text=element_text(size=12, face = "bold")) +
scale_colour_manual(name = ""
, values=c("red","red4","royalblue","royalblue4")
, labels=c("1-b","1-a"
,"2-a","2-b"))
shows:
which has the wrong labels because of a (voluntarily) mistake I made in scale_colour_manual(). Indeed, the bright red dot is 1-a and not 1-b (note how the labels are simply the concatenation of the variable names). The idea is that with more factor levels, guessing the right order can be tricky.
Is there a way to automate this labeling (even better: labeling AND coloring)? Perhaps using forcats? Perhaps creating the labels as strings in the dataframe beforehand?
Thanks!
If the number of factor levels for cat1 / cat2 are not fixed (but could potentially be much larger than 2), I would try to calculate the appropriate colours with hsv(), rather than assign them manually.
The colour cheatsheet here summarise the HSV colour model rather nicely:
Hue (h) is essentially your rainbow colour wheel, Saturation (s) determines how intense the colour is, and Value (v) how dark it is. Each parameter accepts values in the range [0, 1].
Here's how I would adapt it for this use case:
mydata2 <- mydata %>%
# use "-" instead of the default "." since we are using that for the labels anyway
mutate(interacted.variable = interaction(cat1, cat2, sep = "-")) %>%
# cat1: assign hue evenly across the whole wheel,
# cat2: restrict both saturation & value to the [0.3, 1], as it can look too
# faint / dark otherwise
mutate(colour = hsv(h = as.integer(cat1) / length(levels(cat1)),
s = 0.3 + 0.7 * as.integer(cat2) / length(levels(cat2)),
v = 0.3 + 0.7 * as.integer(cat2) / length(levels(cat2))))
# create the vector of colours for scale_colour_manual()
manual.colour <- mydata2 %>% select(interacted.variable, colour) %>% unique()
colour.vector <- manual.colour$colour
names(colour.vector) <- manual.colour$interacted.variable
rm(manual.colour)
> colour.vector
1-a 1-b 2-a 2-b
"#3AA6A6" "#00FFFF" "#A63A3A" "#FF0000"
With the colours calculated automatically for any number of factors, plotting becomes quite straightforward:
ggplot(mydata2,
aes(x = time, y = value, colour = interacted.variable)) +
geom_point(size = 15) +
scale_colour_manual(name = "",
values = colour.vector,
breaks = names(colour.vector)) +
theme(legend.position = "bottom")
An illustration with more factor levels (code is the same except for the addition of specifying guide_legend(byrow = TRUE) in the colour scale:
mydata3 <- data.frame(
cat1 = factor(rep(1:3, times = 5)),
cat2 = rep(LETTERS[1:5], each = 3),
value = 1:15,
time = 15:1
) %>%
mutate(interacted.variable = interaction(cat1, cat2, sep = "-"),
colour = hsv(h = as.integer(cat1) / length(levels(cat1)),
s = 0.3 + 0.7 * as.integer(cat2) / length(levels(cat2)),
v = 0.3 + 0.7 * as.integer(cat2) / length(levels(cat2))))
manual.colour <- mydata3 %>% arrange(cat1, cat2) %>%
select(interacted.variable, colour) %>% unique()
colour.vector <- manual.colour$colour
names(colour.vector) <- manual.colour$interacted.variable
rm(manual.colour)
ggplot(mydata3,
aes(x = time, y = value, colour = interacted.variable)) +
geom_point(size = 15) +
scale_colour_manual(name = "",
values = colour.vector,
breaks = names(colour.vector),
guide = guide_legend(byrow = TRUE)) +
theme(legend.position = "bottom")

Resources