Annotate several regression lines produced with geom_smooth - r

I have a figure with 16 regression lines and I need to be able to identify them. Using a color gradient or symbols or different line types do not really help.
My idea therefore is, to just (haha) annotate every line.
Therefore, I build a dataset (hpAnnotatedLines) with the different maximum x values. This is the position the text should start. However, I have no idea how to automatically extract the respective y values of the predicted regression lines at the maximum x-axis values, which is different for each line.
Please find a smaller data set using mtcars as an example
library(ggplot2)
library(dplyr)
library(ggrepel)
#just select the data I need
mtcars1 <- select(mtcars, disp,cyl,hp)
mtcars1$cyl <- as.factor(mtcars1$cyl)
#extract max values
mtcars2 <- mtcars1 %>%
group_by(cyl) %>%
summarise(Max.disp= max(disp))
#build dataset for the annotation layer
#note that hp was done by hand. Here I need help
hpAnnotatedLines <- data.frame(cyl=levels(mtcars2$cyl),
disp=mtcars2$Max.disp,
hp=c(90,100,210))
#example plot
ggplot(mtcars, aes(x=disp, y=hp, color = factor(cyl))) +
geom_point() +
geom_smooth(method=lm)+
coord_cartesian(xlim = c(min(mtcars$disp), max(mtcars$disp) + 50)) +
geom_text_repel(
data = hpAnnotatedLines,
aes(label = cyl),
size = 3,
nudge_x = 1)

Instead of extracting the fitted values you could add the labels via geom_text by switching the stat to smooth and setting the label aesthetic via after_stat such that only the last point of each regression line gets labelled:
library(ggplot2)
library(dplyr)
myfun <- function(x, color) {
data.frame(x = x, color = color) %>%
group_by(color) %>%
mutate(label = ifelse(x %in% max(x), as.character(color), "")) %>%
pull(label)
}
ggplot(mtcars, aes(x=disp, y=hp, color = factor(cyl))) +
geom_point() +
geom_smooth(method=lm) +
geom_text(aes(label = after_stat(myfun(x, color))),
stat = "smooth", method = "lm", hjust = 0, size = 3, nudge_x = 1, show.legend = FALSE) +
coord_cartesian(xlim = c(min(mtcars$disp), max(mtcars$disp) + 50))

It's a bit of a hack, but you can extract the data from the compiled plot object. For example first make the plot without the labels,
myplot <- ggplot(mtcars, aes(x=disp, y=hp, color = factor(cyl))) +
geom_point() +
geom_smooth(method=lm)+
coord_cartesian(xlim = c(min(mtcars$disp), max(mtcars$disp) + 50))
Then use ggplot_build to get the data from the second layer (The geom_smooth layer) and transform it back into the names used by your data. Here we find the largest x value per group, and then take that y value.
pobj <- ggplot_build(myplot)
hpAnnotatedLines <- pobj$data[[2]] %>% group_by(group) %>%
top_n(1, x) %>%
transmute(disp=x, hp=y, cyl=levels(mtcars$cyl)[group])
Then add an additional layer to your plot
myplot +
geom_text_repel(
data = hpAnnotatedLines,
aes(label = cyl),
size = 3,
nudge_x = 1)

If your data is not that huge, you can extract the predictions out using augment() from broom and take that with the largest value:
library(broom)
library(dplyr)
library(ggplot2)
hpAnn = mtcars %>% group_by(cyl) %>%
do(augment(lm(hp ~ disp,data=.))) %>%
top_n(1,disp) %>%
select(cyl,disp,.fitted) %>%
rename(hp = .fitted)
# A tibble: 3 x 3
# Groups: cyl [3]
cyl disp hp
<dbl> <dbl> <dbl>
1 4 147. 96.7
2 6 258 99.9
3 8 472 220.
Then plot:
ggplot(mtcars, aes(x=disp, y=hp, color = factor(cyl))) +
geom_point() +
geom_smooth(method=lm)+
coord_cartesian(xlim = c(min(mtcars$disp), max(mtcars$disp) + 50))+
geom_text_repel(
data = hpAnn,
aes(label = cyl),
size = 3,
nudge_x = 1)

Related

How to connect means per group in ggplot?

I can do a scatterplot of two continuous variables like this:
mtcars %>%
ggplot(aes(x=mpg, y = disp)) + geom_point() +
geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95)
I use cut to create 5 groups of mpg intervals for cars (any better command would do as well). I like to see the intervals in the graph, thus they are easy to understand.
mtcars %>%
mutate(mpg_groups = cut(mpg, 5)) %>%
group_by(mpg_groups) %>%
mutate(mean_disp = mean(disp)) %>%
ggplot(aes(x=mpg_groups, y = mean_disp)) + geom_point()
mpg_groups is a factor variable and can no longer be connected via geom_smooth().
# not working
mtcars %>%
mutate(mpg_groups = cut(mpg, 5)) %>%
group_by(mpg_groups) %>%
mutate(mean_disp = mean(disp)) %>%
ggplot(aes(x=mpg_groups, y = mean_disp)) + geom_point() +
geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95)
What can I do with easy (tidyverse) code in order to create the mean values per group and connect them via line?
As a more or less general rule, when drawing a line via ggplot2 you have to explicitly set the group aesthetic in all cases where the variable mapped on x isn't a numeric, i.e. use e.g. group=1 to assign all observations to one group which I call 1 for simplicity:
library(ggplot2)
library(dplyr, warn=FALSE)
mtcars %>%
mutate(mpg_groups = cut(mpg, 5)) %>%
group_by(mpg_groups) %>%
mutate(mean_disp = mean(disp)) %>%
ggplot(aes(x = mpg_groups, y = mean_disp, group = 1)) +
geom_point() +
geom_smooth(method = "auto", se = TRUE, fullrange = FALSE, level = 0.95)

Detecting programmatically whether axis labels overlap

Is there a way to detect whether axis labels overlap in ggplot2 programmatically?
Suppose I create the following graph:
library(dplyr)
library(tibble)
library(ggplot2)
dt <- mtcars %>% rownames_to_column("name") %>%
dplyr::filter(cyl == 8)
ggplot(dt, aes(x = name, y = mpg)) + geom_point()
I want to programmatically detect whether x-axis labels are overlapping and apply the following first remedy:
ggplot(dt, aes(x = name, y = mpg)) + geom_point() +
scale_x_discrete(guide = guide_axis(n.dodge = 2))
Here is the tricky part. Say the dimensions are different and first remedy also overlaps like this:
I want to apply a second remedy like this:
ggplot(dt, aes(x = name, y = mpg)) + geom_point() +
theme(axis.text.x = element_text(angle=45, hjust = 1, vjust = 1))
Is it possible without visually inspecting the graph?
Not a definitive solution, but if we consider the margins constant, we can do some simple subtraction:
library(dplyr)
library(tibble)
library(ggplot2)
dt <- mtcars %>% rownames_to_column("name") %>%
dplyr::filter(cyl == 8)
p <- ggplot(dt, aes(x = name, y = mpg)) +
geom_point()
# variable part
font_size <- 9 #points, the ggplot default
full_width <- 15 #cm
full_height <- 10 #cm
cm_to_pt <- 28.35 # 1 cm = 28.35 points
# try varying width
for(full_width in c(30, 40, 45, 50)){
axis_text_length_pt <- ceiling(max(nchar(dt$name))/2)*font_size
axis_available_pt <- full_width/n_distinct(dt$name)*cm_to_pt
do_not_touch <- axis_text_length_pt <= axis_available_pt
p +
theme(axis.text.x = element_text(size=font_size)) +
geom_text(aes(x=5,y=15, label=do_not_touch))
ggsave(paste0("tmp_",full_width,".png"),
width = full_width, height = full_height, unit = "cm")
}
At 40 cm we still have the Hornet Sportabout and the Lincoln Continental touching, at 45 cm they separate.

Adding total histogram count to facets in ggplot in R

I would like to take a faceted histogram and add text on each plot indicating the total number of observations in that facet. So for carb = 1 the total count would be 7, carb = 2 the total count would be 10 etc.
p <- ggplot(mtcars, aes(x = mpg, stat = "count",fill=as.factor(carb))) + geom_histogram(bins = 8)
p <- p + facet_grid(as.factor(carb) ~ .)
p
I can do this with the table function but for more complex faceting how can I do it quickly?
You can try this. Maybe is not the most optimal because you have to define the x and y position for the label (this is done in Labels for x and in geom_text() for y with 3). But it can help you:
#Other
library(tidyverse)
#Create similar data for labels
Labels <- mtcars %>% group_by(carb) %>% summarise(N=paste0('Number is: ',n()))
#X position
Labels$mpg <- 25
#Plot
ggplot(mtcars, aes(x = mpg, stat = "count",fill=as.factor(carb))) + geom_histogram(bins = 8)+
geom_text(data = Labels,aes(x=mpg,y=3,label=N))+facet_grid(as.factor(carb) ~ .)

gghighlight in clustered (grouped) bar chart in R

I need to use gghighlight in a clustered bar chart in R in order to highlight only one single bar. My code and sample data looks like this:
library(tidyr)
library(ggplot2)
dat <- data.frame(country=c('USA','Brazil','Ghana','England','Australia'), Stabbing=c(15,10,9,6,7), Accidents=c(20,25,21,28,15), Suicide=c(3,10,7,8,6))
dat.m <- melt(dat, id.vars='country')
dat.g <- gather(dat, type, value, -country)
ggplot(dat.g, aes(type, value)) +
geom_bar(aes(fill = country), stat = "identity", position = "dodge") +
gghighlight(type == "Accidents" & country == "Brazil")
But this gives me this awkward
How can I get gghighlight to highlight only one single bar of one group (so combining two conditions for two discrete variables)?
Here are two alternative options for highlighting a single column in this type of plot:
1) make a new variable (named highlight below) and fill by that (and, if you like, use the line colors to color by country)
2) manually annotate the one column you want to highlight with an arrow and/or text (or work out how to automate the positioning, but that would be more involved) - could be an option for one final figure
library(tidyr)
library(ggplot2)
dat <- data.frame(country=c('USA','Brazil','Ghana','England','Australia'),
Stabbing=c(15,10,9,6,7),
Accidents=c(20,25,21,28,15), Suicide=c(3,10,7,8,6))
dat.m <- reshape2::melt(dat, id.vars='country')
dat.g <- gather(dat, type, value, -country)
## set highlighted bar
dat.g$highlight <- ifelse(dat.g$type == "Accidents" & dat.g$country == "Brazil", TRUE, FALSE)
## option 1: use fill to highlight, colour for country
ggplot(dat.g, aes(type, value, fill = highlight, colour=country), alpha=.6) +
geom_bar(stat = "identity", position = "dodge2", size=1) +
scale_fill_manual(values = c("grey20", "red"))+
guides(fill = FALSE) +
## option 2: use annotate to manually label a specific column:
annotate(geom = "curve", x = 1.15, y = 30, xend = 1.35, yend = 26,
curvature = .2, arrow = arrow(length = unit(2, "mm"))) +
annotate(geom = "text", x = 1, y = 31, label = "Highlight", hjust = "left")
Created on 2020-03-10 by the reprex package (v0.3.0)
I think gghighlight is not built for this kind of plot - not yet! You could file a feature request ? It is a bit unclear though if this visualisation is very helpful. Gghighlight always draws everything - this makes the "weird" shadows when dodging.
If you want to keep using gghightlight, maybe try faceting, which they suggest in their vignette
A suggestion - Use facets:
(using mtcars as example)
library(tidyverse)
library(gghighlight)
mtcars2 <- mtcars %>% mutate(cyl = as.character(cyl), gear = as.character(gear))
ggplot(mtcars2, aes(cyl, disp, fill = gear)) +
geom_col() + #no dodge
gghighlight(cyl == "4") + #only one variable
facet_grid(~ gear) #the other variable is here
#> Warning: Tried to calculate with group_by(), but the calculation failed.
#> Falling back to ungrouped filter operation...
Created on 2020-03-09 by the reprex package (v0.3.0)
Or, here without gghighlight, in a more traditional subsetting approach.
You need to make a subset of data which contains rows for each group you want to dodge by, in this case "cyl" and "gear". I replace the irrelevant data with "NA", you could also use "0".
library(tidyverse)
mtcars2 <- mtcars %>%
mutate(cyl = as.character(cyl), gear = as.character(gear)) %>%
group_by(cyl, gear) %>%
summarise(disp = mean(disp))
subset_mt <- mtcars2 %>% mutate(highlight = if_else(cyl == '4' & gear == '3', disp, NA_real_))
ggplot() +
geom_col(data = mtcars2, aes(cyl, disp, group = gear), fill = 'grey', alpha = 0.6, position = 'dodge') +
geom_col(data = subset_mt, aes(cyl, highlight, fill = gear), position = 'dodge')
#> Warning: Removed 7 rows containing missing values (geom_col).
Created on 2020-03-10 by the reprex package (v0.3.0)

R: ggplot2, how to annotate summary statistics on each panel of a panel plot

How would I add a text annotation (eg. sd = sd_value) of the standard deviation in each panel of the following plot using ggplot2 in R?
library(datasets)
data(mtcars)
ggplot(data = mtcars, aes(x = hp)) +
geom_dotplot(binwidth = 1) +
geom_density() +
facet_grid(. ~ cyl) +
theme_bw()
I'd post an image of the plot, but I don't have enough rep.
I think "geom_text" or "annotate" might be useful but I'm not sure quite sure how.
If you want to vary the text label in each facet, you will want to use geom_text. If you want the same text to appear in each facet, you can use annotate.
p <- ggplot(data = mtcars, aes(x = hp)) +
geom_dotplot(binwidth = 1) +
geom_density() +
facet_grid(. ~ cyl)
mylabels <- data.frame(cyl = c(4, 6, 8),
label = c("first label", "seond label different", "and another"))
p + geom_text(x = 200, y = 0.75, aes(label = label), data = my labels)
### compare that to this way with annotate
p + annotate("text", x = 200, y = 0.75, label = "same label everywhere")
Now, if you really want standard deviation by cyl in this example, I'd probably use dplyr to do the calculation first and then complete this with geom_text like so:
library(ggplot2)
library(dplyr)
df.sd.hp <- mtcars %>%
group_by(cyl) %>%
summarise(hp.sd = round(sd(hp), 2))
ggplot(data = mtcars, aes(x = hp)) +
geom_dotplot(binwidth = 1) +
geom_density() +
facet_grid(. ~ cyl) +
geom_text(x = 200, y = 0.75,
aes(label = paste0("SD: ", hp.sd)),
data = df.sd.hp)
I prefer the appearance of the graph when the statistic appears within the facet label itself. I made the following script, which allows the choice of displaying the standard deviation, mean or count. Essentially it calculates the summary statistic then merges this with the name so that you have the format CATEGORY (SUMMARY STAT = VALUE).
#' Function will update the name with the statistic of your choice
AddNameStat <- function(df, category, count_col, stat = c("sd","mean","count"), dp= 0){
# Create temporary data frame for analysis
temp <- data.frame(ref = df[[category]], comp = df[[count_col]])
# Aggregate the variables and calculate statistics
agg_stats <- plyr::ddply(temp, .(ref), summarize,
sd = sd(comp),
mean = mean(comp),
count = length(comp))
# Dictionary used to replace stat name with correct symbol for plot
labelName <- mapvalues(stat, from=c("sd","mean","count"), to=c("\u03C3", "x", "n"))
# Updates the name based on the selected variable
agg_stats$join <- paste0(agg_stats$ref, " \n (", labelName," = ",
round(agg_stats[[stat]], dp), ")")
# Map the names
name_map <- setNames(agg_stats$join, as.factor(agg_stats$ref))
return(name_map[as.character(df[[category]])])
}
Using this script with your original question:
library(datasets)
data(mtcars)
# Update the variable name
mtcars$cyl <- AddNameStat(mtcars, "cyl", "hp", stat = "sd")
ggplot(data = mtcars, aes(x = hp)) +
geom_dotplot(binwidth = 1) +
geom_density() +
facet_grid(. ~ cyl) +
theme_bw()
The script should be easy to alter to include other summary statistics. I am also sure it could be rewritten in parts to make it a bit cleaner!

Resources