Mark in ggplot when the observations go below a certain number - r

I have a dataset where
example <- data.frame(
Country = rep(c("A", "B"), each = 12),
IP = c(55,56,59,63,67,69,69,73,74,74,79,87,0,22,24,26,26,31,37,41,43,46,46,47),
Mean_st = c(46,47,49,50,53,55,53,57,60,57,58,63,0,19,20,21,22,25,26,28,29,30,31,31)
)
ggplot(example) +
geom_line(aes(x = IP, y = Mean_st, color = Country), size = 2) +
geom_vline(xintercept = 73) +
geom_vline(xintercept = 42)
I need to mark where the number of observations is below a certain number (let's say less than 5). I can find that point in my spreadsheet for each of the countries (73 and 42), and use geom_vline like in the example, but is there a way of finding that point directly in ggplot without the need of checking the spreadsheet?

You could do something like:
n_below <- 5
ggplot(example) +
geom_line(aes(x = IP, y = Mean_st, color = Country), size = 2) +
geom_vline(xintercept = sort(example$IP[example$Country == "A"][n_below])) +
geom_vline(xintercept = sort(example$IP[example$Country == "B"][n_below]))

Related

Adding geom_line between data points with different geom_boxplot fill variable

Hi I have a much larger data frame but a sample dummy df is as follows:
set.seed(23)
df = data.frame(name = c(rep("Bob",8),rep("Tom",8)),
topic = c(rep(c("Reading","Writing"),8)),
subject = c(rep(c("English","English","Spanish","Spanish"),4)),
exam = c(rep("First",4),rep("Second",4),rep("First",4),rep("Second",4)),
score = sample(1:100,16))
I have to plot it in the way shown in the picture below (for my original data frame) but with lines connecting the scores corresponding to each name between the first and second class in the exam variable, I tried geom_line(aes(group=name)) but the lines are not connected in the right way. Is there any way to connect the points that also respects the grouping by the fill variable similar to how the position_dodge() helps separate the points by their fill grouping? Thanks a lot!
library(ggplot2)
df %>% ggplot(aes(x=topic,y=score,fill=exam)) +
geom_boxplot(outlier.shape = NA) +
geom_point(size=1.75,position = position_dodge(width = 0.75)) +
facet_grid(~subject,switch = "y")
One option to achieve your desired result would be to group the lines by name and topic and do the dodging of lines manually instead of relying on position_dogde. To this end convert topic to a numeric for the geom_line and shift the position by the necessary amount to align the lines with the dodged points:
set.seed(23)
df <- data.frame(
name = c(rep("Bob", 8), rep("Tom", 8)),
topic = c(rep(c("Reading", "Writing"), 8)),
subject = c(rep(c("English", "English", "Spanish", "Spanish"), 4)),
exam = c(rep("First", 4), rep("Second", 4), rep("First", 4), rep("Second", 4)),
score = sample(1:100, 16)
)
library(ggplot2)
ggplot(df, aes(x = topic, y = score, fill = exam)) +
geom_boxplot(outlier.shape = NA) +
geom_point(size = 1.75, position = position_dodge(width = 0.75)) +
geom_line(aes(
x = as.numeric(factor(topic)) + .75 / 4 * ifelse(exam == "First", -1, 1),
group = interaction(name, topic)
)) +
facet_grid(~subject, switch = "y")

Plotting range for same variable across two conditions

I have an input matrix consists of 5 columns and 12 rows.
I am trying to plot a range for same variables (lets say width) across two methods/conditions (Paper, estimated). I am able to plot range across one methods/condition using code:
Input <- read.table("File.txt", header = T, sep = "\t")
ggplot(Input, aes(x=Trait))+
geom_linerange(aes(ymin=min,ymax=max),linetype=3,color="Black")+
geom_point(aes(y=min),size=3,color="darkgreen")+
geom_point(aes(y=max),size=3,color="darkgreen")+ labs(y="-log10(P)", x="Traits") +
theme_bw()
But I want to plot each variable across methods together in the same plot. I can do this by adding an extra suffix with each variable Is there a nicer way to do this? I have tried shape=Method but it's not working for me, Any help will be highly appreciated.
I would suggest mapping Method on color instead of shape. But hey. It's your plot. (; To achieve your desired result without adding a suffix you could make use of position_dodge like so:
library(tibble)
library(ggplot2)
ggplot(Input, aes(x = Trait, shape = Method)) +
geom_linerange(aes(ymin = min, ymax = max, group = Method), linetype = 3, color = "Black", position = position_dodge(.6)) +
geom_point(aes(y = min), color = "darkgreen", size = 3, position = position_dodge(.6)) +
geom_point(aes(y = max), color = "darkgreen", size = 3, position = position_dodge(.6)) +
labs(y = "-log10(P)", x = "Traits") +
theme_bw()
DATA
set.seed(42)
Input <- tibble(
Method = rep(c("Paper", "Estimated"), each = 3),
Trait = rep(c("Width", "Density", "Lenght"), 2),
Count = rep(c(2, 4, 10), 2),
min = runif(6, 5, 7),
max = min + runif(6, 0, 10)
)

Fill aesthetic used twice with continuous and discrete scales

I've got a data like below:
structure(list(bucket = structure(1:23, .Label = c("(1.23,6.1]",
"(6.1,10.9]", "(10.9,15.6]", "(15.6,20.4]", "(20.4,25.1]", "(25.1,29.9]",
"(29.9,34.6]", "(34.6,39.4]", "(39.4,44.2]", "(44.2,48.9]", "(48.9,53.7]",
"(53.7,58.4]", "(58.4,63.2]", "(63.2,68]", "(68,72.7]", "(72.7,77.5]",
"(77.5,82.2]", "(82.2,87]", "(87,91.7]", "(91.7,96.5]", "(96.5,101]",
"(101,106]", "(106,111]"), class = "factor"), value = c(0.996156321090158, 0.968144290236367, 0.882793110384066, 0.719390676388129, 0.497759597498133,
0.311721580067415, 0.181244079443301, 0.0988516758834657, 0.0527504526341006,
0.0278716018561911, 0.0145107725175315, 0.00785033086321829,
0.00405759957072942, 0.00213190168252939, 0.00109610249274952,
0.000578154695264754, 0.000301095727545301, 0.000155696457494707,
8.2897211122996e-05, 4.09225082176349e-05, 2.33782236798641e-05,
1.21665352966827e-05, 6.87373003802479e-06), bucket_id = 1:23), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -23L))
Which I want to visualise as a circular stacked bar plot:
cutoff_values <- seq(0, 115, by = 5)
library(tidyverse)
ex %>%
mutate(r0 = cutoff_values[-length(cutoff_values)],
r = cutoff_values[-1]) %>%
mutate(x0 = 100,
y0 = 50) %>%
ggplot(aes(x0 = x0, y0 = y0, r0 = r0, r = r)) +
ggforce::geom_arc_bar(aes(start = 0, end = 2 * pi, fill = value),
colour = NA) +
theme_void() +
labs(fill = 'colour')
But I also need to be able to mark out some particular bucket with different filling at best. So I need to be able to preserve filling using value with continuous scale, but also fill one particular stratum (let's say bucket == 15) with another colour, leaving the other strata (buckets) as they are. Is it possible? What are the alternatives to mark out bucket 15th?
I believe that this can be done with the relayer package, which is still highly experimental. You can copy a subset of your data in a seperate geom and give it another fill aesthetic. This seperate geom can then be piped into rename_geom_aes() and you would have to set the scale_fill_*() for your renamed aesthetic. You'd probably get a warning about that the geom is ignoring unknown aesthetics, but I don't know if that can be helped.
Below is an example for making bucket 15 red.
library(tidyverse)
library(relayer) # https://github.com/clauswilke/relayer
ex <- df %>%
mutate(r0 = cutoff_values[-length(cutoff_values)],
r = cutoff_values[-1]) %>%
mutate(x0 = 100,
y0 = 50)
ggplot(ex, aes(x0 = x0, y0 = y0, r0 = r0, r = r)) +
ggforce::geom_arc_bar(aes(start = 0, end = 2 * pi, fill = value),
colour = NA) +
ggforce::geom_arc_bar(data = ex[ex$bucket_id == 15,], # Whatever bucket you want
aes(start = 0, end = 2 * pi, fill2 = as.factor(bucket_id))) %>%
rename_geom_aes(new_aes = c("fill" = "fill2")) +
scale_fill_manual(aesthetics = "fill2", values = "red", guide = "legend") +
theme_void() +
labs(fill = 'colour', fill2 = "highlight")

Using geom_segment to create a timeline visualization

I am trying to create a chart like this one produced in the NYTimes using ggplot:
I think I'm getting close, but I'm not quite sure how to separate out some of my data so I get the right view. My data is political office holders that appear something like this:
name,year_elected,year_left,years_in_office,type,party
Person 1,1969,1969,1,Candidate,Unknown
Person 2,1969,1971,2,Candidate,Unknown
Person 3,1969,1973,4,Candidate,Unknown
Person 4,1969,1973,4,Candidate,Unknown
Person 5,1971,1974,3,Candidate,Unknown
Person 1,1971,1976,5,Candidate,Unknown
Person 2,1971,1980,9,Candidate,Unknown
Person 6,1973,1978,5,Candidate,Unknown
Person 7,1973,1980,7,Candidate,Unknown
Person 8,1975,1980,5,Candidate,Unknown
Person 9,1977,1978,1,Candidate,Unknown
And I've used the below code to get very close to this view, but I think an issue I'm running into is either drawing segments incorrectly (e.g., I don't seem to have a single segment for each candidate), or segments are overlapping/stacking. The key issue I'm running into is my list of office holders is around 60, but my chart is only drawing around 28 lines.
library(googlesheets)
library(tidyverse)
# I'm reading from a Google Spreadsheet
data <- gs_title("Council Members")
data_sj <- gs_read(ss = data, ws = "Sheet1")
ggplot(data, aes(year_elected, years_in_office)) +
geom_segment(aes(x = year_elected, y = 0,
xend = year_left, yend = years_in_office)) +
theme_minimal()
The above code gives me:
Thanks ahead of time for any pointers!
If your data frame is called d, then:
Transform it to data.table
Add jitter to year_electer
Add equivalent jitter to year_left
Add group (as an example) to color your samples
Use ggrepel to add text if there are many points.
Code:
library(data.table)
library(ggplot2)
library(ggrepel)
d[, year_elected2 := jitter(year_elected)]
d[, year_left2 := year_left + year_elected2 - year_elected + 0.01]
d[, group := TRUE]
d[factor(years_in_office %/% 9) == 1, group := FALSE]
ggplot(d, aes(year_elected2, years_in_office)) +
geom_segment(aes(x = year_elected2, xend = year_left2,
y = 0, yend = years_in_office, linetype = group),
alpha = 0.8, size = 1, color = "grey") +
geom_point(aes(year_left2), color = "black", size = 3.3) +
geom_point(aes(year_left2, color = group), size = 2.3) +
geom_text_repel(aes(year_left2, label = name), ) +
scale_colour_brewer(guide = FALSE, palette = "Dark2") +
scale_linetype_manual(guide = FALSE, values = c(2, 1)) +
labs(x = "Year elected",
y = "Years on office") +
theme_minimal(base_size = 10)
Result:
For the record and to address my comment on #PoGibas answer above, here's my tidyverse version:
data_transform <- data_sj %>%
mutate(year_elected_jitter = jitter(year_elected)) %>%
mutate(year_left_jitter = year_left + year_elected_jitter - year_elected + 0.01)
ggplot(data_transform, aes(year_elected, years_in_office, label = name)) +
geom_segment(aes(x = year_elected_jitter, y = 0, xend = year_left_jitter, yend = years_in_office, color = gender), size = 0.3) +
geom_text_repel(aes(year_left_jitter, label = name)) +
theme_minimal()

Subset/filter in dplyr chain with ggplot2

I'd like to make a slopegraph, along the lines (no pun intended) of this. Ideally, I'd like to do it all in a dplyr-style chain, but I hit a snag when I try to subset the data to add specific geom_text labels. Here's a toy example:
# make tbl:
df <- tibble(
area = rep(c("Health", "Education"), 6),
sub_area = rep(c("Staff", "Projects", "Activities"), 4),
year = c(rep(2016, 6), rep(2017, 6)),
value = rep(c(15000, 12000, 18000), 4)
) %>% arrange(area)
# plot:
df %>% filter(area == "Health") %>%
ggplot() +
geom_line(aes(x = as.factor(year), y = value,
group = sub_area, color = sub_area), size = 2) +
geom_point(aes(x = as.factor(year), y = value,
group = sub_area, color = sub_area), size = 2) +
theme_minimal(base_size = 18) +
geom_text(data = dplyr::filter(., year == 2016 & sub_area == "Activities"),
aes(x = as.factor(year), y = value,
color = sub_area, label = area), size = 6, hjust = 1)
But this gives me Error in filter_(.data, .dots = lazyeval::lazy_dots(...)) :
object '.' not found. Using subset instead of dplyr::filter gives me a similar error. What I've found on SO/Google is this question, which addresses a slightly different problem.
What is the correct way to subset the data in a chain like this?
Edit: My reprex is a simplified example, in the real work I have one long chain. Mike's comment below works for the first case, but not the second.
If you wrap the plotting code in {...}, you can use . to specify exactly where the previously calculated results are inserted:
library(tidyverse)
df <- tibble(
area = rep(c("Health", "Education"), 6),
sub_area = rep(c("Staff", "Projects", "Activities"), 4),
year = c(rep(2016, 6), rep(2017, 6)),
value = rep(c(15000, 12000, 18000), 4)
) %>% arrange(area)
df %>% filter(area == "Health") %>% {
ggplot(.) + # add . to specify to insert results here
geom_line(aes(x = as.factor(year), y = value,
group = sub_area, color = sub_area), size = 2) +
geom_point(aes(x = as.factor(year), y = value,
group = sub_area, color = sub_area), size = 2) +
theme_minimal(base_size = 18) +
geom_text(data = dplyr::filter(., year == 2016 & sub_area == "Activities"), # and here
aes(x = as.factor(year), y = value,
color = sub_area, label = area), size = 6, hjust = 1)
}
While that plot is probably not what you really want, at least it runs so you can edit it.
What's happening: Normally %>% passes the results of the left-hand side (LHS) to the first parameter of the right-hand side (RHS). However, if you wrap the RHS in braces, %>% will only pass the results in to wherever you explicitly put a .. This formulation is useful for nested sub-pipelines or otherwise complicated calls (like a ggplot chain) that can't otherwise be sorted out just by redirecting with a .. See help('%>%', 'magrittr') for more details and options.
Writing:
geom_text(data = df[df$year == 2016 & df$sub_area == "Activities",],...
instead of
geom_text(data = dplyr::filter(., year == 2016 & sub_area == "Activities"),...
makes it work but you still have issues about the position of the text (you should be able to easily find help on SO for that issue).

Resources