ggplot bar chart limits fix - r

I am trying to fix the limits of a bar chart so the horizontal bar doesn't go over the plot area. I could set the limit manually using limits=c(0,3000000)but I guess there is a way to make it automatically scalable. The code
corona.conf <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",header = TRUE,check.names=FALSE)
corona.conf %>% .[,c(-1,-3,-4)] %>% melt(.,variable.name="day") %>%
group_by(`Country/Region`,day) %>% summarize(value=sum(value)) %>%
mutate(day=as.Date(day,format='%m/%d/%y')) %>% mutate(count=value-lag(value)) %>%
replace(is.na(.),0) %>% group_by(`Country/Region`) %>% summarize(count=sum(count)) %>%
top_n(20) %>% arrange(desc(count)) %>% ggplot(.,aes(x=reorder(`Country/Region`,count),y=count,fill=count)) +
geom_bar(stat = "identity") + coord_flip() + geom_text(aes(label=format(count,big.mark = ",")),hjust=-0.1,size=4) +
scale_y_continuous(expand = c(0,0))
I thought something like:
scale_y_continuous(expand = c(0,0),limits=c(0,max(count))
Appreciate any suggestions on the fix.

I think it would be easier to read an run the code by splitting it into several parts.
We can use layer_data to get the information from a ggplot object, and the calculate the maximum from that. Based on your example, I would also suggest you multiply the maximum by 1.7 to include the text.
library(tidyverse)
library(data.table)
corona.conf <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",header = TRUE,check.names=FALSE)
dat <- corona.conf %>% .[,c(-1,-3,-4)] %>% melt(.,variable.name="day") %>%
group_by(`Country/Region`,day) %>% summarize(value=sum(value)) %>%
mutate(day=as.Date(day,format='%m/%d/%y')) %>% mutate(count=value-lag(value)) %>%
replace(is.na(.),0) %>% group_by(`Country/Region`) %>% summarize(count=sum(count)) %>%
top_n(20) %>% arrange(desc(count))
p <- ggplot(dat, aes(x=reorder(`Country/Region`,count),y=count,fill=count)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label=format(count,big.mark = ",")),hjust=-0.1,size=4)
p +
scale_y_continuous(expand = c(0,1), limits = c(0, max(layer_data(p)$y) * 1.7))

Related

How to display variable and value labels in ggplot bar chart?

I'm trying to get the variable labels and value labels to be displayed on a stacked bar chart.
library(tidyverse)
data <- haven::read_spss("http://staff.bath.ac.uk/pssiw/stats2/SAQ.sav")
data %>%
select(Q01:Q04) %>%
gather %>%
group_by(key, value) %>%
tally %>%
mutate(n = n/sum(n)*100, round = 1) %>%
mutate(n = round(n, 2)) %>%
ggplot(aes(x=key, y=n, fill=factor(value))) +
geom_col() +
geom_text(aes(label=as_factor(n)), position=position_stack(.5)) +
coord_flip() +
theme(aspect.ratio = 1/3) + scale_fill_brewer(palette = "Set2")
Instead of Q01, Q02, Q03, Q04, I would like to use the variable labels.
library(labelled)
var_label(data$Q01)
Statistics makes me cry
var_label(data$Q02)
My friends will think Im stupid for not being able to cope with SPSS
var_label(data$Q03)
Standard deviations excite me
var_label(data$Q04)
I dream that . . .
along with associated value labels
val_labels(data$Q01)
Strongly agree Agree Neither Disagree Strongly disagree Not answered
1 2 3 4 5 9
I tried using label = as_factor(n) but that didn't work.
We may extract the labels and then do a join
library(forcats)
library(haven)
library(dplyr)
library(tidyr)
library(labelled)
subdat <- data %>%
select(Q01:Q04)
d1 <- subdat %>%
summarise(across(everything(), var_label)) %>%
pivot_longer(everything())
subdat %>%
pivot_longer(everything(), values_to = 'val') %>%
left_join(d1, by = 'name') %>%
mutate(name = value, value = NULL) %>%
count(name, val) %>%
mutate(n = n/sum(n)*100, round = 1) %>%
mutate(n = round(n, 2)) %>%
ungroup %>%
mutate(labels = names(val_labels(val)[val])) %>%
ggplot(aes(x=name, y=n, fill=labels)) +
geom_col() +
geom_text(aes(label=as_factor(n)),
position=position_stack(.5)) +
coord_flip() +
theme(aspect.ratio = 1/3) +
scale_fill_brewer(palette = "Set2")
-output

x-axis starting value for diverging plot

How can I change the "x-axis starting value" from the diverging bar chart below (extracted from here), so that the vertical axis is set at 25 instead of 0. And therefore the bars are drawn from 25 and not 0.
For instance, I want this chart:
To look like this:
EDIT
It it not the label I want to change, it is how the data is plotted. My apologies if I wasn't clear. See example below:
Another example to make it clear:
You can provide computed labels to an (x-)scale via scale_x_continuous(labels = function (x) x + 25).
If you also want to change the data, you’ll first need to offset the x-values by the equivalent amount (in the opposite direction):
Example:
df = tibble(Color = c('red', 'green', 'blue'), Divergence = c(5, 10, -5))
offset = 2
df %>%
mutate(Divergence = Divergence - offset) %>%
ggplot() +
aes(x = Divergence, y = Color) +
geom_col() +
scale_x_continuous(labels = function (x) x + offset)
I'm still not 100% clear on your intended outcome but you can "shift" your data by adding/subtracting 25 from each value, e.g.
Original plot:
library(tidyverse)
library(gapminder)
set.seed(123)
gapminder_subset <- gapminder %>%
pivot_longer(-c(country, continent, year)) %>%
filter(year == "1997" | year == "2007") %>%
select(-continent) %>%
filter(name == "gdpPercap") %>%
pivot_wider(names_from = year) %>%
select(-name) %>%
mutate(gdp_change = ((`2007` - `1997`) / `1997`) * 100) %>%
sample_n(15)
ggplot(data = gapminder_subset,
aes(x = country, y = gdp_change)) +
geom_bar(stat = "identity") +
coord_flip()
subtract 25:
library(tidyverse)
library(gapminder)
set.seed(123)
gapminder_subset <- gapminder %>%
pivot_longer(-c(country, continent, year)) %>%
filter(year == "1997" | year == "2007") %>%
select(-continent) %>%
filter(name == "gdpPercap") %>%
pivot_wider(names_from = year) %>%
select(-name) %>%
mutate(gdp_change = ((`2007` - `1997`) / `1997`) * 100) %>%
sample_n(15)
ggplot(data = gapminder_subset,
aes(x = country, y = gdp_change)) +
geom_bar(stat = "identity") +
coord_flip()
If you combine that with my original relabelling I think that's the solution:
ggplot(data = gapminder_subset,
aes(x = country, y = gdp_change - 25)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_y_continuous(breaks = c(-25, 0, 25, 50),
labels = c(0, 25, 50, 75))
The answers that existed at the time that I'm writing this are suggesting to change the data or to change the label. Here, I'm proposing to change neither the data nor the labels, and instead just change where the starting position of a bar is.
First, for reproducibility, I took #jared_mamrot's approach for the data subset.
library(gapminder)
library(tidyverse)
set.seed(123)
gapminder_subset <- gapminder %>%
pivot_longer(-c(country, continent, year)) %>%
filter(year == "1997" | year == "2007") %>%
select(-continent) %>%
filter(name == "gdpPercap") %>%
pivot_wider(names_from = year) %>%
select(-name) %>%
mutate(gdp_change = ((`2007` - `1997`) / `1997`) * 100) %>%
sample_n(15)
Then, you can set xmin = after_scale(25). You'll get a warning that xmin doesn't exists, but it does exist after the bars are reparameterised to rectangles in the ggplot2 internals (which is after the x-scale has seen the data to determine limits). This effectively changes the position where bars start.
ggplot(gapminder_subset,
aes(gdp_change, country)) +
geom_col(aes(xmin = after_scale(25)))
#> Warning: Ignoring unknown aesthetics: xmin
Created on 2021-06-28 by the reprex package (v1.0.0)

how to control the color of ggrepel segments

Going to try this again with a better MRE...for context, here's the product I'm currently trying to improve
What I'm trying to do is get the lines from the endpoints to the labels to be the same color as the data lines.
For purposes of this question we can work with this script
library(ggplot2)
library(babynames)
library(dplyr)
library(ggrepel)
library(ggsci)
data <- babynames %>%
filter(name %in% c("Ashley", "Patricia", "Mary", "Minnie")) %>%
filter(sex=="F")
data <- data %>% group_by(name) %>%
mutate(change = n - lag(n)) %>%
mutate(meanC = mean(change, na.rm = TRUE)) %>%
ungroup()
data$label <- paste(data$name,"\n",round(data$meanC,0),sep="" )
minYear = min(data$year)
maxYear = max(data$year)
#endpoint layer
Endpoints <- data %>%
group_by(name) %>%
filter(year == max(year)) %>%
select(year, name, n, label) %>%
ungroup()
namePlot <- data %>%
ggplot(mapping = aes(x=year, y=n)) +
geom_line(aes(color=name), show.legend = FALSE) +
coord_cartesian(xlim = c(minYear, maxYear+10)) +
scale_color_ucscgb() +
geom_point(data = Endpoints, size=1.5, shape=21,
aes(color=name, fill=name), show.legend=FALSE) +
geom_label_repel(data=Endpoints, aes(label=label),
color = c("forestgreen","red")[1+grepl("\\-\\d",Endpoints$label)],
show.legend = FALSE,
vjust = 0, xlim=c(maxYear+3,maxYear+10), size=3, direction='y')
print(namePlot)
which produces this plot
The colors of the labels is controlled by color = c("forestgreen","red")[1+grepl("\\-\\d",Endpoints$label)], so that, in this case, data with a positive value in the label is green and data with a negative value is red. What I'd like to is make the connecting lines from the endpoints to the label boxes be the same color as the data lines, which are controlled by geom_line(aes(color=name),show.legend = FALSE
In the ggrepel docs there is a segment.color parameter that can control the color of the line segment, but it is not an aesthetic. So it appears it has to be "hard-coded" like segment.color="red" which doesn't really help me. I also found this discussion about the issue that seemed to present a solution, but I have been unable to get it to work. Part of the issue there is that it involves scale_color_discrete(aesthetics = c("color", "segment.color")) and I already have scale_color_ucscgb() so I get a warning about replacing scales...
Any guidance would be most appreciated.
Working version based on guidance from #aosmith
library(ggplot2)
library(babynames)
library(dplyr)
library(ggrepel)
library(ggsci)
data <- babynames %>%
filter(name %in% c("Ashley", "Patricia", "Mary", "Minnie")) %>%
filter(sex=="F")
data <- data %>% group_by(name) %>%
mutate(change = n - lag(n)) %>%
mutate(meanC = mean(change, na.rm = TRUE)) %>%
ungroup()
data$label <- paste(data$name,"\n",round(data$meanC,0),sep="" )
minYear = min(data$year)
maxYear = max(data$year)
#endpoint layer
Endpoints <- data %>%
group_by(name) %>%
filter(year == max(year)) %>%
select(year, name, n, label) %>%
ungroup()
namePlot <- data %>%
ggplot(mapping = aes(x=year, y=n)) +
geom_line(aes(color=name), show.legend = FALSE) +
coord_cartesian(xlim = c(minYear, maxYear+15)) +
geom_point(data = Endpoints, size=1.5, shape=21,
aes(color=name, fill=name), show.legend=FALSE) +
geom_label_repel(data=Endpoints, aes(label=label,
segment.color=name),
color = c("forestgreen","red")[1+grepl("\\-\\d",Endpoints$label)],
show.legend = FALSE,
force = 50,
vjust = 0, xlim=c(maxYear+5,maxYear+12), size=3, direction='y') +
scale_color_discrete(aesthetics = c("color", "segment.color"))
print(namePlot)
produces

Add ylim to geom_col

I would like to see the y-axis (in the plot is flipped) starting at some arbitrary value, like 7.5
After a little bit of researching, I came across ylim, but in this case is giving me some
errors:
Scale for 'y' is already present. Adding another scale for 'y', which will
replace the existing scale.
Warning message:
Removed 10 rows containing missing values (geom_col).
This is my code, and a way to download the data I'm using:
install.packages("remotes")
remotes::install_github("tweed1e/werfriends")
library(werfriends)
friends_raw <- werfriends::friends_episodes
library(tidytext)
library(tidyverse)
#"best" writers with at least 10 episodes
friends_raw %>%
unnest(writers) %>%
group_by(writers) %>%
summarize(mean_rating = mean(rating),
n = n()) %>%
arrange(desc(mean_rating)) %>%
filter(n > 10) %>%
head(10) %>%
mutate(writers = fct_reorder(writers, mean_rating)) %>%
ggplot(aes(x = writers, y = mean_rating, fill = writers)) + geom_col() +
coord_flip() + theme(legend.position = "None") + scale_y_continuous(breaks = seq(7.5,10,0.5)) +
ylim(7.5,10)
You should use coord_cartesian for zoom in a particular location (here the official documentation: https://ggplot2.tidyverse.org/reference/coord_cartesian.html).
With your example, your code should be something like that:
friends_raw %>%
unnest(writers) %>%
group_by(writers) %>%
summarize(mean_rating = mean(rating),
n = n()) %>%
arrange(desc(mean_rating)) %>%
filter(n > 10) %>%
head(10) %>%
mutate(writers = fct_reorder(writers, mean_rating)) %>%
ggplot(aes(x = writers, y = mean_rating, fill = writers)) + geom_col() +
coord_flip() + theme(legend.position = "None") + scale_y_continuous(breaks = seq(7.5,10,0.5)) +
coord_cartesian(ylim = c(7.5,10))
If this is not working please provide a reproducible example of your dataset (see: How to make a great R reproducible example)
I found out the solution. With my actual plot, the answer submitted by #dc37 didn't work because coord_flip() and coord_cartesian() exclude each other. So the way to do this is:
friends_raw %>%
unnest(writers) %>%
group_by(writers) %>%
summarize(mean_rating = mean(rating),
n = n()) %>%
arrange(mean_rating) %>%
filter(n > 10) %>%
head(10) %>%
mutate(writers = fct_reorder(writers, mean_rating)) %>%
ggplot(aes(x = writers, y = mean_rating, fill = writers)) + geom_col() +
theme(legend.position = "None") +
coord_flip(ylim = c(8,8.8))

make geom_bar show values in the ascending order

Although my query shows me values in descending order, ggplot then displays them alphabetically instead of ascending order.
Known solutions to this problem haven't seem to work. They suggest using Reorder or factor for values, which didn't work in this case
This is my code:
boxoffice %>%
group_by(studio) %>%
summarise(movies_made = n()) %>%
arrange(desc(movies_made)) %>%
top_n(10) %>%
arrange(desc(movies_made)) %>%
ggplot(aes(x = studio, y = movies_made, fill = studio, label = as.character(movies_made))) +
geom_bar(stat = 'identity') +
geom_label(label.size = 1, size = 5, color = "white") +
theme(legend.position = "none") +
ylab("Movies Made") +
xlab("Studio")
for those wanting a more complete example, here's where I got:
library(dplyr)
library(ggplot2)
# get some dummy data
boxoffice = boxoffice::boxoffice(dates=as.Date("2017-1-1"))
df <- (
boxoffice %>%
group_by(distributor) %>%
summarise(movies_made = n()) %>%
mutate(studio=reorder(distributor, -movies_made)) %>%
top_n(10))
ggplot(df, aes(x=distributor, y=movies_made)) + geom_col()
You'll need to convert boxoffice$studio to an ordered factor. ggplot will then respect the order of rows in the data set, rather than alphabetizing. Your dplyr chain will look like this:
boxoffice %>%
group_by(studio) %>%
summarise(movies_made = n()) %>%
arrange(desc(movies_made)) %>%
ungroup() %>% # ungroup
mutate(studio = factor(studio, studio, ordered = T)) %>% # convert variable
top_n(10) %>%
arrange(desc(movies_made)) %>%
ggplot(aes(x = studio, y... (rest of plotting code)

Resources