Text mining frequency with ggplot - r

I am working with a dataset called HappyDB for a class presentation and analyzing demographic differences in word frequency. I'm using tidytext for most of the analyses, and using their online guide to create most of my visuals. However, I'm running into a problem with the code to create the frequency plot of words with labels. My dataset is structured differently from theirs, and I thought I was accounting for it but I evidently was not. This is their sample code to generate the graph (comparing Jane Austen with the Bronte sisters and H.G. Wells)
library(tidyr)
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(author, proportion) %>%
gather(author, proportion, `Brontë Sisters`:`H.G. Wells`)
library(scales)
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Jane Austen", x = NULL)
And that code generates this plot:
I'm hoping to emulate this with demographics in my dataset, but keep getting errors. Here is my code, which uses a dataset that I have already tidied:
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidytext)
library(stringr)
windowsFonts(Franklin=windowsFont("Franklin Gothic Demi"))
marriedmen <- tidy_hm[which(tidy_hm$marital =="married" &
tidy_hm$gender == "m"),]
marriedwomen <- tidy_hm[which(tidy_hm$marital =="married" &
tidy_hm$gender == "f"),]
singlemen <- tidy_hm[which(tidy_hm$marital =="single" &
tidy_hm$gender == "m"),]
frequency <- bind_rows(mutate(marriedmen, status = "Married men"),
mutate(marriedwomen, status = "Married women"),
mutate(singlemen, status = "Single men")) %>%
count(status, word) %>%
group_by(status) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(status, proportion) %>%
gather(status, proportion, `Married women`:`Single men`)
library(scales)
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = 'Married men', color = abs(`Married men` - proportion)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~status, ncol = 2) +
theme(legend.position="none") +
labs(y = NULL, x = NULL)
But I keep getting this error:
Error in log(x, base) : non-numeric argument to mathematical function
I tried removing the scale rows, but that caused a bunch of data to get eliminated and the plot didn't look anything like it was supposed to, and had no line, labels, or colors. I'm pretty new to r and coding in general so any help is appreciated.

Related

3 layer donut chart in R

I am trying to recreate this image in R, however I am unable to work out how to have 3 layers to a donut chart - everything I find (for instance, webr::PieDonut) only allows 2. Using ggplot I am also unable to re-create it.
A MRE is:
library(ggplot2)
library(webr)
library(dplyr)
lexicon <- data.frame("Level1" = c(rep("Flavour", 11), rep("Appearance", 4)),
"Level2" = c(rep("Misc", 6), rep("Pungent", 5), rep("Colour", 4)),
"Level3" = c("Fresh", "Refreshing", "Soapy", "Minty", "Nutty", "Milky", "Peppery", "Sharp", "Horseradish", "Mustard hot", "Spicy", "Colourful"," Fresh Green", "Dark Green", "Bright Green")
)
PieDonut(lexicon, aes(Level1, Level2), title = "Salad Lexicon", showRatioDonut =FALSE, showRatioPie = FALSE)
ggplot(lexicon, aes(Level2, Level3, fill = Level1)) +
geom_col() +
scale_fill_viridis_d() +
coord_polar("y")
While the PieDonut works for 2 levels (not shown), it doesn't allow the final level to be included. The ggplot approach also does not work, as seen in the figure below.
How can I get this style of chart in R? Either with ggplot or base plotting.
I think a nice alternative is to use geom_rect here after some data manipulation. Using the fill, color, and alpha scales can help improve the differentiation of categories. I would also use geom_textpath here, though I might go for circumferential labels if there is room to do so:
lexicon %>%
mutate(top_level = Level1) %>%
pivot_longer(1:3) %>%
group_by(name, value) %>%
mutate(width = n()) %>%
unique() %>%
arrange(name) %>%
group_by(name) %>%
mutate(ymid = as.numeric(sub("\\D+", "", name)),
ymax = ymid + 0.5, ymin = ymid - 0.5,
xmin = c(0, head(cumsum(width), -1)),
xmax = cumsum(width),
xmid = (xmax + xmin) / 2) %>%
ggplot(aes(xmid, ymid, fill = top_level)) +
geom_rect(aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax,
alpha = name, color = top_level)) +
geomtextpath::geom_textpath(aes(y = ymid + 0.25, label = value,
group = value)) +
scale_alpha_manual(values = c(1, 0.3, 0.1)) +
scale_fill_manual(values = c("#cd9900", "#00817e")) +
scale_colour_manual(values = c("#cd9900", "#00817e")) +
scale_y_continuous(limits = c(-0.5, 3.6)) +
coord_polar() +
theme_void() +
theme(legend.position = "none")
One option would be to reeshape your data to long and do some manual aggregating before passing to ggplot. Additionally I use geomtextpath::geom_textpath to add the labels:
library(ggplot2)
library(dplyr)
library(geomtextpath)
lexicon <- data.frame("Level1" = c(rep("Flavour", 11), rep("Appearance", 4)),
"Level2" = c(rep("Misc", 6), rep("Pungent", 5), rep("Colour", 4)),
"Level3" = c("Fresh", "Refreshing", "Soapy", "Minty", "Nutty", "Milky", "Peppery", "Sharp", "Horseradish", "Mustard hot", "Spicy", "Colourful"," Fresh Green", "Dark Green", "Bright Green")
)
lexicon_long <- lexicon |>
mutate(fill = Level1) |>
tidyr::pivot_longer(-fill, names_to = "level", values_to = "label") |>
mutate(label = forcats::fct_inorder(label)) |>
count(fill, level, label) |>
group_by(level) |>
mutate(pct = n / sum(n))
ggplot(lexicon_long, aes(level, pct, fill = fill)) +
geom_col(color = "white") +
geom_textpath(aes(label = label, group = label),
position = position_stack(vjust = .5),
upright = TRUE, hjust = .5, size = 3
) +
scale_fill_viridis_d() +
coord_polar("y") +
theme_void() +
guides(fill = "none")

R label with commas but no decimals

My goal is to produce labels with commas, but no decimals. Let's say I have a ggplot with the following section:
geom_text(aes(y = var,
label = scales::comma(round(var))), hjust = 0, nudge_y = 300 )
This is almost what I need. It gives me the commas, but has a decimal. I have seen here (axis labels with comma but no decimals ggplot) that comma_format() could be good, but I think the label in my case needs a data argument, which comma_format() does not take. What can I do?
Update:
As an example of when this problem occurs, see the following, which uses gganimate and has a lot more going on. Code derived from Jon Spring's answer at Animated sorted bar chart with bars overtaking each other
library(gapminder)
library(gganimate)
library(tidyverse)
gap_smoother <- gapminder %>%
filter(continent == "Asia") %>%
group_by(country) %>%
complete(year = full_seq(year, 1)) %>%
mutate(gdpPercap = spline(x = year, y = gdpPercap, xout = year)$y) %>%
group_by(year) %>%
mutate(rank = min_rank(-gdpPercap) * 1) %>%
ungroup() %>%
group_by(country) %>%
complete(year = full_seq(year, .5)) %>%
mutate(gdpPercap = spline(x = year, y = gdpPercap, xout = year)$y) %>%
mutate(rank = approx(x = year, y = rank, xout = year)$y) %>%
ungroup() %>%
arrange(country,year)
gap_smoother2 <- gap_smoother %>% filter(year<=2007 & year>=1999)
gap_smoother3 <- gap_smoother2 %<>% filter(rank<=8)
p <- ggplot(gap_smoother3, aes(rank, group = country,
fill = as.factor(country), color = as.factor(country))) +
geom_tile(aes(y = gdpPercap/2,
height = gdpPercap,
width = 0.9), alpha = 0.8, color = NA) +
geom_text(aes(y = 0, label = paste(country, " ")), vjust = 0.2, hjust = 1) +
geom_text(aes(y = gdpPercap,
label = scales::comma(round(gdpPercap))), hjust = 0, nudge_y = 300 ) +
coord_flip(clip = "off", expand = FALSE) +
scale_x_reverse() +
guides(color = FALSE, fill = FALSE) +
labs(title='{closest_state %>% as.numeric %>% floor}',
x = "", y = "GFP per capita") +
theme(plot.title = element_text(hjust = 0, size = 22),
axis.ticks.y = element_blank(), # These relate to the axes post-flip
axis.text.y = element_blank(), # These relate to the axes post-flip
plot.margin = margin(1,1,1,4, "cm")) +
transition_states(year, transition_length = 1, state_length = 0) +
enter_grow() +
exit_shrink() +
ease_aes('linear')
animate(p, fps = 2, duration = 5, width = 600, height = 500)
In addition to the solution provided by #drf, you need to add scale_y_continuous(scales::comma) to your ggplot commands. But put it before the coord_flip function.
p <- ggplot(gap_smoother3, aes(rank, group = country,
fill = as.factor(country), color = as.factor(country))) +
geom_tile(aes(y = gdpPercap/2,
height = gdpPercap,
width = 0.9), alpha = 0.8, color = NA) +
geom_text(aes(y = gdpPercap,
label = scales::comma(round(gdpPercap), accuracy=1)),
hjust = 0, nudge_y = 300 ) +
scale_y_continuous(labels = scales::comma) +
... etc.

sf map add missing values with inter_join

I have a technical question for you please.
read_sf("map.shp") %>% mutate(Groups = as.factor(Groups)) %>%
mutate(Groups = factor(Groups, levels = c(paste0(1:23)))) %>%
left_join(data, by = "cities_code") %>%
# Show map with cities border
ggplot() +
geom_sf(aes(fill = Groups), size = 0.4) +
# Color the different Groups, here 23 colors
stat_sf_coordinates(aes(size = observation)) +
# Put point with the size of my number of observations
scale_radius(range = c(1, 6)) +
geom_sf(fill = "transparent", color = "gray20", size = 1, data = . %>% group_by(Groups) %>% summarise()) +
# Show the border of my Groups
theme_bw()
This map represents exactly what I want. It represent cities of one state subdivided by district ("Groups"). But between my map.shp and my data I have a difference of 50 cities, because there is no observation in these cities (so no point of "stat_sf_coordinates(aes(size = observation))").
I can find the difference with anti_join(data, by = "cities_code").
I would like to have the same map but with the missing cities colored in red please please.
Thank you
It was simple :
read_sf("map.shp") %>% mutate(Groups = as.factor(Groups)) %>%
mutate(Groups = factor(Groups, levels = c(paste0(1:23)))) %>%
left_join(data, by = "cities_code") %>%
ggplot() +
geom_sf(aes(fill = Groups), size = 0.4) +
stat_sf_coordinates(aes(size = observation)) +
scale_radius(range = c(1, 6)) +
##
geom_sf(fill = "red", color = "gray40", size = 0.4, data = . %>% anti_join(data, by = "cities_code")) +
##
geom_sf(fill = "transparent", color = "gray20", size = 1, data = . %>% group_by(Groups) %>% summarise()) +
theme_bw()

Animated sorted bar chart with bars overtaking each other

Edit: keyword is 'bar chart race'
How would you go at reproducing this chart from Jaime Albella in R ?
See the animation on visualcapitalist.com or on twitter (giving several references in case one breaks).
I'm tagging this as ggplot2 and gganimate but anything that can be produced from R is relevant.
data (thanks to https://github.com/datasets/gdp )
gdp <- read.csv("https://raw.github.com/datasets/gdp/master/data/gdp.csv")
# remove irrelevant aggregated values
words <- scan(
text="world income only total dividend asia euro america africa oecd",
what= character())
pattern <- paste0("(",words,")",collapse="|")
gdp <- subset(gdp, !grepl(pattern, Country.Name , ignore.case = TRUE))
Edit:
Another cool example from John Murdoch :
Most populous cities from 1500 to 2018
Edit: added spline interpolation for smoother transitions, without making rank changes happen too fast. Code at bottom.
I've adapted an answer of mine to a related question. I like to use geom_tile for animated bars, since it allows you to slide positions.
I worked on this prior to your addition of data, but as it happens, the gapminder data I used is closely related.
library(tidyverse)
library(gganimate)
library(gapminder)
theme_set(theme_classic())
gap <- gapminder %>%
filter(continent == "Asia") %>%
group_by(year) %>%
# The * 1 makes it possible to have non-integer ranks while sliding
mutate(rank = min_rank(-gdpPercap) * 1) %>%
ungroup()
p <- ggplot(gap, aes(rank, group = country,
fill = as.factor(country), color = as.factor(country))) +
geom_tile(aes(y = gdpPercap/2,
height = gdpPercap,
width = 0.9), alpha = 0.8, color = NA) +
# text in x-axis (requires clip = "off" in coord_*)
# paste(country, " ") is a hack to make pretty spacing, since hjust > 1
# leads to weird artifacts in text spacing.
geom_text(aes(y = 0, label = paste(country, " ")), vjust = 0.2, hjust = 1) +
coord_flip(clip = "off", expand = FALSE) +
scale_y_continuous(labels = scales::comma) +
scale_x_reverse() +
guides(color = FALSE, fill = FALSE) +
labs(title='{closest_state}', x = "", y = "GFP per capita") +
theme(plot.title = element_text(hjust = 0, size = 22),
axis.ticks.y = element_blank(), # These relate to the axes post-flip
axis.text.y = element_blank(), # These relate to the axes post-flip
plot.margin = margin(1,1,1,4, "cm")) +
transition_states(year, transition_length = 4, state_length = 1) +
ease_aes('cubic-in-out')
animate(p, fps = 25, duration = 20, width = 800, height = 600)
For the smoother version at the top, we can add a step to interpolate the data further before the plotting step. It can be useful to interpolate twice, once at rough granularity to determine the ranking, and another time for finer detail. If the ranking is calculated too finely, the bars will swap position too quickly.
gap_smoother <- gapminder %>%
filter(continent == "Asia") %>%
group_by(country) %>%
# Do somewhat rough interpolation for ranking
# (Otherwise the ranking shifts unpleasantly fast.)
complete(year = full_seq(year, 1)) %>%
mutate(gdpPercap = spline(x = year, y = gdpPercap, xout = year)$y) %>%
group_by(year) %>%
mutate(rank = min_rank(-gdpPercap) * 1) %>%
ungroup() %>%
# Then interpolate further to quarter years for fast number ticking.
# Interpolate the ranks calculated earlier.
group_by(country) %>%
complete(year = full_seq(year, .5)) %>%
mutate(gdpPercap = spline(x = year, y = gdpPercap, xout = year)$y) %>%
# "approx" below for linear interpolation. "spline" has a bouncy effect.
mutate(rank = approx(x = year, y = rank, xout = year)$y) %>%
ungroup() %>%
arrange(country,year)
Then the plot uses a few modified lines, otherwise the same:
p <- ggplot(gap_smoother, ...
# This line for the numbers that tick up
geom_text(aes(y = gdpPercap,
label = scales::comma(gdpPercap)), hjust = 0, nudge_y = 300 ) +
...
labs(title='{closest_state %>% as.numeric %>% floor}',
x = "", y = "GFP per capita") +
...
transition_states(year, transition_length = 1, state_length = 0) +
enter_grow() +
exit_shrink() +
ease_aes('linear')
animate(p, fps = 20, duration = 5, width = 400, height = 600, end_pause = 10)
This is what I came up with, so far, based in good part on #Jon's answer.
p <- gdp %>%
# build rank, labels and relative values
group_by(Year) %>%
mutate(Rank = rank(-Value),
Value_rel = Value/Value[Rank==1],
Value_lbl = paste0(" ",round(Value/1e9))) %>%
group_by(Country.Name) %>%
# keep top 10
filter(Rank <= 10) %>%
# plot
ggplot(aes(-Rank,Value_rel, fill = Country.Name)) +
geom_col(width = 0.8, position="identity") +
coord_flip() +
geom_text(aes(-Rank,y=0,label = Country.Name,hjust=0)) + #country label
geom_text(aes(-Rank,y=Value_rel,label = Value_lbl, hjust=0)) + # value label
theme_minimal() +
theme(legend.position = "none",axis.title = element_blank()) +
# animate along Year
transition_states(Year,4,1)
animate(p, 100, fps = 25, duration = 20, width = 800, height = 600)
I might come back to improve it.
The moving grid could be simulated by removing the actual grid and having geom_segment lines moving and fading out thanks to an alpha parameter changing when it approaches 100 billion.
To have labels changing values between years (which gives a nice feeling of urgency in the original chart) I think we have no choice but multiplying the rows while interpolating labels, we'll need to interpolate Rank too.
Then with a few minor cosmetic changes we should be pretty close.
This is what I came up, I just use Jon and Moody code as a template and make few changes.
library(tidyverse)
library(gganimate)
library(gapminder)
theme_set(theme_classic())
gdp <- read.csv("https://raw.github.com/datasets/gdp/master/data/gdp.csv")
words <- scan(
text="world income only total dividend asia euro america africa oecd",
what= character())
pattern <- paste0("(",words,")",collapse="|")
gdp <- subset(gdp, !grepl(pattern, Country.Name , ignore.case = TRUE))
colnames(gdp) <- gsub("Country.Name", "country", colnames(gdp))
colnames(gdp) <- gsub("Country.Code", "code", colnames(gdp))
colnames(gdp) <- gsub("Value", "value", colnames(gdp))
colnames(gdp) <- gsub("Year", "year", colnames(gdp))
gdp$value <- round(gdp$value/1e9)
gap <- gdp %>%
group_by(year) %>%
# The * 1 makes it possible to have non-integer ranks while sliding
mutate(rank = min_rank(-value) * 1,
Value_rel = value/value[rank==1],
Value_lbl = paste0(" ",value)) %>%
filter(rank <=10) %>%
ungroup()
p <- ggplot(gap, aes(rank, group = country,
fill = as.factor(country), color = as.factor(country))) +
geom_tile(aes(y = value/2,
height = value,
width = 0.9), alpha = 0.8, color = NA) +
geom_text(aes(y = 0, label = paste(country, " ")), vjust = 0.2, hjust = 1) +
geom_text(aes(y=value,label = Value_lbl, hjust=0)) +
coord_flip(clip = "off", expand = FALSE) +
scale_y_continuous(labels = scales::comma) +
scale_x_reverse() +
guides(color = FALSE, fill = FALSE) +
labs(title='{closest_state}', x = "", y = "GDP in billion USD",
caption = "Sources: World Bank | Plot generated by Nitish K. Mishra #nitishimtech") +
theme(plot.title = element_text(hjust = 0, size = 22),
axis.ticks.y = element_blank(), # These relate to the axes post-flip
axis.text.y = element_blank(), # These relate to the axes post-flip
plot.margin = margin(1,1,1,4, "cm")) +
transition_states(year, transition_length = 4, state_length = 1) +
ease_aes('cubic-in-out')
animate(p, 200, fps = 10, duration = 40, width = 800, height = 600, renderer = gifski_renderer("gganim.gif"))
Here I am using duration 40 second, which is slow. You can change duration and make it faster or slower as you needed.

R - How can I add a bivariate legend to my ggplot2 chart?

I'm trying to add a bivariate legend to my ggplot2 chart but I don't know whether (a) this is possible through some guides options and (b) how to achieve it.
The only way I've managed to produce something close to the desired outcome was by specifically creating a new chart which resembles a legend (named p.legend below) and inserting it, via the cowplot package, somewhere in the original chart (named p.chart below). But surely there must be a better way than this, given that this approach requires creating the legend in the first place and fiddling with its size/location to fit it in the original chart.
Here's code for a dummy example of my approach:
library(tidyverse)
# Create Dummy Data #
set.seed(876)
n <- 2
df <- expand.grid(Area = LETTERS[1:n],
Period = c("Summer", "Winter"),
stringsAsFactors = FALSE) %>%
mutate(Objective = runif(2 * n, min = 0, max = 2),
Performance = runif(2 * n) * Objective) %>%
gather(Type, Value, Objective:Performance)
# Original chart without legend #
p.chart <- df %>%
ggplot(., aes(x = Area)) +
geom_col(data = . %>% filter(Type == "Objective"),
aes(y = Value, fill = Period),
position = "dodge", width = 0.7, alpha = 0.6) +
geom_col(data = . %>% filter(Type == "Performance"),
aes(y = Value, fill = Period),
position = "dodge", width = 0.7) +
scale_fill_manual(values = c("Summer" = "#ff7f00", "Winter" = "#1f78b4"), guide = FALSE) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank())
# Create a chart resembling a legend #
p.legend <- expand.grid(Period = c("Summer", "Winter"),
Type = c("Objective", "Performance"),
stringsAsFactors = FALSE) %>%
ggplot(., aes(x = Period, y = factor(Type, levels = c("Performance", "Objective")),
fill = Period, alpha = Type)) +
geom_tile() +
scale_fill_manual(values = c("Summer" = "#ff7f00", "Winter" = "#1f78b4"), guide = FALSE) +
scale_alpha_manual(values = c("Objective" = 0.7, "Performance" = 1), guide = FALSE) +
ggtitle("Legend") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
rect = element_rect(fill = "transparent"),
axis.title = element_blank(),
panel.grid.major = element_blank())
# Add legend to original chart #
p.final <- cowplot::ggdraw() +
cowplot::draw_plot(plot = p.chart) +
cowplot::draw_plot(plot = p.legend, x = 0.5, y = 0.65, width = 0.4, height = 0.28, scale = 0.7)
# Save chart #
cowplot::ggsave("Bivariate Legend.png", p.final, width = 8, height = 6, dpi = 500)
... and the resulting chart:
Is there an easier way of doing this?
This might work at some point, but right now the colorbox seems to ignore all breaks, names and labels (#ClausWilke?). Probably because the multiscales package is in really early stages.
Posting since it might work when future readers are here.
library(multiscales)
df %>%
mutate(
period = as.numeric(factor(Period)),
type = as.numeric(factor(Type))
) %>%
ggplot(., aes(x = Area, y = Value, fill = zip(period, type), group = interaction(Area, Period))) +
geom_col(width = 0.7, position = 'dodge') +
bivariate_scale(
"fill",
pal_hue_sat(c(0.07, 0.6), c(0.4, 0.8)),
guide = guide_colorbox(
nbin = 2,
name = c("Period", "Type"), #ignored
breaks = list(1:2, 1:2), #ignored
labels = list(levels(.$Period), levels(.$Type)) #ignored
)

Resources