sf map add missing values with inter_join - r

I have a technical question for you please.
read_sf("map.shp") %>% mutate(Groups = as.factor(Groups)) %>%
mutate(Groups = factor(Groups, levels = c(paste0(1:23)))) %>%
left_join(data, by = "cities_code") %>%
# Show map with cities border
ggplot() +
geom_sf(aes(fill = Groups), size = 0.4) +
# Color the different Groups, here 23 colors
stat_sf_coordinates(aes(size = observation)) +
# Put point with the size of my number of observations
scale_radius(range = c(1, 6)) +
geom_sf(fill = "transparent", color = "gray20", size = 1, data = . %>% group_by(Groups) %>% summarise()) +
# Show the border of my Groups
theme_bw()
This map represents exactly what I want. It represent cities of one state subdivided by district ("Groups"). But between my map.shp and my data I have a difference of 50 cities, because there is no observation in these cities (so no point of "stat_sf_coordinates(aes(size = observation))").
I can find the difference with anti_join(data, by = "cities_code").
I would like to have the same map but with the missing cities colored in red please please.
Thank you

It was simple :
read_sf("map.shp") %>% mutate(Groups = as.factor(Groups)) %>%
mutate(Groups = factor(Groups, levels = c(paste0(1:23)))) %>%
left_join(data, by = "cities_code") %>%
ggplot() +
geom_sf(aes(fill = Groups), size = 0.4) +
stat_sf_coordinates(aes(size = observation)) +
scale_radius(range = c(1, 6)) +
##
geom_sf(fill = "red", color = "gray40", size = 0.4, data = . %>% anti_join(data, by = "cities_code")) +
##
geom_sf(fill = "transparent", color = "gray20", size = 1, data = . %>% group_by(Groups) %>% summarise()) +
theme_bw()

Related

ggplot2 geom_segment by group

I am trying to draw separate line segments for each of the countries (A, B, C) in the plot.
I used the variable country for the group argument (as the docs suggest), but that does not work. The line is still a continuous line connecting all the text labels, but I need 3 separate lines to be drawn, one for each country, connecting the 3 text labels across the years.
library(dplyr)
library(ggplot2)
df_p <- data.frame(
year = rep(2019:2021, each = 3),
country = rep(LETTERS[1:3], 3),
var_a = c(1,6,10,2,5,7,3,7,9),
var_b = c(2,8,14,4,9,15,2,9,19)
)
df_p %>% arrange(country, year) %>%
ggplot(aes(x = var_a, y = var_b, color = country)) +
geom_text(aes(label = year)) +
geom_segment(
aes(
xend = c(tail(var_a, n = -1), NA),
yend = c(tail(var_b, n = -1), NA),
group = country
),
arrow = arrow(type = "open", length = unit(0.15, "inches"))
)
I think you just need geom_path instead of geom_segment.
Try this:
df_p %>% arrange(country, year) %>%
ggplot(aes(x = var_a, y = var_b, color = country)) +
geom_text(aes(label = year)) +
geom_path(
aes(
group = country
),
arrow = arrow(type = "open", length = unit(0.15, "inches"))
)
Another possible solution with geom_polygon() without showing the direction of the connections:
Sample data:
df_p <- data.frame(
year = rep(2019:2021, each = 3),
country = rep(LETTERS[1:3], 3),
var_a = c(1,6,10,2,5,7,3,7,9),
var_b = c(2,8,14,4,9,15,2,9,19)
)
Sample code:
library(dplyr)
library(ggplot2)
df_p %>%
arrange(country, year) %>%
ggplot(aes(x = var_a, y = var_b, group = country)) +
geom_point(aes(colour = country, shape = country), size = 4) +
geom_line(aes(colour = country), size = 1)+
geom_text(aes(label = year)) +
geom_polygon(
aes(
fill= country), alpha = .4)+
labs(x="Variable B",y="Variable A")+
theme_bw()
Output:

How to adjust ggrepel label on pie chart?

I am trying to create a pie chart to visualize percent abundance of 9 genera. However, the labels are all clumping together. How do I remedy this? Code included below:
generaabundance2020 <- c(883, 464, 1948, 1177, 2607, 962, 2073, 620, 2670)
genera2020 <- c("Andrena", "Ceratina", "Halictus",
"Hesperapis", "Lasioglossum", "Melissodes",
"Osmia", "Panurginus", "Other")
generabreakdown2020 <- data.frame(group = genera2020, value = generaabundance2020)
gb2020label <- generabreakdown2020 %>%
group_by(value) %>% # Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc = `value` / sum(`value`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
generabreakdown2020 %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
coord_polar("y", start = 0) +
theme_void() +
geom_label_repel(aes(label = gb2020label$labels), position = position_fill(vjust = 0.5),
size = 5, show.legend = F, max.overlaps = 50) +
guides(fill = guide_legend(title = "Genera")) +
scale_fill_manual(values = c("brown1", "chocolate1",
"darkgoldenrod1", "darkgreen",
"deepskyblue", "darkslateblue",
"darkorchid4", "hotpink1",
"lightpink"))
Which produces the following:
Thanks for adding your data.
There are a few errors in your code. The main one is that you didn't precalculate where to place the labels (done here in the text_y variable). That variable needs to be passed as the y aesthetic for geom_label_repel.
The second is that you no longer need
group_by(value) %>% count() %>% ungroup() because the data you provided is already aggregated.
library(tidyverse)
library(ggrepel)
generaabundance2020 <- c(883, 464, 1948, 1177, 2607, 962, 2073, 620, 2670)
genera2020 <- c("Andrena", "Ceratina", "Halictus", "Hesperapis", "Lasioglossum", "Melissodes", "Osmia", "Panurginus", "Other")
generabreakdown2020 <- data.frame(group = genera2020, value = generaabundance2020)
gb2020label <-
generabreakdown2020 %>%
mutate(perc = value/ sum(value)) %>%
mutate(labels = scales::percent(perc)) %>%
arrange(desc(group)) %>% ## arrange in the order of the legend
mutate(text_y = cumsum(value) - value/2) ### calculate where to place the text labels
gb2020label %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
coord_polar(theta = "y") +
geom_label_repel(aes(label = labels, y = text_y),
nudge_x = 0.6, nudge_y = 0.6,
size = 5, show.legend = F) +
guides(fill = guide_legend(title = "Genera")) +
scale_fill_manual(values = c("brown1", "chocolate1",
"darkgoldenrod1", "darkgreen",
"deepskyblue", "darkslateblue",
"darkorchid4", "hotpink1",
"lightpink"))
If you want to arrange in descending order of frequency, you should remember to also set the factor levels of the group variable to the same order.
gb2020label <-
generabreakdown2020 %>%
mutate(perc = value/ sum(value)) %>%
mutate(labels = scales::percent(perc)) %>%
arrange(desc(perc)) %>% ## arrange in descending order of frequency
mutate(group = fct_rev(fct_inorder(group))) %>% ## also arrange the groups in descending order of freq
mutate(text_y = cumsum(value) - value/2) ### calculate where to place the text labels
gb2020label %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
coord_polar(theta = "y") +
geom_label_repel(aes(label = labels, y = text_y),
nudge_x = 0.6, nudge_y = 0.6,
size = 5, show.legend = F) +
guides(fill = guide_legend(title = "Genera")) +
scale_fill_manual(values = c("brown1", "chocolate1",
"darkgoldenrod1", "darkgreen",
"deepskyblue", "darkslateblue",
"darkorchid4", "hotpink1",
"lightpink"))
Created on 2021-10-27 by the reprex package (v2.0.1)
You didn't provide us with your data to work with so I'm using ggplot2::mpg here.
library(tidyverse)
library(ggrepel)
mpg_2 <-
mpg %>%
slice_sample(n = 20) %>%
count(manufacturer) %>%
mutate(perc = n / sum(n)) %>%
mutate(labels = scales::percent(perc)) %>%
arrange(desc(manufacturer)) %>%
mutate(text_y = cumsum(n) - n/2)
Chart without polar coordinates
mpg_2 %>%
ggplot(aes(x = "", y = n, fill = manufacturer)) +
geom_col() +
geom_label(aes(label = labels, y = text_y))
Chart with polar coordinates and geom_label_repel
mpg_2 %>%
ggplot(aes(x = "", y = n, fill = manufacturer)) +
geom_col() +
geom_label_repel(aes(label = labels, y = text_y),
force = 0.5,nudge_x = 0.6, nudge_y = 0.6) +
coord_polar(theta = "y")
But maybe your data isn’t dense enough to need repelling?
mpg_2 %>%
ggplot(aes(x = "", y = n, fill = manufacturer)) +
geom_col() +
geom_label(aes(label = labels, y = text_y), nudge_x = 0.6) +
coord_polar(theta = "y")
Created on 2021-10-26 by the reprex package (v2.0.1)

How to use crosstalk with bar + line plot in r?

I am new to crosstalk & trying to make rmarkdown file more interactive by using on bar+line plot but it is not giving line on the plot and also gets weird when I change country.
library(tidyverse)
library(plotly)
library(crosstalk)
library(glue)
library(scales)
library(tidytext)
load data:
file_url <- "https://raw.githubusercontent.com/johnsnow09/covid19-df_stack-code/main/test_crosswalk.csv"
test_df <- read.csv(url(file_url))
Country_selected = c("Brazil")
selected_case_type = c("Confirmed_daily")
trend_sd <- test_df %>%
filter(Daily_Cases_type %in% selected_case_type
# Country.Region %in% Country_selected,
) %>%
select(Country.Region, date, Cases_count)%>%
arrange(date) %>%
group_by(Country.Region) %>%
mutate(new_avg = cumsum(Cases_count)/ seq_len(length(Cases_count))) %>%
ungroup() %>%
SharedData$new()
bscols(widths = c(9, 3),
list(
filter_select(id = "country", label = "Country:", sharedData = trend_sd, group = ~ Country.Region),
ggplotly(ggplot(data = trend_sd) +
geom_col(aes(x = date, y = Cases_count), fill = "turquoise", alpha = .3) +
geom_point(aes(x = date, y = new_avg), col = "tomato") +
geom_line(aes(x = date, y = new_avg), col = "tomato", size = .9, alpha = .3) +
scale_y_continuous(labels = comma) +
# expand_limits(y = 100000) +
labs(title = glue("{Country_selected}'s {selected_case_type} Cases {date_from} onwards"),
caption = "Data source: covid19.analytics")
))
)
This doesn't give correct line plot & even when I change country to some other then bars gets distorted.
Code & Plot Result below without crosstalk & plotly:
Country_selected = c("India") # can be selective
selected_case_type = c("Confirmed_daily")
test_df %>%
filter(Daily_Cases_type %in% selected_case_type,
Country.Region %in% Country_selected,
) %>%
select(Country.Region, date, Cases_count)%>%
arrange(date) %>%
group_by(Country.Region) %>%
mutate(new_avg = cumsum(Cases_count)/ seq_len(length(Cases_count))) %>%
ungroup() %>%
ggplot() +
geom_col(aes(x = date, y = Cases_count), fill = "turquoise", alpha = .3) +
geom_point(aes(x = date, y = new_avg), col = "tomato") +
geom_line(aes(x = date, y = new_avg), col = "tomato", size = .9, alpha = .3) +
scale_y_continuous(labels = comma) +
labs(title = glue("{Country_selected}'s {selected_case_type} Cases {date_from} onwards"),
subtitle = "With Average Daily Cases Trend line",
caption = "Data source: covid19.analytics")

ggplot monthly date scale on x axis uses days as units

When plotting a bar chart with monthly data, ggplot shortens the distance between February and March, making the chart look inconsistent
require(dplyr)
require(ggplot2)
require(lubridate)
## simulating sample data
set.seed(.1073)
my_df <- data.frame(my_dates = sample(seq(as.Date('2010-01-01'), as.Date('2016-12-31'), 1), 1000, replace = TRUE))
### aggregating + visualizing counts per month
my_df %>%
mutate(my_dates = round_date(my_dates, 'month')) %>%
group_by(my_dates) %>%
summarise(n_row = n()) %>%
ggplot(aes(x = my_dates, y = n_row))+
geom_bar(stat = 'identity', color = 'black',fill = 'slateblue', alpha = .5)+
scale_x_date(date_breaks = 'months', date_labels = '%y-%b') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
I would keep the dates as dates rather than factors. Yes, factors will keep the bars uniform in size but you'll have to remember to join in any months that are missing so that blank months aren't skipped and factors are easy to get out of order. I would recommend adjusting your aesthetics to reduce the effect that the black outline has on the gap between February and March.
Here are two examples:
Adjust the outline color to be white. This will reduce the contrast and makes the gap less noticible.
Set the width to 20 (days).
As an aside, you don't need to summarize the data, you can use floor_date() or round_date() in an earlier step and go straight into geom_bar().
dates <- seq(as.Date("2010-01-01"), as.Date("2016-12-31"), 1)
set.seed(.1073)
my_df <-
tibble(
my_dates = sample(dates, 1000, replace = TRUE),
floor_dates = floor_date(my_dates, "month")
)
ggplot(my_df, aes(x = floor_dates)) +
geom_bar(color = "white", fill = "slateblue", alpha = .5)
ggplot(my_df, aes(x = floor_dates)) +
geom_bar(color = "black", fill = "slateblue", alpha = .5, width = 20)
using some parts from IceCream's answer you can try this.
Of note, geom_col is now recommended to use in this case.
my_df %>%
mutate(my_dates = factor(round_date(my_dates, 'month'))) %>%
group_by(my_dates) %>%
summarise(n_row = n()) %>%
ungroup() %>%
mutate(my_dates_x = as.numeric(my_dates)) %>%
mutate(my_dates_label = paste(month(my_dates,label = T), year(my_dates))) %>%
{ggplot(.,aes(x = my_dates_x, y = n_row))+
geom_col(color = 'black',width = 0.8, fill = 'slateblue', alpha = .5) +
scale_x_continuous(breaks = .$my_dates_x, labels = .$my_dates_label) +
theme(axis.text.x = element_text(angle = 60, hjust = 1))}
You can convert it to a factor variable to use as the axis, and fix the formatting with a label argument to scale_x_discrete.
library(dplyr)
library(ggplot2)
my_df %>%
mutate(my_dates = factor(round_date(my_dates, 'month'))) %>%
group_by(my_dates) %>%
summarise(n_row = n()) %>%
ggplot(aes(x = my_dates, y = n_row))+
geom_bar(stat = 'identity', color = 'black',fill = 'slateblue', alpha = .5)+
scale_x_discrete(labels = function(x) format(as.Date(x), '%Y-%b'))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
Edit: Alternate method to account for possibly missing months which should be represented as blank spaces in the plot.
library(dplyr)
library(ggplot2)
library(lubridate)
to_plot <-
my_df %>%
mutate(my_dates = round_date(my_dates, 'month'),
my_dates_ticks = interval(min(my_dates), my_dates) %/% months(1))
to_plot %>%
group_by(my_dates_ticks) %>%
summarise(n_row = n()) %>%
ggplot(aes(x = my_dates_ticks, y = n_row))+
geom_bar(stat = 'identity', color = 'black',fill = 'slateblue', alpha = .5)+
scale_x_continuous(
breaks = unique(to_plot$my_dates_ticks),
labels = function(x) format(min(to_plot$my_dates) + months(x), '%y-%b'))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))

Text mining frequency with ggplot

I am working with a dataset called HappyDB for a class presentation and analyzing demographic differences in word frequency. I'm using tidytext for most of the analyses, and using their online guide to create most of my visuals. However, I'm running into a problem with the code to create the frequency plot of words with labels. My dataset is structured differently from theirs, and I thought I was accounting for it but I evidently was not. This is their sample code to generate the graph (comparing Jane Austen with the Bronte sisters and H.G. Wells)
library(tidyr)
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(author, proportion) %>%
gather(author, proportion, `Brontë Sisters`:`H.G. Wells`)
library(scales)
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Jane Austen", x = NULL)
And that code generates this plot:
I'm hoping to emulate this with demographics in my dataset, but keep getting errors. Here is my code, which uses a dataset that I have already tidied:
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidytext)
library(stringr)
windowsFonts(Franklin=windowsFont("Franklin Gothic Demi"))
marriedmen <- tidy_hm[which(tidy_hm$marital =="married" &
tidy_hm$gender == "m"),]
marriedwomen <- tidy_hm[which(tidy_hm$marital =="married" &
tidy_hm$gender == "f"),]
singlemen <- tidy_hm[which(tidy_hm$marital =="single" &
tidy_hm$gender == "m"),]
frequency <- bind_rows(mutate(marriedmen, status = "Married men"),
mutate(marriedwomen, status = "Married women"),
mutate(singlemen, status = "Single men")) %>%
count(status, word) %>%
group_by(status) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(status, proportion) %>%
gather(status, proportion, `Married women`:`Single men`)
library(scales)
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = 'Married men', color = abs(`Married men` - proportion)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~status, ncol = 2) +
theme(legend.position="none") +
labs(y = NULL, x = NULL)
But I keep getting this error:
Error in log(x, base) : non-numeric argument to mathematical function
I tried removing the scale rows, but that caused a bunch of data to get eliminated and the plot didn't look anything like it was supposed to, and had no line, labels, or colors. I'm pretty new to r and coding in general so any help is appreciated.

Resources