how to remove NA in r - r

I want to separate the hashtags in one column into different columns. After I use "separate" function, I have a lot of NA's when I do a ggplot. How can I remove the NA's in my ggplot? my code is like this:
df %>%
separate(terms, into = paste0("t", 1:5), sep = ";") %>%
pivot_longer(-year) %>%
group_by(year, value) %>%
count(value) %>%
ggplot(aes(x = factor(year), y = n, fill = value, label = NA)) +
geom_col(position = position_dodge()) +
geom_text(position = position_dodge(1))
my data is like this:
terms year
1 #A;#B;#C;#D;E 2017
2 #B;#C;#D 2016
3 #C;#D;#E#G 2021
4 #D;#E;#F 2020
...

Try tidyr::separate_rows instead:
library(tidyverse)
df %>%
separate_rows(terms, sep = ";") %>%
group_by(year, terms) %>%
count(terms) %>%
ggplot(aes(x = factor(year), y = n, fill = terms, label = NA)) +
geom_col(position = position_dodge()) +
geom_text(aes(label = terms), position = position_dodge(1))
You might also want to include tidyr::complete:
df %>%
separate_rows(terms, sep = ";") %>%
group_by(year, terms) %>%
count(terms) %>%
ungroup() %>%
complete(year, terms, fill = list(n = 0)) %>%
ggplot(aes(x = factor(year), y = n, fill = terms, label = NA)) +
geom_col(position = position_dodge(preserve = "single")) +
scale_fill_discrete(drop = FALSE) +
scale_x_discrete(drop = FALSE) +
geom_text(aes(label = n), size = 3, position = position_dodge(width = 1))
Or with only the top 3 terms labeled:
df %>%
separate_rows(terms, sep = ";") %>%
group_by(year, terms) %>%
count(terms) %>%
ungroup() %>%
complete(year, terms, fill = list(n = 0)) -> new_df
ggplot(new_df, aes(x = factor(year), y = n, fill = terms, label = NA)) +
geom_col(position = position_dodge(preserve = "single")) +
scale_fill_discrete(drop = FALSE) +
scale_x_discrete(drop = FALSE) +
geom_text(data = new_df %>%
group_by(year) %>%
mutate(n = case_when(rank(-n,ties.method = "random") <= 3 ~ n,
TRUE ~ NA_real_)),
aes(label = terms), size = 3, position = position_dodge(width = 1))
Sample Data:
df <- structure(list(terms = c("#A;#B;#C;#D;#E", "#C;#D;#E", "#B;#C;#D",
"#A", "#C;#D;#E;#G", "#D;#E;#F", "#D"), year = c(2017L, 2017L,
2016L, 2016L, 2021L, 2020L, 2020L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7"))

Related

Change order of segments in stacked barplot

How can I change the order of the segments in the following stacked barplot
from -- (left) to ++ (right) and NA on the right?
library(dplyr)
library(ggplot2)
library(tidyr)
size <- 100
df <- data.frame(item1 = sample(c(1:5, 1:5, 99), size = size, replace = TRUE),
item2 = sample(c(1:5, 1:4, 99), size = size, replace = TRUE),
item3 = sample(c(1:5, 1:3, 99), size = size, replace = TRUE),
item4 = sample(c(1:5, 1:5, 99), size = size, replace = TRUE),
item5 = sample(c(1:5, 1:4, 99), size = size, replace = TRUE))
# Stacked barplot
df %>% pivot_longer(cols = starts_with("item")) %>%
group_by(name) %>%
count(value) %>%
mutate(perc = 100 / sum(n) * n) %>%
ungroup() %>%
mutate(value = factor(value, levels = c(1:5, 99), labels = c("--", "-", "0", "+", "++", "NA"))) %>%
ggplot(aes(x = name, y = perc, fill = value)) + geom_col() +
coord_flip()
Created on 2022-12-12 with reprex v2.0.2
You can reverse the order of the stack by using position_stack(reverse = TRUE):
set.seed(123)
library(tidyr)
library(ggplot2)
library(dplyr)
df %>% pivot_longer(cols = starts_with("item")) %>%
group_by(name) %>%
count(value) %>%
mutate(perc = 100 / sum(n) * n) %>%
ungroup() %>%
mutate(value = factor(value, levels = c(1:5, 99), labels = c("--", "-", "0", "+", "++", "NA"))) %>%
ggplot(aes(x = name, y = perc, fill = value)) +
geom_col(position = position_stack(reverse = TRUE)) +
coord_flip()

"Crossing off" tiles on a heatmap

For a heatmap made using ggplot and geom_tile, how would you "cross off" a tile based on a conditional value?
The heatmap shows counts of the number of times an animal performed a behavior between 1990-2020.
Rows are animal IDs, columns are years.
Years go from 1990-2020 but not all animals are alive throughout that time frame (ie, some born later than 1990 or die earlier than 2020)
So I want to cross off any tiles where an animal isn't alive, or before it was born.
Data look like this (shortened to 5 rows for brevity):
data <- data.frame(date = structure(c(8243, 8243, 8243, 8248, 8947), class = "Date"),
year = c("1992", "1992", "1992", "1992", "1994"),
event.id = c(8L, 8L, 8L, 10L, 11L),
id = c("L5", "L58", "L73", "L21", "L5"),
birth = c(1964L, 1980L, 1986L, 1950L, 1964L),
death = c(2012L, 2003L, NA, NA, 2012L))
NA means the animal is still alive and it wouldn't be crossed off since before it was born.
Any help to create this is greatly appreciated!
Code looks like this:
heatmap <- data %>%
mutate(x = case_when(year %in% 1990:1999 ~ "1990-1999",
TRUE ~ year)) %>%
mutate(y = paste(id)) %>%
group_by(x, y, .drop = FALSE) %>%
summarize(count = n()) %>%
arrange(y)
ggplot(data = heatmap, aes(x, y, fill = count)) +
geom_tile()
EDIT
Current heat map.
Here's how you could use color to indicate NA, like suggested by #Gregor Thomas.
Transforming your data to "complete":
library(dplyr)
library(tidyr)
library(ggplot2)
hm <- dat %>%
mutate(x = case_when(year < 1960 ~ "Pre-1960",
year %in% 1960:1969 ~ "1960-1969",
year %in% 1970:1979 ~ "1970-1979",
year %in% 1980:1989 ~ "1980-1989",
year %in% 1990:1999 ~ "1990-1999",
TRUE ~ year)) %>%
mutate(y = paste(matriline, id)) %>%
group_by(x, y, .drop = FALSE) %>%
summarize(count = n()) %>%
ungroup() %>%
tidyr::complete(x, y) %>%
arrange(y) %>%
tidyr::separate(y, into = c("ym", "yid"), sep = " ", remove = FALSE)
Then define a color for NA:
ggplot(data = hm, aes(x, yid, fill = count)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red", na.value = "grey50") +
scale_x_discrete(position = "top", drop = FALSE) +
scale_y_discrete(limits=rev) +
labs(x = NULL, y = NULL) +
facet_wrap( ~ ym, strip.position = "left", dir = "v", ncol = 1) +
theme(panel.spacing = unit(0.2, "lines"),
strip.background = element_blank(),
strip.placement = "outside",
axis.text.x = element_text(angle = 45, hjust = -0.02))
Data:
ids <- c("J11", "J16", "J17", "J02", "J22", "J26", "J27", "J30")
matrilines <- c("J02","J04", "K11", "L20", "P90", "K100", "R22")
dat <- data.frame(year = as.character(sample(1960:2018, 1000, replace = TRUE)),
id = sample(ids, 1000, replace = TRUE),
matriline = sample(matrilines, 1000, replace = TRUE))

How to adjust ggrepel label on pie chart?

I am trying to create a pie chart to visualize percent abundance of 9 genera. However, the labels are all clumping together. How do I remedy this? Code included below:
generaabundance2020 <- c(883, 464, 1948, 1177, 2607, 962, 2073, 620, 2670)
genera2020 <- c("Andrena", "Ceratina", "Halictus",
"Hesperapis", "Lasioglossum", "Melissodes",
"Osmia", "Panurginus", "Other")
generabreakdown2020 <- data.frame(group = genera2020, value = generaabundance2020)
gb2020label <- generabreakdown2020 %>%
group_by(value) %>% # Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc = `value` / sum(`value`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
generabreakdown2020 %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
coord_polar("y", start = 0) +
theme_void() +
geom_label_repel(aes(label = gb2020label$labels), position = position_fill(vjust = 0.5),
size = 5, show.legend = F, max.overlaps = 50) +
guides(fill = guide_legend(title = "Genera")) +
scale_fill_manual(values = c("brown1", "chocolate1",
"darkgoldenrod1", "darkgreen",
"deepskyblue", "darkslateblue",
"darkorchid4", "hotpink1",
"lightpink"))
Which produces the following:
Thanks for adding your data.
There are a few errors in your code. The main one is that you didn't precalculate where to place the labels (done here in the text_y variable). That variable needs to be passed as the y aesthetic for geom_label_repel.
The second is that you no longer need
group_by(value) %>% count() %>% ungroup() because the data you provided is already aggregated.
library(tidyverse)
library(ggrepel)
generaabundance2020 <- c(883, 464, 1948, 1177, 2607, 962, 2073, 620, 2670)
genera2020 <- c("Andrena", "Ceratina", "Halictus", "Hesperapis", "Lasioglossum", "Melissodes", "Osmia", "Panurginus", "Other")
generabreakdown2020 <- data.frame(group = genera2020, value = generaabundance2020)
gb2020label <-
generabreakdown2020 %>%
mutate(perc = value/ sum(value)) %>%
mutate(labels = scales::percent(perc)) %>%
arrange(desc(group)) %>% ## arrange in the order of the legend
mutate(text_y = cumsum(value) - value/2) ### calculate where to place the text labels
gb2020label %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
coord_polar(theta = "y") +
geom_label_repel(aes(label = labels, y = text_y),
nudge_x = 0.6, nudge_y = 0.6,
size = 5, show.legend = F) +
guides(fill = guide_legend(title = "Genera")) +
scale_fill_manual(values = c("brown1", "chocolate1",
"darkgoldenrod1", "darkgreen",
"deepskyblue", "darkslateblue",
"darkorchid4", "hotpink1",
"lightpink"))
If you want to arrange in descending order of frequency, you should remember to also set the factor levels of the group variable to the same order.
gb2020label <-
generabreakdown2020 %>%
mutate(perc = value/ sum(value)) %>%
mutate(labels = scales::percent(perc)) %>%
arrange(desc(perc)) %>% ## arrange in descending order of frequency
mutate(group = fct_rev(fct_inorder(group))) %>% ## also arrange the groups in descending order of freq
mutate(text_y = cumsum(value) - value/2) ### calculate where to place the text labels
gb2020label %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
coord_polar(theta = "y") +
geom_label_repel(aes(label = labels, y = text_y),
nudge_x = 0.6, nudge_y = 0.6,
size = 5, show.legend = F) +
guides(fill = guide_legend(title = "Genera")) +
scale_fill_manual(values = c("brown1", "chocolate1",
"darkgoldenrod1", "darkgreen",
"deepskyblue", "darkslateblue",
"darkorchid4", "hotpink1",
"lightpink"))
Created on 2021-10-27 by the reprex package (v2.0.1)
You didn't provide us with your data to work with so I'm using ggplot2::mpg here.
library(tidyverse)
library(ggrepel)
mpg_2 <-
mpg %>%
slice_sample(n = 20) %>%
count(manufacturer) %>%
mutate(perc = n / sum(n)) %>%
mutate(labels = scales::percent(perc)) %>%
arrange(desc(manufacturer)) %>%
mutate(text_y = cumsum(n) - n/2)
Chart without polar coordinates
mpg_2 %>%
ggplot(aes(x = "", y = n, fill = manufacturer)) +
geom_col() +
geom_label(aes(label = labels, y = text_y))
Chart with polar coordinates and geom_label_repel
mpg_2 %>%
ggplot(aes(x = "", y = n, fill = manufacturer)) +
geom_col() +
geom_label_repel(aes(label = labels, y = text_y),
force = 0.5,nudge_x = 0.6, nudge_y = 0.6) +
coord_polar(theta = "y")
But maybe your data isn’t dense enough to need repelling?
mpg_2 %>%
ggplot(aes(x = "", y = n, fill = manufacturer)) +
geom_col() +
geom_label(aes(label = labels, y = text_y), nudge_x = 0.6) +
coord_polar(theta = "y")
Created on 2021-10-26 by the reprex package (v2.0.1)

R: how to use geom_point to add points to side-by-side error bars

library(tidyverse)
library(ggplot2)
x <- c(3.409091, 4.090909, 4.772728, 5.454546)
data <- structure(list(approach1 = c(1.02967633502518, 1.01580726843304, 1.04284139608584,
1.04357840118423), approach2 = c(1.0173503641109, 1.00394712634612,
1.02773624467658, 1.02863476832808)), class = "data.frame", row.names = c(NA,
-4L))
data.lower <- structure(list(approach1 = c(1.0115541354025, 0.993078788685045, 1.01974099190034,
1.02080115708828), approach2 = c(0.997978808683001, 0.982131487818724,
1.00420304483585, 1.00652657076137)), row.names = c(NA, -4L), class = "data.frame")
data.upper <- structure(list(approach1 = c(1.05177371364311, 1.03851495380357, 1.06108886027293,
1.06774282552092), approach2 = c(1.03841840431302, 1.0260370212124,
1.04663363856828, 1.0525347857539)), row.names = c(NA, -4L), class = "data.frame")
bind_cols(
data %>% mutate(x = x) %>% pivot_longer(-x, values_to = "data"),
data.lower %>% pivot_longer(everything()) %>% select(lower = value),
data.upper %>% pivot_longer(everything()) %>% select(upper = value)
) %>%
ggplot(aes(x, data, color = name, ymin = lower, ymax = upper)) +
geom_errorbar(position = "dodge")
The error bars correspond to the data in data.lower and data.upper. I would also like to add a point to each error bar. The data for this is stored in data. I've tried the following, but it did not seem to work, as the points are not on the actual error bars:
bind_cols(
data %>% mutate(x = x) %>% pivot_longer(-x, values_to = "data"),
data.lower %>% pivot_longer(everything()) %>% select(lower = value),
data.upper %>% pivot_longer(everything()) %>% select(upper = value)
) %>%
ggplot(aes(x, data, color = name, ymin = lower, ymax = upper)) +
geom_errorbar(position = "dodge") + geom_point(aes(x = x, y = data))
In your case you don't need to specify the aesthetics again in geom_point. And if so you have to wrap them in aes(). That's the main issue with your code. Additionally to make sure that the points align with the error bars you have to set the same positioning (and width) via position_dodge.
library(tidyverse)
bind_cols(
data %>% mutate(x = x) %>% pivot_longer(-x, values_to = "data"),
data.lower %>% pivot_longer(everything()) %>% select(lower = value),
data.upper %>% pivot_longer(everything()) %>% select(upper = value)
) %>%
ggplot(aes(x, data, color = name, ymin = lower, ymax = upper)) +
geom_errorbar(position = position_dodge(width = 0.6)) +
geom_point(position = position_dodge(width = 0.6))

facet_wrap and assign colors to categorical variables in ggplot2

I am trying to reproduce this graphic below on the COVID19 (first plot) using facet_wrap() but I cannot make the other background series visible in gray (second plot).
Second plot
library(dplyr)
library(httr)
library(readxl)
library(ggplot2)
library(ggrepel)
library(scales)
library(forcats)
url <- paste("https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-",format(Sys.time(), "%Y-%m-%d"), ".xlsx", sep = "")
GET(url, authenticate(":", ":", type="ntlm"), write_disk(tf <- tempfile(fileext = ".xlsx")))
data <- read_excel(tf)
data$`Countries and territories` = fct_recode( data$`Countries and territories`, "Canada" ="CANADA")
days100 = data %>%
rename(country = `Countries and territories`) %>%
select(-Day, -Month, -Year) %>%
arrange(country, DateRep) %>%
group_by(country) %>%
mutate(test = if_else(Cases >= 1,
cumsum(Cases),0),
logtest = if_else(test > 0,
log10(test),0),
dummy100 = if_else(test >= 100,
1,0),
num100 = if_else(dummy100 == 1,
cumsum(dummy100),0),
selec_count = if_else(country == "Ecuador",
1,
if_else(country == "Italy",
2,
if_else(country == "US",
3,
if_else(country == "China",
4,
0))))) %>%
filter(country != 'Cases_on_an_international_conveyance_Japan',
test >=100)
days100 = days100 %>%
mutate(fil_count = if_else(GeoId == "CL" | GeoId == "IT" | GeoId == "CN" | GeoId == "FR", 1, 0))
ggplot(data = days100, aes(x = num100,
y = test,
color = selec_count,
group = country)) +
geom_line() +
guides(color = F) +
#scale_color_manual(values = c("1"="#5aae61", "2"="#7b3294", "3" = "red", "4" = "blue", "0"= "black")) +
facet_wrap(~ country) +
scale_x_continuous(expand = c(0, -1)) +
scale_y_continuous(trans="log10",
labels = scales::comma,
limits = c(100, NA),
expand = expand_scale(mult = c(0, 0.05))) +
theme_bw() +
ggrepel::geom_text_repel(data = days100 %>%
filter(fil_count==1 &
DateRep == last(DateRep)),
aes(label = country))
Also I want to add manual colors for selec_count category so that each series can be better visualized using scale_color_manual().
Without facet_wrap()
The only way I can think of is to duplicate (with crossing or similar) the data across all available countries.
library(dplyr)
library(tidyr)
library(ggplot2)
# helpful to find the most-impacted countries with over 1000 cases
topdat <- dat %>%
group_by(GeoId) %>%
summarize(n=max(Cases)) %>%
filter(n > 1000) %>%
arrange(desc(n))
plotdat <- dat %>%
mutate(
`Countries and territories` =
gsub("_", " ",
if_else(`Countries and territories` == "CANADA",
"Canada", `Countries and territories`))) %>%
inner_join(., topdat, by = "GeoId") %>%
arrange(DateRep) %>%
group_by(GeoId) %>%
filter(cumany(Cases > 100)) %>%
mutate(
ndays = as.numeric(difftime(DateRep, min(DateRep), units = "days")),
ncases = cumsum(Cases),
ndeaths = cumsum(Deaths),
ismax = ncases == max(ncases)
) %>%
crossing(., Country = unique(.$`Countries and territories`)) %>%
mutate(
col = case_when(
`Countries and territories` == Country ~ 1L,
GeoId %in% c("CN", "IT", "UK") ~ 2L,
TRUE ~ 3L
)
)
firstpane <- plotdat %>%
select(-Country) %>%
filter(GeoId %in% c("CN", "IT", "UK")) %>%
group_by(GeoId) %>%
slice(which.max(ncases)) %>%
crossing(., Country = unique(plotdat$`Countries and territories`))
ggplot(plotdat, mapping = aes(x = ndays, y = ncases, group = GeoId)) +
geom_line(aes(color = factor(col)), data = ~ subset(., col == 3L)) +
geom_line(aes(color = factor(col)), data = ~ subset(., col == 2L)) +
geom_line(aes(color = factor(col)), data = ~ subset(., col == 1L)) +
geom_text(aes(label = `Countries and territories`),
hjust = 0, vjust = 1.2,
data = subset(firstpane, Country == min(Country))) +
geom_point(data = firstpane) +
geom_point(color = "red", data = ~ subset(., ismax & col == 1L)) +
facet_wrap(~ Country) +
scale_y_continuous(trans = "log10", labels = scales::comma) +
scale_color_manual(values = c("red", "gray50", "#bbbbbb88"), guide = FALSE) +
labs(x = "Days since 100th case", y = NULL) +
lims(x = c(1, 100))
I did three geom_line to manually control the layering, so the red line is always on top. Otherwise, replace all three with geom_line(aes(color = factor(col))).

Resources