"Crossing off" tiles on a heatmap - r

For a heatmap made using ggplot and geom_tile, how would you "cross off" a tile based on a conditional value?
The heatmap shows counts of the number of times an animal performed a behavior between 1990-2020.
Rows are animal IDs, columns are years.
Years go from 1990-2020 but not all animals are alive throughout that time frame (ie, some born later than 1990 or die earlier than 2020)
So I want to cross off any tiles where an animal isn't alive, or before it was born.
Data look like this (shortened to 5 rows for brevity):
data <- data.frame(date = structure(c(8243, 8243, 8243, 8248, 8947), class = "Date"),
year = c("1992", "1992", "1992", "1992", "1994"),
event.id = c(8L, 8L, 8L, 10L, 11L),
id = c("L5", "L58", "L73", "L21", "L5"),
birth = c(1964L, 1980L, 1986L, 1950L, 1964L),
death = c(2012L, 2003L, NA, NA, 2012L))
NA means the animal is still alive and it wouldn't be crossed off since before it was born.
Any help to create this is greatly appreciated!
Code looks like this:
heatmap <- data %>%
mutate(x = case_when(year %in% 1990:1999 ~ "1990-1999",
TRUE ~ year)) %>%
mutate(y = paste(id)) %>%
group_by(x, y, .drop = FALSE) %>%
summarize(count = n()) %>%
arrange(y)
ggplot(data = heatmap, aes(x, y, fill = count)) +
geom_tile()
EDIT
Current heat map.

Here's how you could use color to indicate NA, like suggested by #Gregor Thomas.
Transforming your data to "complete":
library(dplyr)
library(tidyr)
library(ggplot2)
hm <- dat %>%
mutate(x = case_when(year < 1960 ~ "Pre-1960",
year %in% 1960:1969 ~ "1960-1969",
year %in% 1970:1979 ~ "1970-1979",
year %in% 1980:1989 ~ "1980-1989",
year %in% 1990:1999 ~ "1990-1999",
TRUE ~ year)) %>%
mutate(y = paste(matriline, id)) %>%
group_by(x, y, .drop = FALSE) %>%
summarize(count = n()) %>%
ungroup() %>%
tidyr::complete(x, y) %>%
arrange(y) %>%
tidyr::separate(y, into = c("ym", "yid"), sep = " ", remove = FALSE)
Then define a color for NA:
ggplot(data = hm, aes(x, yid, fill = count)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red", na.value = "grey50") +
scale_x_discrete(position = "top", drop = FALSE) +
scale_y_discrete(limits=rev) +
labs(x = NULL, y = NULL) +
facet_wrap( ~ ym, strip.position = "left", dir = "v", ncol = 1) +
theme(panel.spacing = unit(0.2, "lines"),
strip.background = element_blank(),
strip.placement = "outside",
axis.text.x = element_text(angle = 45, hjust = -0.02))
Data:
ids <- c("J11", "J16", "J17", "J02", "J22", "J26", "J27", "J30")
matrilines <- c("J02","J04", "K11", "L20", "P90", "K100", "R22")
dat <- data.frame(year = as.character(sample(1960:2018, 1000, replace = TRUE)),
id = sample(ids, 1000, replace = TRUE),
matriline = sample(matrilines, 1000, replace = TRUE))

Related

ggplot2: add two groups to one facet

I have a dataframe with of annual temperature time series from two locations (i.e., Site 1 & Site 2). The three temperature variables are:
Air temperature
Water temperature
Difference = Air - Water
I would like to produce a four-panel figure where the top and bottom rows are Site 1 and Site 2 respectively, the left column displays Air and Water and the right column shows Difference. Is there a way to do this using facet_wrap() or facet_grid()?
Example Data
library(data.table)
library(dplyr)
library(ggplot2)
set.seed(321)
# Create the example air and water temperature time series
df1 <- data.frame(matrix(ncol = 4, nrow = 365*4))
colnames(df1)[1:4] <- c("Location","Variable", "Date", "Temperature")
df1[1:730,1] <- "Site 1"
df1[731:NROW(df1),1] <- "Site 2"
df1[c(1:365,731:1095),2] <- "Air"
df1[c(366:730,1096:NROW(df1)),2] <- "Water"
df1$Date <- rep(seq.Date(as.Date("2021-01-01"),as.Date("2021-12-31"),"1 day"),4)
df1$noise <- rep(runif(365),4)
df1$t <- rep(seq(0,1*pi,,365),4)
for (i in 1:NROW(df1)) {
df1$Temperature[1:365] <- 20*sin(df1$t)+df1$noise*8
df1$Temperature[365:730] <- 17*sin(df1$t)+df1$noise*2
df1$Temperature[731:1095] <- 25*sin(df1$t)+df1$noise*6
df1$Temperature[1096:NROW(df1)] <- 18*sin(df1$t)+df1$noise*1.5
}
# Take the difference between air and water temperature
df1 <- df1[,1:4]
site1 <- df1[df1$Location == 'Site 1',]
site1 <- site1 %>%
tidyr::pivot_wider(names_from = Variable, values_from = Temperature) %>%
mutate(Difference = Air - Water) %>%
tidyr::pivot_longer(cols = c('Water','Air','Difference'),
names_to = 'Variable',
values_to = 'Temperature')
site2 <- df1[df1$Location == 'Site 2',]
site2 <- site2 %>%
tidyr::pivot_wider(names_from = Variable, values_from = Temperature) %>%
mutate(Difference = Air - Water) %>%
tidyr::pivot_longer(cols = c('Water','Air','Difference'),
names_to = 'Variable',
values_to = 'Temperature')
# Recombine data from site 1 and site 2 for final dataset
df1 <- rbind(site1,site2)
This is an example of what I am looking for, however instead of having a six-panel figure, I would like Air and Water displayed together, creating a four-panel figure.
df1 %>%
ggplot() +
geom_line(aes(x = Date, y = Temperature, group = Variable, color = Variable)) +
theme_bw() +
theme(panel.grid = element_blank(),
text = element_text(size = 16),
axis.text.x = element_text(size = 14, color = "black", angle = 90, vjust = 0.5, hjust = 1),
axis.text.y = element_text(size = 14, color = "black")) +
facet_grid(Location~Variable)
Try creating a new variable that groups "Air" and "Water" observatiopns, and specifying it to the facet:
df1 %>%
mutate(var_air_water = ## Here is the new variable
if_else(Variable %in% c("Air", "Water"),
true = "Air & Water",
false = Variable)) %>%
ggplot() +
geom_line(aes(x = Date, y = Temperature, group = Variable, color = Variable)) +
theme_bw() +
theme(panel.grid = element_blank(),
text = element_text(size = 16),
axis.text.x = element_text(size = 14, color = "black", angle = 90, vjust = 0.5, hjust = 1),
axis.text.y = element_text(size = 14, color = "black")) +
facet_grid(Location~var_air_water)

Combining two heatmaps with the variables next to each other

I'm trying to combine two heatmaps. I want var_a and var_x on the y axis with for example: var_a first and then var_x. I don't know if I should do this by changing the dataframe or combining them, or if I can do this in ggplot.
Below I have some example code and a drawing of what I want (since I don't know if I explained it right).
I hope someone has ideas how I can do this either in the dataframe or in ggplot!
Example code:
df_one <- data.frame(
vars = c("var_a", "var_b", "var_c"),
corresponding_vars = c("var_x", "var_y", "var_z"),
expression_organ_1_vars = c(5, 10, 20),
expression_organ_2_vars = c(50, 2, 10),
expression_organ_3_vars = c(5, 10, 3)
)
df_one_long <- pivot_longer(df_one,
cols=3:5,
names_to = "tissueType",
values_to = "Expression")
expression.df_one <- ggplot(df_one_long,
mapping = aes(y=tissueType, x=vars, fill = Expression)) +
geom_tile()
expression.df_one
df_two <- data.frame(
corresponding_vars = c("var_x", "var_y", "var_z"),
expression_organ_1_corresponding_vars = c(100, 320, 120),
expression_organ_2_corresponding_vars = c(23, 30, 150),
expression_organ_3_corresponding_vars = c(89, 7, 200)
)
df_two_long <- pivot_longer(df_one,
cols=3:5,
names_to = "tissueType",
values_to = "Expression")
expression.df_two <- ggplot(df_two_long,
mapping = aes(y=tissueType, x=vars, fill = Expression)) +
geom_tile()
expression.df_two
Drawing:
You can bind your data frames together and pivot into a longer format so that vars and corresponding vars are in the same column, but retain a grouping variable to facet by:
df_two %>%
mutate(cor = corresponding_vars) %>%
rename_with(~sub('corresponding_', '', .x)) %>%
bind_rows(df_one %>% rename(cor = corresponding_vars)) %>%
pivot_longer(contains('expression'), names_to = 'organ') %>%
mutate(organ = gsub('expression_|_vars', '', organ)) %>%
group_by(cor) %>%
summarize(vars = vars, organ = organ, value = value,
cor = paste(sort(unique(vars)), collapse = ' cor ')) %>%
ggplot(aes(vars, organ, fill = value)) +
geom_tile(color = 'white', linewidth = 1) +
facet_grid(.~cor, scales = 'free_x', switch = 'x') +
scale_fill_viridis_c() +
coord_cartesian(clip = 'off') +
scale_x_discrete(expand = c(0, 0)) +
theme_minimal(base_size = 16) +
theme(strip.placement = 'outside',
axis.text.x = element_blank(),
axis.ticks.x.bottom = element_line(),
panel.spacing.x = unit(3, 'mm'))
Okay, so I solved the issue for my own project, which is to convert it to a scatter plot. I combined both datasets and then used a simple scatterplot.
df.combined <- dplyr::full_join(df_two_long, df_one_long,
by = c("vars", "corresponding_vars", "tissueType"))
ggplot(df.combined,
aes(x=vars, y=tissueType, colour=Expression.x, size = Expression.y)) +
geom_point()
It's not a solution with heatmaps, but I don't know how to do that at the moment.

Change `ggplot` label color by conditions

I have a question about changing color for geom_text & geom_label base on a condition.
Expected out put
Data label has color = blue for 2021, color = grey for 2020
Gap label is red when negative and green when positive
Data
data <- tibble(Factor = c("A", "B", "A", "B"),
Score = c(90, 85, 85, 90),
Year = c("2020", "2020", "2021", "2021"))
Data transform
df_2 = data %>%
pivot_wider(names_from = Year, names_prefix = "Year", values_from = Score) %>%
mutate(gap = Year2021 - Year2020) %>%
select(Factor, gap)
Plot
data %>%
left_join(df_2) %>%
ggplot(aes(x = Factor, y = Score, fill = Year)) +
geom_col(position='dodge') +
geom_text(aes(label=Score),
position=position_dodge(width = 0.9),
vjust=-0.40) +
geom_label(aes(y = 100, label = ifelse(Year == 2021, gap, NA_character_)), na.rm = TRUE) +
scale_y_continuous(limits = c(0,105))
theme_minimal()
Expected out put
85 & 90 = blue for 2021, 90 & 85 = grey for 2020
-5 = red and 5 = green
Second option would be to use a manual scale for the text and bar colors. For the labels we could use the ggnewscale package which allows for multiple scales for the same aesthetic. Doing so we could add a second fill scale:
library(ggplot2)
library(ggnewscale)
ggplot(dat, aes(x = Factor, y = Score, fill = Year)) +
geom_col(position = "dodge") +
geom_text(aes(label = Score, color = Year),
position = position_dodge(width = 0.9),
vjust = -0.40
) +
scale_color_manual(aesthetics = c("fill", "color"), values = c("2021" = "blue", "2020" = "grey")) +
ggnewscale::new_scale_fill() +
geom_label(aes(y = 100, label = ifelse(Year == 2021, gap, NA_character_), fill = gap > 0), na.rm = TRUE) +
scale_fill_manual(values = c("FALSE" = "red", "TRUE" = "green")) +
scale_y_continuous(limits = c(0, 105)) +
theme_minimal()
DATA
data <- data.frame(
Factor = c("A", "B", "A", "B"),
Score = c(90, 85, 85, 90),
Year = c("2020", "2020", "2021", "2021")
)
library(dplyr)
library(tidyr)
df_2 <- data %>%
pivot_wider(names_from = Year, names_prefix = "Year", values_from = Score) %>%
mutate(gap = Year2021 - Year2020) %>%
select(Factor, gap)
dat <- data %>%
left_join(df_2)
You can conditionally assign the colors to the aesthetics using case_when. You can use the following code:
library(tibble)
library(ggplot2)
library(dplyr)
library(tidyr)
data <- tibble(Factor = c("A", "B", "A", "B"),
Score = c(90, 85, 85, 90),
Year = c("2020", "2020", "2021", "2021"))
df_2 = data %>%
pivot_wider(names_from = Year, names_prefix = "Year", values_from = Score) %>%
mutate(gap = Year2021 - Year2020) %>%
select(Factor, gap)
df <- data %>% left_join(df_2)
#> Joining, by = "Factor"
ggplot(df, mapping = aes(x = Factor, y = Score, fill = Year)) +
geom_col(position='dodge') +
geom_text(df, mapping = aes(label=Score, color = Year),
color = case_when(
df$Score == 85 & df$Year == 2021 | df$Score == 90 & df$Year == 2021 ~ "blue",
df$Score == 85 & df$Year == 2020 | df$Score == 90 & df$Year == 2020 ~ "grey",
),
position=position_dodge(width = 0.9),
vjust=-0.40) +
geom_label(data = df, mapping = aes(y = 100, label = ifelse(Year == 2021, gap, NA_character_)),
fill = case_when(
df$gap == -5 ~ "red",
df$gap == 5 ~ "green"
), na.rm = TRUE) +
scale_y_continuous(limits = c(0,105)) +
theme_minimal()
Created on 2022-09-24 with reprex v2.0.2

how to remove NA in r

I want to separate the hashtags in one column into different columns. After I use "separate" function, I have a lot of NA's when I do a ggplot. How can I remove the NA's in my ggplot? my code is like this:
df %>%
separate(terms, into = paste0("t", 1:5), sep = ";") %>%
pivot_longer(-year) %>%
group_by(year, value) %>%
count(value) %>%
ggplot(aes(x = factor(year), y = n, fill = value, label = NA)) +
geom_col(position = position_dodge()) +
geom_text(position = position_dodge(1))
my data is like this:
terms year
1 #A;#B;#C;#D;E 2017
2 #B;#C;#D 2016
3 #C;#D;#E#G 2021
4 #D;#E;#F 2020
...
Try tidyr::separate_rows instead:
library(tidyverse)
df %>%
separate_rows(terms, sep = ";") %>%
group_by(year, terms) %>%
count(terms) %>%
ggplot(aes(x = factor(year), y = n, fill = terms, label = NA)) +
geom_col(position = position_dodge()) +
geom_text(aes(label = terms), position = position_dodge(1))
You might also want to include tidyr::complete:
df %>%
separate_rows(terms, sep = ";") %>%
group_by(year, terms) %>%
count(terms) %>%
ungroup() %>%
complete(year, terms, fill = list(n = 0)) %>%
ggplot(aes(x = factor(year), y = n, fill = terms, label = NA)) +
geom_col(position = position_dodge(preserve = "single")) +
scale_fill_discrete(drop = FALSE) +
scale_x_discrete(drop = FALSE) +
geom_text(aes(label = n), size = 3, position = position_dodge(width = 1))
Or with only the top 3 terms labeled:
df %>%
separate_rows(terms, sep = ";") %>%
group_by(year, terms) %>%
count(terms) %>%
ungroup() %>%
complete(year, terms, fill = list(n = 0)) -> new_df
ggplot(new_df, aes(x = factor(year), y = n, fill = terms, label = NA)) +
geom_col(position = position_dodge(preserve = "single")) +
scale_fill_discrete(drop = FALSE) +
scale_x_discrete(drop = FALSE) +
geom_text(data = new_df %>%
group_by(year) %>%
mutate(n = case_when(rank(-n,ties.method = "random") <= 3 ~ n,
TRUE ~ NA_real_)),
aes(label = terms), size = 3, position = position_dodge(width = 1))
Sample Data:
df <- structure(list(terms = c("#A;#B;#C;#D;#E", "#C;#D;#E", "#B;#C;#D",
"#A", "#C;#D;#E;#G", "#D;#E;#F", "#D"), year = c(2017L, 2017L,
2016L, 2016L, 2021L, 2020L, 2020L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7"))

facet_wrap and assign colors to categorical variables in ggplot2

I am trying to reproduce this graphic below on the COVID19 (first plot) using facet_wrap() but I cannot make the other background series visible in gray (second plot).
Second plot
library(dplyr)
library(httr)
library(readxl)
library(ggplot2)
library(ggrepel)
library(scales)
library(forcats)
url <- paste("https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-",format(Sys.time(), "%Y-%m-%d"), ".xlsx", sep = "")
GET(url, authenticate(":", ":", type="ntlm"), write_disk(tf <- tempfile(fileext = ".xlsx")))
data <- read_excel(tf)
data$`Countries and territories` = fct_recode( data$`Countries and territories`, "Canada" ="CANADA")
days100 = data %>%
rename(country = `Countries and territories`) %>%
select(-Day, -Month, -Year) %>%
arrange(country, DateRep) %>%
group_by(country) %>%
mutate(test = if_else(Cases >= 1,
cumsum(Cases),0),
logtest = if_else(test > 0,
log10(test),0),
dummy100 = if_else(test >= 100,
1,0),
num100 = if_else(dummy100 == 1,
cumsum(dummy100),0),
selec_count = if_else(country == "Ecuador",
1,
if_else(country == "Italy",
2,
if_else(country == "US",
3,
if_else(country == "China",
4,
0))))) %>%
filter(country != 'Cases_on_an_international_conveyance_Japan',
test >=100)
days100 = days100 %>%
mutate(fil_count = if_else(GeoId == "CL" | GeoId == "IT" | GeoId == "CN" | GeoId == "FR", 1, 0))
ggplot(data = days100, aes(x = num100,
y = test,
color = selec_count,
group = country)) +
geom_line() +
guides(color = F) +
#scale_color_manual(values = c("1"="#5aae61", "2"="#7b3294", "3" = "red", "4" = "blue", "0"= "black")) +
facet_wrap(~ country) +
scale_x_continuous(expand = c(0, -1)) +
scale_y_continuous(trans="log10",
labels = scales::comma,
limits = c(100, NA),
expand = expand_scale(mult = c(0, 0.05))) +
theme_bw() +
ggrepel::geom_text_repel(data = days100 %>%
filter(fil_count==1 &
DateRep == last(DateRep)),
aes(label = country))
Also I want to add manual colors for selec_count category so that each series can be better visualized using scale_color_manual().
Without facet_wrap()
The only way I can think of is to duplicate (with crossing or similar) the data across all available countries.
library(dplyr)
library(tidyr)
library(ggplot2)
# helpful to find the most-impacted countries with over 1000 cases
topdat <- dat %>%
group_by(GeoId) %>%
summarize(n=max(Cases)) %>%
filter(n > 1000) %>%
arrange(desc(n))
plotdat <- dat %>%
mutate(
`Countries and territories` =
gsub("_", " ",
if_else(`Countries and territories` == "CANADA",
"Canada", `Countries and territories`))) %>%
inner_join(., topdat, by = "GeoId") %>%
arrange(DateRep) %>%
group_by(GeoId) %>%
filter(cumany(Cases > 100)) %>%
mutate(
ndays = as.numeric(difftime(DateRep, min(DateRep), units = "days")),
ncases = cumsum(Cases),
ndeaths = cumsum(Deaths),
ismax = ncases == max(ncases)
) %>%
crossing(., Country = unique(.$`Countries and territories`)) %>%
mutate(
col = case_when(
`Countries and territories` == Country ~ 1L,
GeoId %in% c("CN", "IT", "UK") ~ 2L,
TRUE ~ 3L
)
)
firstpane <- plotdat %>%
select(-Country) %>%
filter(GeoId %in% c("CN", "IT", "UK")) %>%
group_by(GeoId) %>%
slice(which.max(ncases)) %>%
crossing(., Country = unique(plotdat$`Countries and territories`))
ggplot(plotdat, mapping = aes(x = ndays, y = ncases, group = GeoId)) +
geom_line(aes(color = factor(col)), data = ~ subset(., col == 3L)) +
geom_line(aes(color = factor(col)), data = ~ subset(., col == 2L)) +
geom_line(aes(color = factor(col)), data = ~ subset(., col == 1L)) +
geom_text(aes(label = `Countries and territories`),
hjust = 0, vjust = 1.2,
data = subset(firstpane, Country == min(Country))) +
geom_point(data = firstpane) +
geom_point(color = "red", data = ~ subset(., ismax & col == 1L)) +
facet_wrap(~ Country) +
scale_y_continuous(trans = "log10", labels = scales::comma) +
scale_color_manual(values = c("red", "gray50", "#bbbbbb88"), guide = FALSE) +
labs(x = "Days since 100th case", y = NULL) +
lims(x = c(1, 100))
I did three geom_line to manually control the layering, so the red line is always on top. Otherwise, replace all three with geom_line(aes(color = factor(col))).

Resources