Manual fill geom_tile with multiple scales by group - r

I have the following current output:
And I am aiming for a colouring like this, but only filled until the maximum level (e.g the fill stops at the level present):
The data to create this, is:
df <- tribble(~Question_Code, ~RespondentLevel,
"Engagement - Inclusion", 5,
"External engagement - policies", 2,
"External engagement - technology", 5,
"Community data ", 5,
"Internal engagement", 5,
"Internal use of technology", 4,
"Familiarity/Alignment", 5,
"Environmental impacts", 5,
"Innovation", 2,
"Use of open-source technology", 2,
"Regulation of hardware & software", 5,
"In-house technical capacity", 5,
"Infrastructure procurement", 5,
"Algorithmic Error & Bias", 2,
"Control: Privacy", 5,
"Accountability in Governance Structures", 3,
"Open procurement", 5,
"Use in decision-making", 1,
"Accountability", 1,
"External Control", 4,
"Internal Control", 2,
"Open Data", 2)
levels <- c("Open Data","Internal Control","External Control","Accountability",
"Use in decision-making","Open procurement","Accountability in Governance Structures","Control: Privacy",
"Algorithmic Error & Bias","Infrastructure procurement","In-house technical capacity",
"Regulation of hardware & software","Use of open-source technology","Innovation",
"Environmental impacts","Familiarity/Alignment",
"Internal use of technology","Internal engagement","Community data",
"External engagement - technology","External engagement - policies","Engagement - Inclusion")
df <- df %>% mutate(Domain = c(as.character((rep("Domain 1", 5))),
as.character(rep("Domain 2", 4)),
as.character(rep("Domain 3", 6)),
as.character(rep("Domain 4", 7))))
And for the ggplot:
df %>%
ggplot(aes(x = RespondentLevel, y = fct_rev(Question_Code))) +
geom_tile() +
theme_minimal(16)
The colours to fill, I'm using:
with each colour corresponding to a domain, and each shade to a level:
Greens <- c("#edf8e9", "#bae4b3", "#74c476", "#31a354", "#006d2c")
Reds <- c("#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#a50f15")
Yellows <- c("#ffffeb","#ffff9d","#ffff89", "#ffff4e", "#ffff14")
Blues <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd", "#08519c")
EDIT: geom_bar also does the trick, but not broken down by gradient. Trying to use this function:
ColourPalleteMulti <- function(df, group, subgroup){
# Find how many colour categories to create and the number of colours in each
categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )), df, function(x) length(unique(x)))
category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
# Build Colour pallette
colours <- unlist(lapply(1:nrow(categories),
function(i){
colorRampPalette(colors = c(category.start[i], category.end[i]))(categories[i,2])}))
return(colours)
}
colours <- ColourPalleteMulti(df, "Domain", "RespondentLevel")
df %>%
ggplot(aes(x = fct_rev(Question_Code), y = RespondentLevel))+
geom_bar(stat = "identity", aes(fill = Domain), alpha = .9) +
coord_flip() +
theme_minimal(16)+
xlab(" ") +
ggtitle("Baseline Report Card Sample Community")+
scale_fill_manual("RespondentLevel", values = colours)+
theme(legend.title = element_text(size = 14),
legend.position = "none",
legend.text = element_text(size = 14),
plot.title = element_text(size=18, hjust = 0.5),
plot.caption = element_text(size = 12, hjust = 1),
axis.text.y = element_text(hjust = 0),
panel.grid = element_line(colour = "#F0F0F0"),
plot.margin = unit(c(1,1,0.5,1), "cm"))
Sorry for the long reprex, can adjust if possible

Here are a few options for tricks. First off, to get the full set of levels for each question so you don't have gaps in your data, I used tidyr::complete. That's the data frame I'll be working with.
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(patchwork)
df_full <- df %>%
complete(nesting(Domain, Question_Code), RespondentLevel) %>%
mutate(RespondentLevel = as.character(RespondentLevel))
The easier option is to approximate the gradients with changing the alpha, and setting the hue (red, green, etc) based on domain. This forfeits the other colors you've chosen, and just uses the last, darkest color of each palette.
To do this, I made a list of all your palettes. In setting the fill, map_chr(palettes, 5) extracts the 5th element of each list, which is the darkest color of each. You'll probably want to adjust or remove one or both of the legends.
palettes <- list(Greens, Reds, Yellows, Blues)
ggplot(df_full, aes(x = RespondentLevel, y = Question_Code, fill = Domain, alpha = RespondentLevel)) +
geom_tile() +
theme_minimal() +
facet_grid(rows = vars(Domain), scales = "free", space = "free") +
scale_fill_manual(values = map_chr(palettes, 5))
#> Warning: Using alpha for a discrete variable is not advised.
The more difficult way splits the data by domain and makes a list of plots, then puts them together with the patchwork package. The benefit is that you can keep the full color palettes, but the downside is that it's more difficult to control things like sizing that you get from facet_grid, which adjusts for the fact that there are more questions listed in some domains than in others. You could resize these by hand in plot_layout if you think this approach is worthwhile. You'll also need to adjust some theme elements to mimic what facet_grid would do.
plot_list <- df_full %>%
split(.$Domain) %>%
map2(palettes, function(domain_df, pal) {
ggplot(domain_df, aes(x = RespondentLevel, y = Question_Code, fill = RespondentLevel)) +
geom_tile() +
theme_minimal() +
scale_fill_manual(values = pal) +
theme(legend.position = "none") +
labs(x = NULL, y = NULL)
})
reduce(plot_list, `+`) +
plot_layout(ncol = 1)
Note that normally, patchwork puts plots together like plot1 + plot2 to mimic ggplot layering. Since I had the plots in a list, I did this with purrr::reduce.

Related

Ggplot - always place 'Total' bar as the farthest right bar using geom_col

I am creating a chart that looks like the below. Problem is that I'd like the grey 'total' bar to always be on the far right hand side.
Current code is below, can anyone please amend/provide any additional code to create this effect?
#plot with reorder
PrevalencePlot <- ggplot(ICSTable4, aes(x = reorder(value, Area), y = value, fill = Statistical_Significance)) +
geom_col() +
scale_fill_manual(values = colours)+
geom_errorbar(aes(ymin=errorbarlowerplot, ymax=errorbarhigherplot),
width=.2, # Width of the error bars
position=position_dodge(.9)) +
theme_bw() +
geom_text(aes(label = valuelabel), vjust = 2.5, colour = "black")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
If anyone is able to help then the below data frame could be used to generate the principle I think? Thank you!
df <- data.frame(Area = c("Area1", "Area2", "Area3", "Area4", "Total"),
Value = c(1, 3, 7, 5, 4)
)
Building on the minimal example data, we can make a spartanic version of the plot that addresses the question of ordering the values, and placing a selected column at the end.
df <- data.frame(Area = c("Area1", "Area2", "Area3", "Area4", "Total"),
value = c(1, 3, 7, 5, 4),
Statistical_Significance = c("higher", "lower", "lower", "higher", NA))
It's easier to create the order of the columns before plotting, as we need to create the factors based on the order of value and then reposition the target column ("Total").
df <- df %>%
dplyr::arrange(desc(value)) %>% #arrange by value
dplyr::mutate(Area = forcats::as_factor(Area)) %>% # factor that defines order on x-axis
dplyr::mutate(Area = forcats::fct_relevel(Area, "Total", after = Inf)) # reposition "Total" column
ggplot(df, aes(x = Area, y = value, fill = Statistical_Significance)) +
geom_col() +
theme_bw()

ggplot2 code runs and updates plot but no I see a bunch of numbers instead of data

I used the same code to plot my data last week, and got it all right.
I updated the data with new rows, so this is the only thing I changed, but then ggplot does not plot the data but a bunch of numbers as shown in the pictures I attached.
I'm sure it is something about how the data is coded, but when it happened last week, what I did was to use as.numeric, and that's it.
**# Read the data
experiment_6 <- read.csv("Data_Experiments.xlsx - experiment_6.csv", header=TRUE)
#clean the irrelevant rows
experiment_6_clean <- experiment_6[-c(1,2), ]
#check if it data.frame
is.data.frame(experiment_6_clean)
#remove Nas from dataset
#experiment_6_clean %>% drop_na()
#select relevant data
experiment_6_clean<- experiment_6_clean[1:517,]
#rename columns
experiment_6_clean<- dplyr::rename(experiment_6_clean,
mean_S_both_Par= X, plusSD_S_both_Par= X.1, minusSD_S_both_Par=X.2,
mean_S_both_Agl=X.3,plusSD_S_both_Agl= X.4, minusSD_S_both_Agl= X.5,
mean_M_both_Par= X.6,plusSD_M_both_Par= X.7, minusSD_M_both_Par= X.8,
mean_M_both_Agl= X.9, plusSD_M_both_Agl= X.10,minusSD_M_both_Agl= X.11,
mean_L_water_Sch= X.12, plusSD_L_water_Sch= X.13, minusSD_L_water_Sch= X.14,
mean_L_both_Sch= X.15, plusSD_L_both_Sch=X.16, minusSD_L_both_Sch= X.17,
Time= X1)
#make all relevant data numeric
Time<- as.numeric(experiment_6_clean$Time)
#small_pots_both_par------4"_2mm_BothSides_Par
mean_S_both_Par<- as.numeric(experiment_6_clean$mean_S_both_Par)
plusSD_S_both_Par<- as.numeric(experiment_6_clean$plusSD_S_both_Par)
minusSD_S_both_Par<- as.numeric(experiment_6_clean$minusSD_S_both_Par)
#small_pots_both_agl---4"_2mm_BothSides_Agl
mean_S_both_Agl<- as.numeric(experiment_6_clean$mean_S_both_Agl)
plusSD_S_both_Agl<- as.numeric(experiment_6_clean$plusSD_S_both_Agl)
minusSD_S_both_Agl<- as.numeric(experiment_6_clean$minusSD_S_both_Agl)
#medium_pots_both_par---6"_2mm_BothSides_Par
mean_M_both_Par<- as.numeric(experiment_6_clean$mean_M_both_Par)
plusSD_M_both_Par<- as.numeric(experiment_6_clean$plusSD_M_both_Par)
minusSD_M_both_Par<- as.numeric(experiment_6_clean$minusSD_M_both_Par)
#medium_pots_both_agl---6"_2mm_BothSides_Agl
mean_M_both_Agl<- as.numeric(experiment_6_clean$mean_M_both_Agl)
plusSD_M_both_Agl<- as.numeric(experiment_6_clean$plusSD_M_both_Agl)
minusSD_M_both_Agl<- as.numeric(experiment_6_clean$minusSD_M_both_Agl)
#large_pots_water_Sch---10"_4mm_WaterSide_Sch
mean_L_water_Sch<- as.numeric(experiment_6_clean$mean_L_water_Sch)
plusSD_L_water_Sch<- as.numeric(experiment_6_clean$plusSD_L_water_Sch)
minusSD_L_water_Sch<- as.numeric(experiment_6_clean$minusSD_L_water_Sch)
#large_pots_both_Sch---10"_5mm_BothSides_Sch
mean_L_both_Sch<- as.numeric(experiment_6_clean$mean_L_both_Sch)
plusSD_L_both_Sch<- as.numeric(experiment_6_clean$plusSD_L_both_Sch)
minusSD_L_both_Sch<- as.numeric(experiment_6_clean$minusSD_L_both_Sch)
experiment_6_clean<- as.data.frame(experiment_6_clean)
#plot
#create key
colors <- c(mean_S_both_Par = "light blue", mean_S_both_Agl = "red", mean_M_both_Par = "orange",
mean_M_both_Agl = "violet", mean_L_water_Sch = "pink", mean_L_both_Sch = "yellow")
#compare all six pots
all_six<- ggplot(experiment_6_clean, aes(x=Time))+
geom_smooth(aes(y=mean_S_both_Par, colour = "mean_S_both_Par"), size = 2, se=TRUE ) +
geom_smooth(aes(y=mean_S_both_Agl, colour = "mean_S_both_Agl"), size = 2, se=TRUE)+
geom_smooth(aes(y=mean_M_both_Par, colour = "mean_M_both_Par"), size = 2, se=TRUE)+
geom_smooth(aes(y=mean_M_both_Agl, colour = "mean_M_both_Agl"),size = 2, se=TRUE )+
geom_smooth(aes(y=mean_L_water_Sch, colour = "mean_L_water_Sch"),size = 2, se=TRUE)+
geom_smooth(aes(y=mean_L_both_Sch,colour = "mean_L_both_Sch"),size = 2, se=TRUE)+
labs(title="Experiment 6", subtitle="All Plants", caption="20 days",
y="Mositure Level", x="Time", color = "Group") +
scale_color_manual(values = colors)+
theme(plot.title=element_text(size=20, face="bold"), axis.text.x=element_text(size=15),
axis.text.y=element_text(size=15))+
coord_cartesian(ylim=c(-100, 150), xlim=c(0, 25))+
theme_bw()
all_six**
what I got this weekwhat I had last week

Difficulty positioning heatmap.2 components

I have been really struggling to position the components of my heatmap.2 output.
I found this old answer explaining how the element positioning worked from #IanSudbery which seemed really clear and I thought it had given me the understanding I need, but I'm still not grasping something.
I understand that the elements are all essentially put in a lattice of windows but they aren't behaving in a way I understand.
Here is my code and the current output (at the very bottom is the bit of interest which orders the figure elements):
for(i in 1:length(ConditionsAbbr)) {
# creates its own colour palette
my_palette <- colorRampPalette(c("snow", "yellow", "darkorange", "red"))(n = 399)
# (optional) defines the colour breaks manually for a "skewed" colour transition
col_breaks = c(seq(0,0.09,length=100), #white 'snow'
seq(0.1,0.19,length=100), # for yellow
seq(0.2,0.29,length=100), # for orange 'darkorange'
seq(0.3,1,length=100)) # for red
# creates a 5 x 5 inch image
png(paste(SourceDir, "Heatmap_", ConditionsAbbr[i], "XYZ.png"), # create PNG for the heat map
width = 5*600, # 5 x 600 pixels
height = 5*600,
res = 300, # 300 pixels per inch
pointsize = 8) # smaller font size
heatmap.2(ConditionsMtx[[ConditionsAbbr[i]]],
cellnote = ConditionsMtx[[ConditionsAbbr[i]]], # same data set for cell labels
main = paste(ConditionsAbbr[i], "XYZ"), # heat map title
notecol="black", # change font color of cell labels to black
density.info="none", # turns off density plot inside color legend
trace="none", # turns off trace lines inside the heat map
margins =c(12,9), # widens margins around plot
col=my_palette, # use on color palette defined earlier
breaks=col_breaks, # enable color transition at specified limits
dendrogram="none", # No dendogram
srtCol = 0 , #correct angle of label numbers
asp = 1 , #this overrides layout methinks and for some reason makes it square
adjCol = c(NA, -35) ,
adjRow = c(53, NA) ,
keysize = 1.2 ,
Colv = FALSE , #turn off column clustering
Rowv = FALSE , # turn off row clustering
key.xlab = paste("Correlation") ,
lmat = rbind( c(0, 3), c(2,1), c(0,4) ),
lhei = c(0.9, 4, 0.5) )
dev.off() # close the PNG device
}
This gives:
As you can see, the key is right of the matrix, there are huge amounts of white space between the matrix, the title above and key below, and it's not even as if the title and matrix are centred in the PNG?
I think to myself "well I'll just create a 3x3 that is easy to understand and edit" e.g.
| |
| | (3)
| |
--------------------------
| (1) |
(2) | Matrix |
| |
--------------------------
| (4) |
| Key |
| |
And then I can get rid of the white space so it's more like this.
| |(3)
------------------
| (1) |
(2)| Matrix |
| |
------------------
|(4) Key |
I do this using:
lmat = rbind( c(0, 0, 3), c(2, 1, 0), c(0, 4, 0) ),
lhei = c(0.9, 4, 0.5) ,
lwid = c(1, 4, 1))
This is what it looks like:
As great as it is to see my matrix in the centre, my key is still aligned to the right of my matrix and my title is taking the Silk Road East? Not to mention all the excess white space?
How do I get these to align and to all move together so the figure components fit snugly together?
EDIT: reducing my margins helped to reduce the whitespace but it's still excessive.
Here are the final changes I made to get my results, however, I would recommend using the advice of Maurits Evers if you aren't too invested in heatmap.2. Don't overlook the changes I made to the image dimensions.
# creates my own colour palette
my_palette <- colorRampPalette(c("snow", "yellow", "darkorange", "red"))(n = 399)
# (optional) defines the colour breaks manually for a "skewed" colour transition
col_breaks = c(seq(0,0.09,length=100), #white 'snow'
seq(0.1,0.19,length=100), # for yellow
seq(0.2,0.29,length=100), # for orange 'darkorange'
seq(0.3,1,length=100)) # for red
# creates an image
png(paste(SourceDir, "Heatmap_XYZ.png" )
# create PNG for the heat map
width = 5*580, # 5 x 580 pixels
height = 5*420, # 5 x 420 pixels
res = 300, # 300 pixels per inch
pointsize =11) # smaller font size
heatmap.2(ConditionsMtx[[ConditionsAbbr[i]]],
cellnote = ConditionsMtx[[ConditionsAbbr[i]]], # same data set for cell labels
main = "XYZ", # heat map title
notecol="black", # change font color of cell labels to black
density.info="none", # turns off density plot inside color legend
trace="none", # turns off trace lines inside the heat map
margins=c(0,0), # widens margins around plot
col=my_palette, # use on color palette defined earlier
breaks=col_breaks, # enable color transition at specified limits
dendrogram="none", # only draw a row dendrogram
srtCol = 0 , #correct angle of label numbers
asp = 1 , #this overrides layout methinks and for some reason makes it square
adjCol = c(NA, -38.3) , #shift column labels
adjRow = c(77.5, NA) , #shift row labels
keysize = 2 , #alter key size
Colv = FALSE , #turn off column clustering
Rowv = FALSE , # turn off row clustering
key.xlab = paste("Correlation") , #add label to key
cexRow = (1.8) , # alter row label font size
cexCol = (1.8) , # alter column label font size
notecex = (1.5) , # Alter cell font size
lmat = rbind( c(0, 3, 0), c(2, 1, 0), c(0, 4, 0) ) ,
lhei = c(0.43, 2.6, 0.6) , # Alter dimensions of display array cell heighs
lwid = c(0.6, 4, 0.6) , # Alter dimensions of display array cell widths
key.par=list(mar=c(4.5,0, 1.8,0) ) ) #tweak specific key paramters
dev.off()
Here is the output, which I will continue to refine until all spacing and font sizes suit my aesthetic preference. I would tell you exactly what I've done but I'm not 100% sure, frankly it all feels like it's held together with old gum and bailer twine, but don't kick a gift horse in the code, as they say.
I don't know if you're open to non-heatmap.2-based solutions. In my opinion ggplot offers greater flexibility and with a bit of tweaking you can reproduce a heatmap similar to the one you're showing quite comfortably while maximising plotting "real-estate" and avoiding excessive whitespace.
I'm happy to remove this post if you're only looking for heatmap.2 solutions.
That aside, a ggplot2 solution may look like this:
First off, let's generate some sample data
set.seed(2018)
df <- as_tibble(matrix(runif(7*10), ncol = 10), .name_repair = ~seq(1:10))
Prior to plotting we need to reshape df from wide to long
library(tidyverse)
df <- df %>%
rowid_to_column("row") %>%
gather(col, Correlation, -row) %>%
mutate(col = as.integer(col))
Then to plot
ggplot(df, aes(row, col, fill = Correlation)) +
geom_tile() +
scale_fill_gradientn(colours = my_palette) + # Use your custom colour palette
theme_void() + # Minimal theme
labs(title = "Main title") +
geom_text(aes(label = sprintf("%2.1f", Correlation)), size = 2) +
theme(
plot.title = element_text(hjust = 1), # Right-aligned text
legend.position="bottom") + # Legend at the bottom
guides(fill = guide_colourbar(
title.position = "bottom", # Legend title below bar
barwidth = 25, # Extend bar length
title.hjust = 0.5))
An example with multiple heatmaps in a grid layout via facet_wrap
First off, let's generate more complex data.
set.seed(2018)
df <- replicate(
4,
as_tibble(matrix(runif(7*10), ncol = 10), .name_repair = ~seq(1:10)), simplify = F) %>%
setNames(., paste("data", 1:4, sep = "")) %>%
map(~ .x %>% rowid_to_column("row") %>%
gather(col, Correlation, -row) %>%
mutate(col = as.integer(col))) %>%
bind_rows(.id = "data")
Then the plotting is identical to what we did before plus an additional facet_wrap(~data, ncol = 2) statement
ggplot(df, aes(row, col, fill = Correlation)) +
geom_tile() +
scale_fill_gradientn(colours = my_palette) + # Use your custom colour palette
theme_void() + # Minimal theme
labs(title = "Main title") +
geom_text(aes(label = sprintf("%2.1f", Correlation)), size = 2) +
facet_wrap(~ data, ncol = 2) +
theme(
plot.title = element_text(hjust = 1), # Right-aligned text
legend.position="bottom") + # Legend at the bottom
guides(fill = guide_colourbar(
title.position = "bottom", # Legend title below bar
barwidth = 25, # Extend bar length
title.hjust = 0.5))
One final update
I thought it'd be fun/interesting to see how far we can get towards a complex heatmap similar to the one you link to from the paper.
The sample data is included at the end, as this takes up a bit of space.
We first construct three different ggplot2 plot objects that show the main heatmap (gg3), an additional smaller heatmap with missing values (gg2), and a bar denoting group labels for every row (gg1).
gg3 <- ggplot(df.cor, aes(col, row, fill = Correlation)) +
geom_tile() +
scale_fill_distiller(palette = "RdYlBu") +
theme_void() +
labs(title = "Main title") +
geom_text(aes(label = sprintf("%2.1f", Correlation)), size = 2) +
scale_y_discrete(position = "right") +
theme(
plot.title = element_text(hjust = 1),
legend.position="bottom",
axis.text.y = element_text(color = "black", size = 10)) +
guides(fill = guide_colourbar(
title.position = "bottom",
barwidth = 10,
title.hjust = 0.5))
gg2 <- ggplot(df.flag, aes(col, row, fill = Correlation)) +
geom_tile(colour = "grey") +
scale_fill_distiller(palette = "RdYlBu", guide = F, na.value = "white") +
theme_void() +
scale_x_discrete(position = "top") +
theme(
axis.text.x = element_text(color = "black", size = 10, angle = 90, hjust = 1, vjust = 0.5))
gg1 <- ggplot(df.bar, aes(1, row, fill = grp)) +
geom_tile() +
scale_fill_manual(values = c("grp1" = "orange", "grp2" = "green")) +
theme_void() +
theme(legend.position = "left")
We can now use egg::ggarrange to position all three plots such that the y axis ranges are aligned.
library(egg)
ggarrange(gg1, gg2, gg3, ncol = 3, widths = c(0.1, 1, 3))
Sample data
library(tidyverse)
set.seed(2018)
nrow <- 7
ncol <- 20
df.cor <- matrix(runif(nrow * ncol, min = -1, max = 1), nrow = nrow) %>%
as_tibble(.name_repair = ~seq(1:ncol)) %>%
rowid_to_column("row") %>%
gather(col, Correlation, -row) %>%
mutate(
row = factor(
paste("row", row, sep = ""),
levels = paste("row", 1:nrow, sep = "")),
col = factor(
paste("col", col, sep = ""),
levels = paste("col", 1:ncol, sep = "")))
nrow <- 7
ncol <- 10
df.flag <- matrix(runif(nrow * ncol, min = -1, max = 1), nrow = nrow) %>%
as_tibble(.name_repair = ~seq(1:ncol)) %>%
rowid_to_column("row") %>%
gather(col, Correlation, -row) %>%
mutate(
row = factor(
paste("row", row, sep = ""),
levels = paste("row", 1:nrow, sep = "")),
col = factor(
paste("col", col, sep = ""),
levels = paste("col", 1:ncol, sep = ""))) %>%
mutate(Correlation = ifelse(abs(Correlation) < 0.5, NA, Correlation))
df.bar <- data.frame(
row = 1:nrow,
grp = paste("grp", c(rep(1, nrow - 3), rep(2, 3)), sep = "")) %>%
mutate(
row = factor(
paste("row", row, sep = ""),
levels = paste("row", 1:nrow, sep = "")))

Split dataframe and Create multipanel scatterplots from list of data frames

I have a dataframe like so:
set.seed(453)
year= as.factor(c(rep("1998", 20), rep("1999", 16)))
lepsp= c(letters[seq(from = 1, to = 20 )], c('a','b','c'),letters[seq(from =8, to = 20 )])
freq= c(sample(1:15, 20, replace=T), sample(1:18, 16,replace=T))
df<-data.frame(year, lepsp, freq)
df<-
df %>%
group_by(year) %>%
mutate(rank = dense_rank(-freq))
Frequencies freq of each lepsp within each year are ranked in the rank column. Larger freq values correspond to the smallest rank value and smaller freq values have the largest rank values. Some rankings are repeated if levels of lepsp have the same abundance.
I would like to split the df into multiple subsets by year. Then I would like to plot each subsetted dataframe in a multipanel figure. Essentially this is to create species abundance curves. The x-axis would be rank and the yaxis needs to be freq.
In my real dataframe I have 22 years of data. I would prefer the graphs to be displayed as 2 columns of 4 rows for a total of 8 graphs per page. Essentially I would have to repeat the solution offered here 3 times.
I also need to demarcate the 25%, 50% and 75% quartiles with vertical lines to look like this (desired result):
It would be great if each graph specified the year to which it belonged, but since all axis are the same name, I do not want x and y labels to be repeated for each graph.
I have tried to plot multiple lines on the same graph but it gets messy.
year.vec<-unique(df$year)
plot(sort(df$freq[df$year==year.vec[1]],
decreasing=TRUE),bg=1,type="b", ylab="Abundance", xlab="Rank",
pch=21, ylim=c(0, max(df$freq)))
for (i in 2:22){
points(sort(df$freq[df$year==year.vec[i]], decreasing=TRUE), bg=i,
type="b", pch=21)
}
legend("topright", legend=year.vec, pt.bg=1:22, pch=21)
I have also tried a loop, however it does not produce an output and is missing some of the arguments I would like to include:
jpeg('pract.jpg')
par(mfrow = c(6, 4)) # 4 rows and 2 columns
for (i in unique(levels(year))) {
plot(df$rank,df$freq, type="p", main = i)
}
dev.off()
Update
(Attempted result)
I found the following code after my post which gets me a little closer, but is still missing all the features I would like:
library(reshape2)
library(ggplot2)
library (ggthemes)
x <- ggplot(data = df2, aes(x = rank, y = rabun)) +
geom_point(aes(fill = "dodgerblue4")) +
theme_few() +
ylab("Abundance") + xlab("Rank") +
theme(axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15),
plot.title = element_blank(), # we don't want individual plot titles as the facet "strip" will give us this
legend.position = "none", # we don't want a legend either
panel.border = element_rect(fill = NA, color = "darkgrey", size = 1.25, linetype = "solid"),
axis.ticks = element_line(colour = 'darkgrey', size = 1.25, linetype = 'solid')) # here, I just alter to colour and thickness of the plot outline and tick marks. You generally have to do this when faceting, as well as alter the text sizes (= element_text() in theme also)
x
x <- x + facet_wrap( ~ year, ncol = 4)
x
I prefer base R to modify graph features, and have not been able to find a method using base R that meets all my criteria above. Any help is appreciated.
Here's a ggplot approach. First off, I made some more data to get the 3x2 layout:
df = rbind(df, mutate(df, year = year + 4), mutate(df, year = year + 8))
Then We do a little manipulation to generate the quantiles and labels by group:
df_summ =
df %>% group_by(year) %>%
do(as.data.frame(t(quantile(.$rank, probs = c(0, 0.25, 0.5, 0.75)))))
names(df_summ)[2:5] = paste0("q", 0:3)
df_summ_long = gather(df_summ, key = "q", value = "value", -year) %>%
inner_join(data.frame(q = paste0("q", 0:3), lab = c("Common", "Rare-75% -->", "Rare-50% -->", "Rare-25% -->"), stringsAsFactors = FALSE))
With the data in good shape, plotting is fairly simple:
library(ggthemes)
library(ggplot2)
ggplot(df, aes(x = rank, y = freq)) +
geom_point() +
theme_few() +
labs(y = "Abundance (% of total)", x = "Rank") +
geom_vline(data = df_summ_long[df_summ_long$q != "q0", ], aes(xintercept = value), linetype = 4, size = 0.2) +
geom_text(data = df_summ_long, aes(x = value, y = Inf, label = lab), size = 3, vjust = 1.2, hjust = 0) +
facet_wrap(~ year, ncol = 2)
There's some work left to do - mostly in the rarity text overlapping. It might not be such an issue with your actual data, but if it is you could pull the max y values into df_summ_long and stagger them a little bit, actually using y coordinates instead of just Inf to get it at the top like I did.

R plot: Uniform distance between ticks for non-uniform numbers

I am trying to recreate the basic temperature trend of this Paleotemperature figure in R. (Original image and data.)
The scale interval of the x-axis changes from 100s of millions of years to 10s of millions to millions, and then to 100s of thousands, and so on, but the ticks marks are evenly spaced. The original figure was carefully laid out in five separate graphs in Excel to achieve the spacing. I am trying to get the same x-axis layout in R.
I have tried two basic approaches. The first approach was to use par(fig=c(x1,x2,y1,y2)) to make five separate graphs placed side by side. The problem is that the intervals among tick marks is not uniform and labels overlap.
#1
par(fig=c(0,0.2,0,0.5), mar=c(3,4,0,0))
plot(paleo1$T ~ paleo1$Years, col='red3', xlim=c(540,60), bty='l',type='l', ylim=c(-6,15), ylab='Temperature Anomaly (°C)')
abline(0,0,col='gray')
#2
par(fig=c(0.185,0.4,0,0.5), mar=c(3,0,0,0), new=TRUE)
plot(paleo2$T ~ paleo2$Year, col='forestgreen', axes=F, type='l', xlim=c(60,5), ylab='', ylim=c(-6,15))
axis(1, xlim=c(60,5))
abline(0,0,col='gray')
#etc.
The second approach (and my preferred approach, if possible) is to plot the data in a single graph. This causes non-uniform distance among tick marks because they follow their "natural" order. (Edit: example data added as well as link to full data set.).
years <- c(500,400,300,200,100,60,50,40,30,20,10,5,4,3,2,1)
temps <- c(13.66, 8.6, -2.16, 3.94, 8.44, 5.28, 12.98, 8.6, 5, 5.34, 3.66, 2.65, 0.78, 0.25, -1.51, -1.77)
test <- data.frame(years, temps)
names(test) <- c('Year','T')
# The full csv file can be used with this line instead of the above.
# test <- read.csv('https://www.dropbox.com/s/u0dfmlvzk0ztpkv/paleo_test.csv?dl=1')
plot(test$T ~ test$Year, type='l', xaxt='n', xlim=c(520,1), bty='l', ylim=c(-5,15), xlab="", ylab='Temperature Anomaly (°C)')
ticklabels = c(500,400,300,200,100,60,50,40,30,20,10,5,4,3,2,1)
axis(1, at=ticklabels)
Adding log='x' to plot comes closest but the intervals between ticks are still not even and the actual scale is, of course, not a log scale.
My examples only go down to 1 million years because I am trying to solve the problem first but my the goal is to match the original figure above. I am open to ggplot solutions although I am only fleetingly familiar with it.
I will strike a different note by saying: don't. In my experience, the harder something is to do in ggplot2 (and to a lesser extent, base graphics), the less likely it is to be a good idea. Here, the problem is that consistently changing the scales like is more likely to lead the viewer astray.
Instead, I recommend using a log scale and manually setting your cutoffs.
First, here is some longer data, just to cover the full likely scale of your question:
longerTest <-
data.frame(
Year = rep(1:9, times = 6) * rep(10^(3:8), each = 9)
, T = rnorm(6*9))
Then, I picked some cutoffs to place the labels at in the plot. These can be adjusted to whatever you want, but are at least a starting point for reasonably spaced ticks:
forLabels <-
rep(c(1,2,5), times = 6) * rep(10^(3:8), each = 3)
Then, I manually set some things to append to the labels. Thus, instead of having to say "Thousands of years" under part of the axis, you can just label those with a "k". Each order of magnitude gets a value. Nnote that the names are just to help keep things straight: below I just use the index to match. So, if you skip the first two, you will need to adjust the indexing below.
toAppend <-
c("1" = "0"
, "2" = "00"
, "3" = "k"
, "4" = "0k"
, "5" = "00k"
, "6" = "M"
, "7" = "0M"
, "8" = "00M")
Then, I change my forLabels into the text versions I want to use by grabbing the first digit, and concatenating with the correct suffix from above.
myLabels <-
paste0(
substr(as.character(forLabels), 1, 1)
, toAppend[floor(log10(forLabels))]
)
This gives:
[1] "1k" "2k" "5k" "10k" "20k" "50k" "100k" "200k" "500k" "1M" "2M"
[12] "5M" "10M" "20M" "50M" "100M" "200M" "500M"
You could likely use these for base graphics, but getting the log scale to do what you want is sometimes tricky. Instead, since you said you are open to a ggplot2 solution, I grabbed this modified log scale from this answer to get a log scale that runs from big to small:
library("scales")
reverselog_trans <- function(base = exp(1)) {
trans <- function(x) -log(x, base)
inv <- function(x) base^(-x)
trans_new(paste0("reverselog-", format(base)), trans, inv,
log_breaks(base = base),
domain = c(1e-100, Inf))
}
Then, just pass in the data, and set the scale with the desired breaks:
ggplot(longerTest
, aes(x = Year
, y = T)) +
geom_line() +
scale_x_continuous(
breaks = forLabels
, labels = myLabels
, trans=reverselog_trans(10)
)
Gives:
Which has a consistent scale, but is labelled far more uniformly.
If you want colors, you can do that using cut:
ggplot(longerTest
, aes(x = Year
, y = T
, col = cut(log10(Year)
, breaks = c(3,6,9)
, labels = c("Thousands", "Millions")
, include.lowest = TRUE)
, group = 1
)) +
geom_line() +
scale_x_continuous(
breaks = forLabels
, labels = myLabels
, trans=reverselog_trans(10)
) +
scale_color_brewer(palette = "Set1"
, name = "How long ago?")
Here is a version using facet_wrap to create different scales. I used 6 here, but you can set whatever thresholds you want instead.
longerTest$Period <-
cut(log10(longerTest$Year)
, breaks = c(3, 4, 5, 6, 7, 8, 9)
, labels = paste(rep(c("", "Ten", "Hundred"), times = 2)
, rep(c("Thousands", "Millions"), each = 3) )
, include.lowest = TRUE)
longerTest$Period <-
factor(longerTest$Period
, levels = rev(levels(longerTest$Period)))
newBreaks <-
rep(c(2,4,6,8, 10), times = 6) * rep(10^(3:8), each = 5)
newLabels <-
paste0(
substr(as.character(newBreaks), 1, 1)
, toAppend[floor(log10(newBreaks))]
)
ggplot(longerTest
, aes(x = Year
, y = T
)) +
geom_line() +
facet_wrap(~Period, scales = "free_x") +
scale_x_reverse(
breaks = newBreaks
, labels = newLabels
)
gives:
Here is a start:
#define the panels
breaks <- c(-Inf, 8, 80, Inf)
test$panel <- cut(test$Year, breaks, labels = FALSE)
test$panel <- ordered(test$panel, levels = unique(test$panel))
#for correct scales
dummydat <- data.frame(Year = c(0, 8, 8, 80, 80, max(test$Year)),
T = mean(test$T),
panel = ordered(rep(1:3, each = 2), levels = levels(test$panel)))
library(ggplot2)
ggplot(test, aes(x = Year, y = T, color = panel)) +
geom_line() +
geom_blank(data = dummydat) + #for correct scales
facet_wrap(~ panel, nrow = 1, scales = "free_x") +
theme_minimal() + #choose a theme you like
theme(legend.position = "none", #and customize it
panel.spacing.x = unit(0, "cm"),
strip.text = element_blank() ,
strip.background = element_blank()) +
scale_x_reverse(expand = c(0, 0))
Here's a basic example of doing it with separate plots using gridExtra. This may be useful to combine with extra grobs, for instance to create the epoch boxes across the top (not done here). If so desired, this might be best combined with Roland's solution.
# ggplot with gridExtra
library('ggplot2')
library('gridExtra')
library('grid')
d1 <- test[1:5, ]
d2 <- test[6:11, ]
d3 <- test[12:16, ]
plot1 <- ggplot(d1, aes(y = T, x = seq(1:nrow(d1)))) +
geom_line() +
ylim(c(-5, 15)) +
theme_minimal() +
theme(axis.title.x = element_blank(),
plot.margin = unit(c(1,0,1,1), "cm")) +
scale_x_continuous(breaks=)
plot2 <- ggplot(d2, aes(y = T, x = seq(1:nrow(d2)))) +
geom_line() +
ylim(c(-5, 15)) +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.x = element_blank(),
plot.margin = unit(c(1,0,1,0), "cm"))
plot3 <- ggplot(d3, aes(y = T, x = seq(1:nrow(d3)))) +
geom_line() +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.x = element_blank(),
plot.margin = unit(c(1,0,1,0), "cm")) +
ylim(c(-5, 15))
# put together
grid.arrange(plot1, plot2, plot3, nrow = 1,
widths = c(1.5,1,1)) # allow extra width for first plot which has y axis

Resources