Create custom (equally spaced) bins in ggplot for data with gaps - r

I have data with large degrees of separation between "clusters/groups" of values that I hope to make a histogram with, but dividing the bins into equal sized groups has been difficult. I'd like for zero (0) to have it's own bin, the total number of equally spaced bins be < 8 (ideally, to avoid crowding the plot) with an extra empty bin for "..." signifying the large gaps in-between the data values. The actual dataset has 800+ zeros with maybe 5% data >0. Naturally the zeros will over-shadow the rest of the data, but a log transform will fix that. I just can't figure out the best way to break-up the data...
Data looks like this:
set.seed(123)
zero <- runif(50, min=0, max=0)
small <- runif(7, min=0, max=0.1)
medium <- runif(5, min=0, max=0.5)
high <- runif(3, min=1.5, max=2.5)
f <- function(x){
return(data.frame(ID=deparse(substitute(x)), value=x))
}
all <- bind_rows(f(zero), f(small), f(medium), f(high))
all <- as.data.frame(all[,-1])
names(all)[1] <- "value"
My attempt:
bins <- all %>% mutate(bin = cut(all$value, breaks = c(0, seq(0.01:0.4), Inf), right = FALSE)) %>%
count(bin, name = "freq") %>%
add_row(bin = "...", freq = NA_integer_) %>%
mutate(bin = fct_relevel(bin, "...", after = 0.4))
But I get this error:
Error in `mutate()`:
! Problem while computing `bin = fct_relevel(bin, "...", after = 0.5)`.
Caused by error:
! `idx` must contain one integer for each level of `f`
This is not equally spaced, but I'm looking for something like this as labels for my plot:
levels(bins$bin) <- c("0", "0.01-0.05", "0.05-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "...", "2.0+")
ggplot(bins, aes(x = bin, y = freq, fill = bin)) +
geom_histogram(stat = "identity", colour = "black")

You can use cut directly inside ggplot
ggplot(all, aes(cut(value, breaks = c(0, 0.25, 0.5, 3), inc = TRUE))) +
geom_bar() +
scale_y_log10() +
labs(x = "value")

This worked for me (using my own data):
bins <- WET %>% mutate(bin = cut(den, breaks = c(0, seq(0.001, 0.225, 0.15), 0.255, 0.3, Inf), right = FALSE)) %>%
count(bin, name = "freq") %>% # build frequency table, frequency = freq
add_row(bin = "...", freq = NA_integer_) %>% # add empty row for NA
mutate(bin = fct_relevel(bin, "...", after = 3)) # Put factor level "..." after 3! (the 3rd position)
levels(bins$bin) <- c("0", "0.001-0.15", "0.15-0.255", "...", "0.3+")
# fct_relevel(f, "a", after = 2), "..., after = x, x must be an integer! (2nd position)
ggplot(bins, aes(x = bin, y = freq, fill = bin)) +
geom_bar(stat = "identity", colour = "black") +
geom_text(aes(label = freq), vjust = -0.5) +
scale_y_continuous(limits = c(0, 800), expand = expansion(mult = c(0, 0.05))) +
scale_fill_brewer(name = "Density", palette="Greys", breaks = c("0", "0.001-0.15", "0.15-0.255", "0.3+")) +
# Only show these legend values (exclude "...")
labs(title = "Wet seasons - Pink shrimp density (no./m2)",x = "Density range", y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text = element_text(size = 9, face = "bold")) +
theme(axis.title = element_text(size = 13, face = "bold")) + # Axis titles
theme(axis.title.x = element_text(vjust = -3)) +
theme(panel.border = element_rect(color = "black", fill = NA, size = 1)) +
# Adjust distance of x-axis title from plot
theme(plot.margin = margin(t = 20, # Top margin
r = 50, # Right margin
b = 40, # Bottom margin
l = 10)) # Left margin

Related

Position stacked identity data sample size as geom_text directly over a bar using geom_bar from ggplot2

In this experiment, we tracked presence or absence of bacterial infection in our subject animals. We were able to isolate which type of bacteria was present in our animals and created a plot that has Week Since Experiment Start on the X axis, and Percentage of Animals Positive for bacterial infection on the Y axis. This is a stacked identity ggplot where each geom_bar contains the different identities of the bacteria that were in the infected animals each week. Here is a sample dataset with the corresponding ggplot code and result:
DummyData <- data.frame(matrix(ncol = 5, nrow = 78))
colnames(DummyData) <- c('WeeksSinceStart','BacteriaType','PositiveOccurences','SampleSize','NewSampleSize')
DummyData$WeeksSinceStart <- c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9,9,9,9,9,10,10,10,10)
DummyData$BacteriaType <- c("BactA","BactB","BactD","BactB","BactE","BactA","BactS","BactF","BactE","BactH","BactJ","BactK","BactE","BactB","BactS","BactF","BactL","BactE","BactW","BactH","BactS","BactJ","BactQ","BactN","BactW","BactA","BactD","BactE","BactA","BactC","BactD","BactK","BactL","BactE","BactD","BactA","BactS","BactK","BactB","BactE","BactF","BactH","BactN","BactE","BactL","BactZ","BactE","BactC","BactR","BactD","BactJ","BactN","BactK","BactW","BactR","BactE","BactW","BactA","BactM","BactG","BactO","BactI","BactE","BactD","BactM","BactH","BactC","BactM","BactW","BactA","BactL","BactB","BactE","BactA","BactS","BactH","BactQ","BactF")
PosOcc <- seq(from = 1, to = 2, by = 1)
DummyData$PositiveOccurences <- rep(PosOcc, times = 13)
DummyData$SampleSize <- c(78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,29,29,29,29,29,10,10,10,10)
DummyData$NewSampleSize <- c(78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,29,NA,NA,NA,NA,10,NA,NA,NA)
numcolor <- 20
plotcolors <- colorRampPalette(brewer.pal(8, "Set3"))(numcolor)
#GGplot for Dummy Data
DummyDataPlot <- ggplot(DummyData, aes(x = WeeksSinceStart, y = PositiveOccurences/SampleSize, fill = BacteriaType)) + geom_bar(position = "stack", stat = "identity") +
geom_text(label = DummyData$NewSampleSize, nudge_y = 0.1) +
scale_y_continuous(limits = c(0,0.6), breaks = seq(0, 1, by = 0.1)) + scale_x_continuous(limits = c(0.5,11), breaks = seq(0,10, by =1)) +
labs(
x = "Weeks Since Start",
y = "Proportion Positive") +
scale_fill_manual(values = plotcolors)
The problem: I cannot seem to find a way to position the labels from geom_text directly over each bar. I would also love to add the text "n = " to the sample size value directly over each bar. Thank you for your help!
I have tried different values for position_dodge statement and nudge_y statement with no success.
Sometimes the easiest approach is to do some data wrangling, i.e. one option would be to create a separate dataframe for your labels:
library(ggplot2)
library(dplyr)
dat_label <- DummyData |>
group_by(WeeksSinceStart) |>
summarise(y = sum(PositiveOccurences / SampleSize), SampleSize = unique(SampleSize))
ggplot(DummyData, aes(x = WeeksSinceStart, y = PositiveOccurences / SampleSize, fill = BacteriaType)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(data = dat_label, aes(x = WeeksSinceStart, y = y, label = SampleSize), inherit.aes = FALSE, nudge_y = .01) +
#scale_y_continuous(limits = c(0, 0.6), breaks = seq(0, 1, by = 0.1)) +
scale_x_continuous(limits = c(0.5, 11), breaks = seq(0, 10, by = 1)) +
labs(
x = "Weeks Since Start",
y = "Proportion Positive"
) +
scale_fill_manual(values = plotcolors)

Plot a heat map with a gradient for the tiles based on the proximity of a specific area of the plot

My goal is to use a heat map to plot the data below. I have obtained the layout of the heat map I wanted, but I am struggling to find a way to color the tiles as intended. My guess is that the code below colors the tiles proportional to the number inside the tile itself. However, this is not what I need. I would like to:
have the same color within each of the rectangles (yellow for the rectangle on the left and blue for the rectangle on the right);
a gradient from yellow to blue for the other tiles proportional to how far away the tiles are from the blue area.
Thanks to anyone who will help!
library(tidyverse)
# First, I create the simulated dataset with 200 individuals
set.seed(1243) # set seed for reproducibility
#I simulate test 1 scores
test1_score <- sample(c(3.5, 3.8, 4), 200, replace = TRUE)
#I simulate test 2 scores
test2_score <- round(runif(200, 0, 38))
#I create diagnostic classes based on test 2 scores
test2_class <- ifelse(test2_score < 20, "non meet",
ifelse(test2_score < 30, "below 40th",
ifelse(test2_score < 35, "above 40th",
ifelse(test2_score < 37, "meet", "master"))))
#I create exit_program variable based on test 1 scorea and test 2 classes
exit_program <- ifelse(test1_score == 4 & test2_class %in% c("above 40th", "meet", "master"), 1, 0)
#I create data frame with simulated data
df <- data.frame(ID = 1:200, test1_score, test2_class, exit_program)
#I calculate the group size for each combination of test 1 scores and test 2 classes
df2 <- df |>
group_by(test1_score, test2_class, exit_program) |>
summarize(n = n()) |>
mutate(test2_class = factor(test2_class, levels = c("non meet", "below 40th", "above 40th", "meet", "master"))) |> arrange(test2_class)
#plot data
df2 |>
ggplot(aes(x = test2_class, y = as.factor(test1_score))) +
theme(legend.position = "none", panel.background = element_rect(fill = "white"),
axis.text = element_text(size = 12), axis.title = element_text(size = 14)) +
geom_tile(aes(fill = n)) +
geom_text(aes(label = n), size = 7, family = "sans") +
labs(x = "classification", y = "scores", size = 10) +
scale_fill_gradient(low = "#56B4E9", high = "#F0E442", guide = "none") +
geom_rect(aes(xmin = 2.5, xmax = 5.5, ymin = 2.5, ymax = 3.5), linewidth = 2, color = "#0072B2", fill = NA) +
geom_rect(aes(xmin = 0.5, xmax = 1.5, ymin = 0.5, ymax = 3.5), linewidth = 2, color = "#E69F00", fill = NA)
Maybe this is what you are looking for. You are right. Mapping n on fill means to color the tiles by the value of n.
As I understand the question you want each "column" to have the same color and colored with a gradient from yellow on the left to blue on the right. To this end you could add a column with a measure of the distance from the left or the right end to your data which could then be mapped on the fill aes. One option would be to convert your test2_class to a numeric then compute the absolute distance from the maximum value (which corresponds to the "master" level).
library(tidyverse)
df2 <- df2 |>
ungroup() |>
mutate(
fill = as.numeric(test2_class),
fill = abs(fill - max(fill))
)
#plot data
df2 |>
ggplot(aes(x = test2_class, y = as.factor(test1_score))) +
theme(legend.position = "none", panel.background = element_rect(fill = "white"),
axis.text = element_text(size = 12), axis.title = element_text(size = 14)) +
geom_tile(aes(fill = fill)) +
geom_text(aes(label = n), size = 7, family = "sans") +
labs(x = "classification", y = "scores", size = 10) +
scale_fill_gradient(low = "#56B4E9", high = "#F0E442", guide = "none") +
geom_rect(aes(xmin = 2.5, xmax = 5.5, ymin = 2.5, ymax = 3.5), linewidth = 2, color = "#0072B2", fill = NA) +
geom_rect(aes(xmin = 0.5, xmax = 1.5, ymin = 0.5, ymax = 3.5), linewidth = 2, color = "#E69F00", fill = NA)

SHAP Summary Plot for XGBoost model in R without displaying Mean Absolute SHAP value on the plot

I don't want to display the Mean Absolute Values on my SHAP Summary Plot in R. I want an output similar to the one produced in python. What line of code will help remove the mean absolute values from the summary plot in R?
I'm currently using this line of code:
shap.plot.summary.wrap1(xgb_model, X = x, top_n = 10)
You can do this by sligtly modifying the source code of shap.plot.summary() as below:
shap.plot.summary.edited <- function(data_long,
x_bound = NULL,
dilute = FALSE,
scientific = FALSE,
my_format = NULL){
if (scientific){label_format = "%.1e"} else {label_format = "%.3f"}
if (!is.null(my_format)) label_format <- my_format
# check number of observations
N_features <- setDT(data_long)[,uniqueN(variable)]
if (is.null(dilute)) dilute = FALSE
nrow_X <- nrow(data_long)/N_features # n per feature
if (dilute!=0){
# if nrow_X <= 10, no dilute happens
dilute <- ceiling(min(nrow_X/10, abs(as.numeric(dilute)))) # not allowed to dilute to fewer than 10 obs/feature
set.seed(1234)
data_long <- data_long[sample(nrow(data_long),
min(nrow(data_long)/dilute, nrow(data_long)/2))] # dilute
}
x_bound <- if (is.null(x_bound)) max(abs(data_long$value))*1.1 else as.numeric(abs(x_bound))
plot1 <- ggplot(data = data_long) +
coord_flip(ylim = c(-x_bound, x_bound)) +
geom_hline(yintercept = 0) + # the y-axis beneath
# sina plot:
ggforce::geom_sina(aes(x = variable, y = value, color = stdfvalue),
method = "counts", maxwidth = 0.7, alpha = 0.7) +
# print the mean absolute value:
#geom_text(data = unique(data_long[, c("variable", "mean_value")]),
# aes(x = variable, y=-Inf, label = sprintf(label_format, mean_value)),
# size = 3, alpha = 0.7,
# hjust = -0.2,
# fontface = "bold") + # bold
# # add a "SHAP" bar notation
# annotate("text", x = -Inf, y = -Inf, vjust = -0.2, hjust = 0, size = 3,
# label = expression(group("|", bar(SHAP), "|"))) +
scale_color_gradient(low="#FFCC33", high="#6600CC",
breaks=c(0,1), labels=c(" Low","High "),
guide = guide_colorbar(barwidth = 12, barheight = 0.3)) +
theme_bw() +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(), # remove axis line
legend.position="bottom",
legend.title=element_text(size=10),
legend.text=element_text(size=8),
axis.title.x= element_text(size = 10)) +
# reverse the order of features, from high to low
# also relabel the feature using `label.feature`
scale_x_discrete(limits = rev(levels(data_long$variable))#,
#labels = label.feature(rev(levels(data_long$variable)))
)+
labs(y = "SHAP value (impact on model output)", x = "", color = "Feature value ")
return(plot1)
}

Reorder and split the ggplot heatmap based on the clusters in one of the columns

I generated a heatmap with ggplot, and order the samples by using hclust, However, I still need more reordering to get all the similar values corespondent with one of the samples in the ordered cluster. Here I generate a samples data to explain better.
set.seed(99)
M <- data.frame(names = paste0("g", seq(1,30)), S1 = runif(30, 0 , 8), S2 = runif(30, -4, 5), S3 = runif(30, -5, 5))
M.mat <- M %>%
tibble::column_to_rownames('names') %>%
as.matrix()
M.dendro <- as.dendrogram(hclust(d = dist(x = M.mat)))
dendro.plot <- ggdendrogram(data = M.dendro, rotate = TRUE) +
theme(axis.text.y = element_text(size = 6))
print(dendro.plot)
str(M.dendro)
dend.order <- order.dendrogram(M.dendro)
df <- melt(M, id.vars = "names")
df$names <- factor(x = df$names,
levels = M$names[dend.order],
ordered = TRUE)
ggplot(df, aes(x = names, y = variable, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = muted("steelblue"), mid = "white", high = muted("red3"),
midpoint = 0, space = "Lab", na.value = "grey50",
guide = "colourbar", aesthetics = "fill"
) +
theme(axis.text.x = element_text(angle = 90, hjust=1), legend.key.size = unit(0.4, "cm")) +
coord_fixed()
For the generated heatmap, I need reorder it such that all the dark blue be on the bottom, the middle color and then the red on the top based on samples S3. Thank you

How to draw both positive mirror bar graph in r?

I would like to draw a graph similar to the image here:
I tried to find similar mirror bar graphs on google, but I could not find similar graph to the image above.
Tricky parts of the graph are that 1) both +ve and -ve y axis have positive values, and 2) both +ve and -ve y axis have different y-axis labellings.
Thank you in advance for your help.
This is as close as I could get so far to that graph.
It's really tricky.
The Y axis has to be positive on the negative side
On the negative side numbers have to look 5 times smaller because of the number on the Y axis being 5 times smaller [from 1 to 5 instead of 1 to 25]
uncertainty bars need to drawn
X labels are doubled
What I couldn't do:
set up the Y axis names in a proper manner, [if anyone knows and can help..!]
understand what a and b are and with which logic to place them [you need to explain this one better]
library(dplyr)
library(ggplot2)
# your data
n <- 100
set.seed(42)
df <- tibble(var1 = factor(rep(c("Mamou", "Crowley"), each = 8 * n), levels = c("Mamou", "Crowley"), ordered = TRUE),
var2 = factor(rep(c("RWW-M1", "RWW-M2", "RWW-C1", "RWW-C2"), each = 4* n), levels = c("RWW-M1", "RWW-M2", "RWW-C1", "RWW-C2"), ordered = TRUE),
var3 = factor(rep(rep(c("Shoot dry weight (g)", "Root dry weight (g)"), each = 2*n), 4), levels = c("Shoot dry weight (g)", "Root dry weight (g)"), ordered = TRUE),
varc = rep(rep(c("white", "black"), each = n), 8),
value = abs(c(
rnorm(2*n, mean = 5 , sd = 0.2),
rnorm(2*n, mean = 3 , sd = 0.04),
rnorm(2*n, mean = 15 , sd = 0.2),
rnorm(2*n, mean = 4 , sd = 0.04),
rnorm(2*n, mean = 5 , sd = 0.2),
rnorm(2*n, mean = 2.5, sd = 0.04),
rnorm(2*n, mean = 5 , sd = 0.2),
rnorm(2*n, mean = 2.5, sd = 0.04))))
# edit your data this way [a little trick to set bars up and down the line and make them look like 5 times bigger]
df <- df %>% mutate(value = if_else(var3 == "Root dry weight (g)", -value*5, value))
# calculate statistics you want to plot
df <- df %>%
group_by(var1, var2, var3, varc) %>%
summarise(mean = mean(value), min = min(value), max = max(value)) %>%
ungroup()
df %>%
ggplot(aes(x = var2)) +
# plot dodged bars
geom_col(aes(y = mean, fill = varc),
position = position_dodge(width = 0.75),
colour = "black", width = 0.5) +
# plot dodged errorbars
geom_errorbar(aes(ymin = min, ymax = max, group = varc),
position = position_dodge(width = 0.75), width = 0.2, size = 1) +
# make line on zero more visible
geom_hline(aes(yintercept = 0)) +
# set up colour of the bars, don't show legend
scale_fill_manual(values = c("white", "gray75"), guide = FALSE) +
# set up labels of y axis
# dont change positive, make negative look positive and 5 times smaller
# set up breaks every 5 [ggplot will calc labels after breaks]
scale_y_continuous(labels = function(x) if_else(x<0, -x/5, x),
breaks = function(x) as.integer(seq(x[1]-x[1]%%5, x[2]-x[2]%%5, 5))) +
# put labels and x axis on top
scale_x_discrete(position = "top") +
# set up var1 labels on top
facet_grid( ~ var1, space = 'free', scales = 'free') +
# show proper axis names
labs(x = "", y = "Root dry weight (g) Shoot dry weight (g)") +
# set up theme
theme_classic() +
theme(axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid = element_blank(),
# this is to put names of facet grid on top
strip.placement = 'outside',
# this is to remove background from labels on facet grid
strip.background = element_blank(),
# this is to make facets close to each other
panel.spacing.x = unit(0,"line"))
Something like this perhaps?
library(ggplot2)
df <- data.frame(x = rep(letters[1:3], each = 4),
y = c(2, -2, 3, -3, 4, -4, 5, -5, 2, -2, 3, -3),
dodgegroup = factor(rep(rep(1:2, each = 2), 3)))
ggplot(df, aes(x, y, fill = dodgegroup)) +
geom_col(position = position_dodge(width = 0.75),
colour = "black", width = 0.5) +
geom_hline(aes(yintercept = 0)) +
scale_fill_manual(values = c("white", "gray75")) +
scale_y_continuous(breaks = 0:10 - 5,
labels = c(5:0, 5 * 1:5)) +
theme_classic()
Created on 2020-08-07 by the reprex package (v0.3.0)
Try this. While the answer by Edo looks most like what you have asked for, this method does not need you to transform your data. However, the scale on both sides of the axis are the same.
Call geom_col twice but with - before the values for Root, then we use labels=abs to make both sides of the y-axis positive numbers:
Edit - fixed the y-axis
library(ggplot2)
df <- data.frame(x = rep(c("RWW-M1", "RWW-M2", "RWW-C1", "RWW-C2"), each = 2),
Shoot = c(5, 6, 7, 8, 4, 5, 5, 7),
Root = c(1, 2, 3, 4, 2, 3, 1, 2),
Condition = rep(c("control", "test"), each = 1))
p <- ggplot(df, aes(x=x, fill=Condition)) +
geom_col(aes(y=Shoot), position = position_dodge(width = 0.75), width = 0.5, colour = "black")+
geom_col(aes(y=-Root), position = position_dodge(width = 0.75), width = 0.5, colour = "black")+
geom_hline(aes(yintercept = 0)) +
scale_fill_manual(values = c("white", "gray75")) +
ylab("Root weight (g) / Shoot weight (g)")+
xlab("")+
scale_y_continuous(breaks = 0:15 - 5, labels=abs) +
theme_bw()
p

Resources