ggplot2: How to get dots to group with violin plots? - r

I would like to plot violin plots where x axis is exon however I want to group the plots. This works if its just violins however when I add in the jitters for some reason its not responding the correct aes and is plotting on its own? Here is a reproducible code with a screen shot of the error. thanks!
set.seed(1)
df <- data.frame(
exons = c(rep("e1", 200), rep("e2", 200)),
values = rnorm(400, 200, 40),
group = c(
rep("g1", 75), rep("g2", 75), rep("g3", 50),
rep("g1", 75), rep("g2", 75), rep("g3", 50)
)
)
ggplot(df, aes(y = values, x = exons, fill = group)) +
geom_violin() +
geom_jitter(shape = 16, position = position_jitter(0.07))
so if the plot works the dots should had been plotted within each of the group for each exon, however here it is clearly not.

You probably want both position_dodge() and position_jitterdodge()
library(ggplot2)
ggplot(df, aes(y = values, x = exons, fill = group)) +
geom_violin(position = position_dodge(width = 0.9)) +
geom_point(position = position_jitterdodge(seed = 1, dodge.width = 0.9))
Another option worth mentioning is geom_quasirandom() function from the ggbeeswarm package
library(ggbeeswarm)
ggplot(df, aes(y = values, x = exons, fill = group)) +
geom_violin(position = position_dodge(width = 0.9)) +
geom_quasirandom(dodge.width = 0.9, varwidth = TRUE)
Created on 2019-08-10 by the reprex package (v0.3.0)

Do you mean something like that?
set.seed ( 1)
df = data.frame (
exons = c(rep("e1", 200), rep("e2", 200))
,values = rnorm(400,200,40)
,group = c(rep("g1", 75), rep("g2", 75), rep("g3",50),
rep("g1", 75), rep("g2", 75), rep("g3",50) )
)
ggplot(df, aes(y= values , x= exons , fill = group )) +
geom_violin()+
geom_jitter(shape=16, position=position_jitter(width = NULL, height = NULL))
You can define the degree of jitter in x and y direction.

Related

Position stacked identity data sample size as geom_text directly over a bar using geom_bar from ggplot2

In this experiment, we tracked presence or absence of bacterial infection in our subject animals. We were able to isolate which type of bacteria was present in our animals and created a plot that has Week Since Experiment Start on the X axis, and Percentage of Animals Positive for bacterial infection on the Y axis. This is a stacked identity ggplot where each geom_bar contains the different identities of the bacteria that were in the infected animals each week. Here is a sample dataset with the corresponding ggplot code and result:
DummyData <- data.frame(matrix(ncol = 5, nrow = 78))
colnames(DummyData) <- c('WeeksSinceStart','BacteriaType','PositiveOccurences','SampleSize','NewSampleSize')
DummyData$WeeksSinceStart <- c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9,9,9,9,9,10,10,10,10)
DummyData$BacteriaType <- c("BactA","BactB","BactD","BactB","BactE","BactA","BactS","BactF","BactE","BactH","BactJ","BactK","BactE","BactB","BactS","BactF","BactL","BactE","BactW","BactH","BactS","BactJ","BactQ","BactN","BactW","BactA","BactD","BactE","BactA","BactC","BactD","BactK","BactL","BactE","BactD","BactA","BactS","BactK","BactB","BactE","BactF","BactH","BactN","BactE","BactL","BactZ","BactE","BactC","BactR","BactD","BactJ","BactN","BactK","BactW","BactR","BactE","BactW","BactA","BactM","BactG","BactO","BactI","BactE","BactD","BactM","BactH","BactC","BactM","BactW","BactA","BactL","BactB","BactE","BactA","BactS","BactH","BactQ","BactF")
PosOcc <- seq(from = 1, to = 2, by = 1)
DummyData$PositiveOccurences <- rep(PosOcc, times = 13)
DummyData$SampleSize <- c(78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,29,29,29,29,29,10,10,10,10)
DummyData$NewSampleSize <- c(78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,29,NA,NA,NA,NA,10,NA,NA,NA)
numcolor <- 20
plotcolors <- colorRampPalette(brewer.pal(8, "Set3"))(numcolor)
#GGplot for Dummy Data
DummyDataPlot <- ggplot(DummyData, aes(x = WeeksSinceStart, y = PositiveOccurences/SampleSize, fill = BacteriaType)) + geom_bar(position = "stack", stat = "identity") +
geom_text(label = DummyData$NewSampleSize, nudge_y = 0.1) +
scale_y_continuous(limits = c(0,0.6), breaks = seq(0, 1, by = 0.1)) + scale_x_continuous(limits = c(0.5,11), breaks = seq(0,10, by =1)) +
labs(
x = "Weeks Since Start",
y = "Proportion Positive") +
scale_fill_manual(values = plotcolors)
The problem: I cannot seem to find a way to position the labels from geom_text directly over each bar. I would also love to add the text "n = " to the sample size value directly over each bar. Thank you for your help!
I have tried different values for position_dodge statement and nudge_y statement with no success.
Sometimes the easiest approach is to do some data wrangling, i.e. one option would be to create a separate dataframe for your labels:
library(ggplot2)
library(dplyr)
dat_label <- DummyData |>
group_by(WeeksSinceStart) |>
summarise(y = sum(PositiveOccurences / SampleSize), SampleSize = unique(SampleSize))
ggplot(DummyData, aes(x = WeeksSinceStart, y = PositiveOccurences / SampleSize, fill = BacteriaType)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(data = dat_label, aes(x = WeeksSinceStart, y = y, label = SampleSize), inherit.aes = FALSE, nudge_y = .01) +
#scale_y_continuous(limits = c(0, 0.6), breaks = seq(0, 1, by = 0.1)) +
scale_x_continuous(limits = c(0.5, 11), breaks = seq(0, 10, by = 1)) +
labs(
x = "Weeks Since Start",
y = "Proportion Positive"
) +
scale_fill_manual(values = plotcolors)

ggplot line plot with one group`s lines on top

I am making a line plot of several groups and want to make a visualization where one of the groups lines are highlighted
ggplot(df) + geom_line(aes(x=timepoint ,y=var, group = participant_id, color=color)) +
scale_color_identity(labels = c(red = "g1",gray90 = "Other"),guide = "legend")
However, the group lines are partially obscured by the other groups lines
How can I make these lines always on top of other groups lines?
The simplest way to do this is to plot the gray and red groups on different layers.
First, let's try to replicate your problem with a dummy data set:
set.seed(1)
df <- data.frame(
participant_id = rep(1:50, each = 25),
timepoint = factor(rep(0:24, 50)),
var = c(replicate(50, runif(1, 50, 200) + runif(25, 0.3, 1.5) *
sin(0:24/(0.6*pi))^2/seq(0.002, 0.005, length = 25))),
color = rep(sample(c("red", "gray90"), 50, TRUE, prob = c(1, 9)), each = 100)
)
Now we apply your plotting code:
library(ggplot2)
ggplot(df) +
geom_line(aes(x=timepoint ,y=var, group = participant_id, color = color)) +
scale_color_identity(labels = c(red = "g1", gray90 = "Other"),
guide = "legend") +
theme_classic()
This looks broadly similar to your plot. If instead we plot in different layers, we get:
ggplot(df, aes(timepoint, var, group = participant_id)) +
geom_line(data = df[df$color == "gray90",], aes(color = "Other")) +
geom_line(data = df[df$color == "red",], aes(color = "gl")) +
scale_color_manual(values = c("red", "gray90")) +
theme_classic()
Created on 2022-06-20 by the reprex package (v2.0.1)
You can use factor releveling to bring the line (-s) of interest to front.
First, let's plot the data as is, with the red line partly hidden by others.
library(ggplot2)
library(dplyr)
set.seed(13)
df <-
data.frame(timepoint = rep(c(1:100), 20),
participant_id = paste0("p_", sort(rep(c(1:20), 100))),
var = abs(rnorm(2000, 200, 50) - 200),
color = c(rep("red", 100), rep("gray90", 1900)))
ggplot(df) +
geom_line(aes(x = timepoint ,
y = var,
group = participant_id, color = color)) +
scale_color_identity(labels = c(red = "g1", gray90 = "Other"),
guide = "legend")
Now let's bring p_1 to front by making it the last factor level.
df %>%
mutate(participant_id = factor(participant_id)) %>%
mutate(participant_id = relevel(participant_id, ref = "p_1")) %>%
mutate(participant_id = factor(participant_id, levels = rev(levels(participant_id)))) %>%
ggplot() +
geom_line(aes(x=timepoint,
y=var,
group = participant_id,
color = color)) +
scale_color_identity(labels = c(red = "g1", gray90 = "Other"),
guide = "legend")

unable to show axis tick label in ggplot in R

I'm trying to plot a stacked bar chart with ggplot. Below is my code:
df = data.frame(Y = c(0,0,1,1), X = c(0,1,0,1), N = c(200, 50, 300, 70))
ggplot(data=df, aes(y=N, x= X, fill=Y)) +
geom_bar(position="stack", stat="identity", width=0.7)+
scale_x_discrete(name ="", breaks = c(0, 1), labels=c("No",'Yes')) +
theme(legend.position="none")
I want to show 'No' and 'Yes' as tick label on x axis. But nothing shows up. Does anyone know why my tick labelz do not show up? I do not understand what I did wrong.
You just need to insert factor(X) to make X discrete rather than continuous:
library(tidyverse)
df <- data.frame(Y = c(0, 0, 1, 1), X = c(0, 1, 0, 1), N = c(200, 50, 300, 70))
ggplot(data = df, aes(y = N, x = factor(X), fill = Y)) +
geom_bar(position = "stack", stat = "identity", width = 0.7) +
scale_x_discrete(name = "", breaks = c(0, 1), labels = c("No", "Yes")) +
theme(legend.position = "none")
Created on 2022-06-14 by the reprex package (v2.0.1)
An easier way to do it may be to do the data transformation outside the ggplot function:
library(ggplot2)
library(dplyr)
df = data.frame(Y = c(0,0,1,1), X = c(0,1,0,1), N = c(200, 50, 300, 70))
df %>%
mutate(X = if_else(X == 0, "No", "Yes")) %>%
ggplot(aes(y=N, x= X, fill=Y)) +
geom_bar(position="stack", stat="identity", width=0.7)+
theme(legend.position="none")

How to overlay facet labels according to y-axis value on plots made with ggplot2?

I created a heat map that represents the usage time of two products (A and B) that are available in colors C1 and C2. According to the time of use, it is possible to classify how the product was used (God, regular or bad). Within the usage classification there are categories that overlap as a function of time, as described below:
Good: use time greater than or equal to 280 minutes.
Regular: use time between 150 and 350 minutes.
Bad: use time less than or equal to 10 minutes.
I want to create facets for the categories good, regular and bad, without completely separating the facets but overlapping them as shown in the second image below. The attempts I've made have been unsatisfactory. The final aesthetics of the heat map need not be exactly the same as the one shown in the second image, what is necessary is to correctly indicate the classification.
library(ggplot2)
Product <- c("A", "B")
Color <- c("C1", "C2")
Time <- seq(10, 430, 60)
df <- expand.grid(Time = Time,
Color = Color,
Product = Product)
df$Fill_factor <- seq(1, 32, 1)
df$Usage <- ifelse(
df$Time <= 10,
"Bad",
ifelse(
df$Time >= 150 & df$Time <= 350,
"Regular",
ifelse(
df$Time >= 280,
"Good",
"Without classification"
)
)
)
ggplot(data = df,
aes(x = Product,
y = Time,
fill = Fill_factor)) +
geom_tile() +
geom_text(aes(label = Fill_factor),
size = 2.5) +
facet_grid(~ Color) +
scale_y_continuous(breaks = seq(10, 430, 60))
# Fail
ggplot(data = df,
aes(x = Product,
y = Time,
fill = Fill_factor)) +
geom_tile() +
geom_text(aes(label = Fill_factor),
size = 2.5) +
facet_grid(Usage ~ Color) +
scale_y_continuous(breaks = seq(10, 430, 60))
I don't think there is any way to solve this using facets as they exist now. However, you can annotate a few rectangles with text outside the plotting area by setting coord_cartesian(clip = "off") with the right out-of-bounds argument in the scales. Unfortunately, this works better with a continuous x-axis, but you can of course make 'pseudodiscrete' scales.
library(ggplot2)
Product <- c("A", "B")
Color <- c("C1", "C2")
Time <- seq(10, 430, 60)
df <- expand.grid(Time = Time,
Color = Color,
Product = Product)
df$Fill_factor <- seq(1, 32, 1)
# Make annotation data.frame
max_x <- (length(unique(df$Product)) + 0.6)
anno <- data.frame(
Color = "C2",
Usage = c("Good", "Regular", "Bad"),
xmin = max_x,
xmax = max_x + c(0.5, 0.25, 0.25),
ymin = c(280, 160, -20),
ymax = c(460, 340, 40)
)
ggplot(data = df,
aes(x = match(Product, sort(unique(Product))),
y = Time,
fill = Fill_factor)) +
geom_tile() +
geom_text(aes(label = Fill_factor),
size = 2.5) +
# Adding the rectangles from annotation data
geom_rect(
data = anno,
aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
fill = c("limegreen", "gold", "tomato"),
inherit.aes = FALSE
) +
# Adding text from annotation data
geom_text(
data = anno,
aes(x = (xmin + xmax) / 2, y = (ymin + ymax) / 2,
label = Usage),
inherit.aes = FALSE, angle = 270
) +
facet_grid(~ Color) +
scale_y_continuous(breaks = seq(10, 430, 60)) +
# Set breaks and labels as they would be for discrete scale
scale_x_continuous(limits = c(0.5, 2.5),
breaks = seq_along(unique(df$Product)),
labels = sort(unique(df$Product)),
oob = scales::oob_keep) +
coord_cartesian(clip = "off") +
theme(legend.box.margin = margin(l = 40))
Created on 2021-08-07 by the reprex package (v1.0.0)

R: ggplot customize transform axis label to reduce zeros

In my following code:
library(ggplot2)
library(scales)
myData <- data.frame(
platform = c("Windows", "MacOs", "Linux"),
number = c(27000, 16000, 9000)
)
ggplot(myData, aes(x = reorder(platform, -number), y = number))+
geom_bar(stat="identity", fill="darkturquoise", width = 0.5)+
geom_text(aes(label = number), vjust=-0.3)+
xlab("Platform")+
scale_y_continuous(breaks = round(seq(0,40000, by = 5000), 1))
that produces this plot:
How do I change the param of scale_y_continuous to reduce the number of 000? i.e, the y-tick will show 5, 10, 15, 20, 25...
Divide the y-axis' labels by 1000 like so:
ggplot(myData, aes(x = reorder(platform, -number), y = number))+
geom_bar(stat="identity", fill="darkturquoise", width = 0.5)+
geom_text(aes(label = number), vjust=-0.3)+
xlab("Platform")+
scale_y_continuous(breaks = seq( 0,40000, by = 5000),
labels = function(y_value) y_value / 1000) # <- ! here !

Resources