In this experiment, we tracked presence or absence of bacterial infection in our subject animals. We were able to isolate which type of bacteria was present in our animals and created a plot that has Week Since Experiment Start on the X axis, and Percentage of Animals Positive for bacterial infection on the Y axis. This is a stacked identity ggplot where each geom_bar contains the different identities of the bacteria that were in the infected animals each week. Here is a sample dataset with the corresponding ggplot code and result:
DummyData <- data.frame(matrix(ncol = 5, nrow = 78))
colnames(DummyData) <- c('WeeksSinceStart','BacteriaType','PositiveOccurences','SampleSize','NewSampleSize')
DummyData$WeeksSinceStart <- c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9,9,9,9,9,10,10,10,10)
DummyData$BacteriaType <- c("BactA","BactB","BactD","BactB","BactE","BactA","BactS","BactF","BactE","BactH","BactJ","BactK","BactE","BactB","BactS","BactF","BactL","BactE","BactW","BactH","BactS","BactJ","BactQ","BactN","BactW","BactA","BactD","BactE","BactA","BactC","BactD","BactK","BactL","BactE","BactD","BactA","BactS","BactK","BactB","BactE","BactF","BactH","BactN","BactE","BactL","BactZ","BactE","BactC","BactR","BactD","BactJ","BactN","BactK","BactW","BactR","BactE","BactW","BactA","BactM","BactG","BactO","BactI","BactE","BactD","BactM","BactH","BactC","BactM","BactW","BactA","BactL","BactB","BactE","BactA","BactS","BactH","BactQ","BactF")
PosOcc <- seq(from = 1, to = 2, by = 1)
DummyData$PositiveOccurences <- rep(PosOcc, times = 13)
DummyData$SampleSize <- c(78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,29,29,29,29,29,10,10,10,10)
DummyData$NewSampleSize <- c(78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,29,NA,NA,NA,NA,10,NA,NA,NA)
numcolor <- 20
plotcolors <- colorRampPalette(brewer.pal(8, "Set3"))(numcolor)
#GGplot for Dummy Data
DummyDataPlot <- ggplot(DummyData, aes(x = WeeksSinceStart, y = PositiveOccurences/SampleSize, fill = BacteriaType)) + geom_bar(position = "stack", stat = "identity") +
geom_text(label = DummyData$NewSampleSize, nudge_y = 0.1) +
scale_y_continuous(limits = c(0,0.6), breaks = seq(0, 1, by = 0.1)) + scale_x_continuous(limits = c(0.5,11), breaks = seq(0,10, by =1)) +
labs(
x = "Weeks Since Start",
y = "Proportion Positive") +
scale_fill_manual(values = plotcolors)
The problem: I cannot seem to find a way to position the labels from geom_text directly over each bar. I would also love to add the text "n = " to the sample size value directly over each bar. Thank you for your help!
I have tried different values for position_dodge statement and nudge_y statement with no success.
Sometimes the easiest approach is to do some data wrangling, i.e. one option would be to create a separate dataframe for your labels:
library(ggplot2)
library(dplyr)
dat_label <- DummyData |>
group_by(WeeksSinceStart) |>
summarise(y = sum(PositiveOccurences / SampleSize), SampleSize = unique(SampleSize))
ggplot(DummyData, aes(x = WeeksSinceStart, y = PositiveOccurences / SampleSize, fill = BacteriaType)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(data = dat_label, aes(x = WeeksSinceStart, y = y, label = SampleSize), inherit.aes = FALSE, nudge_y = .01) +
#scale_y_continuous(limits = c(0, 0.6), breaks = seq(0, 1, by = 0.1)) +
scale_x_continuous(limits = c(0.5, 11), breaks = seq(0, 10, by = 1)) +
labs(
x = "Weeks Since Start",
y = "Proportion Positive"
) +
scale_fill_manual(values = plotcolors)
Suppose we want to plot this data:
library(ggplot2)
library(sf)
library(raster)
library(colorRamps)
min_lon <- 10
max_lon <- 17
min_lat <- 8
max_lat <- 17
grid_size <- 0.5
lon_grids <- 1 + ((max_lon - min_lon)/grid_size)
lat_grids <- 1 + ((max_lat - min_lat)/grid_size)
points <- data.frame(lon = rep(seq(min_lon, max_lon, grid_size), lat_grids), lat = rep(seq(min_lat, max_lat, grid_size), each = lon_grids))
points$Var <- runif(min= 10, max = 48, 285)
points$value <-cut(points$Var, breaks= seq(10.08, 47.80, length.out = 13), dig.lab = 1)
ggplot() +
coord_sf(xlim = c(min_lon, max_lon), ylim = c(min_lat, max_lat)) +
theme_bw()+
geom_raster(data = points, aes(x = lon, y = lat, fill = value), interpolate = FALSE) +
labs(x="Longitude", y="Latitude")+
scale_fill_manual(values = matlab.like(n = 13), name = "[m]",
labels = sprintf("%.2f", seq(10.08, 47.80, length.out = 13)),
guide = guide_legend(reverse = TRUE))+theme(legend.position = "bottom")
This code produces the following graph:
Two problems I am facing here:
To make it discrete, I used the cut function. I chose the breaks= seq(10.08, 47.80, length.out = 13) arbitrary based on the minimum and maximum values with a random length of 13. Is there any criteria to decide the correct range?
Is there any way to make the legend look like this?
One option would be to use e.g. scale_fill_stepsn with guide_binswhich does not require to manually discretize the variable mapped on fill. Additionally I use a custom function to set the breaks of the legend instead of the default mechanism to set the number of breaks.
set.seed(123)
library(ggplot2)
library(colorRamps)
base <- ggplot() +
coord_sf(xlim = c(min_lon, max_lon), ylim = c(min_lat, max_lat)) +
theme_bw() +
geom_raster(data = points, aes(x = lon, y = lat), interpolate = FALSE) +
labs(x = "Longitude", y = "Latitude") +
theme(legend.position = "bottom")
base +
aes(fill = Var) +
scale_fill_stepsn(colors = matlab.like(n = 13), name = "[m]",
breaks = function(x) seq(x[[1]], x[[2]], length.out = 13),
labels = ~ sprintf("%.0f", .x),
guide = guide_bins(axis = FALSE,
show.limits = TRUE))
I have a dataframe of single column with multiple values. I was using basic rplot function like plot() and points(). I successfully plotted the lineplot but I was unable to write point values from the dataframe onto the plot field. Is there anyway to add data values onto the plot?
Below is the following code for test
> x = data.frame(A = rnorm(10))
> plot(x$A, type = "o", pch = 20)**
Sorry, I made an edit to make my question clearer.
Here below is the example plot for 10 random numbers
Plot lines, then add text:
#data
set.seed(1); x = data.frame(A = rnorm(10))
#base plot
plot(x$A, type = "o", pch = 20, ylim = range(x$A * 1.3))
text(x = seq_along(x$A), y = x$A + 0.3, labels = round(x$A, 2), srt = 90)
Or using ggplot with ggrepel for pretty labels:
#ggplot
library(ggplot2)
library(ggrepel) # pretty labels, avoid overlap:
ggplot(cbind(x = seq_along(x$A), x),
aes(x = x, y = A, label = round(A, 2))) +
geom_line() +
geom_point() +
geom_label_repel()
#geom_text_repel()
Probably this is more than what you are asking, but you can add labels to the values you have in your line plot using ggplot:
library(ggplot2)
x = data.frame(A = rnorm(10),
pos = runif(10, 0.1, 0.7))
ggplot(x) +
geom_point(aes(x = A),
y = 0) +
geom_line(aes(x = A),
y = 0) +
geom_segment(aes(x = A,
xend = A,
y = 0,
yend = pos),
linetype = 2) +
geom_label(aes(x = A,
y = pos,
label = round(A, 2)),
size = 3) +
scale_y_continuous(name = "",
limits = c(0, 0.8)) +
guides(y = "none") +
theme_bw()
You could make a base R "type b" equivalent.
The OP hasn't specified that every y value should be set to zero.
library(ggh4x)
#> Loading required package: ggplot2
set.seed(1)
x = data.frame(A = rnorm(10))
ggplot(x, aes(1:10, A)) +
geom_pointpath(shape = NA) +
geom_text(aes(label = round(A,2))) +
labs(x= "Index")
Created on 2022-05-27 by the reprex package (v2.0.1)
I have been struggling with this for hours now. I have the following script:
library(ggplot2)
sims = replicate(1000, sample(c(0,0,0,0,1,1,1,2,2,2), size=3, replace=FALSE))
df = data.frame(x=colSums(sims == 0),
y=colSums(sims == 1))
df$count <- 1
total_counts = aggregate(count ~ ., df, FUN = sum)
min_count = min(total_counts$count)
max_count = max(total_counts$count)
p = (ggplot(df, aes(x=x, y=y))
+ geom_count(aes(color=..n.., size=..n..), alpha=0.8)
+ guides(color = 'legend', size=FALSE)
+ labs(color='Count')
+ scale_colour_gradient(limits = c(min_count, max_count),
breaks = round(seq(min_count, max_count, length.out=5)),
labels = round(seq(min_count, max_count, length.out=5)))
+ scale_size_continuous(range = c(3, 7.5))
)
So far so good. The problem is that I want to add two additional sets of points:
df2 = data.frame(x=c(0, 1, 2, 3),
y=c(1.5253165, 1.0291262, 0.4529617, 0))
df3 = data.frame(x=c(0, 1, 2, 3),
y=c(1.5, 1, 0.5, 0))
To get something like this:
p2 = (p
+ geom_point(data=df2, aes(x=x, y=y), alpha=0.4, color="red", size = 2.5)
+ geom_point(data=df3, aes(x=x, y=y), alpha=0.4, color="green", size = 2.5)
)
The problem is that I am not being capable of adding these new points to the legend. I would like the legend to be in a different "section". Namely, to have an empty string title (to differentiate these points from "Count" title), and to have strings instead of numbers in their labels ("Simulated means" and "Theoretical means", for example).
Is there any way to achieve this?
A trick I learned from #tjebo is that you can use the ggnewscale package to spawn additional legends. At what point in plot construction you call the new scale is important, so you first want to make a geom/stat layer and add the desired scale. Once these are declared, you can use new_scale_colour() and all subsequent geom/stat layers will use a new colour scale.
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.0.5
library(ggnewscale)
#> Warning: package 'ggnewscale' was built under R version 4.0.3
sims = replicate(1000, sample(c(0,0,0,0,1,1,1,2,2,2), size=3, replace=FALSE))
df = data.frame(x=colSums(sims == 0),
y=colSums(sims == 1))
df$count <- 1
total_counts = aggregate(count ~ ., df, FUN = sum)
min_count = min(total_counts$count)
max_count = max(total_counts$count)
df2 = data.frame(x=c(0, 1, 2, 3),
y=c(1.5253165, 1.0291262, 0.4529617, 0))
df3 = data.frame(x=c(0, 1, 2, 3),
y=c(1.5, 1, 0.5, 0))
ggplot(df, aes(x, y)) +
geom_count(aes(colour = after_stat(n), size = after_stat(n)),
alpha = 0.5) +
scale_colour_gradient(
limits = c(min_count, max_count),
breaks = round(seq(min_count, max_count, length.out = 5)),
labels = round(seq(min_count, max_count, length.out = 5)),
guide = "legend"
) +
new_scale_colour() +
geom_point(aes(colour = "Simulated means"),
data = df2, alpha = 0.4) +
geom_point(aes(colour = "Theoretical means"),
data = df3, alpha = 0.4) +
scale_colour_discrete(
name = ""
) +
scale_size_continuous(range = c(3, 7.5), guide = "none")
Created on 2021-04-22 by the reprex package (v1.0.0)
(P.S. sorry for reformatting your code, it just read more easily for myself this way)
I have a large number of variables and would like to create scatterplots comparing all variables to a single variable. I have been able to do this in base R using lapply, but I cannot complete the same task in ggplot2 using lapply.
Below is an example dataset.
df <- data.frame("ID" = 1:16)
df$A <- c(1,2,3,4,5,6,7,8,9,10,11,12,12,14,15,16)
df$B <- c(5,6,7,8,9,10,13,15,14,15,16,17,18,18,19,20)
df$C <- c(11,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
I define the variables I would like to generate scatterplots with, using the code below:
df_col_names <- df %>% select(A:C) %>% colnames(.)
Below is how I have been able to successfully complete the task of plotting all variables against variable A, using lapply in base R:
lapply(df_col_names, function(x) {
tiff(filename=sprintf("C:\\Documents\\%s.tiff", x),
width = 1000, height = 1000, res=200)
plot(df$A, df[[x]],
pch=19,
cex = 1.5,
ylab = x,
ylim = c(0, 20),
xlim = c(0, 20))
dev.off()
})
Below is my attempt at completing the task in ggplot2 without any success. It generates the tiff images, although they are empty.
lapply(df_col_names, function(x) {
tiff(filename=sprintf("C:\\Documents\\%s.tiff", x),
width = 1000, height = 1000, res=200)
ggplot(df) +
geom_point(data = df,
aes(x = A, y = df_col_names[[x]], size = 3)) +
geom_smooth(aes(x = A, y = df_col_names[[x]], size = 0), method = "lm", size=0.5) +
coord_fixed(ratio = 1, xlim = c(0, 20), ylim = c(0, 20)) +
guides(size = FALSE, color = FALSE) +
theme_bw(base_size = 14)
dev.off()
})
It works for me with ggsave. Also note that you are passing string column names to ggplot so use .data to refer to actual column values.
library(ggplot2)
lapply(df_col_names, function(x) {
ggplot(df) +
geom_point( aes(x = A, y = .data[[x]], size = 3)) +
geom_smooth(aes(x = A, y = .data[[x]], size = 0), method = "lm", size=0.5) +
coord_fixed(ratio = 1, xlim = c(0, 20), ylim = c(0, 20)) +
guides(size = FALSE, color = FALSE) +
theme_bw(base_size = 14) -> plt
ggsave(sprintf("%s.tiff", x), plt)
})