Consider the MWE below. I would like to generate boxplots with these ideas in mind:
Food on the y-axix ordered according to Amot for Home, while Amt (1:40) on x-axis
show mean points overlaying the boxes
boxplots of Food to be ordered based on median of Home Site from dfsummary data
text annotations of N of observations (to be taken from dfsummary data)
MWE
df <- data.frame(
Site = sample(rep(c("Home", "Office"), size = 884)),
Food = sample(rep(c("Banana","Apple","Egg","Berry","Tomato","Potato","Bean","Pea","Nuts","Onion","Carrot","Cabbage","Eggplant"), size=884)),
Amt = sample(seq(1, 40, by = 0.25), size = 884, replace = TRUE)
)
random <- sample(seq(1, 884, by = 1), size = 100, replace = TRUE) # to randomly introduce 100 NAs to Amt vector
df$Amt[random] <- NA
Summary code
dfsummary <- df %>%
dplyr::group_by(Food, Site) %>%
dplyr::summarise(Median = round(median(Amt, na.rm=TRUE), digits=2), N = sum(!is.na(Amt))) %>%
ungroup()
ggplot code
p1 <- ggplot(df, aes(Amt, Food)) +
geom_boxplot() +
facet_grid(facets = . ~ Site)
Graph
I was expecting to see boxplots here.
Adding annotation
p2 <- p1 + geom_text(aes(y = 42, Food, label = paste("n=", N)), data = dfsummary, size = 3, nudge_x = 0.1) +
facet_grid(facets = . ~ Site)
Unfortunately, this doesn't work either.
Note
tidyverse version is 1.3.0
R version 3.6.2 (2019-12-12) -- "Dark and Stormy Night"
To work out the problem, you may want to generate a scatter plot, first:
library(ggplot2)
p1 <- ggplot(df, aes(Amt, Food)) +
geom_point() +
facet_grid(facets = . ~ Site)
p1
As you can see it is impossible to generate a boxplot.
However, if you switch x and y
ggplot(df, aes(Food, Amt)) +
geom_boxplot() +
facet_grid(facets = . ~ Site)
You get:
This works just fine in the current development version of ggplot2, to be released in January 2020.
# If your ggplot2 version is <= 3.2.1, do:
# remotes::install_github("tidyverse/ggplot2")
library(tidyverse)
df <- data.frame(
Site = sample(rep(c("Home", "Office"), size = 884)),
Food = sample(rep(c("Banana","Apple","Egg","Berry","Tomato","Potato","Bean","Pea","Nuts","Onion","Carrot","Cabbage","Eggplant"), size=884)),
Amt = sample(seq(1, 40, by = 0.25), size = 884, replace = TRUE)
)
random <- sample(seq(1, 884, by = 1), size = 100, replace = TRUE) # to randomly introduce 100 NAs to Amt vector
df$Amt[random] <- NA
ggplot(df, aes(Amt, Food)) +
geom_boxplot() +
facet_grid(facets = . ~ Site)
#> Warning: Removed 98 rows containing non-finite values (stat_boxplot).
Created on 2020-01-01 by the reprex package (v0.3.0)
With annotations:
library(tidyverse)
df <- data.frame(
Site = sample(rep(c("Home", "Office"), size = 884)),
Food = sample(rep(c("Banana","Apple","Egg","Berry","Tomato","Potato","Bean","Pea","Nuts","Onion","Carrot","Cabbage","Eggplant"), size=884)),
Amt = sample(seq(1, 40, by = 0.25), size = 884, replace = TRUE)
)
random <- sample(seq(1, 884, by = 1), size = 100, replace = TRUE) # to randomly introduce 100 NAs to Amt vector
df$Amt[random] <- NA
dfsummary <- df %>%
dplyr::group_by(Food, Site) %>%
dplyr::summarise(Median = round(median(Amt, na.rm=TRUE), digits=2), N = sum(!is.na(Amt))) %>%
ungroup()
ggplot(df, aes(Amt, Food)) +
geom_boxplot() +
geom_text(
aes(x = 42, Food, label = paste("n=", N)),
data = dfsummary,
size = 3, nudge_x = 0.1
) +
facet_grid(facets = . ~ Site)
#> Warning: Removed 95 rows containing non-finite values (stat_boxplot).
Created on 2020-01-01 by the reprex package (v0.3.0)
Related
I am attempting to make a ggplot2 scatter plot that is grouped by bins in R. I successfully made the first model, which I did not try to alter the fill for. But when I tried to have the fill of the scatter plot be based upon my variable (Miss.) ,which is a numeric value ranging from 0.00 to 0.46, it essentially ignores the heat map scale and turns everything gray.
ggplot(data = RightFB, mapping = aes(x = TMHrzBrk, y = TMIndVertBrk))+
geom_bin_2d(bins = 15)+
scale_fill_continuous(type = "viridis")+
ylim(5, 20)+
xlim(0,15)+
coord_fixed(1.3)
ggplot(data = RightFB, mapping = aes(x = TMHrzBrk, y = TMIndVertBrk, fill
=Miss.))+
geom_bin_2d(bins = 15)+
scale_fill_continuous(type = "viridis")+
ylim(5, 20)+
xlim(0,15)+
coord_fixed(1.3)
I appreciate any help! Thanks!
I think I understand your problem, so let's replicate it with a reproducible example. Obviously we don't have your data, but the following data frame has the same names, types and ranges as your own data, so this walk-through should work for you.
set.seed(1)
RightFB <- data.frame(TMHrzBrk = runif(1000, 0, 15),
TMIndVertBrk = runif(1000, 5, 20),
Miss. = runif(1000, 0, 0.46))
Your first plot will look something like this:
library(tidyverse)
ggplot(data = RightFB, mapping = aes(x = TMHrzBrk, y = TMIndVertBrk)) +
geom_bin_2d(bins = 15) +
scale_fill_continuous(type = "viridis") +
ylim(5, 20) +
xlim(0, 15) +
coord_fixed(1.3)
#> Warning: Removed 56 rows containing missing values (`geom_tile()`).
Here, the fill colors represent the counts of observations within each bin. But if you try to map the fill to Miss., you get all gray squares:
ggplot(data = RightFB, mapping = aes(x = TMHrzBrk, y = TMIndVertBrk,
fill = Miss.)) +
geom_bin_2d(bins = 15) +
scale_fill_continuous(type = "viridis") +
ylim(5, 20) +
xlim(0, 15) +
coord_fixed(1.3)
#> Warning: The following aesthetics were dropped during statistical transformation: fill
#> i This can happen when ggplot fails to infer the correct grouping structure in
#> the data.
#> i Did you forget to specify a `group` aesthetic or to convert a numerical
#> variable into a factor?
#> Removed 56 rows containing missing values (`geom_tile()`).
The reason this happens is that by default geom_bin_2d calculates the bins and the counts within each bin to get the fill variable. There are multiple observations within each bin, and they all have a different value of Miss. . Furthermore, geom_bin_2d doesn't know what you want to do with this variable. My guess is that you are looking for the average of Miss. within each bin, but this is difficult to achieve within the framework of geom_bin_2d.
The alternative is to calculate the bins yourself, get the average of Miss. in each bin, and plot as a geom_tile
RightFB %>%
mutate(TMHrzBrk = cut(TMHrzBrk, breaks = seq(0, 15, 1), seq(0.5, 14.5, 1)),
TMIndVertBrk = cut(TMIndVertBrk, seq(5, 20, 1), seq(5.5, 19.5, 1))) %>%
group_by(TMHrzBrk, TMIndVertBrk) %>%
summarize(Miss. = mean(Miss., na.rm = TRUE), .groups = "drop") %>%
mutate(across(TMHrzBrk:TMIndVertBrk, ~as.numeric(as.character(.x)))) %>%
ggplot(aes(x = TMHrzBrk, y = TMIndVertBrk, fill = Miss.)) +
geom_tile() +
scale_fill_continuous(type = "viridis") +
ylim(5, 20) +
xlim(0, 15) +
coord_fixed(1.3)
EDIT
With the link to the data in the comments, here is a full reprex:
library(tidyverse)
RightFB <- read.csv(paste0("https://raw.githubusercontent.com/rileyfeltner/",
"FB-Analysis/main/Right%20FB.csv"))
RightFB <- RightFB[c(2:6, 9, 11, 13, 18, 19)]
RightFB$Miss. <- as.numeric(as.character(RightFB$Miss.))
#> Warning: NAs introduced by coercion
RightFB$TMIndVertBrk <- as.numeric(as.character(RightFB$TMIndVertBrk))
#> Warning: NAs introduced by coercion
RightFB <- na.omit(RightFB)
RightFB1 <- filter(RightFB, P > 24)
RightFB %>%
mutate(TMHrzBrk = cut(TMHrzBrk, breaks = seq(0, 15, 1), seq(0.5, 14.5, 1)),
TMIndVertBrk = cut(TMIndVertBrk, seq(5, 20, 1), seq(5.5, 19.5, 1))) %>%
group_by(TMHrzBrk, TMIndVertBrk) %>%
summarize(Miss. = mean(Miss., na.rm = TRUE), .groups = "drop") %>%
mutate(across(TMHrzBrk:TMIndVertBrk, ~as.numeric(as.character(.x)))) %>%
ggplot(aes(x = TMHrzBrk, y = TMIndVertBrk, fill = Miss.)) +
geom_tile() +
scale_fill_continuous(type = "viridis") +
ylim(5, 20) +
xlim(0, 15) +
coord_fixed(1.3)
#> Warning: Removed 18 rows containing missing values (`geom_tile()`).
Created on 2022-11-23 with reprex v2.0.2
I have built a stacked bar chart showing the relative proportions of response to different questions. Now I want to show a particular response ontop of that barchart, to show how an individuals response relates to the overall proportions of responses.
I created a toy example here:
library(ggplot2)
n = 1000
n_groups = 5
overall_df = data.frame(
state = sample(letters[1:8], n, replace = TRUE),
frequency = runif(n, min = 0, max = 1),
var_id = rep(LETTERS[1:n_groups], each = 1000 / n_groups)
)
row = data.frame(
A = "a", B = "b", C = "c", D = "h", E = "b"
)
ggplot(overall_df,
aes(fill=state, y=frequency, x=var_id)) +
geom_bar(position="fill", stat="identity")
The goal here is to have the responses in the object row plotted as a point in the corresponding barchart box, with a line connecting the points.
Here is a (poorly drawn) example of the desired result. Thanks for your help.
This was trickier than I thought. I'm not sure there's any way round manually calculating the x/y co-ordinates of the line.
library(dplyr)
library(ggplot2)
df <- overall_df %>% group_by(state, var_id) %>%
summarize(frequency = sum(frequency))
freq <- unlist(Map(function(d, val) {
(sum(d$frequency[d$state > val]) + 0.5 * d$frequency[d$state == val]) /
sum(d$frequency)
}, d = split(df, df$var_id), val = row))
line_df <- data.frame(state = unlist(row),
frequency = freq,
var_id = names(row))
ggplot(df, aes(fill=state, y=frequency, x=var_id)) +
geom_col(position="fill") +
geom_line(data = line_df, aes(group = 1)) +
geom_point(data = line_df, aes(group = 1))
Created on 2022-03-08 by the reprex package (v2.0.1)
Here's an automated approach using dplyr. I prepare the summary by joining the label data to the original data, and then using group_by + summarize to get those.
library(dplyr)
row_df <- data.frame(state = letters[1:n_groups], var_id = LETTERS[1:n_groups])
line_df <- row_df %>%
left_join(overall_df, by = "var_id") %>%
group_by(var_id) %>%
summarize(state = last(state.x),
frequency = (sum(frequency[state.x < state.y]) +
sum(frequency[state.x == state.y])/2) / sum(frequency))
ggplot(overall_df, aes(fill=state, y=frequency, x=var_id)) +
geom_bar(position="fill", stat="identity") +
geom_point(data = line_df) +
geom_line(data = line_df, aes(group = 1))
Suppose a dataset containing count data per multiple time periods and per multiple groups in the following format:
set.seed(123)
df <- data.frame(group = as.factor(rep(1:3, each = 50)),
week = rep(1:50, 3),
rate = c(round(700 - rnorm(50, 100, 10) - 1:50 * 2, 0),
round(1000 - rnorm(50, 200, 10) - 1:50 * 2, 0),
round(1000 - rnorm(50, 200, 10) - 1:50 * 2, 0)))
group week rate
1 1 1 604
2 1 2 598
3 1 3 578
4 1 4 591
5 1 5 589
6 1 6 571
7 1 7 581
8 1 8 597
9 1 9 589
10 1 10 584
I'm interested in fitting a model-based trend line per groups, however, I want this trend line to be displayed only from a certain x value. To visualize the trend line using all data points (requires ggplot2):
df %>%
ggplot(aes(x = week,
y = rate,
group = group,
lty = group)) +
geom_line() +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "quasipoisson"),
se = FALSE)
Or to fit a model based on a specific range of values (requires ggplot2 and dplyr):
df %>%
group_by(group) %>%
mutate(rate2 = ifelse(week < 35, NA, rate)) %>%
ggplot(aes(x = week,
y = rate,
group = group,
lty = group)) +
geom_line() +
geom_point() +
geom_smooth(aes(y = rate2),
method = "glm",
method.args = list(family = "quasipoisson"),
se = FALSE)
However, I cannot find a way to fit the models using all data, but display the trend line only from a specific x value (let's say 35+). Thus, I essentially want the trend line as computed for plot one, but displaying it according the second plot, using ggplot2 and ideally only one pipeline.
I went to look at the after_stat function mentioned by #tjebo. See if the following works for you?
df %>%
ggplot(aes(x = week,
y = rate,
lty = group)) +
geom_line() +
geom_point() +
geom_smooth(method = "glm",
aes(group = after_stat(interaction(group, x > 35)),
colour = after_scale(alpha(colour, as.numeric(x > 35)))),
method.args = list(family = "quasipoisson"),
se = F)
This works by splitting the points associated with each line into two groups, those in the x <=35 region and those in the x >35 region, since a line's colour shouldn't vary, and defining a separate colour transparency for each new group. As a result, only the lines in the x > 35 region are visible.
When used, the code triggers a warning that the after_scale modification isn't applied to the legend. I don't think that's a problem though, since we don't need it to appear in the legend anyway.
If you can tolerate a warning, you can solve this with 1 line difference from the example code using stage().
library(tidyverse)
set.seed(123)
df <- data.frame(group = as.factor(rep(1:3, each = 50)),
week = rep(1:50, 3),
rate = c(round(700 - rnorm(50, 100, 10) - 1:50 * 2, 0),
round(1000 - rnorm(50, 200, 10) - 1:50 * 2, 0),
round(1000 - rnorm(50, 200, 10) - 1:50 * 2, 0)))
df %>%
ggplot(aes(x = week,
y = rate,
group = group,
lty = group)) +
geom_line() +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "quasipoisson"),
aes(x = stage(week, after_stat = ifelse(x > 35, x, NA))),
se = FALSE)
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 165 rows containing missing values (geom_smooth).
One way to do this is to construct the fitted values outside of ggplot so you have control over them:
df$fit <- glm(rate ~ week + group, data = df, family = "quasipoisson")$fitted.values
library(dplyr)
library(ggplot2)
ggplot(df, aes(x = week, group = group, lty = group)) +
geom_line(aes(y = rate)) +
geom_point(aes(y = rate)) +
geom_line(data = df %>% filter(week >= 35), aes(y = fit), color = "blue", size = 1.25)
I am not sure if it is generally correct to use a linear model in time series. The whole point about time series is that they require specific statistics because of their expected autocorrelation. You might want something like average rolling models instead.
I am not sure if your visualisation might not be quite confusing and, more dangerously, misleading.
Besides, an interesting problem. I thought the new after_stat might somehow help, but I couldn't get it working.
So, here a quick hack. Change the order of your geom-s and draw a rectangle in-between. I am cheekily using a different theme, but if you really want to use theme_grey(), you can fake the axis lines as well.
library(tidyverse)
set.seed(123)
df <- data.frame(group = as.factor(rep(1:3, each = 50)),
week = rep(1:50, 3),
rate = c(round(700 - rnorm(50, 100, 10) - 1:50 * 2, 0),
round(1000 - rnorm(50, 200, 10) - 1:50 * 2, 0),
round(1000 - rnorm(50, 200, 10) - 1:50 * 2, 0)))
df %>%
ggplot(aes(x = week, y = rate, group = group, lty = group)) +
stat_smooth(se = FALSE) +
geom_rect(xmin = -Inf, xmax = 35, ymin = -Inf, ymax = Inf,
fill = "white") +
geom_line() +
geom_point() +
theme_classic()
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Created on 2021-02-09 by the reprex package (v1.0.0)
P.S. I've removed a few of the unnecessary bits in the code to reproduce this, like the model specs.
You could use ggplot_build to get the structure of the plot :
p <- ggplot(df, aes(x = week,
y = rate,
group = group,
lty = group)) +
geom_line() +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "quasipoisson"),
se = FALSE)
p_build <- ggplot_build(p)
You could then modify the internal data, here the third element of the data list (geom_smooth):
p_build$data[[3]]$x <- sapply(p_build$data[[3]]$x,function(x) {ifelse(x<35,NA,x)})
and use ggplot_gtable to regenerate the plot (the lm calculations still apply to the whole dataset):
plot(ggplot_gtable(p_build))
I have a question regarding the visualization of data using ggplot in R. Specifically, regarding the scaling of the y-axis in case of outliers.
Let's start with a sample dataset with observations from 31 IDs. 30 IDs are in an expected range and there is one outlier:
# Load libraries
library(tidyverse)
library(ggbeeswarm)
library(data.table)
# Set seed
set.seed(123)
# Create dataset
ID <- sprintf("ID-%s",seq(1:30))
baseline <- rnorm(30, mean = 50, sd = 3)
df <- data.frame(ID, baseline) %>%
mutate(`1` = baseline - rnorm(1, mean = 5, sd = 4),
`2` = `1` - rnorm(1, mean = 3, sd = 5),
`3` = `2` - rnorm(1, mean = 1, sd = 3))
# Add outlier
df <- as.data.frame(rbindlist(list(df, list("ID-31", 0.01, 0.02, 0.03 ,1))))
df <- df %>%
pivot_longer(-ID) %>%
rename(time = name) %>%
mutate(time = as.factor(time))
#Plot
ggplot(data = df, aes(x=time, y = value)) +
geom_quasirandom() +
theme_classic() +
scale_x_discrete(limits = c("baseline", "1", "2", "3") ) +
labs(x = "Time", y = "Value")
Expected output
Since the variation in the upper part of the graph is not well visible, I would like to scale the x-axis in a way that shows all values but focusses on a certain part of the plot (in this case values between 20 and 50).
Question
Is it possible to scale the x-axis in such a way?
Additional info
I am specifically not looking for a data transformation solution. Furthermore, I am aware of the scale_y_continuous function in ggplot and it limits argument, but this omits a part of the data.
I don''t know anything about having a broken y-axis with ggplot, but this achieves something similar if you can specify in advance which ID is going to be the outlier.
library(tidyverse)
library(ggbeeswarm)
library(data.table)
# Set seed
set.seed(123)
# Create dataset
ID <- sprintf("ID-%s",seq(1:30))
baseline <- rnorm(30, mean = 50, sd = 3)
df <- data.frame(ID, baseline) %>%
mutate(`1` = baseline - rnorm(1, mean = 5, sd = 4),
`2` = `1` - rnorm(1, mean = 3, sd = 5),
`3` = `2` - rnorm(1, mean = 1, sd = 3))
# Add outlier
df <- as.data.frame(rbindlist(list(df, list("ID-31", 0.01, 0.02, 0.03 ,1))))
df <- df %>%
pivot_longer(-ID) %>%
rename(time = name) %>%
mutate(time = as.factor(time),
is_outlier = (as.character(ID) == "ID-31"))
ggplot(data = df, aes(x=time, y = value)) +
geom_point() +
facet_grid(rows = vars(is_outlier),
scales = "free_y",
switch = "y") +
theme_classic() +
scale_x_discrete(limits = c("baseline", "1", "2", "3") ) +
labs(x = "Time", y = "Value")
I'm trying to re-create a plot like this in ggplot:.
This graph takes the residuals from a regression output, and plots them in order (with the X-axis being a rank of residuals).
My best attempt at this was something like the following:
library(ggplot2)
library(modelr)
d <- d %>% add_residuals(mod1, var = "resid")
d$resid_rank <- rank(d$resid)
ggplot(data = d, aes(x = resid_rank, y = resid)) +
geom_bar(stat="identity") +
theme_bw()
However, this yields a completely blank graph. I tried something like this:
ggplot(data = d, aes(x = resid_rank, y = resid)) +
geom_segment(yend = 0, aes(xend=resid)) +
theme_bw()
But this yields the segments that go in the wrong direction. What is the right way to do this, and to color those lines by a third factor?
FAKE DATASET:
library(estimatr)
library(fabricatr)
#simulation
dat <- fabricate(
N = 10000,
y = runif(N, 0, 10),
x = runif(N, 0, 100)
)
#add an outlier
dat <- rbind(dat, c(300, 5))
dat <- rbind(dat, c(500, 3))
dat$y_log <- log(dat$y)
dat$x_log <- log(dat$x)
dat$y_log_s <- scale(log(dat$y))
dat$x_log_s <- scale(log(dat$x))
mod1 <- lm(y_log ~ x_log, data = dat))
I used the build in dataset from the help page on lm() to create this example. I also just directly used resid() to get the residuals. It's unclear where / why the colored bars would be different, but basically you'd need to add a column to your data.frame that specificies why they are red or blue, then pass that to fill.
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 3.4.4
#example from lm
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
weight <- c(ctl, trt)
lm.D9 <- lm(weight ~ group)
resids <- data.frame(resid = resid(lm.D9))
#why are some bars red and some blue? No clue - so I'll pick randomly
resids$group <- sample(c("group 1", "group 2"), nrow(resids), replace = TRUE)
#rank
resids$rank <- rank(-1 * resids$resid)
ggplot(resids, aes(rank, resid, fill = group)) +
geom_bar(stat = "identity", width = 1) +
geom_hline(yintercept = c(-1,1), colour = "darkgray", linetype = 2) +
geom_hline(yintercept = c(-2,2), colour = "lightgray", linetype = 1) +
theme_bw() +
theme(panel.grid = element_blank()) +
scale_fill_manual(values = c("group 1" = "red", "group 2" = "blue"))
Created on 2019-01-24 by the reprex package (v0.2.1)