position_dodge when using separate datasets - r

I am attempting to produce a graph that shows two groups of error bars, but the different error bars represent different estimates of central tendency/variability (e.g., mean with sd and median with quantiles). I'm trying to use position_dodge, but it's not working, and I suspect this is because I'm feeding it values from a different dataset. Here's a reproducible example:
#### simulate dosages
dose = factor(rep(c("small", "medium", "large"), times=10))
dose = relevel(dose, "small")
#### simulate fevers, based on dosage (but highly skewed)
fever = rnorm(length(dose), 100, 1)
betas = matrix(c(0, -3, -6), nrow=1)
fever = fever + as.numeric(betas%*%t(model.matrix(fever~dose)))
#### put into data frame
d = data.frame(dose=dose, fever=fever)
#### compute means and standard errors
means = d %>% group_by(dose) %>% summarise(mean=mean(fever), lower=mean - sd(fever), upper = mean + sd(fever))
medians = d %>% group_by(dose) %>% summarise(median=median(fever), lower=quantile(fever, .25), upper = quantile(fever, .75))
#### put all into a ggplot
ggplot(d, aes(x=dose, y=fever)) +
geom_jitter(alpha=.2, width=.2) +
geom_point(data=means, aes(x=dose, y=mean)) +
geom_point(data=medians, aes(x=dose, y=median), col="red") +
geom_errorbar(data=means, aes(y=mean, ymin=lower, ymax=upper), width=.2, position=position_dodge(width=.2)) +
geom_errorbar(data= medians, aes(y=median, ymin=lower, ymax=upper), width=.2, position=position_dodge(width=.2), col="red")
Which gives the results of the following image:
Notice dodging isn't working.
Let's assume I can't just use stat_summary (I can't...I'm actually comparing means with some robust estimates from another package). Is there any way to offset the error bars/dots so they can be better seen?

Combine your dataframes for both statistics so you can map the kind of statistic on group:
means <- df %>%
group_by(dose) %>%
summarise(Statistic = "Mean", Value = mean(fever), lower=mean(fever) - sd(fever), upper = mean(fever) + sd(fever))
medians <- df %>%
group_by(dose) %>%
summarise(Statistic = "Median", Value = median(fever), lower=quantile(fever, 0.25), upper = quantile(fever, 0.75))
df2 <- bind_rows(means, medians)
#### put all into a ggplot
ggplot(df, aes(x = dose, y = fever)) +
geom_jitter(alpha = .2, width = .2) +
geom_point(data = df2, aes(x = dose, y = Value, color = Statistic)) +
geom_errorbar(data = df2, aes(y = Value, ymin = lower, ymax = upper,
group = Statistic, color = Statistic),
width=.2, position = position_dodge(width = .2))

Related

Meansplot with Tukey HSD confidence intervals in R

I'm trying to make a meansplot with confidence intervals, but I would like the intervals to be Tukey HSD intervals after an ANOVA is computed.
I'll use the next example here to explain, in the dataframe there is a factor: poison {1,2,3}
library(magrittr)
library(ggplot2)
library(ggpubr)
library(dplyr)
library("agricolae")
PATH <- "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/poisons.csv"
df <- read.csv(PATH) %>%
select(-X) %>%
mutate(poison = factor(poison, ordered = TRUE))
glimpse(df)
ggplot(df, aes(x = poison, y = time, fill = poison)) +
geom_boxplot() +
geom_jitter(shape = 15,
color = "steelblue",
position = position_jitter(0.21)) +
theme_classic()
anova_one_way <- aov(time ~ poison, data = df)
summary(anova_one_way)
# Use TukeyHSD
tukeyHSD <- TukeyHSD(anova_one_way)
plot(tukeyHSD)
I would like the plot to be similar to the one from statgraphics, where you can see the mean point and the lenght of the bars is the HSD tuckey intervals, so in one simple glimpse you can apreciate the best level and if it is better and is statistically significantly better.
I have seen some examples in more complex questions but is for boxplots and I dont understand it enough to adapt the solutions here.
Tukey's results on boxplot in R
example1
example1
TukeyHSD results on boxplot after two-way anova
example2
example2
Edit#############
The answer provided by Allan Cameron #allan-cameron is great, however It doesnt work right now in my computer probably due to versions. stats_summary method keywords change a bit. I took his solution and did a couple of changes to make it work for me.
# Allans original response
tukeyCI <- (tukeyHSD$poison[1, 1] - tukeyHSD$poison[1, 2]) / 2
# Changed fun.max and min to ymax and ymin
# Changed fun to fun.y to make Allans solution work for me.
ggplot(df, aes(x = poison, y = time)) +
stat_summary(fun.ymax = function(x) mean(x) + tukeyCI,
fun.ymin = function(x) mean(x) - tukeyCI,
geom = 'errorbar', size = 1, color = 'gray50',
width = 0.25) +
stat_summary(fun.y = mean, geom = 'point', size = 4, shape = 21,
fill = 'white') +
geom_point(position = position_jitter(width = 0.25), alpha = 0.4,
color = 'deepskyblue4') +
theme_minimal(base_size = 16)
Error response was:
Warning:Ignoring unknown parameters:fun.max, fun.min
Warning:Ignoring unknown parameters:fun
No summary function supplied, defaulting to `mean_se()
I'm currently using these versions:
version R version 3.5.2 (2018-12-20)
packageVersion("ggplot2") ‘3.1.0’
packageVersion("dplyr") ‘0.7.8’
The image from statgraphics shows error bars around the mean points, and if I understand you correctly then you want to be able to draw error bars around your mean points such that non-overlapping error bars mean there are significant differences between the variables. That being the case, we can extract the required confidence interval like this:
tukeyCI <- (tukeyHSD$poison[1,1] - tukeyHSD$poison[1,2])/2
And we can draw the result in ggplot like this:
ggplot(df, aes(x = poison, y = time)) +
stat_summary(fun.max = function(x) mean(x) + tukeyCI,
fun.min = function(x) mean(x) - tukeyCI,
geom = 'errorbar', size = 1, color = 'gray50',
width = 0.25) +
stat_summary(fun = mean, geom = 'point', size = 4, shape = 21,
fill = 'white') +
geom_point(position = position_jitter(width = 0.25), alpha = 0.4,
color = 'deepskyblue4') +
theme_minimal(base_size = 16)
Here we can see that there are significant differences between 1 and 3, and between 2 and 3, but that the difference between 1 and 2 is non-significant.

How do you plot multiple columns of a data frame all within the same boxplot in r (using ggplot2)?

I have a data frame that looks like this:
Train_Table_Time_Power <- data.frame(
Mean = runif(100),
STD = runif(100),
Kurt = runif(100),
Skew = runif(100),
TI = sample(c("0.05", "0.10", "0.15", "0.20"), 10, replace = TRUE)
)
I then created a box for the Skew Feature using the code below:
Skew_BoxPlot <- ggplot(Train_Table_Time_Power, aes(x = TI, y = Skew, color = TI)) +
geom_boxplot(notch = T, id=TRUE) +
stat_summary(fun = mean, geom="point", shape=19, color="red", size=2) +
geom_jitter(shape=16, position = position_jitter(0.2), size = 0.3) +
labs(title = "Crest_Time", x = "TI", y = "Normalized Magnitude") +
theme_minimal() + theme_Publication()
The above box plot displays the different distributions of the Skew feature as the TI feature varies. However, I now want to create a new box plot that shows the distributions of all of the features (Mean, STD, Kurt, and Skew) for just one value of TI, say TI = 0.05, and I would like the figure to plot all of the box plot distributions on the same graph horizontally, next to each other. Can anyone direct me on how best to go about doing this?
You can convert your data into a long table and then plot. Using tidyverse this can be easily done
library(tidyverse)
Train_Table_Time_Power %>% filter(TI == 0.05) %>%
pivot_longer( cols=1:4) %>%
ggplot(aes(x=name, y=value)) + geom_boxplot()
You can change TI == 0.05 to any value that you want or you can do all TI values and used facet_grid() to split out individual plots
Train_Table_Time_Power %>% pivot_longer( cols=1:4) %>%
ggplot(aes(x=name, y=value)) + geom_boxplot() +facet_grid(~TI)

ggplot2 - a custom histogram with a rug plot

I am trying to create a custom histogram with a rug plot showing the original values on the X axis.
I am going to use the mtcars dataset to illustrate. Its not be best dataset for this question...but hopefully the reader will understand what I am trying to achieve...
Below shows the basic histogram, without any rug plot attempt.
I want to create the histogram using geom_bar as this allows for more flexibility with custom bins.
I also want a small gap between the histgram bars (i.e width = 0.95) .... which adds to this
problem's complexity.
library(dplyr)
library(ggplot2)
# create custom bins
vct_seq <- c(seq(from = 10, to = 25, by = 5), 34)
mtcars$bin <- cut(mtcars$mpg, breaks = vct_seq)
# create data.frame for the ggplot graph..using bins above
df_mtcars_count <- mtcars %>% group_by(bin) %>% summarise(count = n())
# indicative labels
vct_labels <- c("bin 1", "bin 2", "bin 3", "bin 4")
# attempt 1 - basic plot -- no rug plot
p <- ggplot(data = df_mtcars_count, aes(x = bin, y = count))
p <- p + geom_bar(stat = "identity", width = 0.95)
p <- p + geom_text(aes(label = count), vjust = -0.5)
p <- p + scale_x_discrete("x title to go here", labels = df_mtcars_count$bin, breaks = df_mtcars_count$bin)
p
Next, try and add a basic rug plot on the X axis. This obviously doesn't work as the geom_bar and geom_rug have completely different scales.
# attempt 2 with no scaling.... doesn't work as x scale for ordinal (bins) and
# x scale for continuous (mpg) do not match
p <- ggplot(data = df_mtcars_count, aes(x = bin, y = count))
p <- p + geom_bar(stat = "identity", width = 0.95)
p <- p + geom_text(aes(label = count), vjust = -0.5)
p <- p + scale_x_discrete("x title to go here", labels = df_mtcars_count$bin, breaks = df_mtcars_count$bin)
p <- p + geom_rug(data = mtcars, aes(x = mpg), inherit.aes = F, alpha = 0.3)
p
Now, try and rescale the mpg column to match with the ordinal scale....
First define a linear mapping function...
fn_linear_map <- function(vct_existing_val, vct_new_range) {
# example....converts 1:20 into the range 1 to 10 like this:
# fn_linear_map(1:20, c(1, 10))
fn_r_diff <- function(x) x %>% range() %>% diff()
flt_ratio <- fn_r_diff(vct_new_range) / fn_r_diff(vct_existing_val)
vct_old_min_offset <- vct_existing_val - min(vct_existing_val)
vct_new_range_val <- (vct_old_min_offset * flt_ratio) + min(vct_new_range)
return(vct_new_range_val)
}
Now apply the function...we try and map mpg to the range 1 to 4 (which is an attempt to match
the ordinal scale)
mtcars$mpg_remap <- fn_linear_map(mtcars$mpg, c(1, 4))
Try the plot again.... getting closer ... but not really accurate...
# attempt 3: getting closer but doesn't really match the ordinal scale
p <- ggplot(data = df_mtcars_count, aes(x = bin, y = count))
p <- p + geom_bar(stat = "identity", width = 0.95)
p <- p + geom_text(aes(label = count), vjust = -0.5)
p <- p + scale_x_discrete("x title to go here", labels = df_mtcars_count$bin, breaks = df_mtcars_count$bin)
p <- p + geom_rug(data = mtcars, aes(x = mpg_remap), inherit.aes = F, alpha = 0.3)
p
The graph above is getting close to what I want....but rug plot does not line up
with the actual data ... example the max observation (33.9) should be displayed
almost aligning with the right hand side of the bar.. see below:
mtcars %>% filter(bin == "(25,34]") %>% arrange(mpg) %>% dplyr::select(mpg, mpg_remap)
Your scale makes no sense to me, as you are showing a bin that is twice as wide using the same bar width. Doing that in combination with a rug strikes me as confusing as best and misleading at worst. I suggest you plot the bars with their correct widths, after which the rug is trivial.
I think the best solution is to just use geom_histogram:
ggplot(mtcars, aes(mpg)) +
geom_histogram(breaks = vct_seq, col = 'grey80') +
geom_rug(aes(mpg, y = NULL))
If you really want the gaps between the bars you'll have to do more work:
library(tidyr)
d <- mtcars %>%
count(bin) %>%
separate(bin, c('min', 'max'), sep = ',', remove = FALSE) %>%
mutate_at(vars('min', 'max'), readr::parse_number) %>%
mutate(
middle = min + (max - min) / 2,
width = 0.9 * (max - min)
)
ggplot(d, aes(middle, n)) +
geom_col(width = d$width) +
geom_rug(aes(mpg, y = NULL), mtcars)

R geom_ribbon after specific value

I am trying to find a way to colour the background after a specific value.
Here in this example, I want to colour the spaces after the value 5 (here shown with a vertical line).
#
library(lme4)
library(tidyverse)
data("sleepstudy")
#
sleepstudy = sleepstudy %>% mutate(days = ifelse(Days > 5, 1, 0))
#
m1 = sleepstudy %>% group_by(Days, days) %>% summarise(m = mean(Reaction))
m1
m1 %>% ggplot(aes(Days, m)) +
geom_point() +
geom_vline(xintercept = 6) +
theme_minimal()
I want to achieve something like this
However, when I use the following line, I get an error message.
m1 %>% ggplot(aes(Days, m)) +
geom_point() +
geom_vline(xintercept = 6) +
theme_minimal() +
geom_ribbon(data = m1, aes(x = c(6,9), ymin=0, ymax = 400), fill = 'khaki', alpha = 0.2)
Maybe the following does what the question asks for.
First of all, if the error bars are to be plotted, the data preparation code must change.
There is no need to compute an extra variable, days that tells if Days are greater than 6.
The standard errors must be computed.
This can be all done in one pipe only.
library(lme4)
library(tidyverse)
data("sleepstudy")
m1 <- sleepstudy %>%
group_by(Days) %>%
summarise(m = mean(Reaction),
s = sd(Reaction))
Now the plot.
I have changed the order of the geoms, to have the points, error bars and vertical line over the ribbon.
I have also increased the alpha level to 0.30.
There is no need to reset the x aesthetic, it is set since the beginning of the plot.
It's the latter point that caused the code error.
Error: Aesthetics must be either length 1 or the same as the data (10): x
m1 %>% ggplot(aes(Days, m)) +
theme_minimal() +
geom_ribbon(data = m1 %>% filter(Days > 5),
aes(ymin = 0, ymax = 400),
fill = 'khaki',
alpha = 0.30) +
geom_vline(xintercept = 6) +
geom_point() +
geom_errorbar(aes(ymin = m - s, ymax = m + s))

Removing lower and upper quartiles in boxplot, with connection between whiskers in R

So im trying to make some different Boxplots,
Completely normal boxplot
I can't figure out how to create the boxplot without the lower and upper quantile, which essentially would be the outliers and the median connected by the whiskers. So something which would look like this
My attempt
But i need a total connection with a vertical line between the whisker?
what i did for the second plot in R was the following
boxplot(mpg~cyl,data=mtcars, main="Car Milage Data", xlab="Number of Cylinders",
ylab="Miles Per Gallon",col="white",frame=F,medcol = "black", boxlty =0,
whisklty = 1, staplelwd = 1,boxwex=0.4)
Many Thanks.
Here is a way to get what you are looking for using a scatter plot and error bars:
library(tidyverse)
data_summary <- data %>%
group_by(grouping_var) %>%
summarize(median = median(quant_var),
max = max(quant_var),
min = min(quant_var))
ggplot(data_summary, aes(x = grouping_var,
y = median)) +
geom_point() +
geom_errorbar(aes(ymin = min,
ymax = max))
Then if you need to overlay your old data you can just add a new geom like so:
ggplot(data_summary, aes(x = grouping_var,
y = median)) +
geom_point() +
geom_errorbar(aes(ymin = min,
ymax = max)) +
geom_point(data = data, aes(x = grouping_var,
y = quant_var))

Resources