How to assign standard deviation using facet_grid operator? - r

I have used the ggplot using the folowing script
data <- fread("Book1.txt")
names(data)
days <- c("UD","D4","D6","D10","D16")
sample <-c("H1","H2","H3")
SD <- c('SD_UD', 'SD_D4', 'SD_D6', 'SD_D10','SD_16')
vol <- data.table(gather(data, days, expression, UD:D16))
volL <- data.table(gather(vol, expression, SD, SD_UD:SD_D16))
dodge <- position_dodge(width=0.9)
gg <-ggplot(volL, aes(days, expression, fill= sample)) + geom_bar(stat="identity", width = 0.9, position = dodge) +
geom_errorbar(aes(ymin=expression-SD, ymax=expression+SD), color= "grey", width=.1) +
facet_grid(sample~.) +
scale_x_discrete(limits=days, labels = days) +
ggtitle("Expression vs days")
Data table :
sample UD D4 D6 D10 D16 SD_UD SD_D4 SD_D6 SD_D10 SD_D16
H1 9.96113E-05 0.000276321 0.105211427 0.098271655 0.06978369 1.17174E-05 9.84763E-05 0.03589122 0.05107505 0.017763717
H2 7.9913E-06 4.43916E-05 0.040212602 0.106493626 0.162614138 9.31988E-07 4.28076E-05 0.012332228 0.0441571 0.063304324
H3 0.000233391 0.000382084 0.001415172 0.003544547 0.018624673 0.000103126 0.000110262 0.000612986 0.000572592 0.010861883
Using the above script I got the following graph;
Problem: It did not assign the standard deviation according to the data.fram. Could anyone help me out how to assign the SD values correctly.

It did assign the standard deviation according to the data.frame.. problem is the data.frame contains 4 different value of SD for each expression.
To solve, using dplyr:
library(dplyr)
library(ggplot2)
expression <- df %>%
select(sample, UD:D16) %>%
gather(days, expr, UD:D16)
SD <- df %>%
select(sample, SD_UD:SD_D16) %>%
gather(days, SD, SD_UD:SD_D16) %>%
mutate(sample, days = gsub(x = days, pattern = 'SD_', ''), SD)
df1 <- inner_join(expression,SD)
gg <-ggplot(df1, aes(days, expr, fill = sample)) +
geom_bar(stat="identity", width = 0.9, position = dodge) +
geom_errorbar(aes(ymin=expr-SD, ymax=expr+SD), color= "grey", width=.1) +
facet_grid(sample~.) +
scale_x_discrete(limits=days, labels = days) +
ggtitle("Expression vs days")
Hope this helps.

Related

Annotate several regression lines produced with geom_smooth

I have a figure with 16 regression lines and I need to be able to identify them. Using a color gradient or symbols or different line types do not really help.
My idea therefore is, to just (haha) annotate every line.
Therefore, I build a dataset (hpAnnotatedLines) with the different maximum x values. This is the position the text should start. However, I have no idea how to automatically extract the respective y values of the predicted regression lines at the maximum x-axis values, which is different for each line.
Please find a smaller data set using mtcars as an example
library(ggplot2)
library(dplyr)
library(ggrepel)
#just select the data I need
mtcars1 <- select(mtcars, disp,cyl,hp)
mtcars1$cyl <- as.factor(mtcars1$cyl)
#extract max values
mtcars2 <- mtcars1 %>%
group_by(cyl) %>%
summarise(Max.disp= max(disp))
#build dataset for the annotation layer
#note that hp was done by hand. Here I need help
hpAnnotatedLines <- data.frame(cyl=levels(mtcars2$cyl),
disp=mtcars2$Max.disp,
hp=c(90,100,210))
#example plot
ggplot(mtcars, aes(x=disp, y=hp, color = factor(cyl))) +
geom_point() +
geom_smooth(method=lm)+
coord_cartesian(xlim = c(min(mtcars$disp), max(mtcars$disp) + 50)) +
geom_text_repel(
data = hpAnnotatedLines,
aes(label = cyl),
size = 3,
nudge_x = 1)
Instead of extracting the fitted values you could add the labels via geom_text by switching the stat to smooth and setting the label aesthetic via after_stat such that only the last point of each regression line gets labelled:
library(ggplot2)
library(dplyr)
myfun <- function(x, color) {
data.frame(x = x, color = color) %>%
group_by(color) %>%
mutate(label = ifelse(x %in% max(x), as.character(color), "")) %>%
pull(label)
}
ggplot(mtcars, aes(x=disp, y=hp, color = factor(cyl))) +
geom_point() +
geom_smooth(method=lm) +
geom_text(aes(label = after_stat(myfun(x, color))),
stat = "smooth", method = "lm", hjust = 0, size = 3, nudge_x = 1, show.legend = FALSE) +
coord_cartesian(xlim = c(min(mtcars$disp), max(mtcars$disp) + 50))
It's a bit of a hack, but you can extract the data from the compiled plot object. For example first make the plot without the labels,
myplot <- ggplot(mtcars, aes(x=disp, y=hp, color = factor(cyl))) +
geom_point() +
geom_smooth(method=lm)+
coord_cartesian(xlim = c(min(mtcars$disp), max(mtcars$disp) + 50))
Then use ggplot_build to get the data from the second layer (The geom_smooth layer) and transform it back into the names used by your data. Here we find the largest x value per group, and then take that y value.
pobj <- ggplot_build(myplot)
hpAnnotatedLines <- pobj$data[[2]] %>% group_by(group) %>%
top_n(1, x) %>%
transmute(disp=x, hp=y, cyl=levels(mtcars$cyl)[group])
Then add an additional layer to your plot
myplot +
geom_text_repel(
data = hpAnnotatedLines,
aes(label = cyl),
size = 3,
nudge_x = 1)
If your data is not that huge, you can extract the predictions out using augment() from broom and take that with the largest value:
library(broom)
library(dplyr)
library(ggplot2)
hpAnn = mtcars %>% group_by(cyl) %>%
do(augment(lm(hp ~ disp,data=.))) %>%
top_n(1,disp) %>%
select(cyl,disp,.fitted) %>%
rename(hp = .fitted)
# A tibble: 3 x 3
# Groups: cyl [3]
cyl disp hp
<dbl> <dbl> <dbl>
1 4 147. 96.7
2 6 258 99.9
3 8 472 220.
Then plot:
ggplot(mtcars, aes(x=disp, y=hp, color = factor(cyl))) +
geom_point() +
geom_smooth(method=lm)+
coord_cartesian(xlim = c(min(mtcars$disp), max(mtcars$disp) + 50))+
geom_text_repel(
data = hpAnn,
aes(label = cyl),
size = 3,
nudge_x = 1)

R geom_ribbon after specific value

I am trying to find a way to colour the background after a specific value.
Here in this example, I want to colour the spaces after the value 5 (here shown with a vertical line).
#
library(lme4)
library(tidyverse)
data("sleepstudy")
#
sleepstudy = sleepstudy %>% mutate(days = ifelse(Days > 5, 1, 0))
#
m1 = sleepstudy %>% group_by(Days, days) %>% summarise(m = mean(Reaction))
m1
m1 %>% ggplot(aes(Days, m)) +
geom_point() +
geom_vline(xintercept = 6) +
theme_minimal()
I want to achieve something like this
However, when I use the following line, I get an error message.
m1 %>% ggplot(aes(Days, m)) +
geom_point() +
geom_vline(xintercept = 6) +
theme_minimal() +
geom_ribbon(data = m1, aes(x = c(6,9), ymin=0, ymax = 400), fill = 'khaki', alpha = 0.2)
Maybe the following does what the question asks for.
First of all, if the error bars are to be plotted, the data preparation code must change.
There is no need to compute an extra variable, days that tells if Days are greater than 6.
The standard errors must be computed.
This can be all done in one pipe only.
library(lme4)
library(tidyverse)
data("sleepstudy")
m1 <- sleepstudy %>%
group_by(Days) %>%
summarise(m = mean(Reaction),
s = sd(Reaction))
Now the plot.
I have changed the order of the geoms, to have the points, error bars and vertical line over the ribbon.
I have also increased the alpha level to 0.30.
There is no need to reset the x aesthetic, it is set since the beginning of the plot.
It's the latter point that caused the code error.
Error: Aesthetics must be either length 1 or the same as the data (10): x
m1 %>% ggplot(aes(Days, m)) +
theme_minimal() +
geom_ribbon(data = m1 %>% filter(Days > 5),
aes(ymin = 0, ymax = 400),
fill = 'khaki',
alpha = 0.30) +
geom_vline(xintercept = 6) +
geom_point() +
geom_errorbar(aes(ymin = m - s, ymax = m + s))

position_dodge when using separate datasets

I am attempting to produce a graph that shows two groups of error bars, but the different error bars represent different estimates of central tendency/variability (e.g., mean with sd and median with quantiles). I'm trying to use position_dodge, but it's not working, and I suspect this is because I'm feeding it values from a different dataset. Here's a reproducible example:
#### simulate dosages
dose = factor(rep(c("small", "medium", "large"), times=10))
dose = relevel(dose, "small")
#### simulate fevers, based on dosage (but highly skewed)
fever = rnorm(length(dose), 100, 1)
betas = matrix(c(0, -3, -6), nrow=1)
fever = fever + as.numeric(betas%*%t(model.matrix(fever~dose)))
#### put into data frame
d = data.frame(dose=dose, fever=fever)
#### compute means and standard errors
means = d %>% group_by(dose) %>% summarise(mean=mean(fever), lower=mean - sd(fever), upper = mean + sd(fever))
medians = d %>% group_by(dose) %>% summarise(median=median(fever), lower=quantile(fever, .25), upper = quantile(fever, .75))
#### put all into a ggplot
ggplot(d, aes(x=dose, y=fever)) +
geom_jitter(alpha=.2, width=.2) +
geom_point(data=means, aes(x=dose, y=mean)) +
geom_point(data=medians, aes(x=dose, y=median), col="red") +
geom_errorbar(data=means, aes(y=mean, ymin=lower, ymax=upper), width=.2, position=position_dodge(width=.2)) +
geom_errorbar(data= medians, aes(y=median, ymin=lower, ymax=upper), width=.2, position=position_dodge(width=.2), col="red")
Which gives the results of the following image:
Notice dodging isn't working.
Let's assume I can't just use stat_summary (I can't...I'm actually comparing means with some robust estimates from another package). Is there any way to offset the error bars/dots so they can be better seen?
Combine your dataframes for both statistics so you can map the kind of statistic on group:
means <- df %>%
group_by(dose) %>%
summarise(Statistic = "Mean", Value = mean(fever), lower=mean(fever) - sd(fever), upper = mean(fever) + sd(fever))
medians <- df %>%
group_by(dose) %>%
summarise(Statistic = "Median", Value = median(fever), lower=quantile(fever, 0.25), upper = quantile(fever, 0.75))
df2 <- bind_rows(means, medians)
#### put all into a ggplot
ggplot(df, aes(x = dose, y = fever)) +
geom_jitter(alpha = .2, width = .2) +
geom_point(data = df2, aes(x = dose, y = Value, color = Statistic)) +
geom_errorbar(data = df2, aes(y = Value, ymin = lower, ymax = upper,
group = Statistic, color = Statistic),
width=.2, position = position_dodge(width = .2))

apply jittering to outliers data in a boxplot with ggplot2

do you have any idea of how to apply jittering just to the outliers data of a boxplot? This is the code:
ggplot(data = a, aes(x = "", y = a$V8)) +
geom_boxplot(outlier.size = 0.5)+
geom_point(data=a, aes(x="", y=a$V8[54]), colour="red", size=3) +
theme_bw()+
coord_flip()
thank you!!
Added a vector to your data set to indicate which points are and are not outliers. Then, Set the geom_boxplot to not plot any outliers and use a geom_point to plot the outliers explicity.
I will use the diamonds data set from ggplot2 to illustrate.
library(ggplot2)
library(dplyr)
diamonds2 <-
diamonds %>%
group_by(cut) %>%
mutate(outlier = price > median(price) + IQR(price) * 1.5) %>%
ungroup
ggplot(diamonds2) +
aes(x = cut, y = price) +
geom_boxplot(outlier.shape = NA) + # NO OUTLIERS
geom_point(data = function(x) dplyr::filter_(x, ~ outlier), position = 'jitter') # Outliers
This is slightly different approach than above (assigns a color variable with NA for non-outliers), and includes a correction for the upper and lower bounds calculations.
The default "outlier" definition is a point beyond the 25/75th quartile +/- 1.5 x the interquartile range (IQR).
Generate some sample data:
set.seed(1)
a <- data_frame(x= factor(rep(1:4, each = 1000)),
V8 = c(rnorm(1000, 25, 4),
rnorm(1000, 50, 4),
rnorm(1000, 75, 4),
rnorm(1000, 100, 4)))
calculate the upper/lower limit outliers (uses dplyr/tidyverse functions):
library(tidyverse)
a <- a %>% group_by(x) %>%
mutate(outlier.high = V8 > quantile(V8, .75) + 1.50*IQR(V8),
outlier.low = V8 < quantile(V8, .25) - 1.50*IQR(V8))
Define a color for the upper/lower points:
a <- a %>% mutate(outlier.color = case_when(outlier.high ~ "red",
outlier.low ~ "steelblue"))
The unclassified cases will be coded as "NA" for color, and will not appear in the plot.
The dplyr::case_when() function is not completely stable yet (may require github development version > 0.5 at enter link description here), so here is a base alternative if that does not work:
a$outlier.color <- NA
a$outlier.color[a$outlier.high] <- "red"
a$outlier.color[a$outlier.low] <- "steelblue"
Plot:
a %>% ggplot(aes(x, V8)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(color = a$outlier.color, width = .2) + # NA not plotted
theme_bw() + coord_flip()

Combining 2 different graph outputs in R into one graph

So I used the following code, to generate graphs, where appl and apple generate 2 different graphs and now I want to combine them into a single graph
data <- ddply(data, .(Value), summarise,
N = length(means),
mean = mean(means),
sd = sd(means),
se = sd(means) / sqrt(length(means)) )
apple=ggplot(data, aes(x=Value, y=mean)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1) +
geom_ribbon(aes(ymin=mean-se, ymax=mean+se),alpha=0.5) +
geom_line() +
geom_point()
dat <- ddply(dat1, .(Value), summarise,
N = length(means),
mean = mean(means),
sd = sd(means),
se = sd(means) / sqrt(length(means)))
appl=ggplot(dat, aes(x=Value, y=mean)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1) +
geom_ribbon(aes(ymin=mean-se, ymax=mean+se),alpha=0.5) +
geom_line() +
geom_point()
The answer involves combining the datasets into one big one, with an additional column specifying to which dataset that subset belonged. There is no need for creating plots separately and combining them. Let's assume that column is named id, then you can use an additional argument in aes to get the plot to work, i.e. aes(x=Value, y=mean, color=id). Combining the datasets can be done using rbind.
A code example:
df1 = data.frame(Value = sample(LETTERS[1:8], 1000, replace = TRUE),
means = runif(1000))
df2 = data.frame(Value = sample(LETTERS[1:8], 1000, replace = TRUE),
means = runif(1000) + 0.5)
df1 = ddply(df1, .(Value), summarise,
N = length(means),
mean = mean(means),
sd = sd(means),
se = sd(means) / sqrt(length(means)))
df1$id = "ID1"
df2 = ddply(df2, .(Value), summarise,
N = length(means),
mean = mean(means),
sd = sd(means),
se = sd(means) / sqrt(length(means)))
df2$id = "ID2"
df_all = rbind(df1, df2)
ggplot(df_all, aes(x=Value, y=mean, color = id)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1) +
geom_ribbon(aes(ymin=mean-se, ymax=mean+se),alpha=0.5) +
geom_line() +
geom_point()
Which results in the following graph:
Note that I have had to invent some data due to lack of example data form your side, so this might not entirely fit your situation. However, it nicely illustrates the approach.

Resources