how to colour a funnel plot in ggplot R - r

I have drawn the attached funnel plot in ggplot, But I have 2 questions:
Is there any way to make the coloured green dot bigger (only that one);
is there any way to colour the upper and lower part of the confidence intervals?
This is what I am able to make so far:
Thank you!
The data set I am working on:
df <-
read.table(text = "
school_id year sdq_emotional
1060 7 4
1060 7 5
1060 7 7
1060 7 6
1060 7 4
1060 7 7
1060 7 8
1115 7 5
1115 7 9
1115 7 3
1136 7 1
1136 7 8
1136 7 5
1136 7 9
1135 7 4
1139 7 7
1139 7 3
2371 7 6
2371 7 3
2372 7 4
2372 7 1
2378 7 6
2378 7 7
2378 7 5", header=TRUE)
My code as follows:
# Format the data
df1 <- plyr::count(df, c('school_id'))
df2 <- merge(df,df1, by= c("school_id"))
df <- df2
M3 <- aggregate(df$sdq_emotional[df$freq > 10], by=list(df$school_id[df$freq > 10]),mean,na.rm=T)
S3 <- aggregate(df$sdq_emotional[df$freq > 10], by=list(df$school_id[df$freq > 10]),nona)
CG_PLOT1 <- merge(M3,S3,by="Group.1")
names(CG_PLOT1) <- c("School","Mean","Size")
LINE3 <- data.frame(M3=rep(mean(df$sdq_emotional,na.rm=T),max(CG_PLOT1$Size)+25),
SD3=rep(sd(df$sdq_emotional,na.rm=T),max(CG_PLOT1$Size)+25),
N3=sqrt(1:(max(CG_PLOT1$Size)+25)))
ID <- 1060
filling3 <- rep("white",nrow(CG_PLOT1))
filling3[CG_PLOT1$School ==ID]<-"green"
# Build the graph
ggplot(data = CG_PLOT1) +
geom_line(data = LINE3, aes(x = 1:(max(CG_PLOT1$Size) + 25),
y = M3 + qnorm(0.975) * SD3 / N3), size = 1, colour = "steelblue2",
linetype = 5) +
geom_line(data = LINE3, aes(x = 1:(max(CG_PLOT1$Size) + 25),
y = M3 - qnorm(0.975) * SD3 / N3), size = 1, colour = "steelblue2",
linetype = 5) +
geom_segment(xend = max(CG_PLOT1$Size)+25,yend=mean(LINE3$M3,na.rm=T)),
aes(x = 1, y = mean(LINE3$M3,na.rm=T), size=1, colour="steelblue2") +
geom_point(data = CG_PLOT1, aes(x = Size, y = Mean), size = 2,
colour = "black", shape = 21,fill = filling3) +
ylim(0, 8)
thank you very much!

As you didn't provide a reproducible example, I have used this question as a template for your problem:
Creating a dataset here:
library(ggplot2)
set.seed(101)
x <- runif(100, min=1, max=10)
y <- rnorm(length(x), mean=5, sd=0.1*x)
df <- data.frame(x=x*70, y=y)
m <- lm(y ~ x, data=df)
fit95 <- predict(m, interval="conf", level=.95)
fit99 <- predict(m, interval="conf", level=.999)
df <- cbind.data.frame(df,
lwr95=fit95[,"lwr"], upr95=fit95[,"upr"],
lwr99=fit99[,"lwr"], upr99=fit99[,"upr"])
To add a colour background to the funnel plot, we can use the geom_ribbon function within ggplot to fill the area between a ymin and ymax. In this case, we will use the data used to construct each of the lines:
ggplot(df, aes(x, y)) +
# Add background
geom_ribbon(ymin= df$upr99, ymax = Inf, fill = "#e2a49a", alpha = 0.5) +
geom_ribbon(ymin = df$lwr99, ymax = df$upr99, fill = "#e0ba9d", alpha = 0.5 ) +
geom_ribbon(ymin = 0, ymax = df$lwr99, fill = "#8fd6c9", alpha = 0.5 ) +
# Overlay points and lines
geom_point() +
geom_smooth(method="lm", colour="black", lwd=1.1, se=FALSE) +
geom_line(aes(y = upr95), color="black", linetype=2) +
geom_line(aes(y = lwr95), color="black", linetype=2) +
geom_line(aes(y = upr99), color="red", linetype=3) +
geom_line(aes(y = lwr99), color="red", linetype=3)
labs(x="No. admissions...", y="Percentage of patients...")
As for changing the size of one point, you can check out the answer here. I would recommend subsetting the data to extract the one point, and then add another layer for the geom_point and then changing the size and colour argument of the new layer`

Related

geom_text() to label two separate points from different plots in ggplot

I am trying to create individual plots facetted by 'iid' using 'facet_multiple', in the following dataset (first 3 rows of data)
iid Age iop al baseIOP baseAGE baseAL agesurg
1 1 1189 20 27.9 21 336 24.9 336
2 2 877 11 21.5 16 98 20.3 98
3 2 1198 15 21.7 16 98 20.3 98
and wrote the following code:
# Install gg_plus from GitHub
remotes::install_github("guiastrennec/ggplus")
# Load libraries
library(ggplot2)
library(ggplus)
# Generate ggplot object
p <- ggplot(data_longF1, aes(x = Age, y = al)) +
geom_point(alpha = 0.5) +
geom_point(aes(x= baseAGE, y=baseAL)) +
labs(x = 'Age (days)',
y = 'Axial length (mm)',
title = 'Individual plots of Axial length v time')
p1 <- p+geom_vline(aes(xintercept = agesurg),
linetype = "dotted",
colour = "red",
size =1.0)
p2<- p1 + geom_text(aes(label=iop ,hjust=-1, vjust=-1))
p3 <- p2 + geom_text(aes(label = baseIOP, hjust=-1, vjust=-1))
# Plot on multiple pages (output plot to R/Rstudio)
facet_multiple(plot = p3,
facets = 'iid',
ncol = 1,
nrow = 1,
scales = 'free')
The main issue I am having is labeling the points. The points corresponding to (x=age, y=axl) get labelled fine, but labels for the second group of points (x=baseIOP, y=baseAL) gets put in the wrong place.individual plot sample
I have had a look at similar issues in Stack Overflow e.g. ggplot combining two plots from different data.frames
But not been able to correct my code.
Thanks for your help
You need to define the x and y coordinates for the labels or they will default to the last ones specified.
Thus the geom_text() definitions should look something like:
data_longF1 <-read.table(header=TRUE, text="iid Age iop al baseIOP baseAGE baseAL agesurg
1 1 1189 20 27.9 21 336 24.9 336
2 2 877 11 21.5 16 98 20.3 98
3 2 1198 15 21.7 16 98 20.3 98")
# Generate ggplot object
p <- ggplot(data_longF1, aes(x = Age, y = al)) +
geom_point(alpha = 0.5) +
geom_point(aes(x= baseAGE, y=baseAL)) +
labs(x = 'Age (days)',
y = 'Axial length (mm)',
title = 'Individual plots of Axial length v time')
p1 <- p+geom_vline(aes(xintercept = agesurg),
linetype = "dotted",
colour = "red",
size =1.0)
#Need to specify the x and y coordinates or will default to the last ones defined
p2<- p1 + geom_text(aes(x=Age, y= al, label=iop ,hjust=-1, vjust=-1))
p3 <- p2 + geom_text(aes(x=baseAGE, y= baseAL, label = baseIOP, hjust=-1, vjust=-1))
print(p3)

volcano plot error (using ggplot2): drawn without data

I'm here again with another problem.
I'm currently working with making a volcano plot of DEG data using ggplot2.
The thing is that I'm getting a result without data. weird.
for more accurate diagnosis, my data(volcano) is consist of 948 DEG data (|logFC|>1, FDR<0.05).
library(ggplot2)
volcano["group"] <- "NotSignificant"
volcano[which(volcano['FDR'] < 0.01 & abs(volcano['logFC']) > 2 ),"group"] <- "Increased"
volcano[which(volcano['FDR'] < 0.01 & abs(volcano['logFC']) < -2 ),"group"] <- "Decreased"
# creating color palette
cols <- c("red" = "red", "orange" = "orange", "NotSignificant" = "darkgrey",
"Increased" = "#00B2FF", "Decreased" = "#00B2FF")
##I didn't even get to use those beautiful colors.
FDR_threshold <- 0.01
logFC_threshold <- 2
deseq.threshold <- as.factor(abs(volcano$logFC) >= logFC_threshold &
volcano$FDR < FDR_threshold)
xi <- which(deseq.threshold == TRUE)
deseq.threshold <- as.factor(abs(volcano$logFC) > 2 & volcano$FDR < 0.05)
# Make a basic ggplot2 object
vol <- ggplot(volcano, aes(x = logFC, y =-log10(FDR), colour=deseq.threshold))
# inserting manual colours as per colour palette and more
vol +
scale_colour_manual(values = cols) +
ggtitle(label = "Volcano Plot", subtitle = "colon specific volcano plot") +
geom_point(size = 2.5, alpha = 1, na.rm = T) +
theme_bw(base_size = 14) +
theme(legend.position = "none") +
xlab(expression(log[2]("logFC"))) +
ylab(expression(-log[10]("FDR"))) +
geom_hline(yintercept = 1, colour="#990000", linetype="dashed") +
geom_vline(xintercept = 0.586, colour="#990000", linetype="dashed") +
geom_vline(xintercept = -0.586, colour="#990000", linetype="dashed")+
scale_y_continuous(trans = "log1p")
Here is the lil sample of my dataset, volcano
genes logFC FDR group
1 INHBA 6.271879 2.070000e-30 Increased
2 COL10A1 7.634386 1.820000e-23 Increased
3 WNT2 9.485133 6.470000e-20 Increased
4 COL8A1 3.974965 6.470000e-20 Increased
5 THBS2 4.104176 2.510000e-19 Increased
6 BGN 3.524484 5.930000e-18 Increased
7 COMP 11.916956 2.740000e-17 Increased
9 SULF1 3.540374 1.290000e-15 Increased
10 CTHRC1 3.937028 4.620000e-14 Increased
11 TRIM29 3.827088 1.460000e-11 Increased
12 SLC6A20 5.060538 5.820000e-11 Increased
13 SFRP4 5.924330 8.010000e-11 Increased
14 CDH3 5.330732 8.940000e-11 Increased
15 ESM1 6.491496 3.380000e-10 Increased
614 TDP2 -1.801368 0.002722461 NotSignificant
615 EPHX2 -1.721039 0.002722461 NotSignificant
616 RAVER2 -1.581812 0.002749728 NotSignificant
617 BMP6 -2.702780 0.002775460 Increased
619 SCNN1G -4.012111 0.002870500 Increased
620 SLC52A3 -1.868920 0.002931197 NotSignificant
621 VIPR1 -1.556238 0.002945578 NotSignificant
622 SUCLG2 -1.720993 0.003059717 NotSignificant
I think your issue is coming from the use of deseq.threshold in the color of aes. Instead, I think you should use group column to plot the color.
BTW, your threshold to define your significant genes has a mistake because you are looking for "Decreased" for genes with an absolute value of logFC inferior to -2 which is not possible.
Here, I used an example of an output of DEG:
library(data.table)
volcano = fread("https://gist.githubusercontent.com/stephenturner/806e31fce55a8b7175af/raw/1a507c4c3f9f1baaa3a69187223ff3d3050628d4/results.txt", header = TRUE)
colnames(volcano) <- c("Gene","logFC","pvalue","FDR")
# Adding group to decipher if the gene is significant or not:
volcano <- data.frame(volcano)
volcano["group"] <- "NotSignificant"
volcano[which(volcano['FDR'] < 0.01 & volcano['logFC'] > 1 ),"group"] <- "Increased"
volcano[which(volcano['FDR'] < 0.01 & volcano['logFC'] < -1 ),"group"] <- "Decreased"
So, my example dataframe looks like (I changed a little bit the threshold you are using to get more significant genes):
> head(volcano)
Gene logFC pvalue FDR group
1 DOK6 0.5100 1.861e-08 0.0003053 NotSignificant
2 TBX5 -2.1290 5.655e-08 0.0004191 Decreased
3 SLC32A1 0.9003 7.664e-08 0.0004191 NotSignificant
4 IFITM1 -1.6870 3.735e-06 0.0068090 Decreased
5 NUP93 0.3659 3.373e-06 0.0068090 NotSignificant
6 EMILIN2 1.5340 2.976e-06 0.0068090 Increased
Now, you can plot:
library(ggplot2)
ggplot(volcano, aes(x = logFC, y = -log10(FDR), color = group))+
scale_colour_manual(values = cols) +
ggtitle(label = "Volcano Plot", subtitle = "colon specific volcano plot") +
geom_point(size = 2.5, alpha = 1, na.rm = T) +
theme_bw(base_size = 14) +
theme(legend.position = "none") +
xlab(expression(log[2]("logFC"))) +
ylab(expression(-log[10]("FDR"))) +
geom_hline(yintercept = 1, colour="#990000", linetype="dashed") +
geom_vline(xintercept = 0.586, colour="#990000", linetype="dashed") +
geom_vline(xintercept = -0.586, colour="#990000", linetype="dashed")+
scale_y_continuous(trans = "log1p")

reorder ggplot with geom_bar(stat="identity")

I have prepare a dataframe and use a ggplot on him. But the initial order is not respected. How i can respect this order ?
Patient Nb_peptides Type_affinite
1 22 563 a
2 22 1040 b
3 22 11139 c
4 24 489 a
5 24 1120 b
6 24 11779 c
7 13 467 a
8 13 1239 b
9 13 14600 c
g_plot <- ggplot(data = nb_peptides_type,
aes(x = reorder(Patient, True_order),
y = Nb_peptides,
fill = Type_affinite)) +
geom_bar(stat = "identity")
print(g_plot)
Please provide stand-alone code to make it easier.
I would use levels outside of your plot to reorder factor levels : Is it what you're looking for ?
## fake dataframe
df <- data.frame(patient = as.factor(rep((21:30), each=3)),
nb = rpois(30, 1000),
type=sample(letters[1:3], replace =T, size =30))
## initial plot
ggplot(data = df,
aes(x = patient,
y = nb,
fill = type)) +
geom_bar(stat = "identity")
## adjust factors levels
True_order <- sample((21:30), 10)
levels(df$patient) <- True_order
## re-plot
ggplot(data = df,
aes(x = patient,
y = nb,
fill = type)) +
geom_bar(stat = "identity")

ggplot2 facets: Different annotation text for each plot

I have the following generated data frame called Raw_Data:
Time Velocity Type
1 10 1 a
2 20 2 a
3 30 3 a
4 40 4 a
5 50 5 a
6 10 2 b
7 20 4 b
8 30 6 b
9 40 8 b
10 50 9 b
11 10 3 c
12 20 6 c
13 30 9 c
14 40 11 c
15 50 13 c
I plotted this data with ggplot2:
ggplot(Raw_Data, aes(x=Time, y=Velocity))+geom_point() + facet_grid(Type ~.)
I have the objects: Regression_a, Regression_b, Regression_c. These are the linear regression equations for each plot. Each plot should display the corresponding equation.
Using annotate displays the particular equation on each plot:
annotate("text", x = 1.78, y = 5, label = Regression_a, color="black", size = 5, parse=FALSE)
I tried to overcome the issue with the following code:
Regression_a_eq <- data.frame(x = 1.78, y = 1,label = Regression_a,
Type = "a")
p <- x + geom_text(data = Raw_Data,label = Regression_a)
This did not solve the problem. Each plot still showed Regression_a, rather than just plot a
You can put the expressions as character values in a new dataframe with the same unique Type's as in your data-dataframe and add them with geom_text:
regrDF <- data.frame(Type = c('a','b','c'), lbl = c('Regression_a', 'Regression_b', 'Regression_c'))
ggplot(Raw_Data, aes(x = Time, y = Velocity)) +
geom_point() +
geom_text(data = regrDF, aes(x = 10, y = 10, label = lbl), hjust = 0) +
facet_grid(Type ~.)
which gives:
You can replace the text values in regrDF$lbl with the appropriate expressions.
Just a supplementary for the adopted answer if we have facets in both horizontal and vertical directions.
regrDF <- data.frame(Type1 = c('a','a','b','b'),
Type2 = c('c','d','c','d'),
lbl = c('Regression_ac', 'Regression_ad', 'Regression_bc', 'Regression_bd'))
ggplot(Raw_Data, aes(x = Time, y = Velocity)) +
geom_point() +
geom_text(data = regrDF, aes(x = 10, y = 10, label = lbl), hjust = 0) +
facet_grid(Type1 ~ Type2)
The answer is good but still imperfect as I do not know how to incorporate math expressions and newline simultaneously (Adding a newline in a substitute() expression).

Generate multiple x-y plots from the same data frame in the same plot using ggplot2 in R using a loop and display corresponding legend

I have a data frame which I generated using the following piece of code,
x <- c(1:10)
y <- x^3
z <- y-20
s <- z/3
t <- s*6
q <- s*y
x1 <- cbind(x,y,z,s,t,q)
x1 <- data.frame(x1)
The data frame x1 thus has the following data,
x y z s t q
1 1 1 -19 -6.333333 -38 -6.333333
2 2 8 -12 -4.000000 -24 -32.000000
3 3 27 7 2.333333 14 63.000000
4 4 64 44 14.666667 88 938.666667
5 5 125 105 35.000000 210 4375.000000
6 6 216 196 65.333333 392 14112.000000
7 7 343 323 107.666667 646 36929.666667
8 8 512 492 164.000000 984 83968.000000
9 9 729 709 236.333333 1418 172287.000000
10 10 1000 980 326.666667 1960 326666.666667
Now I want to plot columns x vs y, z vs s and t vs q in the same plot, so for this I use the following code,
p <- ggplot() +
geom_line(data = x1, aes(x = x1[,1], y = x1[,2], color = "red")) +
geom_line(data = x1, aes(x = x1[,3], y = x1[,4], color = "blue")) +
geom_line(data = x1, aes(x = x1[,5], y = x1[,6], color = "green")) +
xlab('x') +
ylab('y')
While the above piece of code works fine for a data frame of just 6 columns, I would like to perform the same operation for a data frame with many number of columns. For example if there are 20 columns in a data frame, there should be one single plot generated containing plot of col 1 vs 2, col 3 vs 4, col 5 vs 6 and so on until col 19 vs 20. To do this I use this following piece of code,
p <- ggplot() + geom_line(data = x1, aes(x = x1[,1], y = x1[,2], color = "red")) + xlab('x') + ylab('y')
ctr <- 1
for (iz in seq(3, ncol(x1), by = 2))
{
p$ctr <- p + geom_line(data = x1, aes(x = x1[,iz], y = x1[,iz+1], color = "green"))
ctr <- ctr+1
}
So the plots should be layered incrementally and the last object should contain the entire plot. Using the above code the plot gets overwritten every time when the loop runs, could some one point out how to capture the full data. I would like to display a legend for each of the plot as well.
Thanks
You don't need a loop if you put your data into the right format. You can create a long data frame based on your original data frame.
x1_long <- data.frame(x = unlist(x1[c(TRUE, FALSE)]),
y = unlist(x1[c(FALSE, TRUE)]),
ind = gl(ncol(x1) / 2, nrow(x1)))
Now, a single geom_line command is sufficient:
library(ggplot2)
ggplot(x1_long) +
geom_line(aes(x = x, y = y, colour = ind))
(Note. The red line is plotted too but its values are quite small.)
How about this?
ggplot() +
lapply(seq(1,ncol(x1),2), # every second col index
function(x){ # return the geom_line calls in a list
geom_line(aes_string(x=x1[x], # remember to use aes_string for x
y=x1[x+1]), # and y
color=factor(x), # then color
size=2) # and size
}) +
xlab('x') + ylab('y')

Resources