I'm here again with another problem.
I'm currently working with making a volcano plot of DEG data using ggplot2.
The thing is that I'm getting a result without data. weird.
for more accurate diagnosis, my data(volcano) is consist of 948 DEG data (|logFC|>1, FDR<0.05).
library(ggplot2)
volcano["group"] <- "NotSignificant"
volcano[which(volcano['FDR'] < 0.01 & abs(volcano['logFC']) > 2 ),"group"] <- "Increased"
volcano[which(volcano['FDR'] < 0.01 & abs(volcano['logFC']) < -2 ),"group"] <- "Decreased"
# creating color palette
cols <- c("red" = "red", "orange" = "orange", "NotSignificant" = "darkgrey",
"Increased" = "#00B2FF", "Decreased" = "#00B2FF")
##I didn't even get to use those beautiful colors.
FDR_threshold <- 0.01
logFC_threshold <- 2
deseq.threshold <- as.factor(abs(volcano$logFC) >= logFC_threshold &
volcano$FDR < FDR_threshold)
xi <- which(deseq.threshold == TRUE)
deseq.threshold <- as.factor(abs(volcano$logFC) > 2 & volcano$FDR < 0.05)
# Make a basic ggplot2 object
vol <- ggplot(volcano, aes(x = logFC, y =-log10(FDR), colour=deseq.threshold))
# inserting manual colours as per colour palette and more
vol +
scale_colour_manual(values = cols) +
ggtitle(label = "Volcano Plot", subtitle = "colon specific volcano plot") +
geom_point(size = 2.5, alpha = 1, na.rm = T) +
theme_bw(base_size = 14) +
theme(legend.position = "none") +
xlab(expression(log[2]("logFC"))) +
ylab(expression(-log[10]("FDR"))) +
geom_hline(yintercept = 1, colour="#990000", linetype="dashed") +
geom_vline(xintercept = 0.586, colour="#990000", linetype="dashed") +
geom_vline(xintercept = -0.586, colour="#990000", linetype="dashed")+
scale_y_continuous(trans = "log1p")
Here is the lil sample of my dataset, volcano
genes logFC FDR group
1 INHBA 6.271879 2.070000e-30 Increased
2 COL10A1 7.634386 1.820000e-23 Increased
3 WNT2 9.485133 6.470000e-20 Increased
4 COL8A1 3.974965 6.470000e-20 Increased
5 THBS2 4.104176 2.510000e-19 Increased
6 BGN 3.524484 5.930000e-18 Increased
7 COMP 11.916956 2.740000e-17 Increased
9 SULF1 3.540374 1.290000e-15 Increased
10 CTHRC1 3.937028 4.620000e-14 Increased
11 TRIM29 3.827088 1.460000e-11 Increased
12 SLC6A20 5.060538 5.820000e-11 Increased
13 SFRP4 5.924330 8.010000e-11 Increased
14 CDH3 5.330732 8.940000e-11 Increased
15 ESM1 6.491496 3.380000e-10 Increased
614 TDP2 -1.801368 0.002722461 NotSignificant
615 EPHX2 -1.721039 0.002722461 NotSignificant
616 RAVER2 -1.581812 0.002749728 NotSignificant
617 BMP6 -2.702780 0.002775460 Increased
619 SCNN1G -4.012111 0.002870500 Increased
620 SLC52A3 -1.868920 0.002931197 NotSignificant
621 VIPR1 -1.556238 0.002945578 NotSignificant
622 SUCLG2 -1.720993 0.003059717 NotSignificant
I think your issue is coming from the use of deseq.threshold in the color of aes. Instead, I think you should use group column to plot the color.
BTW, your threshold to define your significant genes has a mistake because you are looking for "Decreased" for genes with an absolute value of logFC inferior to -2 which is not possible.
Here, I used an example of an output of DEG:
library(data.table)
volcano = fread("https://gist.githubusercontent.com/stephenturner/806e31fce55a8b7175af/raw/1a507c4c3f9f1baaa3a69187223ff3d3050628d4/results.txt", header = TRUE)
colnames(volcano) <- c("Gene","logFC","pvalue","FDR")
# Adding group to decipher if the gene is significant or not:
volcano <- data.frame(volcano)
volcano["group"] <- "NotSignificant"
volcano[which(volcano['FDR'] < 0.01 & volcano['logFC'] > 1 ),"group"] <- "Increased"
volcano[which(volcano['FDR'] < 0.01 & volcano['logFC'] < -1 ),"group"] <- "Decreased"
So, my example dataframe looks like (I changed a little bit the threshold you are using to get more significant genes):
> head(volcano)
Gene logFC pvalue FDR group
1 DOK6 0.5100 1.861e-08 0.0003053 NotSignificant
2 TBX5 -2.1290 5.655e-08 0.0004191 Decreased
3 SLC32A1 0.9003 7.664e-08 0.0004191 NotSignificant
4 IFITM1 -1.6870 3.735e-06 0.0068090 Decreased
5 NUP93 0.3659 3.373e-06 0.0068090 NotSignificant
6 EMILIN2 1.5340 2.976e-06 0.0068090 Increased
Now, you can plot:
library(ggplot2)
ggplot(volcano, aes(x = logFC, y = -log10(FDR), color = group))+
scale_colour_manual(values = cols) +
ggtitle(label = "Volcano Plot", subtitle = "colon specific volcano plot") +
geom_point(size = 2.5, alpha = 1, na.rm = T) +
theme_bw(base_size = 14) +
theme(legend.position = "none") +
xlab(expression(log[2]("logFC"))) +
ylab(expression(-log[10]("FDR"))) +
geom_hline(yintercept = 1, colour="#990000", linetype="dashed") +
geom_vline(xintercept = 0.586, colour="#990000", linetype="dashed") +
geom_vline(xintercept = -0.586, colour="#990000", linetype="dashed")+
scale_y_continuous(trans = "log1p")
Related
I have density plots for each shift and year. The means are plotted by grouping in a df called mu. I also add vertical reference lines which I can label without issue but I cannot seem to get the labels on the grouped vertical lines. You will see my latest attempt which throws an error "Aesthetics must be either length 1 or the same as the data (134): x"
My code
library(ggplot2)
library(dplyr)
df <- read.csv("f4_bna_no_cup.csv")
head(df)
ï..n yr s ys x
1 1 2021 1 2021-1 116.83
2 2 2021 1 2021-1 114.83
3 3 2021 1 2021-1 115.50
4 4 2021 1 2021-1 115.42
5 5 2021 1 2021-1 115.58
6 6 2021 1 2021-1 115.58
#summarize means by ys (year-shift)
mu <- df %>%
group_by(ys,s) %>%
summarise(grp.mean = mean(x))
mu
ys s grp.mean
<chr> <int> <dbl>
1 2021-1 1 116.
2 2021-2 2 117.
3 2022-1 1 114.
4 2022-2 2 115.
llab<-mu
shift <- c("Shift 1", "Shift 2")
#density charts on df
ggplot(data=df, aes(x=x,group =ys, fill = yr, color = yr)) +
geom_density(alpha = 0.4) +
scale_x_continuous(limits=c(112,120))+
geom_vline(aes(xintercept = grp.mean), data = mu, linetype = "dashed", size = 0.5) +
geom_text(aes(x=llab$grp.mean, y=.6), label = llab$ys) + #this throws the error
geom_vline(aes(xintercept=114.8), linetype="dashed", size=0.5, color = 'green3') +
geom_text(aes(x=114.8, y=.6), label = "Target", angle = 90, color="black",size=3) +
geom_vline(aes(xintercept=114.1), linetype="solid", size=0.5, color = 'limegreen') +
geom_text(aes(x=114.1, y=.55), label = "Potential", angle = 90, color="black",size=3 ) +
geom_vline(aes(xintercept=113.4), linetype="solid", size=0.5, color = 'firebrick3') +
geom_text(aes(x=113.4, y=.62), label = "Label wt", angle = 90,
color="black",size=3, family = "Times New Roman", vjust=0) +
facet_grid(
.~s,
labeller = labeller(
s = c(`1` = "Shift 1", `2` = "Shift 2")
))+
theme_light()+
theme(legend.position = "none")
Output so far...I'm so close.
Persistence pays off. I figured it out and thought I would share it in case someone else has a similar problem:
All code remains the same as in my question except a slight change to grouping for the mu df, AND replace the line that I noted as throwing the error as follows:
#small change to group_by, retaining yr
mu <- df %>%
group_by(yr,s,ys) %>%
summarise(grp.mean = mean(x))
Replace: geom_text(aes(x=llab$grp.mean, y=.6), label = llab$ys), with
geom_text(data = mu, aes(label = yr), x = mu$grp.mean, y = .60, color = "black", angle = 90, vjust = 0)
Warning, I am brand-new to R!
I have the R bug and having a play with the possibilities but getting very lost. I want to try and colour segments of a density plot with a condition '>' to indicate bins. In my head it look like:
...but not quartile or % change dependant.
My data shows; x = duration (number of days) and y = frequency. I would like the plot to colour split on 3 month intervals up to 12 months and one colour after (using working days i.e. 63 = 3 months).
I have had a go, but really not sure where to start!
ggplot(df3, aes(x=Investigation.Duration))+
geom_density(fill = W.S_CleanNA$Investigation.Duration[W.S_CleanNA$Investigation.Duration>0],
fill = W.S_CleanNA$Investigation.Duration[W.S_CleanNA$Investigation.Duration>63], color = "white",
fill = W.S_CleanNA$Investigation.Duration[W.S_CleanNA$Investigation.Duration>127], color = "light Grey",
fill = W.S_CleanNA$Investigation.Duration[W.S_CleanNA$Investigation.Duration>190], color = "medium grey",
fill = W.S_CleanNA$Investigation.Duration[W.S_CleanNA$Investigation.Duration>253], color = "dark grey",
fill = W.S_CleanNA$Investigation.Duration[W.S_CleanNA$Investigation.Duration>506], color = "black")+
ggtitle ("Investigation duration distribution in 'Wales' complexity sample")+
geom_text(aes(x=175, label=paste0("Mean, 136"), y=0.0053))+
geom_vline(xintercept = c(136.5), color = "red")+
geom_text(aes(x=80, label=paste0("Median, 129"), y=0.0053))+
geom_vline(xintercept = c(129.5), color = "blue")
Any really simple help much appreciated.
Unfortunately, you can't do this directly with geom_density, as "under the hood" it is built with a single polygon, and a polygon can only have a single fill. The only way to do this is to have multiple polygons, and you need to build them yourself.
Fortunately, this is easier than it sounds.
There was no sample data in the question, so we will create a plausible distribution with the same median and mean:
#> Simulate data
set.seed(69)
df3 <- data.frame(Investigation.Duration = rgamma(1000, 5, 1/27.7))
round(median(df3$Investigation.Duration))
#> [1] 129
round(mean(df3$Investigation.Duration))
#> [1] 136
# Get the density as a data frame
dens <- density(df3$Investigation.Duration)
dens <- data.frame(x = dens$x, y = dens$y)
# Exclude the artefactual times below zero
dens <- dens[dens$x > 0, ]
# Split into bands of 3 months and group > 12 months together
dens$band <- dens$x %/% 63
dens$band[dens$band > 3] <- 4
# This us the complex bit. For each band we want to add a point on
# the x axis at the upper and lower ltime imits:
dens <- do.call("rbind", lapply(split(dens, dens$band), function(df) {
df <- rbind(df[1,], df, df[nrow(df),])
df$y[c(1, nrow(df))] <- 0
df
}))
Now we have the polygons, it's just a case of drawing and labelling appropriately:
library(ggplot2)
ggplot(dens, aes(x, y)) +
geom_polygon(aes(fill = factor(band), color = factor(band))) +
theme_minimal() +
scale_fill_manual(values = c("#003f5c", "#58508d", "#bc5090",
"#ff6361", "#ffa600"),
name = "Time",
labels = c("Less than 3 months",
"3 to 6 months",
"6 to 9 months",
"9 to 12 months",
"Over 12 months")) +
scale_colour_manual(values = c("#003f5c", "#58508d", "#bc5090",
"#ff6361", "#ffa600"),
guide = guide_none()) +
labs(x = "Days since investigation started", y = "Density") +
ggtitle ("Investigation duration distribution in 'Wales' complexity sample") +
geom_text(aes(x = 175, label = paste0("Mean, 136"), y = 0.0053),
check_overlap = TRUE)+
geom_vline(xintercept = c(136.5), linetype = 2)+
geom_text(aes(x = 80, label = paste0("Median, 129"), y = 0.0053),
check_overlap = TRUE)+
geom_vline(xintercept = c(129.5), linetype = 2)
I have drawn the attached funnel plot in ggplot, But I have 2 questions:
Is there any way to make the coloured green dot bigger (only that one);
is there any way to colour the upper and lower part of the confidence intervals?
This is what I am able to make so far:
Thank you!
The data set I am working on:
df <-
read.table(text = "
school_id year sdq_emotional
1060 7 4
1060 7 5
1060 7 7
1060 7 6
1060 7 4
1060 7 7
1060 7 8
1115 7 5
1115 7 9
1115 7 3
1136 7 1
1136 7 8
1136 7 5
1136 7 9
1135 7 4
1139 7 7
1139 7 3
2371 7 6
2371 7 3
2372 7 4
2372 7 1
2378 7 6
2378 7 7
2378 7 5", header=TRUE)
My code as follows:
# Format the data
df1 <- plyr::count(df, c('school_id'))
df2 <- merge(df,df1, by= c("school_id"))
df <- df2
M3 <- aggregate(df$sdq_emotional[df$freq > 10], by=list(df$school_id[df$freq > 10]),mean,na.rm=T)
S3 <- aggregate(df$sdq_emotional[df$freq > 10], by=list(df$school_id[df$freq > 10]),nona)
CG_PLOT1 <- merge(M3,S3,by="Group.1")
names(CG_PLOT1) <- c("School","Mean","Size")
LINE3 <- data.frame(M3=rep(mean(df$sdq_emotional,na.rm=T),max(CG_PLOT1$Size)+25),
SD3=rep(sd(df$sdq_emotional,na.rm=T),max(CG_PLOT1$Size)+25),
N3=sqrt(1:(max(CG_PLOT1$Size)+25)))
ID <- 1060
filling3 <- rep("white",nrow(CG_PLOT1))
filling3[CG_PLOT1$School ==ID]<-"green"
# Build the graph
ggplot(data = CG_PLOT1) +
geom_line(data = LINE3, aes(x = 1:(max(CG_PLOT1$Size) + 25),
y = M3 + qnorm(0.975) * SD3 / N3), size = 1, colour = "steelblue2",
linetype = 5) +
geom_line(data = LINE3, aes(x = 1:(max(CG_PLOT1$Size) + 25),
y = M3 - qnorm(0.975) * SD3 / N3), size = 1, colour = "steelblue2",
linetype = 5) +
geom_segment(xend = max(CG_PLOT1$Size)+25,yend=mean(LINE3$M3,na.rm=T)),
aes(x = 1, y = mean(LINE3$M3,na.rm=T), size=1, colour="steelblue2") +
geom_point(data = CG_PLOT1, aes(x = Size, y = Mean), size = 2,
colour = "black", shape = 21,fill = filling3) +
ylim(0, 8)
thank you very much!
As you didn't provide a reproducible example, I have used this question as a template for your problem:
Creating a dataset here:
library(ggplot2)
set.seed(101)
x <- runif(100, min=1, max=10)
y <- rnorm(length(x), mean=5, sd=0.1*x)
df <- data.frame(x=x*70, y=y)
m <- lm(y ~ x, data=df)
fit95 <- predict(m, interval="conf", level=.95)
fit99 <- predict(m, interval="conf", level=.999)
df <- cbind.data.frame(df,
lwr95=fit95[,"lwr"], upr95=fit95[,"upr"],
lwr99=fit99[,"lwr"], upr99=fit99[,"upr"])
To add a colour background to the funnel plot, we can use the geom_ribbon function within ggplot to fill the area between a ymin and ymax. In this case, we will use the data used to construct each of the lines:
ggplot(df, aes(x, y)) +
# Add background
geom_ribbon(ymin= df$upr99, ymax = Inf, fill = "#e2a49a", alpha = 0.5) +
geom_ribbon(ymin = df$lwr99, ymax = df$upr99, fill = "#e0ba9d", alpha = 0.5 ) +
geom_ribbon(ymin = 0, ymax = df$lwr99, fill = "#8fd6c9", alpha = 0.5 ) +
# Overlay points and lines
geom_point() +
geom_smooth(method="lm", colour="black", lwd=1.1, se=FALSE) +
geom_line(aes(y = upr95), color="black", linetype=2) +
geom_line(aes(y = lwr95), color="black", linetype=2) +
geom_line(aes(y = upr99), color="red", linetype=3) +
geom_line(aes(y = lwr99), color="red", linetype=3)
labs(x="No. admissions...", y="Percentage of patients...")
As for changing the size of one point, you can check out the answer here. I would recommend subsetting the data to extract the one point, and then add another layer for the geom_point and then changing the size and colour argument of the new layer`
I am trying to plot a volcano plot with ggplot2. I would like to have three different colors based on the following criteria:
qvalue <0.05 and meth.diff > 25% = Red
qvalue <0.05 and meth.diff < -25% (minus 25%) = Green
qvalue <0.05 and meth.diff between +25 and -25 = Gray
Similar questions have been asked here before and I tried following them but keep getting error messages. Any suggestions would be highly appreciated.
Here is the raw data file:
chr start end strand pvalue qvalue meth.diff
16 chr1 37801 38100 * 2.246550e-05 4.487042e-04 -36.485769
17 chr1 38101 38400 * 5.699781e-06 1.376471e-04 55.755181
29 chr1 49501 49800 * 1.453030e-18 2.442391e-16 -18.381131
35 chr1 62701 63000 * 5.547627e-03 3.686303e-02 -31.871711
54 chr1 122401 122700 * 3.917230e-03 2.845933e-02 63.443366
57 chr1 130201 130500 * 8.941091e-04 9.253737e-03 -8.347167
myDiff1p$threshold = factor(ifelse(myDiff1p$meth.diff>25 & myDiff1p$qvalue< 0.05, 1,
ifelse(myDiff1p$meth.diff<-25 & myDiff1p$qvalue< 0.05,-1,0)))
ggplot(data=myDiff1p, aes(x=meth.diff, y=-log10(qvalue))) +
geom_point(aes(color=myDiff1p$threshold), alpha=0.4, size=1.75)+
geom_vline(xintercept=c(-25,25), color="red", alpha=1.0)+
geom_hline(yintercept=2, color="blue", alpha=1.0)+
xlab("Differential Methylation")+
ylab("-log10 (qvalue)")+
theme_bw()+
xlim(c(-75, 75)) +
ylim(c(0, 300))
Error: Discrete value supplied to continuous scale
You have an almost unnoticeable mistake in this line:
myDiff1p$threshold = factor(ifelse(myDiff1p$meth.diff>25 & myDiff1p$qvalue< 0.05, 1,
ifelse(myDiff1p$meth.diff<-25 & myDiff1p$qvalue< 0.05,-1,0)))
As there's no space in myDiff1p$meth.diff<-25, it's interpreted as myDiff1p$meth.diff <- 25 rather than myDiff1p$meth.diff < -25. As a result, meth.diff got messed up.
Here's what I recommend:
library(dplyr)
myDiff1p <- myDiff1p %>%
mutate(threshold = factor(case_when(meth.diff > 25 & qvalue < 0.05 ~ "cond1",
meth.diff < -25 & qvalue < 0.05 ~ "cond2",
TRUE ~ "cond3")))
ggplot(data=myDiff1p, aes(x=meth.diff, y=-log10(qvalue))) +
geom_point(aes(color=myDiff1p$threshold), alpha=0.4, size=1.75)+
geom_vline(xintercept=c(-25,25), color="red", alpha=1.0)+
geom_hline(yintercept=2, color="blue", alpha=1.0)+
xlab("Differential Methylation")+
ylab("-log10 (qvalue)")+
theme_bw()+
xlim(c(-75, 75)) +
ylim(c(0, 300)) +
scale_color_manual(name = "Threshold",
values = c("cond1" = "red", "cond2" = "green", "cond3" = "grey"))
I labelled the threshold factor by condition, & defined the mapping between condition & colour in a named vector in scale_color_manual(). Also, a matter of personal preference, but I think dplyr::case_when() looks neater than nested ifelse() statements.
I have a data with longitude, latitude and value at each grid. A grid may have more than one value so I set alpha to visualize multiple values. My aim is to fill grids with three different ranges. If the value is zero then that grid would be empty.
library(maps)
library(ggplot2)
data <- read.csv("G:/mydata.csv")
g1 <- ggplot(aes(x=x, y=y, fill= A), data=data) +
geom_tile(data=subset(data, A > 1970 & A < 1980),fill = "black", alpha = 0.5)+
geom_tile(data=subset(data, B > 1970 & B < 1980),fill = "black", alpha = 0.5)+
geom_tile(data=subset(data, C > 1970 & C < 1980),fill = "black", alpha = 0.5)+
geom_tile(data=subset(data, A > 1979 & A < 1990),fill = "blue", alpha = 0.5)+
geom_tile(data=subset(data, B> 1979 & B < 1990), fill = "blue", alpha = 0.5)+
geom_tile(data=subset(data, C > 1979 & C < 1990),fill = "blue", alpha = 0.5)+
geom_tile(data=subset(data, A > 1989),fill = "red", alpha = 0.5)+
geom_tile(data=subset(data, B > 1989),fill = "red", alpha = 0.5)+
geom_tile(data=subset(data, C > 1989),fill = "red", alpha = 0.5)+
theme_classic()
is wrong. As blue grids are bigger. I could not find out the mistake. I followed the link but could not make it. I guess there is something trivial which I am missing. My data can be accessed here. Many thanks in advance.
Sorry, can't do it the way you envisioned it. Not enough flexiblity that I could see. But one can do this:
library(maps)
library(ggplot2)
ddf <- read.csv("mydata.csv")
setz <- function(dddf,zvek,lev=0,fillclr){
dddf$z <- as.numeric(zvek)
dddf$lev <- lev
dddf$color <- "white"
dddf$fill <- ifelse(zvek,fillclr,"gray")
return(dddf)
}
df1<-setz(ddf,ddf$A>1970 & ddf$A<1980,"A>1970 & A<1980","black")
df2<-setz(ddf,ddf$B>1970 & ddf$B<1980,"B>1970 & B<1980","black")
df3<-setz(ddf,ddf$C>1970 & ddf$C<1980,"C>1970 & C<1980","black")
df4<-setz(ddf,ddf$A>1979 & ddf$A<1990,"A>1979 & A<1990","blue")
df5<-setz(ddf,ddf$B>1979 & ddf$B<1990,"B>1979 & B<1990","blue")
df6<-setz(ddf,ddf$C>1979 & ddf$C<1990,"C>1979 & C<1990","blue")
df7<-setz(ddf,ddf$A>1989,"A>1989","red")
df8<-setz(ddf,ddf$B>1989,"B>1989","red")
df9<-setz(ddf,ddf$C>1989,"C>1989","red")
ddg <- rbind( df1,df2,df3, df4,df5,df6, df7,df8,df9 )
g1 <- ggplot(data=ddg,aes(x=x, y=y,fill=fill,color=color)) +
geom_tile() +
scale_color_identity() +
scale_fill_identity() +
facet_wrap(~lev)
theme_classic()
print(g1)
Which yields this: