I am using RNA seq data to analyze genes via a volcano plot (which compares differential gene expression of bacteria with and without antibiotic) in R. After having created my plot, I am unsure why some of my values which read "0" TPM in one condition and a TPM value that's not "0" in the other condition were not determined to be differentially expressed. Some of the genes on my volcano plot have this difference in TPM and show up as significant on my plot while others with this "0" value difference are not considered to be significant according to my plot.
Here is a sample of my data
(UO1_D712_##### represents the locus number, the top column represents the different replicates where "1" is untreated and "2" is treated with antibiotic, the letters next to the numbers represent a different technical replicate):
structure(list(`1` = c("U01_D712_00001",
"U01_D712_00002", "U01_D712_00003",
"U01_D712_00004", "U01_D712_00005", "U01_D712_00006", "U01_D712_00007",
"U01_D712_00008", "U01_D712_00009", "U01_D712_00010", "U01_D712_00011",
"U01_D712_00012", "U01_D712_00013", "U01_D712_00014", "U01_D712_00015",
"U01_D712_00016", "U01_D712_00017", "U01_D712_00018", "U01_D712_00019",
"U01_D712_00020", "U01_D712_00021", "U01_D712_00022", "U01_D712_00023",
"U01_D712_00024", "U01_D712_00025", "U01_D712_00026", "U01_D712_00027",
"U01_D712_00028", "U01_D712_00029", "U01_D712_00030"), `1a` = c("6.590456502",
"6.741694688", "7.110342585", "6.299482433", "2.77173982", "2.470330508",
"3.827125186", "6.267229842", "5.524708663", "2.657913228", "2.87209859",
"7.479820548", "4.980185572", "4.210955199", "0", "4.492492822",
"3.611091371", "3.714270433", "7.455036914", "7.045203025", "6.061860857",
"2.925268313", "6.544077039", "5.747318013", "9.97083444", "9.22523089",
"4.20205383", "6.097040679", "2.621192351", "0"), `1b` = c("6.544454427",
"6.601489488", "7.134224619", "5.814043553", "3.280958379", "2.649180803",
"3.860542083", "6.256648363", "5.380427766", "2.581027705", "3.016132165",
"7.405329447", "4.701503289", "4.073814818", "0", "4.304196924",
"3.515977329", "3.843535649", "7.342972625", "6.966606769", "6.122878624",
"3.007522306", "6.495797641", "5.965431621", "9.828050269", "9.219563915",
"4.065778989", "6.105331066", "3.061209408", "0"), `1c` = c("6.608196006",
"6.743010138", "7.102600793", "6.146518601", "3.555184202", "2.364971542",
"4.034053983", "6.158523627", "5.656051812", "2.658660735", "3.054717455",
"7.392164473", "4.950953264", "4.277770477", "0", "4.507936666",
"3.794842979", "3.610794578", "7.471646548", "7.104792624", "6.484767016",
"3.071205184", "6.584425715", "6.20466015", "9.986342122", "9.282943758",
"4.179958213", "6.219551653", "2.984738345", "0"), `1d` = c("6.547155382",
"6.558328892", "6.992501615", "6.449558793", "3.059801464", "2.418800257",
"3.96498952", "6.013208538", "5.279919645", "2.893295085", "1.750510471",
"7.408735671", "4.9425624", "3.804549986", "0", "4.174835979",
"3.806006888", "3.570390524", "7.641137006", "6.976672494", "6.363030106",
"3.083061726", "6.300910093", "6.007490342", "9.926316442", "9.09671588",
"4.320556917", "6.153860107", "2.877230446", "0"), `1e` = c("6.626724417",
"6.577345176", "7.156821278", "6.296873411", "3.618089702", "2.394444986",
"4.129376392", "6.011715246", "5.31197869", "2.00754706", "2.695493528",
"7.538910448", "4.606060035", "3.909472643", "0", "4.346616047",
"3.468681284", "3.338231445", "7.559599613", "7.1527452", "6.232923513",
"3.108624209", "6.535435309", "6.12922864", "10.13108497", "9.310331313",
"3.959568571", "6.182335537", "2.902736258", "0"), `1f` = c("6.419219179",
"6.650459302", "7.319125725", "5.570869357", "2.962845933", "2.55903176",
"4.087597573", "5.995610538", "5.386268651", "2.750800859", "2.416572678",
"7.579148955", "3.952633067", "3.615227674", "0", "4.562838935",
"3.76942104", "3.920096905", "7.935320749", "7.501470652", "6.13700099",
"3.123910608", "6.035971952", "5.706235015", "9.254751395", "8.379630979",
"4.51391973", "5.6890651", "2.43285316", "0"), `1g` = c("6.553221221",
"6.633949852", "7.182305386", "5.769365973", "2.721354972", "1.668390466",
"4.148367057", "6.240883382", "5.458877133", "1.733842637", "2.723803355",
"7.522249899", "4.149567197", "3.780763096", "0", "4.496306813",
"3.645643535", "3.851001768", "7.678552875", "7.283411279", "6.591585956",
"2.879345378", "6.389427003", "5.911222165", "9.851084493", "9.084575304",
"4.272587776", "5.974762147", "2.98852705", "0"), `2a` = c("9.769887737",
"4.550226652", "1.869021464", "4.944848987", "7.9678549", "2.682865013",
"8.495575559", "2.234521659", "3.667196316", "9.180445037", "3.210107621",
"7.21523691", "5.714579923", "5.423986751", "9.118981459", "8.635701597",
"2.742889473", "3.712618983", "8.006057144", "4.999541279", "10.54351774",
"5.880978085", "7.145433526", "6.982416661", "9.339651188", "5.360835327",
"4.699680905", "3.423826225", "6.408271885", "5.038170992"),
`2b` = c("10.26397519", "1.945664005", "2.086763158", "2.800904763",
"8.583418657", "1.536094563", "9.32057547", "3.10685839",
"4.224502319", "10.1252842", "1.811175407", "5.714439316",
"6.039142559", "3.833361174", "9.757360286", "9.565906731",
"1.523640473", "2.315033488", "6.312524363", "4.889986456",
"10.23020108", "4.848727685", "7.533256999", "7.138160378",
"10.30380331", "3.955469283", "3.167940742", "3.599655687",
"4.828945262", "3.701043054"), `2c` = c("9.478481216", "2.789289131",
"3.393949527", "3.754810933", "8.710777154", "1.806170784",
"9.150005253", "2.612275457", "4.961073313", "9.802701699",
"2.933183115", "6.532384958", "6.919449225", "4.432699799",
"9.715063475", "9.265691356", "1.412064593", "3.330131873",
"6.665979896", "5.158526421", "9.417365584", "4.899531204",
"8.173459354", "7.271400938", "9.813068613", "4.384622077",
"3.700645365", "4.457874829", "5.649440022", "3.531010379"
), `2d` = c("8.199795497", "2.565711524", "2.202287889",
"3.856354444", "7.380224849", "2.192476466", "8.14446837",
"1.144258862", "3.31447122", "8.713146629", "2.697890381",
"6.304428859", "5.745291803", "4.898396114", "9.173362747",
"8.339933849", "3.159678152", "4.094587234", "7.608649692",
"5.280206424", "10.34630403", "5.098585806", "7.262400625",
"6.150190905", "9.316698845", "5.073027993", "4.695003229",
"2.485847024", "5.545300465", "4.350571411"), `2e` = c("8.74935033",
"3.739489484", "0.217205045", "4.413657999", "8.745525588",
"3.657060086", "9.279834921", "2.898951179", "4.282874018",
"9.610485827", "3.561102455", "7.228334332", "6.388491443",
"4.389908652", "8.781086564", "9.178866581", "2.374603596",
"3.961037408", "7.864369809", "3.654728044", "10.15284858",
"5.894439123", "7.68020282", "7.12243523", "9.998637438",
"5.092174395", "4.111530392", "3.776835632", "5.624523213",
"4.095011377"), `2f` = c("7.648310926", "4.345557215", "1.986576876",
"4.99426288", "7.087937177", "2.810917253", "7.77706637",
"2.62822773", "3.581811188", "8.470225989", "3.335757437",
"7.416094847", "5.208841128", "5.536128034", "8.255571138",
"7.993319997", "1.9209089", "3.573861828", "7.318814519",
"5.233804806", "11.05855833", "6.247720809", "6.673407583",
"6.029960625", "8.806867591", "5.459208493", "4.001428729",
"3.609936979", "5.876522973", "4.652839671"), `2g` = c("7.555235468",
"3.892899549", "1.726443458", "5.304546796", "7.039588042",
"3.027235295", "7.703852207", "1.753183519", "2.909288568",
"8.385169315", "3.902707541", "7.523315081", "4.978364017",
"5.49103181", "8.096218606", "7.944822989", "2.352609608",
"4.155433517", "7.227355741", "5.532668321", "11.24946953",
"6.159185473", "6.443203375", "5.931761874", "8.7421732",
"5.502000205", "4.652883503", "3.458323017", "6.566487449",
"4.89790353")), row.names = c(NA, 30L), class = "data.frame")
The design matrix:
structure(list(SampleID = c("1a ", "1b", "1c",
"1d", "1e", "1f",
"1g", "2a", "2b", "2c", "2d", "2e", "2f", "2g"), Group = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Replicate = c("a",
"b", "c", "d", "e", "f", "g", "a", "b", "c", "d", "e", "f", "g"
)), row.names = c(NA, 14L), class = "data.frame")
I read through this edgeR user manual to write my code: http://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
Here is how I wrote my code to sample the RNA seq data (note ~/Documents/VOLCANO/R10LB_0ugvs0.5ug.csv refers to the the sample of data above and ~/Documents/VOLCANO/R10LB_0vs0.5_designmatrix.csv refers to the above design matrix):
R10LB_0vs0.5 <- read.csv2("~/Documents/VOLCANO/R10LB_0ugvs0.5ug.csv", sep=",", check.names = F)
R10LB_0vs0.5 <- janitor::remove_empty(R10LB_0vs0.5, which = "cols")
R10LB_0vs0.5_design <- read.csv2("~/Documents/VOLCANO/R10LB_0vs0.5_designmatrix.csv", sep=",")
#Set rownames
rownames(R10LB_0vs0.5) <- R10LB_0vs0.5$`1`
R10LB_0vs0.5 <- R10LB_0vs0.5[,-1]
#colnames(R10LB_0ugvs0.5ug) <- R10LB_0ugvs0.5ug[1,]
#R10LB_0ugvs0.5ug <- R10LB_0ugvs0.5ug[-1,]
#Convert to Matrix
R10LB_0vs0.5 <- data.matrix(R10LB_0vs0.5)
#Create DGEList
R10LB_0vs0.5 <- DGEList(counts = R10LB_0vs0.5, group = R10LB_0vs0.5_design$Group)
str(R10LB_0vs0.5)
#Design Matrix
Group <- as.vector(as.character(R10LB_0vs0.5_design$Group))
Replicate <- as.vector((R10LB_0vs0.5_design$Replicate))
R10LB_0vs0.5_designmatrix <- model.matrix(~0+Group+Replicate)
#Filter
keep <- filterByExpr(R10LB_0vs0.5)
R10LB_0vs0.5_filter <- R10LB_0vs0.5[keep, keep.lib.sizes = FALSE]
#Estimate Sample Dispersion
R10LB_0vs0.5_Disp <- estimateDisp(R10LB_0vs0.5_filter, R10LB_0vs0.5_designmatrix)
#Create Contrasts (Group Comparisons)
CONTRASTS <- makeContrasts( Group1vs2 = Group1-Group2,
levels = R10LB_0vs0.5_designmatrix)
#GLM Fit for Group 1 vs. Group 2
R10LB_0vs0.5fit <- glmFit(R10LB_0vs0.5_Disp, contrast = CONTRASTS[,1])
R10LB_0vs0.5lrt <- glmLRT(R10LB_0vs0.5fit)
R10LB_0vs0.5TT <- topTags(R10LB_0vs0.5lrt, n=nrow(R10LB_0vs0.5_Disp))
write.csv(R10LB_0vs0.5TT, file = "R10LB_0vs0.5_comparison.csv")
saveRDS(R10LB_0vs0.5TT, file = "R10LB_0vs0.5_comparison.RDS")
Here is my code for formatting the data into a volcano plot:
#Basic scatter plot: x is "logFC", y is "PValue"
ggplot(data = R10LB_0vs0.5_comparison, aes(x = logFC, y = PValue)) + geom_point()
#Doesn't look quite like a Volcano plot... convert the p-value into a -log10(p-value)
p4 <- ggplot(data = R10LB_0vs0.5_comparison, aes(x = logFC, y = -log10(PValue))) + geom_point() + theme_minimal()
#Add vertical lines for logFC thresholds, and one horizontal line for the p-value threshold
p5 <- p4 + geom_vline(xintercept = c(-0.6, 0.6), col = "red") + geom_hline(yintercept = -log10(0.05), col = "red")
# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.
# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)
# add a column of NAs
R10LB_0vs0.5_comparison$diffexpressed <- "NO"
# if logFC > 0.6 and PValue < 0.05, set as "UP"
R10LB_0vs0.5_comparison$diffexpressed[R10LB_0vs0.5_comparison$logFC > 0.6 & R10LB_0vs0.5_comparison$PValue < 0.05] <- "UP"
# if logFC < -0.6 and PValue < 0.05, set as "DOWN"
R10LB_0vs0.5_comparison$diffexpressed[R10LB_0vs0.5_comparison$logFC < -0.6 & R10LB_0vs0.5_comparison$PValue < 0.05] <- "DOWN"
# Re-plot but this time color the points with "diffexpressed"
p4 <- ggplot(data=R10LB_0vs0.5_comparison, aes(x=logFC, y=-log10(PValue), col=diffexpressed)) + geom_point() + theme_minimal()
# Add lines as before...
p5 <- p4 + geom_vline(xintercept=c(-0.6, 0.6), col="red") + geom_hline(yintercept=-log10(0.05), col="red")
## Change point color
# 1. by default, it is assigned to the categories in an alphabetical order):
p6 <- p5 + scale_color_manual(values=c("blue", "black", "red"))
# 2. to automate a bit: create a named vector: the values are the colors to be used, the names are the categories they will be assigned to:
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p6 <- p5 + scale_colour_manual(values = mycolors)
# Now write down the name of genes beside the points...
# Create a new column "delabel" to de, that will contain the name of genes differentially expressed (NA in case they are not)
R10LB_0vs0.5_comparison$delabel <- NA
R10LB_0vs0.5_comparison$delabel[R10LB_0vs0.5_comparison$diffexpressed != "NO"] <- R10LB_0vs0.5_comparison$gene_symbol[R10LB_0vs0.5_comparison$diffexpressed != "NO"]
ggplot(data=R10LB_0vs0.5_comparison, aes(x=logFC, y=-log10(PValue), col=diffexpressed, label=delabel)) + geom_point() + theme_minimal() +geom_text()
# Finally, we can organize the labels nicely using the "ggrepel" package and the geom_text_repel() function
# load library
library(ggrepel)
# plot adding up all layers we have seen so far
ggplot(data=R10LB_0vs0.5_comparison, aes(x=logFC, y=-log10(PValue), col=diffexpressed, label=delabel)) +geom_point() + theme_minimal() +geom_text_repel() + scale_color_manual(values=c("blue", "black", "red")) + geom_vline(xintercept=c(-0.6, 0.6), col="red") +geom_hline(yintercept=-log10(0.05), col="red")
This is the volcano plot I currently have with all of my data:
Can you explain how some of these functions filtered my data (i.e. filterByExpr, estimateDisp, makeContrasts, glmFit, glmLRT)? Why are some of my data points that change from 0 TPM in one condition to some value in another condition showing up on my plot while others are not?
Are there other specific filtering processes you would recommend to change, fix, and/or improve my volcano plot?
I am trying to generate a series of plots that show the same patient taking drinks and urinating at different times. Each plot represents a single day. I want to compare the days and hence I need to ensure that all graphs plotted have the same x-axis. My code is below which I cribbed from How to specify the actual x axis values to plot as x axis ticks in R
### Data Input
time_Thurs <- c("01:10", "05:50", "06:00","06:15", "06:25", "09:35", "10:00", "12:40",
"14:00", "17:20", "18:50", "19:10", "20:10", "21:00", "22:05", "22:35")
event_Thurs <- c("u", "u", "T", "T", "u", "u", "T","T","u", "u", "T", "T", "T", "T", "u", "W")
volume_Thurs <- c(NA, NA, 0.25, 0.25, NA, NA, 0.125, 0.625, NA, NA, 0.25, 0.25, 0.25, 0.25,
NA, 0.25)
total_liquids_Thurs <- sum(volume_Thurs, na.rm=TRUE)
time_Thurs <- paste("04/04/2019", time_Thurs, sep=" ")
time_Fri <- c("01:15", "06:00", "06:10", "06:25", "06:30", "07:10", "08:40", "09:20",
"12:45", "13:45")
event_Fri <- c("u","u", "T","T","u","uu","T", "u", "T", "u")
volume_Fri <- c(NA, NA, 0.25, 0.25, NA, NA, 0.125, NA, 0.625, NA)
total_liquids_Fri <- sum(volume_Fri, na.rm=TRUE)
time_Fri <- paste("05/04/2019", time_Fri, sep=" ")
### Collect all data together
event <- c(event_Thurs, event_Fri)
Volume <- c(volume_Thurs, volume_Fri)
time_log <- c(time_Thurs, time_Fri)
time_log <- strptime(time_log, format = "%d/%m/%Y %H:%M")
time_view <- format(time_log, "%H:%M")
### Put into Dataframe
patient_data <- data.frame(time_log, time_view, event, Volume)
# write.csv(patient_data, file="patient_data.csv", row.names = FALSE)
daily_plot <- function(x, day) {
# x patient data - a data.frame with four columns:
# POSIXct time, time, event and Volume
# date number of day of month
# y volume of liquid
# TotVol total volume of intake over week
# Event - drink or otherwise
x <- x[as.numeric(format(x[,1], "%d")) == day, ]
TotVol <- sum(x[,4], na.rm = TRUE)
DayOfWeek <- weekdays(x[1,1], abbreviate = FALSE)
plot(x[,1],x[,4],
xlim = c(x[1,1],x[length(x[,1]),1]),
xlab="Hours of Study", ylab = "Volume of Liquid Drank /L",
main = paste("Total Liquids Drank = ", TotVol, " L on ", DayOfWeek, "Week 1, Apr 2019"),
sub = "dashed red line = urination", pch=16,
col = c("black", "yellow", "green", "blue")[as.numeric(x[,3])],
xaxt = 'n'
)
xAxis_hrs <- seq(as.POSIXct(x[1,1]), as.POSIXct(x[length(x[,1]),1]), by="hour")
axis(1, at = xAxis_hrs, las = 2)
abline( v = c(x[x[,3] == "u",1]), lty=3, col="red")
}
When I run the function,
daily_plot(patient_data, 4)
I want to print out my x-axis, as amended in the form of hours representing the events over the 24 hour period.
When I wrap my xAxis_hrs vector in strptime(xAxis_hrs, format = "%H") the code crashes - that is the x-axis doesn't print out and I see, Error in axis(1, at = xAxis_hrs, las = 2) : (list) object cannot be coerced to type 'double' . Any help?
The issue is that you pass the labels to the wrong named argument, namely at (which should be the numeric positions of the labels). Use the following instead:
axis(1, at = xAxis_hrs, labels = strptime(xAxis_hrs, format = "%H"), las = 2)
Unfortunately this doesn’t change the fact that the axis labels don’t fit into the plot, and collide with the axis title. The former can be fixed by adjusting the plot margins. I’m not aware of a good solution for the latter, although changing the time format might help: it’s probably not necessary/helpful to print the full minutes and seconds (which are always 0). In fact, did you mean to use format instead of strptime?
Apart from that I fundamentally agree with the other answer recommending ggplot2 in the long run. It makes this kind of stuff a lot less painful.
If you're open to a ggplot solution:
library(tidyverse)
library(lubridate)
daily_ggplot <- function(df, selected_day) {
df_day <- filter(df, day(time_log) == selected_day)
df_urine <- filter(df_day, event == "u")
df_drink <- filter(df_day, event != "u")
TotVol <- sum(df_day$Volume, na.rm = TRUE)
Date <- floor_date(df_day$time_log[1], 'days')
DayOfWeek <- weekdays(Date, abbreviate = F)
plot_title <- paste0("Total drank = ", TotVol, "L on ", DayOfWeek, " Week 1, Apr 2018")
ggplot(df_drink) +
aes(time_log, Volume, color = event) +
geom_point(size = 2) +
geom_vline(data = df_urine, aes(xintercept = time_log), color = "red", linetype = 3) +
labs(x = "Hours of Study", ylab = "Volume of Liquid Drank (L)",
title = plot_title, subtitle = "lines = urination") +
theme_bw() +
scale_x_datetime(date_labels = "%H:%M", limits = c(Date, Date + days(1)))
}
daily_ggplot(patient_data, 4)