Related
I am using RNA seq data to analyze genes via a volcano plot (which compares differential gene expression of bacteria with and without antibiotic) in R. After having created my plot, I am unsure why some of my values which read "0" TPM in one condition and a TPM value that's not "0" in the other condition were not determined to be differentially expressed. Some of the genes on my volcano plot have this difference in TPM and show up as significant on my plot while others with this "0" value difference are not considered to be significant according to my plot.
Here is a sample of my data
(UO1_D712_##### represents the locus number, the top column represents the different replicates where "1" is untreated and "2" is treated with antibiotic, the letters next to the numbers represent a different technical replicate):
structure(list(`1` = c("U01_D712_00001",
"U01_D712_00002", "U01_D712_00003",
"U01_D712_00004", "U01_D712_00005", "U01_D712_00006", "U01_D712_00007",
"U01_D712_00008", "U01_D712_00009", "U01_D712_00010", "U01_D712_00011",
"U01_D712_00012", "U01_D712_00013", "U01_D712_00014", "U01_D712_00015",
"U01_D712_00016", "U01_D712_00017", "U01_D712_00018", "U01_D712_00019",
"U01_D712_00020", "U01_D712_00021", "U01_D712_00022", "U01_D712_00023",
"U01_D712_00024", "U01_D712_00025", "U01_D712_00026", "U01_D712_00027",
"U01_D712_00028", "U01_D712_00029", "U01_D712_00030"), `1a` = c("6.590456502",
"6.741694688", "7.110342585", "6.299482433", "2.77173982", "2.470330508",
"3.827125186", "6.267229842", "5.524708663", "2.657913228", "2.87209859",
"7.479820548", "4.980185572", "4.210955199", "0", "4.492492822",
"3.611091371", "3.714270433", "7.455036914", "7.045203025", "6.061860857",
"2.925268313", "6.544077039", "5.747318013", "9.97083444", "9.22523089",
"4.20205383", "6.097040679", "2.621192351", "0"), `1b` = c("6.544454427",
"6.601489488", "7.134224619", "5.814043553", "3.280958379", "2.649180803",
"3.860542083", "6.256648363", "5.380427766", "2.581027705", "3.016132165",
"7.405329447", "4.701503289", "4.073814818", "0", "4.304196924",
"3.515977329", "3.843535649", "7.342972625", "6.966606769", "6.122878624",
"3.007522306", "6.495797641", "5.965431621", "9.828050269", "9.219563915",
"4.065778989", "6.105331066", "3.061209408", "0"), `1c` = c("6.608196006",
"6.743010138", "7.102600793", "6.146518601", "3.555184202", "2.364971542",
"4.034053983", "6.158523627", "5.656051812", "2.658660735", "3.054717455",
"7.392164473", "4.950953264", "4.277770477", "0", "4.507936666",
"3.794842979", "3.610794578", "7.471646548", "7.104792624", "6.484767016",
"3.071205184", "6.584425715", "6.20466015", "9.986342122", "9.282943758",
"4.179958213", "6.219551653", "2.984738345", "0"), `1d` = c("6.547155382",
"6.558328892", "6.992501615", "6.449558793", "3.059801464", "2.418800257",
"3.96498952", "6.013208538", "5.279919645", "2.893295085", "1.750510471",
"7.408735671", "4.9425624", "3.804549986", "0", "4.174835979",
"3.806006888", "3.570390524", "7.641137006", "6.976672494", "6.363030106",
"3.083061726", "6.300910093", "6.007490342", "9.926316442", "9.09671588",
"4.320556917", "6.153860107", "2.877230446", "0"), `1e` = c("6.626724417",
"6.577345176", "7.156821278", "6.296873411", "3.618089702", "2.394444986",
"4.129376392", "6.011715246", "5.31197869", "2.00754706", "2.695493528",
"7.538910448", "4.606060035", "3.909472643", "0", "4.346616047",
"3.468681284", "3.338231445", "7.559599613", "7.1527452", "6.232923513",
"3.108624209", "6.535435309", "6.12922864", "10.13108497", "9.310331313",
"3.959568571", "6.182335537", "2.902736258", "0"), `1f` = c("6.419219179",
"6.650459302", "7.319125725", "5.570869357", "2.962845933", "2.55903176",
"4.087597573", "5.995610538", "5.386268651", "2.750800859", "2.416572678",
"7.579148955", "3.952633067", "3.615227674", "0", "4.562838935",
"3.76942104", "3.920096905", "7.935320749", "7.501470652", "6.13700099",
"3.123910608", "6.035971952", "5.706235015", "9.254751395", "8.379630979",
"4.51391973", "5.6890651", "2.43285316", "0"), `1g` = c("6.553221221",
"6.633949852", "7.182305386", "5.769365973", "2.721354972", "1.668390466",
"4.148367057", "6.240883382", "5.458877133", "1.733842637", "2.723803355",
"7.522249899", "4.149567197", "3.780763096", "0", "4.496306813",
"3.645643535", "3.851001768", "7.678552875", "7.283411279", "6.591585956",
"2.879345378", "6.389427003", "5.911222165", "9.851084493", "9.084575304",
"4.272587776", "5.974762147", "2.98852705", "0"), `2a` = c("9.769887737",
"4.550226652", "1.869021464", "4.944848987", "7.9678549", "2.682865013",
"8.495575559", "2.234521659", "3.667196316", "9.180445037", "3.210107621",
"7.21523691", "5.714579923", "5.423986751", "9.118981459", "8.635701597",
"2.742889473", "3.712618983", "8.006057144", "4.999541279", "10.54351774",
"5.880978085", "7.145433526", "6.982416661", "9.339651188", "5.360835327",
"4.699680905", "3.423826225", "6.408271885", "5.038170992"),
`2b` = c("10.26397519", "1.945664005", "2.086763158", "2.800904763",
"8.583418657", "1.536094563", "9.32057547", "3.10685839",
"4.224502319", "10.1252842", "1.811175407", "5.714439316",
"6.039142559", "3.833361174", "9.757360286", "9.565906731",
"1.523640473", "2.315033488", "6.312524363", "4.889986456",
"10.23020108", "4.848727685", "7.533256999", "7.138160378",
"10.30380331", "3.955469283", "3.167940742", "3.599655687",
"4.828945262", "3.701043054"), `2c` = c("9.478481216", "2.789289131",
"3.393949527", "3.754810933", "8.710777154", "1.806170784",
"9.150005253", "2.612275457", "4.961073313", "9.802701699",
"2.933183115", "6.532384958", "6.919449225", "4.432699799",
"9.715063475", "9.265691356", "1.412064593", "3.330131873",
"6.665979896", "5.158526421", "9.417365584", "4.899531204",
"8.173459354", "7.271400938", "9.813068613", "4.384622077",
"3.700645365", "4.457874829", "5.649440022", "3.531010379"
), `2d` = c("8.199795497", "2.565711524", "2.202287889",
"3.856354444", "7.380224849", "2.192476466", "8.14446837",
"1.144258862", "3.31447122", "8.713146629", "2.697890381",
"6.304428859", "5.745291803", "4.898396114", "9.173362747",
"8.339933849", "3.159678152", "4.094587234", "7.608649692",
"5.280206424", "10.34630403", "5.098585806", "7.262400625",
"6.150190905", "9.316698845", "5.073027993", "4.695003229",
"2.485847024", "5.545300465", "4.350571411"), `2e` = c("8.74935033",
"3.739489484", "0.217205045", "4.413657999", "8.745525588",
"3.657060086", "9.279834921", "2.898951179", "4.282874018",
"9.610485827", "3.561102455", "7.228334332", "6.388491443",
"4.389908652", "8.781086564", "9.178866581", "2.374603596",
"3.961037408", "7.864369809", "3.654728044", "10.15284858",
"5.894439123", "7.68020282", "7.12243523", "9.998637438",
"5.092174395", "4.111530392", "3.776835632", "5.624523213",
"4.095011377"), `2f` = c("7.648310926", "4.345557215", "1.986576876",
"4.99426288", "7.087937177", "2.810917253", "7.77706637",
"2.62822773", "3.581811188", "8.470225989", "3.335757437",
"7.416094847", "5.208841128", "5.536128034", "8.255571138",
"7.993319997", "1.9209089", "3.573861828", "7.318814519",
"5.233804806", "11.05855833", "6.247720809", "6.673407583",
"6.029960625", "8.806867591", "5.459208493", "4.001428729",
"3.609936979", "5.876522973", "4.652839671"), `2g` = c("7.555235468",
"3.892899549", "1.726443458", "5.304546796", "7.039588042",
"3.027235295", "7.703852207", "1.753183519", "2.909288568",
"8.385169315", "3.902707541", "7.523315081", "4.978364017",
"5.49103181", "8.096218606", "7.944822989", "2.352609608",
"4.155433517", "7.227355741", "5.532668321", "11.24946953",
"6.159185473", "6.443203375", "5.931761874", "8.7421732",
"5.502000205", "4.652883503", "3.458323017", "6.566487449",
"4.89790353")), row.names = c(NA, 30L), class = "data.frame")
The design matrix:
structure(list(SampleID = c("1a ", "1b", "1c",
"1d", "1e", "1f",
"1g", "2a", "2b", "2c", "2d", "2e", "2f", "2g"), Group = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Replicate = c("a",
"b", "c", "d", "e", "f", "g", "a", "b", "c", "d", "e", "f", "g"
)), row.names = c(NA, 14L), class = "data.frame")
I read through this edgeR user manual to write my code: http://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
Here is how I wrote my code to sample the RNA seq data (note ~/Documents/VOLCANO/R10LB_0ugvs0.5ug.csv refers to the the sample of data above and ~/Documents/VOLCANO/R10LB_0vs0.5_designmatrix.csv refers to the above design matrix):
R10LB_0vs0.5 <- read.csv2("~/Documents/VOLCANO/R10LB_0ugvs0.5ug.csv", sep=",", check.names = F)
R10LB_0vs0.5 <- janitor::remove_empty(R10LB_0vs0.5, which = "cols")
R10LB_0vs0.5_design <- read.csv2("~/Documents/VOLCANO/R10LB_0vs0.5_designmatrix.csv", sep=",")
#Set rownames
rownames(R10LB_0vs0.5) <- R10LB_0vs0.5$`1`
R10LB_0vs0.5 <- R10LB_0vs0.5[,-1]
#colnames(R10LB_0ugvs0.5ug) <- R10LB_0ugvs0.5ug[1,]
#R10LB_0ugvs0.5ug <- R10LB_0ugvs0.5ug[-1,]
#Convert to Matrix
R10LB_0vs0.5 <- data.matrix(R10LB_0vs0.5)
#Create DGEList
R10LB_0vs0.5 <- DGEList(counts = R10LB_0vs0.5, group = R10LB_0vs0.5_design$Group)
str(R10LB_0vs0.5)
#Design Matrix
Group <- as.vector(as.character(R10LB_0vs0.5_design$Group))
Replicate <- as.vector((R10LB_0vs0.5_design$Replicate))
R10LB_0vs0.5_designmatrix <- model.matrix(~0+Group+Replicate)
#Filter
keep <- filterByExpr(R10LB_0vs0.5)
R10LB_0vs0.5_filter <- R10LB_0vs0.5[keep, keep.lib.sizes = FALSE]
#Estimate Sample Dispersion
R10LB_0vs0.5_Disp <- estimateDisp(R10LB_0vs0.5_filter, R10LB_0vs0.5_designmatrix)
#Create Contrasts (Group Comparisons)
CONTRASTS <- makeContrasts( Group1vs2 = Group1-Group2,
levels = R10LB_0vs0.5_designmatrix)
#GLM Fit for Group 1 vs. Group 2
R10LB_0vs0.5fit <- glmFit(R10LB_0vs0.5_Disp, contrast = CONTRASTS[,1])
R10LB_0vs0.5lrt <- glmLRT(R10LB_0vs0.5fit)
R10LB_0vs0.5TT <- topTags(R10LB_0vs0.5lrt, n=nrow(R10LB_0vs0.5_Disp))
write.csv(R10LB_0vs0.5TT, file = "R10LB_0vs0.5_comparison.csv")
saveRDS(R10LB_0vs0.5TT, file = "R10LB_0vs0.5_comparison.RDS")
Here is my code for formatting the data into a volcano plot:
#Basic scatter plot: x is "logFC", y is "PValue"
ggplot(data = R10LB_0vs0.5_comparison, aes(x = logFC, y = PValue)) + geom_point()
#Doesn't look quite like a Volcano plot... convert the p-value into a -log10(p-value)
p4 <- ggplot(data = R10LB_0vs0.5_comparison, aes(x = logFC, y = -log10(PValue))) + geom_point() + theme_minimal()
#Add vertical lines for logFC thresholds, and one horizontal line for the p-value threshold
p5 <- p4 + geom_vline(xintercept = c(-0.6, 0.6), col = "red") + geom_hline(yintercept = -log10(0.05), col = "red")
# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.
# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)
# add a column of NAs
R10LB_0vs0.5_comparison$diffexpressed <- "NO"
# if logFC > 0.6 and PValue < 0.05, set as "UP"
R10LB_0vs0.5_comparison$diffexpressed[R10LB_0vs0.5_comparison$logFC > 0.6 & R10LB_0vs0.5_comparison$PValue < 0.05] <- "UP"
# if logFC < -0.6 and PValue < 0.05, set as "DOWN"
R10LB_0vs0.5_comparison$diffexpressed[R10LB_0vs0.5_comparison$logFC < -0.6 & R10LB_0vs0.5_comparison$PValue < 0.05] <- "DOWN"
# Re-plot but this time color the points with "diffexpressed"
p4 <- ggplot(data=R10LB_0vs0.5_comparison, aes(x=logFC, y=-log10(PValue), col=diffexpressed)) + geom_point() + theme_minimal()
# Add lines as before...
p5 <- p4 + geom_vline(xintercept=c(-0.6, 0.6), col="red") + geom_hline(yintercept=-log10(0.05), col="red")
## Change point color
# 1. by default, it is assigned to the categories in an alphabetical order):
p6 <- p5 + scale_color_manual(values=c("blue", "black", "red"))
# 2. to automate a bit: create a named vector: the values are the colors to be used, the names are the categories they will be assigned to:
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p6 <- p5 + scale_colour_manual(values = mycolors)
# Now write down the name of genes beside the points...
# Create a new column "delabel" to de, that will contain the name of genes differentially expressed (NA in case they are not)
R10LB_0vs0.5_comparison$delabel <- NA
R10LB_0vs0.5_comparison$delabel[R10LB_0vs0.5_comparison$diffexpressed != "NO"] <- R10LB_0vs0.5_comparison$gene_symbol[R10LB_0vs0.5_comparison$diffexpressed != "NO"]
ggplot(data=R10LB_0vs0.5_comparison, aes(x=logFC, y=-log10(PValue), col=diffexpressed, label=delabel)) + geom_point() + theme_minimal() +geom_text()
# Finally, we can organize the labels nicely using the "ggrepel" package and the geom_text_repel() function
# load library
library(ggrepel)
# plot adding up all layers we have seen so far
ggplot(data=R10LB_0vs0.5_comparison, aes(x=logFC, y=-log10(PValue), col=diffexpressed, label=delabel)) +geom_point() + theme_minimal() +geom_text_repel() + scale_color_manual(values=c("blue", "black", "red")) + geom_vline(xintercept=c(-0.6, 0.6), col="red") +geom_hline(yintercept=-log10(0.05), col="red")
This is the volcano plot I currently have with all of my data:
Can you explain how some of these functions filtered my data (i.e. filterByExpr, estimateDisp, makeContrasts, glmFit, glmLRT)? Why are some of my data points that change from 0 TPM in one condition to some value in another condition showing up on my plot while others are not?
Are there other specific filtering processes you would recommend to change, fix, and/or improve my volcano plot?
I have the following code, that generates the following heatmap in R.
ggplot(data = hminput, color=category, aes(x = Poblaciones, y = Variantes)) +
geom_tile(aes(fill = Frecuencias)) + scale_colour_gradient(name = "Frecuencias",low = "blue", high = "white",guide="colourbar")
hminput is a data frame with three columns: Poblaciones, Variantes and Frecuencias, where the first two are the x and y axis and the third one is the color reference.
And my desired output is that the heatmap to have a bar as the reference instead of those blocks, and also that the coloring is white-blue gradient instead of that multicolor gradient.
To achieve that, I tried what's in my code, but I'm not achieving what I want (I'm getting the graph you see in the picture). Any thoughts? Thanks!
As some people asked, here is the dput of the data frame :
> dput(hminput)
structure(list(Variantes = structure(c(1L, 2L, 3L, 4L,...), .Label =
c("rs10498633", "rs10792832", "rs10838725",
"rs10948363", ..., "SNP"), class = "factor"),
Poblaciones = c("AFR", "AFR", ...), Frecuencias = structure(c(12L,
10L,...), .Label = c("0.01135", "0.0121",
"0.01286", "0.01513", "0.02194", "0.05144", "0.05825", "0.059",
"0.07716", "0.0938", "0.1051", "0.1225", "0.1346", "0.1407",
"0.1566", "0.1604", "0.1619", "0.1838", "0.1914", "0.1929",
...,
"0.45", "0.5", "0.4"), class = "factor")), .Names = c("Variantes",
"Poblaciones", "Frecuencias"), row.names = c("frqAFR.33", "frqAFR.31",
"frqAFR.27", "frqAFR.14", "frqAFR.24",...
), class = "data.frame")
I have a data frame like this.
ID read1 read2 read3 read4 class
1 5820350 0.3791915 0.3747022 0.3729779 0.3724259 1
2 5820364 0.3758676 0.3711775 0.3695976 0.3693112 2
3 5820378 0.3885081 0.3823900 0.3804273 0.3797707 2
4 5820392 0.3779945 0.3729582 0.3714910 0.3709072 1
5 5820425 0.2954782 0.2971604 0.2973882 0.2973216 3
6 5820426 0.3376101 0.3368173 0.3360203 0.3359517 3
Each row represents one sample with four values,and the last column is the classification of this sample. I want to visualize each sample curve and set the class as the color.
I tried to reshape the data frame, but I then lost the class feature which I need.
Could you please give me some hint or show me how to do that in R?
Thanks in advance.
You are going to want to tidy your data first (shown below with tidyr::gather). Then, when you plot, you will want to set your group = ID and color = factor(class) (for discrete colors):
library(tidyr)
library(ggplot2)
df <- structure(list(ID = c(5820350L, 5820364L, 5820378L, 5820392L, 5820425L, 5820426L),
read1 = c(0.3791915, 0.3758676, 0.3885081, 0.3779945, 0.2954782, 0.3376101),
read2 = c(0.3747022, 0.3711775, 0.38239, 0.3729582, 0.2971604, 0.3368173),
read3 = c(0.3729779, 0.3695976, 0.3804273, 0.371491, 0.2973882, 0.3360203),
read4 = c(0.3724259, 0.3693112, 0.3797707, 0.3709072, 0.2973216, 0.3359517),
class = c(1L, 2L, 2L, 1L, 3L, 3L)),
.Names = c("ID", "read1", "read2", "read3", "read4", "class"),
class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))
df <- gather(df, reading, value, -c(ID, class))
ggplot(df, aes(x = reading, y = value, color = factor(class))) +
geom_line(aes(group = ID))
Here's a function that may do what you want:
PlotMultiCurve = function(x, classes, cols = NULL, colSet = "Set1", ...) {
if(!is.factor(classes)) classes = as.factor(classes)
nClasses = length(levels(classes))
if(is.null(cols)) cols = brewer.pal(nClasses, colSet)
plot(1:ncol(x), x[1,], col = cols[classes[1]], type = "l",
ylim = range(x), xaxt = "n", ...)
axis(1, 1:ncol(x), 1:ncol(x))
for(i in 2:nrow(x)) {
par(new = T)
plot(1:ncol(x), x[i,], col = cols[classes[i]], type = "l",
ylim = range(x), axes = F, xlab = "", ylab = "")
}
}
It uses chooses colors automatically from the RColorBrewer package unless you provide the colors. I copied your data directly into a text file and then ran the following:
# Prepare data
require(RColorBrewer)
myData = read.table("Data.2016-05-03.txt")
x = myData[,2:5]
classes = as.factor(myData$class)
# Plot into PNG file[![enter image description here][1]][1]
png("Plot.2016-05-03.png", width = 1000, height = 1000, res = 300)
par(cex = 0.8)
PlotMultiCurve(x = x, classes = classes, xlab = "Read", ylab = "Response")
dev.off()
A pairwise scatterplot showing relationship between genes (columns in data frame) across multiple samples (rows in data frame) is created. The samples belong to two distinct groups: group "A" and "B". Since one dot in plot represent one sample, I need to color the data points (dots) according to groups with two different colors, say group A with "green" and group B with "red". Is it possible to do that?
Any kind of help will be appreciated.
plot(DF[1:6], pch = 21) #command used for plotting, DF is data frame
Sample Data Frame Example:
CBX3 PSPH ATP2C1 SNX10 MMD ATP13A3
B 10.589844 6.842970 8.084550 8.475023 9.202490 10.403811
A 10.174385 5.517944 7.736994 9.094834 9.253766 10.133408
B 10.202084 5.669137 7.392141 7.522270 7.830969 9.123178
B 10.893231 6.630709 7.601690 7.894177 8.979142 9.791841
B 10.071038 5.091222 7.032585 8.305581 7.903737 8.994821
A 10.005002 4.708631 7.927246 7.292527 8.257853 10.054630
B 10.028055 5.080944 6.421961 7.616856 8.287496 9.642294
A 10.144115 6.626483 7.686203 7.970934 7.919615 9.475175
A 10.675386 6.874047 7.900560 7.605519 8.585158 8.858613
A 9.855063 5.164399 6.847923 8.072608 8.221344 9.077744
A 10.994228 6.545318 8.606128 8.426329 8.787876 9.857079
A 10.501266 6.677360 7.787168 8.444976 8.928174 9.542558
GGally has a good function for this as well.
library(GGally)
ggpairs(dd, color = 'CLASS',columns = 2:ncol(dd) )
It might not be that easy to do with base graphics. You could easily do this with lattice. With this sample data.frame
dd<-structure(list(CLASS = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor"),
CBX3 = c(10.589844, 10.174385, 10.202084, 10.893231, 10.071038,
10.005002, 10.028055, 10.144115, 10.675386, 9.855063, 10.994228,
10.501266), PSPH = c(6.84297, 5.517944, 5.669137, 6.630709,
5.091222, 4.708631, 5.080944, 6.626483, 6.874047, 5.164399,
6.545318, 6.67736), ATP2C1 = c(8.08455, 7.736994, 7.392141,
7.60169, 7.032585, 7.927246, 6.421961, 7.686203, 7.90056,
6.847923, 8.606128, 7.787168), SNX10 = c(8.475023, 9.094834,
7.52227, 7.894177, 8.305581, 7.292527, 7.616856, 7.970934,
7.605519, 8.072608, 8.426329, 8.444976), MMD = c(9.20249,
9.253766, 7.830969, 8.979142, 7.903737, 8.257853, 8.287496,
7.919615, 8.585158, 8.221344, 8.787876, 8.928174), ATP13A3 = c(10.403811,
10.133408, 9.123178, 9.791841, 8.994821, 10.05463, 9.642294,
9.475175, 8.858613, 9.077744, 9.857079, 9.542558)), .Names = c("CLASS",
"CBX3", "PSPH", "ATP2C1", "SNX10", "MMD", "ATP13A3"), class = "data.frame", row.names = c(NA, -12L))
you can do
library(lattice)
splom(~dd[,-1], groups=dd$CLASS)
to get
You can add color to the points by specifying the argument col
to plot
DF <- read.delim(textConnection(
"category CBX3 PSPH ATP2C1 SNX10 MMD ATP13A3
B 10.589844 6.842970 8.084550 8.475023 9.202490 10.403811
A 10.174385 5.517944 7.736994 9.094834 9.253766 10.133408
B 10.202084 5.669137 7.392141 7.522270 7.830969 9.123178
B 10.893231 6.630709 7.601690 7.894177 8.979142 9.791841
B 10.071038 5.091222 7.032585 8.305581 7.903737 8.994821
A 10.005002 4.708631 7.927246 7.292527 8.257853 10.054630
B 10.028055 5.080944 6.421961 7.616856 8.287496 9.642294
A 10.144115 6.626483 7.686203 7.970934 7.919615 9.475175
A 10.675386 6.874047 7.900560 7.605519 8.585158 8.858613
A 9.855063 5.164399 6.847923 8.072608 8.221344 9.077744
A 10.994228 6.545318 8.606128 8.426329 8.787876 9.857079
A 10.501266 6.677360 7.787168 8.444976 8.928174 9.542558"))
plot(DF[2:7],col = ifelse(DF$category == 'A','red','green'))
A list of valid color values can be obtained by calling colors(). Vectors with a gradient of colors can be created via rainbow(), and just for fun, I use this little function for choosing pretty colors when making a figure.
(Edited per suggestions from #MrFlick)
#! #param n The number of colors to be selected
colorchoose <- function (n = 1, alpha, term = F)
{
cols <- colors()
mod <- ceiling(sqrt(length(cols)))
plot(xlab = "", ylab = "", main = "click for color name",
c(0, mod), c(0, mod), type = "n", axes = F)
s<-seq_along(cols)
dev.hold()
points(s%%mod, s%/%mod, col = cols, pch = 15, cex = 2.4)
dev.flush()
p <- locator(n)
return(cols[round(p$y) * mod + round(p$x)])
}
This question is a direct successor to a pervious question asked here called “ggplot scatter plot of two groups with superimposed means with X and Y error bars”. That questions answer looks to do exactly what I am trying to accomplish however the code provided results in an error which I can’t get around. I will use my data as example here but I have tried the original question code as well with the same result.
I have a data frame which looks like this:
structure(list(Meta_ID = structure(c(15L, 22L, 31L, 17L), .Label = c("NM*624-46",
"NM*624-54", "NM*624-56", "NM*624-61", "NM*624-70", "NM624-36",
"NM624-38", "NM624-39", "NM624-40", "NM624-41", "NM624-43", "NM624-46",
"NM624-47", "NM624-51", "NM624-54 ", "NM624-56", "NM624-57",
"NM624-59", "NM624-61", "NM624-64", "NM624-70", "NM624-73", "NM624-75",
"NM624-77", "NM624-81", "NM624-82", "NM624-83", "NM624-84", "NM625-02",
"NM625-10", "NM625-11", "SM621-43", "SM621-44", "SM621-46", "SM621-47",
"SM621-48", "SM621-52", "SM621-53", "SM621-55", "SM621-56", "SM621-96",
"SM621-97", "SM622-51", "SM622-52", "SM623-14", "SM623-23", "SM623-26",
"SM623-27", "SM623-32", "SM623-33", "SM623-34", "SM623-55", "SM623-56",
"SM623-57", "SM623-58", "SM623-59", "SM623-61", "SM623-62", "SM623-64",
"SM623-65", "SM623-66", "SM623-67", "SM680-74", "SM681-16"), class = "factor"),
Region = structure(c(1L, 1L, 1L, 1L), .Label = c("N", "S"
), class = "factor"), Tissue = structure(c(1L, 2L, 1L, 1L
), .Label = c("M", "M*"), class = "factor"), Tag_Num = structure(c(41L,
48L, 57L, 43L), .Label = c("621-43", "621-44", "621-46",
"621-47", "621-48", "621-52", "621-53", "621-55", "621-56",
"621-96", "621-97", "622-51", "622-52", "623-14", "623-23",
"623-26", "623-27", "623-32", "623-33", "623-34", "623-55",
"623-56", "623-57", "623-58", "623-59", "623-61", "623-62",
"623-64", "623-65", "623-66", "623-67", "624-36", "624-38",
"624-39", "624-40", "624-41", "624-43", "624-46", "624-47",
"624-51", "624-54", "624-56", "624-57", "624-59", "624-61",
"624-64", "624-70", "624-73", "624-75", "624-77", "624-81",
"624-82", "624-83", "624-84", "625-02", "625-10", "625-11",
"680-74", "681-16"), class = "factor"), Lab_Num = structure(1:4, .Label = c("C4683",
"C4684", "C4685", "C4686", "C4687", "C4688", "C4689", "C4690",
"C4691", "C4692", "C4693", "C4694", "C4695", "C4696", "C4697",
"C4698", "C4699", "C4700", "C4701", "C4702", "C4703", "C4704",
"C4705", "C4706", "C4707", "C4708", "C4709", "C4710", "C4711",
"C4712", "C4713", "C4714", "C4715", "C4716", "C4717", "C4718",
"C4719", "C4720", "C4721", "C4722", "C4723", "C4724", "C4725",
"C4726", "C4727", "C4728", "C4729", "C4730", "C4731", "C4732",
"C4733", "C4734", "C4735", "C4736", "C4737", "C4738", "C4739",
"C4740", "C4741", "C4742", "C4743", "C4744", "C4745", "C4746",
"C4747", "C4748"), class = "factor"), C = c(46.5, 46.7, 45,
43.6), N = c(12.9, 13.7, 14.5, 13.4), C.N = c(3.6, 3.4, 3.1,
3.3), d13C = c(-19.7, -19.5, -19.4, -19.2), d15N = c(13.3,
12.4, 11.7, 11.9)), .Names = c("Meta_ID", "Region", "Tissue",
"Tag_Num", "Lab_Num", "C", "N", "C.N", "d13C", "d15N"), row.names = c(NA,
4L), class = "data.frame")
What I want to produce is a scatter plot of the raw data with an overlay of the data means for each “Region” with bidirectional error bars. To accomplish that I use plyr to summarize my data and generate the means and SD’s. Then I use ggplot2:
library(plyr)
Basic <- ddply(First.run,.(Region),summarise,
N = length(d13C),
d13C.mean = mean(d13C),
d15N.mean = mean(d15N),
d13C.SD = sd(d13C),
d15N.SD = sd(d15N))
ggplot(data=First.run, aes(x = First.run$d13C, y = First.run$d15N))+
geom_point(aes(colour = Region))+
geom_point(data = Basic,aes(colour = Region))+
geom_errorbarh(data = Basic, aes(xmin = d13C.mean + d13C.SD, xmax = d13C.mean - d13C.SD,
y = d15N.mean, colour = Region, height = 0.01))+
geom_errorbar(data = Basic, aes(ymin = d15N.mean - d15N.SD, ymax = d15N.mean + d15N.SD,
x = d13C.mean,colour = Region))
But each time I run this code I get the same error and can’t figure out what the problem is.
Error: Aesthetics must either be length one, or the same length as the dataProblems:Region
Any help would be much appreciated.
Edit: Since my example data is taken from the head of my full dataset it only includes samples from the "N" Region. With only this one region the code works fine but if you use fix() to change the provided dataset so that at least one other Region is included (in my data the other Region is "S") then the error I get shows up. My mistake in not including some data from each Region.
I ended up changing two of the "N" Regions to "S" so I could calculate standard deviation for both groups.
I think the problem was that you were missing required aesthetics in some of your geoms (geom_point was missing x and y, for example). At least getting all the required aesthetics into each geom seemed to get everything working. I cleaned up a few other things while I was at it to shorten the code up a bit.
ggplot(data = First.run, aes(x = d13C, y = d15N, colour = Region)) +
geom_point() +
geom_point(data = Basic,aes(x = d13C.mean, y = d15N.mean)) +
geom_errorbarh(data = Basic, aes(xmin = d13C.mean + d13C.SD,
xmax = d13C.mean - d13C.SD, y = d15N.mean, x = d13C.mean), height = .5) +
geom_errorbar(data = Basic, aes(ymin = d15N.mean - d15N.SD,
ymax = d15N.mean + d15N.SD, x = d13C.mean, y = d15N.mean), width = .01)