Estimate_richness for all phyla in phyloseq

Estimate_richness for all phyla in phyloseq - r

Is there an easy way to get ASV richness for each Phylum for each Station using the estimate_richness function in phyloseq? Or is there another simple way of extracting the abundance data for each taxonomic rank and calculating richness that way?
So far I have just been subsetting individual Phyla of interest using for example:
ps.Prymnesiophyceae <- subset_taxa(ps, Phylum == "Prymnesiophyceae")
alpha_diversity<-estimate_richness(ps.Prymnesiophyceae,measure=c("Shannon","Observed"))
H<-alpha_diversity$Shannon
S1<-alpha_diversity$Observed
S<-log(S1)
evenness<-H/S
alpha<-cbind(Shannon=H,Richness=S1,Evenness=evenness,sample_data(Prymnesiophyceae))
But this is rather a pain when having to do it for e.g. the top 20 phyla.
EDIT:
suggestion by #GTM works well until last step. See comment + dput:
> dput(head(sample_names(ps.transect), n=2)) c("2-1-DCM_S21_L001_R1_001.fastq", "2-1-SA_S9_L001_R1_001.fastq" )
> dput(head(alpha, n=2)) structure(list(Observed = c(31, 25), Shannon = c(2.84184012598765,
2.53358345702604), taxon = c("Prymnesiophyceae", "Prymnesiophyceae" ), sample_id = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq" ), S = c(3.43398720448515,
3.2188758248682), evenness = c(0.827562817437384,
0.787101955736294)), row.names = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq"), class = "data.frame")
> dput(head(smpl_data, n=1)) new("sample_data", .Data = list("001_DCM", 125L, structure(1L, .Label = "DCM", class = "factor"), structure(1L, .Label = "Transect", class = "factor"), structure(1L, .Label = "STZ", class = "factor"),
structure(1L, .Label = "STFW", class = "factor"), "Oligotrophic",
16L, -149.9978333, -29.997, 130.634, 17.1252, 35.4443, 1025.835008,
1.1968, 1e-12, 5.387, 2.8469, 52.26978546, 98.0505, 0, 0,
0.02, 0.9, 0, 0, 2069.47, 8.057, 377.3), names = c("Station_neat", "Depth_our", "Depth_bin", "Loc", "Front", "Water", "Zone", "Bottle", "Lon", "Lat", "pressure..db.", "Temperature", "Salinity", "Density_kgm.3", "Fluorescence_ugL", "PAR", "BottleO2_mLL", "CTDO2._mLL", "OxygenSat_.", "Beam_Transmission", "N_umolL", "NO3_umolL", "PO4_umolL", "SIL_umolL", "NO2_umolL", "NH4_umolL", "DIC_uMkg", "pH", "pCO2_matm"), row.names = "2-1-DCM_S21_L001_R1_001.fastq",
.S3Class = "data.frame")

You can wrap your code in a for loop to do so. I've slightly modified your code to make it a bit more flexible, see below.
require("phyloseq")
require("dplyr")
# Calculate alpha diversity measures for a specific taxon at a specified rank.
# You can pass any parameters that you normally pass to `estimate_richness`
estimate_diversity_for_taxon <- function(ps, taxon_name, tax_rank = "Phylum", ...){
# Subset to taxon of interest
tax_tbl <- as.data.frame(tax_table(ps))
keep <- tax_tbl[,tax_rank] == taxon_name
keep[is.na(keep)] <- FALSE
ps_phylum <- prune_taxa(keep, ps)
# Calculate alpha diversity and generate a table
alpha_diversity <- estimate_richness(ps_phylum, ...)
alpha_diversity$taxon <- taxon_name
alpha_diversity$sample_id <- row.names(alpha_diversity)
return(alpha_diversity)
}
# Load data
data(GlobalPatterns)
ps <- GlobalPatterns
# Estimate alpha diversity for each phylum
phyla <- get_taxa_unique(ps,
taxonomic.rank = 'Phylum')
phyla <- phyla[!is.na(phyla)]
alpha <- data.frame()
for (phylum in phyla){
a <- estimate_diversity_for_taxon(ps = ps,
taxon_name = phylum,
measure = c("Shannon", "Observed"))
alpha <- rbind(alpha, a)
}
# Calculate the additional alpha diversity measures
alpha$S <- log(alpha$Observed)
alpha$evenness <- alpha$Shannon/alpha$S
# Add sample data
smpl_data <- as.data.frame(sample_data(ps))
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "X.SampleID"))
This is a reproducible example with GlobalPatterns. Make sure to alter the code to match your data by replacing X.SampleID in the left join with the name of the column that contains the sample IDs in your sample_data. If there is no such column, you can create it from the row names:
smpl_data <- as.data.frame(sample_data(ps))
smpl_data$sample_id < row.names(smpl_data)
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "sample_id"))

Related

R: Error: sample_info must be a data frame with columns sample_name, file_bam, paired_end, read_length, frag_length, lib_size

I have a list of bam files in chr16_bam folder and also an annotation file sgseq_sam.txt.
library(SGSeq)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(org.Hs.eg.db)
library(tidyverse)
library(AnnotationDbi)
library(dplyr)
library(stringr)
# Rename the "file_bam" column values to the full path where the BAMs are stored
setwd("C:/Users/User/Downloads/")
bamPath = "C:/Users/User/Downloads/chr16_bam"
samFile <- read.delim("C:/Users/User/Downloads/sgseq_sam.txt", header=T)
samFile <- samFile %>%
mutate(file_bam = paste0(bamPath, file_bam))
# Save the TxDb.Hsapiens.UCSC.hg19.knownGene object into a variable.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb <- keepSeqlevels(txdb, "chr16")
seqlevelsStyle(txdb) <- "NCBI"
# Read in the gene list from Supplementary Table 1
gene.list <- read.table("Table_1_Differential Expression Analysis Revealing CLCA1 to Be a Prognostic and Diagnostic Biomarker for Colorectal Cancer.xls", header=T)
# Convert the gene symbols to Entrez IDs and remove the genes that don't map to an Entrez ID.
entrez = mapIds(org.Hs.eg.db, keys=gene.list$Name, column = "ENTREZID", keytype="SYMBOL")
gene.list <- gene.list %>%
mutate(entrez) %>%
filter(!is.na(entrez))
# Convert the TxDb.Hsapiens.UCSC.hg19.knownGene object
txf_ucsc <- convertToTxFeatures(txdb)
# Cast this object to a dataframe and save as another variable name
txf_df <- as.data.frame(txf_ucsc)
# Subset by the Entrez IDs
txf_df <- txf_df %>% filter(geneName %in% gene.list$entrez)
# Find the number of common transcripts
unique <- unique(txf_df$geneName)
gene.list <- gene.list[gene.list$entrez %in% unique,]
nrow(gene.list)
# Cast the seqnames column to factor
txf_df$seqnames <- as.factor(as.character(txf_df$seqnames))
# Remove the "chr" prefix from the seqnames column
txf_df %>%
rename_all(~stringr::str_replace(.,"^chr",""))
# Recast this dataframe back to a GRanges object
txf_grange <- makeGRangesFromDataFrame(txf_df, keep.extra.columns=T)
Now, I want to create a loop for each of the genes, where upon iteration, subset Granges objects in txf_grange by only the gene, use the reduce function to collapse the ranges of the genes into a single vector, run analyzeFeatures, then annotate functions, and finally plotFeatures.
for (i in txf_grange$geneName) {
if (i=="343") {
next
}
else{
# For each of the 15 genes, subset the Granges objects by only the gene
grange.subset <- txf_grange[i == toString(i)]
# Collapse the ranges of the genes into a single vector
grange.subset <- unlist(IRanges::reduce((split(grange.subset, grange.subset$geneName))))
# Run analyzeFeatures
for (j in 1:dim(samFile)[[1]]) {
si <- samFile
for (k in si) {
for (l in list.files(path="C:/Users/User/Downloads/chr16_bam", pattern=".bam$", all.files=F, full.names=F)) {
sgfc_pred <- analyzeFeatures(k, which=grange.subset, features=txf_ucsc, predict=T)
# Annotate predicted features
sgfc_pred <- annotate(sgfc_pred, txf_ucsc)
# Plot features
pdf(paste("plot", l, ".pdf", sep=""))
plotFeatures(sgfc_pred, geneID=1)
dev.off()
}
}
}
}
}
Data
> dput(head(txf_grange))
new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = "16", class = "factor"),
lengths = 6L, elementMetadata = NULL, metadata = list()),
ranges = new("IRanges", start = c(12058964L, 12059311L, 12059311L,
12060052L, 12060198L, 12060198L), width = c(348L, 742L, 2117L,
147L, 680L, 1230L), NAMES = NULL, elementType = "ANY", elementMetadata = NULL,
metadata = list()), strand = new("Rle", values = structure(1L, .Label = c("+",
"-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL,
metadata = list()), seqinfo = new("Seqinfo", seqnames = "16",
seqlengths = NA_integer_, is_circular = NA, genome = NA_character_),
elementMetadata = new("DFrame", rownames = NULL, nrows = 6L,
listData = list(type = structure(c(3L, 1L, 1L, 2L, 1L,
1L), .Label = c("J", "I", "F", "L", "U"), class = "factor"),
txName = structure(list(c("uc002dbv.3", "uc010buy.3",
"uc010buz.3"), c("uc002dbv.3", "uc010buy.3"), "uc010buz.3",
c("uc002dbv.3", "uc010buy.3"), "uc010buy.3",
"uc002dbv.3"), class = "AsIs"), geneName = structure(list(
"608", "608", "608", "608", "608", "608"), class = "AsIs")),
elementType = "ANY", elementMetadata = NULL, metadata = list()),
elementType = "ANY", metadata = list())

change line:
analyzeFeatures(samFile, grange.subset)
Also you do not need that many loops to run for the question. The question asks for 14 plots and you might be plotting for much more with the number of loops you have.

Automatically split function output (list) into component data.frames

I have a functions which yields 2 dataframes. As functions can only return one object, I combined these dataframes as a list. However, I need to work with both dataframes separately. Is there a way to automatically split the list into the component dataframes, or to write the function in a way that both objects are returned separately?
The function:
install.packages("plyr")
require(plyr)
fun.docmerge <- function(x, y, z, crit, typ, doc = checkmerge) {
mergedat <- paste(deparse(substitute(x)), "+",
deparse(substitute(y)), "=", z)
countdat <- nrow(x)
check_t1 <- data.frame(mergedat, countdat)
z1 <- join(x, y, by = crit, type = typ)
countdat <- nrow(z1)
check_t2 <- data.frame(mergedat, countdat)
doc <- rbind(doc, check_t1, check_t2)
t1<-list()
t1[["checkmerge"]]<-doc
t1[[z]]<-z1
return(t1)
}
This is the call to the function, saving the result list to the new object results.
results <- fun.docmerge(x = df1, y = df2, z = "df3", crit = c("id"), typ = "left")
In the following sample data to replicate the problem:
df1 <- structure(list(id = c("XXX1", "XXX2", "XXX3",
"XXX4"), tr.isincode = c("ISIN1", "ISIN2",
"ISIN3", "ISIN4")), .Names = c("id", "isin"
), row.names = c(NA, 4L), class = "data.frame")
df2 <- structure(list(id= c("XXX1", "XXX5"), wrong= c(1L,
1L)), .Names = c("id", "wrong"), row.names = 1:2, class = "data.frame")
checkmerge <- structure(list(mergedat = structure(integer(0), .Label = character(0), class = "factor"),
countdat = numeric(0)), .Names = c("mergedat", "countdat"
), row.names = integer(0), class = "data.frame")
In the example, a list with the dataframes df3 and checkmerge are returned. I would need both dataframes separately. I know that I could do it via manual assignment (e.g., checkmerge <- results$checkmerge) but I want to eliminate manual changes as much as possible and am therefore looking for an automated way.

Lattice xyplot() Adding a different mean trend line to each panel?

I have a simple trellis scatterplot. Two panels - male/female. ID is a unique number for each participant. The var1 is a total test time. Mean.values is a vector of two numbers (the means for gender).
No point including a best fit line so what I want is to plot a trend line of the mean in each panel. The two panels have different means, say male = 1 minute, female = 2 minutes.
xyplot(var1 ~ ID|Gender, data=DF,
group = Gender,
panel=function(...) {
panel.xyplot(...)
panel.abline(h=mean.values)
})
At the minute the graph is coming out so that both trendlines appear in each panel. I want only one trendline in each.
Does anyone have the way to do this?
I have tried a number of different ways including the long code for function Addline which just doesn't work for me. I just want to define which panel im looking at and i've looked at ?panel.number but not sure how that works as its coming up that I don't have a current row. (current.row(prefix)).
There must be a simple way of doing this?
[EDIT - Here's the actual data i'm using]
I've tried to simplify the DF
library(lattice)
dput(head(DF))
structure(list(ID = 1:6, Var1 = c(2333858, 4220644,
2941774, 2368496, 3165740, 3630300), mean = c(2412976, 2412976,
2412976, 2412976, 2412976, 2412976), Gender = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor")), .Names = c("ID",
"Var1", "mean", "Gender"), row.names = c(NA, 6L), class = "data.frame")
dput(tail(DF))
structure(list(ID = 161:166, Var1= c(2825246, 3552170,
3688882, 2487760, 3849108, 3085342), mean = c(3689805, 3689805,
3689805, 3689805, 3689805, 3689805), Gender = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")), .Names = c("ID",
"Var1", "mean", "Gender"), row.names = 109:114, class = "data.frame")
plot i'm using:
xyplot((Var1/1000) ~ ID|Gender, data=DF,
group = Gender,scales=list(x=list(at=NULL)),
panel=function(...) {
panel.xyplot(...)
panel.abline(h=mean.values) })
causes 2 lines.
[EDIT - This is the code which includes the function Addline & is everywhere on all the posts and doesn't seem to work for me]
addLine<- function(a=NULL, b=NULL, v = NULL, h = NULL, ..., once=F) { tcL <- trellis.currentLayout() k<-0 for(i in 1:nrow(tcL)) for(j in 1:ncol(tcL)) if (tcL[i,j] > 0) { k<-k+1 trellis.focus("panel", j, i, highlight = FALSE) if (once) panel.abline(a=a[k], b=b[k], v=v[k], h=h[k], ...) else panel.abline(a=a,b=b, v=v, h=h, ...) trellis.unfocus() } }
then writing after the trellis plot (mean.values being a vector of two numbers, mean for female, mean for male)
addLine(v=(mean.values), once=TRUE)
Update - I managed to do it in ggplot2.
Make the ggplot using facet_wrap then -
hline.data <- data.frame(z = c(2413, 3690), Gender = c("Female","Male"))
This creates a DF of the two means and the Gender, 2x2 DF
myplot <- myplot + geom_hline(aes(yintercept = z), hline.data)
This adds the lines to the ggplot.

If you just wanted plot the mean of values you are drawing on the plot aready, you can skip the mean.values variable and just do
xyplot(Var1 ~ ID|Gender, data=DF,
group = Gender,
panel=function(x,y,...) {
panel.xyplot(x,y,...)
panel.abline(h=mean(y))
}
)
With the sample data
DF<-data.frame(
ID=1:10,
Gender=rep(c("M","F"), each=5),
Var1=c(5,6,7,6,5,8,9,10,8,9)
)
this produces

I believe lattice has a specific panel function for this, panel.average().
Try replacing panel.abline(h=mean.values) with panel.average(...).
If that doesn't solve the problem, we might need more information; try using dput() on your data (e.g., dput(DF), or some representative subset).

prop.table doesn't work in a for-loop?

This may be a very simple question, but I don't see how to answer it.
I have the following reproducible code, where I have two small dataframes that I use to calculate a percentage value based on each column total:
#dataframe x
x <- structure(list(PROV = structure(c(1L, 1L), .Label = "AG", class = "factor"),
APT = structure(1:2, .Label = c("AAA", "BBB"), class = "factor"),
PAX.2013 = c(5L, 4L), PAX.2014 = c(4L, 2L), PAX.2015 = c(4L,0L)),
.Names = c("PROV", "APT", "PAX.2013", "PAX.2014", "PAX.2015"),
row.names = 1:2, class = "data.frame")
#dataframe y
y <- structure(list(PROV = structure(c(1L, 1L), .Label = "AQ", class = "factor"),
APT = structure(1:2, .Label = c("CCC", "AAA"), class = "factor"),
PAX.2013 = c(3L, 7L), PAX.2014 = c(2L, 1L), PAX.2015 = c(0L,3L)),
.Names = c("PROV", "APT", "PAX.2013", "PAX.2014", "PAX.2015"),
row.names = 1:2, class = "data.frame")
#list z (with x and y)
z <- list(x,y)
#percentage value of x and y based on columns total
round(prop.table(as.matrix(z[[1]][3:5]), margin = 2)*100,1)
round(prop.table(as.matrix(z[[2]][3:5]), margin = 2)*100,1)
as you can see, it works just fine.
Now I want to automate for all the list, but I can't figure out how to get the results. This is my simple code:
#for-loop that is not working
for (i in length(z))
{round(prop.table(as.matrix(z[[i]][3:5]), margin = 2)*100,1)}

You have two problems.
First, you have not put a range into your for loop so you are just trying to iterate over a single number and second, you are not assigning your result anywhere on each iteration.
Use 1:length(z) to define a range. Then assign the results to a variable.
This would work:
my_list <- list()
for (i in 1:length(z)){
my_list[[i]] <- round(prop.table(as.matrix(z[[i]][3:5]),
margin = 2)*100,1)
}
my_list
But it would be more efficient and idiomatic to use lapply:
lapply(1:length(z),
function(x) round(prop.table(as.matrix(z[[x]][3:5]), margin = 2)*100,1))

Barring discussions whether for-loops is the best approach, you had two issues. One, your for loop only iterates over 2 (which is length(z)) instead of 1:2. Two, you need to do something with the round(....) statement. In this solution, I added a print statement.
for (i in 1:length(z)){
print(round(prop.table(as.matrix(z[[i]][3:5]), margin = 2)*100,1))
}

Color data points based on sample classification

A pairwise scatterplot showing relationship between genes (columns in data frame) across multiple samples (rows in data frame) is created. The samples belong to two distinct groups: group "A" and "B". Since one dot in plot represent one sample, I need to color the data points (dots) according to groups with two different colors, say group A with "green" and group B with "red". Is it possible to do that?
Any kind of help will be appreciated.
plot(DF[1:6], pch = 21) #command used for plotting, DF is data frame
Sample Data Frame Example:
CBX3 PSPH ATP2C1 SNX10 MMD ATP13A3
B 10.589844 6.842970 8.084550 8.475023 9.202490 10.403811
A 10.174385 5.517944 7.736994 9.094834 9.253766 10.133408
B 10.202084 5.669137 7.392141 7.522270 7.830969 9.123178
B 10.893231 6.630709 7.601690 7.894177 8.979142 9.791841
B 10.071038 5.091222 7.032585 8.305581 7.903737 8.994821
A 10.005002 4.708631 7.927246 7.292527 8.257853 10.054630
B 10.028055 5.080944 6.421961 7.616856 8.287496 9.642294
A 10.144115 6.626483 7.686203 7.970934 7.919615 9.475175
A 10.675386 6.874047 7.900560 7.605519 8.585158 8.858613
A 9.855063 5.164399 6.847923 8.072608 8.221344 9.077744
A 10.994228 6.545318 8.606128 8.426329 8.787876 9.857079
A 10.501266 6.677360 7.787168 8.444976 8.928174 9.542558

GGally has a good function for this as well.
library(GGally)
ggpairs(dd, color = 'CLASS',columns = 2:ncol(dd) )

It might not be that easy to do with base graphics. You could easily do this with lattice. With this sample data.frame
dd<-structure(list(CLASS = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor"),
CBX3 = c(10.589844, 10.174385, 10.202084, 10.893231, 10.071038,
10.005002, 10.028055, 10.144115, 10.675386, 9.855063, 10.994228,
10.501266), PSPH = c(6.84297, 5.517944, 5.669137, 6.630709,
5.091222, 4.708631, 5.080944, 6.626483, 6.874047, 5.164399,
6.545318, 6.67736), ATP2C1 = c(8.08455, 7.736994, 7.392141,
7.60169, 7.032585, 7.927246, 6.421961, 7.686203, 7.90056,
6.847923, 8.606128, 7.787168), SNX10 = c(8.475023, 9.094834,
7.52227, 7.894177, 8.305581, 7.292527, 7.616856, 7.970934,
7.605519, 8.072608, 8.426329, 8.444976), MMD = c(9.20249,
9.253766, 7.830969, 8.979142, 7.903737, 8.257853, 8.287496,
7.919615, 8.585158, 8.221344, 8.787876, 8.928174), ATP13A3 = c(10.403811,
10.133408, 9.123178, 9.791841, 8.994821, 10.05463, 9.642294,
9.475175, 8.858613, 9.077744, 9.857079, 9.542558)), .Names = c("CLASS",
"CBX3", "PSPH", "ATP2C1", "SNX10", "MMD", "ATP13A3"), class = "data.frame", row.names = c(NA, -12L))
you can do
library(lattice)
splom(~dd[,-1], groups=dd$CLASS)
to get

You can add color to the points by specifying the argument col
to plot
DF <- read.delim(textConnection(
"category CBX3 PSPH ATP2C1 SNX10 MMD ATP13A3
B 10.589844 6.842970 8.084550 8.475023 9.202490 10.403811
A 10.174385 5.517944 7.736994 9.094834 9.253766 10.133408
B 10.202084 5.669137 7.392141 7.522270 7.830969 9.123178
B 10.893231 6.630709 7.601690 7.894177 8.979142 9.791841
B 10.071038 5.091222 7.032585 8.305581 7.903737 8.994821
A 10.005002 4.708631 7.927246 7.292527 8.257853 10.054630
B 10.028055 5.080944 6.421961 7.616856 8.287496 9.642294
A 10.144115 6.626483 7.686203 7.970934 7.919615 9.475175
A 10.675386 6.874047 7.900560 7.605519 8.585158 8.858613
A 9.855063 5.164399 6.847923 8.072608 8.221344 9.077744
A 10.994228 6.545318 8.606128 8.426329 8.787876 9.857079
A 10.501266 6.677360 7.787168 8.444976 8.928174 9.542558"))
plot(DF[2:7],col = ifelse(DF$category == 'A','red','green'))
A list of valid color values can be obtained by calling colors(). Vectors with a gradient of colors can be created via rainbow(), and just for fun, I use this little function for choosing pretty colors when making a figure.
(Edited per suggestions from #MrFlick)
#! #param n The number of colors to be selected
colorchoose <- function (n = 1, alpha, term = F)
{
cols <- colors()
mod <- ceiling(sqrt(length(cols)))
plot(xlab = "", ylab = "", main = "click for color name",
c(0, mod), c(0, mod), type = "n", axes = F)
s<-seq_along(cols)
dev.hold()
points(s%%mod, s%/%mod, col = cols, pch = 15, cex = 2.4)
dev.flush()
p <- locator(n)
return(cols[round(p$y) * mod + round(p$x)])
}

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Estimate_richness for all phyla in phyloseq - r

Related

R: Error: sample_info must be a data frame with columns sample_name, file_bam, paired_end, read_length, frag_length, lib_size

Automatically split function output (list) into component data.frames

Lattice xyplot() Adding a different mean trend line to each panel?

prop.table doesn't work in a for-loop?

Color data points based on sample classification

Categories

Resources