batch generating PDF files with R markdown or other methods?

batch generating PDF files with R markdown or other methods? - r

I have the following dataset:
Name Score
Amy A
Bob B
Charlie C
I want to generate three PDF files:
First one: Amy.pdf, with the content "Amy, your score for MATH 101 is A."
Second one: Bob.pdf, with the content "Bob, your score for MATH 101 is B."
Third one: Charlie.pdf, with the content "Charlie, your score for MATH 101 is C."
Is there a simple to way to do this with only one R or R markdown file? There are hundreds of individuals in the actual dataset.

One way to do it is to create it as a graphic and export it as a pdf:
Data:
d <- structure(list(Name = structure(1:3, .Label = c("Amy", "Bob",
"Charlie"), class = "factor"), Score = structure(1:3, .Label = c("A",
"B", "C"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
Perform operation:
for(i in 1:nrow(d)){
pdf(file = paste0(d[i, "Name"], ".pdf"))
plot(0:1, 1:0, bty = "n", type = "n", axes = F, xlab = "", ylab = "")
text(.5, .5, paste0(d[i, "Name"], ", your score for MATH 101 is ", d[i, "Score"]))
dev.off()
}

Related

Estimate_richness for all phyla in phyloseq

Is there an easy way to get ASV richness for each Phylum for each Station using the estimate_richness function in phyloseq? Or is there another simple way of extracting the abundance data for each taxonomic rank and calculating richness that way?
So far I have just been subsetting individual Phyla of interest using for example:
ps.Prymnesiophyceae <- subset_taxa(ps, Phylum == "Prymnesiophyceae")
alpha_diversity<-estimate_richness(ps.Prymnesiophyceae,measure=c("Shannon","Observed"))
H<-alpha_diversity$Shannon
S1<-alpha_diversity$Observed
S<-log(S1)
evenness<-H/S
alpha<-cbind(Shannon=H,Richness=S1,Evenness=evenness,sample_data(Prymnesiophyceae))
But this is rather a pain when having to do it for e.g. the top 20 phyla.
EDIT:
suggestion by #GTM works well until last step. See comment + dput:
> dput(head(sample_names(ps.transect), n=2)) c("2-1-DCM_S21_L001_R1_001.fastq", "2-1-SA_S9_L001_R1_001.fastq" )
> dput(head(alpha, n=2)) structure(list(Observed = c(31, 25), Shannon = c(2.84184012598765,
2.53358345702604), taxon = c("Prymnesiophyceae", "Prymnesiophyceae" ), sample_id = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq" ), S = c(3.43398720448515,
3.2188758248682), evenness = c(0.827562817437384,
0.787101955736294)), row.names = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq"), class = "data.frame")
> dput(head(smpl_data, n=1)) new("sample_data", .Data = list("001_DCM", 125L, structure(1L, .Label = "DCM", class = "factor"), structure(1L, .Label = "Transect", class = "factor"), structure(1L, .Label = "STZ", class = "factor"),
structure(1L, .Label = "STFW", class = "factor"), "Oligotrophic",
16L, -149.9978333, -29.997, 130.634, 17.1252, 35.4443, 1025.835008,
1.1968, 1e-12, 5.387, 2.8469, 52.26978546, 98.0505, 0, 0,
0.02, 0.9, 0, 0, 2069.47, 8.057, 377.3), names = c("Station_neat", "Depth_our", "Depth_bin", "Loc", "Front", "Water", "Zone", "Bottle", "Lon", "Lat", "pressure..db.", "Temperature", "Salinity", "Density_kgm.3", "Fluorescence_ugL", "PAR", "BottleO2_mLL", "CTDO2._mLL", "OxygenSat_.", "Beam_Transmission", "N_umolL", "NO3_umolL", "PO4_umolL", "SIL_umolL", "NO2_umolL", "NH4_umolL", "DIC_uMkg", "pH", "pCO2_matm"), row.names = "2-1-DCM_S21_L001_R1_001.fastq",
.S3Class = "data.frame")

You can wrap your code in a for loop to do so. I've slightly modified your code to make it a bit more flexible, see below.
require("phyloseq")
require("dplyr")
# Calculate alpha diversity measures for a specific taxon at a specified rank.
# You can pass any parameters that you normally pass to `estimate_richness`
estimate_diversity_for_taxon <- function(ps, taxon_name, tax_rank = "Phylum", ...){
# Subset to taxon of interest
tax_tbl <- as.data.frame(tax_table(ps))
keep <- tax_tbl[,tax_rank] == taxon_name
keep[is.na(keep)] <- FALSE
ps_phylum <- prune_taxa(keep, ps)
# Calculate alpha diversity and generate a table
alpha_diversity <- estimate_richness(ps_phylum, ...)
alpha_diversity$taxon <- taxon_name
alpha_diversity$sample_id <- row.names(alpha_diversity)
return(alpha_diversity)
}
# Load data
data(GlobalPatterns)
ps <- GlobalPatterns
# Estimate alpha diversity for each phylum
phyla <- get_taxa_unique(ps,
taxonomic.rank = 'Phylum')
phyla <- phyla[!is.na(phyla)]
alpha <- data.frame()
for (phylum in phyla){
a <- estimate_diversity_for_taxon(ps = ps,
taxon_name = phylum,
measure = c("Shannon", "Observed"))
alpha <- rbind(alpha, a)
}
# Calculate the additional alpha diversity measures
alpha$S <- log(alpha$Observed)
alpha$evenness <- alpha$Shannon/alpha$S
# Add sample data
smpl_data <- as.data.frame(sample_data(ps))
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "X.SampleID"))
This is a reproducible example with GlobalPatterns. Make sure to alter the code to match your data by replacing X.SampleID in the left join with the name of the column that contains the sample IDs in your sample_data. If there is no such column, you can create it from the row names:
smpl_data <- as.data.frame(sample_data(ps))
smpl_data$sample_id < row.names(smpl_data)
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "sample_id"))

Formatting p-value cut-off line in a volcano plot in R

I am using the following function in R to develop a simple volcano plot:
EnhancedVolcano(all_genes, x = "logFC", y = "adjust.p.value", lab = all_genes$Gene.ID,
pCutoff = 10e-2, FCcutoff = 1)
I would like my pCutoff line to appear to represent p = 0.05 which on a log scale for this figure would appear as 1.3 on the y-axis. However, changing "10e-2" to say "10e-2.5" generates an error
Error: unexpected numeric constant in: "EnhancedVolcano(all_genes, x =
"logFC", y = "adjust.p.value", lab = all_genes$Gene.ID,
pCutoff = 10e-2.5"
Any suggestions on how I can get a horizontal p-value cut-off line at exactly 1.3 (currently appears at 1.2). Here is some reproducible data:
structure(list(X = 1:14, Gene.ID = c("A", "B", "C", "D", "E", "F",
"G", "H", "I", "J", "K", "L", "M", "N"), logFC = c(1.5,
0.17212922, 0.145542174, 0.304348578, 0.124636936, 0.247841044,
0.160818268, 0.123741518, 0.148530876, 0.148960225, 0.114135472,
-0.147118359, 0.095549291, 0.138521594), AveExpr = c(5.426424957,
4.289728233, 4.901134193, 4.742864705, 5.447030699, 4.539641767,
4.650750102, 5.901020922, 5.365944907, 5.818788787, 4.837214384,
7.017656548, 4.531897822, 5.192294452), t = c(6.15098624, 5.452898247,
4.979246654, 4.949519834, 4.818043279, 4.73403717, 4.701937811,
4.522692175, 4.518518374, 4.281900066, 4.247981727, -4.194421592,
4.10350597, 4.088357671), p.value = c(1.27e-09, 6.8e-08, 7.99e-07,
9.26e-07, 1.77e-06, 2.65e-06, 3.09e-06, 7.13e-06, 7.27e-06, 2.1e-05,
2.44e-05, 3.07e-05, 4.53e-05, 4.83e-05), adjust.p.value = c(1.64e-05,
0.000438854, 0.002987004, 0.002987004, 0.004558267, 0.005687325,
0.005687325, 0.010422933, 0.010422933, 0.027128901, 0.028601707,
0.033061438, 0.04452146, 0.04452146), B = c(11.2786109, 7.664706936,
5.439886439, 5.306497286, 4.725465519, 4.361868581, 4.224515919,
3.473656504, 3.45649938, 2.508304771, 2.376338878, 2.169980059,
1.825392322, 1.76867543)), class = "data.frame", row.names = c(NA,
-14L))

I think you want the following code where the p-value is calculated like p=10^-s where s is your 1.3 like this:
library(EnhancedVolcano)
EnhancedVolcano(all_genes, x = "logFC", y = "adjust.p.value", lab = all_genes$Gene.ID,
pCutoff = 10^-1.3, FCcutoff = 1)
Output:

Subset data to contain only label of columns whose names match a condition

I have the following sample data frame:
M <- c(1, 2)
S <- c(3, 4)
old_df <- data.frame(M, S)
label(old_df$M) <- "My Mathematics Variable"
label(old_df$S) <- "My Science Variable"
And I want to filter in order to only have columns that contains "Math" in the label of the columns.
Thanks!

You can use grepl:
old_df[, grepl("Math", label(old_df))]
# My Mathematics Variable
# [1] 1 2
data
library(Hmisc)
old_df <- structure(list(M = structure(c(1, 2), label = "My Mathematics Variable", class = c("labelled",
"numeric")), S = structure(c(3, 4), label = "My Science Variable", class = c("labelled",
"numeric"))), row.names = c(NA, -2L), class = "data.frame")

Write a Data.Table as a csv file

I have a data.table that has list values within the columns. Below is the dput:
dput(df2)
structure(list(a = list(structure(5594.05118603497, .Names = "a"),
structure(8877.42723091876, .Names = "a"), structure(2948.95666065332,
.Names = "a"),
structure(5312.77623937465, .Names = "a"), structure(676.637044992807,
.Names = "a"),
structure(323.104243007498, .Names = "a")), b =
list(structure(3.90258318853593e-06, .Names = "b"),
structure(3.89772483584672e-06, .Names = "b"), structure(3.91175458242421e-
06, .Names = "b"),
structure(3.90169532031545e-06, .Names = "b"), structure(6.54536728417568e-
06, .Names = "b"),
structure(6.59087917747312e-06, .Names = "b")), id = 1:6), .Names = c("a",
"b", "id"), class = c("data.table", "data.frame"), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x0000000000220788>)
Here is what the output looks like:
head(df2)
a b id
1: 5594.051 3.902583e-06 1
2: 8877.427 3.897725e-06 2
3: 2948.957 3.911755e-06 3
4: 5312.776 3.901695e-06 4
5: 676.637 6.545367e-06 5
6: 323.1042 6.590879e-06 6
This looks ok when you see it at first but if you look further into it, this is what it looks like when I want to select a column:
How do I change df2 to just be a normal dataframe where it doesn't have these extra values within a and b like this? I am trying to write this file to a csv but it will not allow me to because it is saying there are vectors as the values.
Thanks!
Edit:
This was the code that generated the lists:
test<-sapply( split( df , df$ID),
function(d){ dat <- list2env(d)
nlsfit <- nls( form = y ~ a * (1-exp(-b * x)), data=dat,
start= list( a=max(dat$y), b=b.start),
control= control1)
list(a = coef(nlsfit)[1], b = coef(nlsfit)[2])} )
df1<-as.data.frame(t(test))

Load the right package, look at its help page, search for "csv", follow the Usage section:
library(data.table)
help(pac=data.table)
fwrite(df2, file="~/test.csv") # for mac, need changing for other OS
Another approach might be:
as.data.frame( lapply(df2, unlist) )

How to plot multiple curves and color them as group using R ggplot

I have a data frame like this.
ID read1 read2 read3 read4 class
1 5820350 0.3791915 0.3747022 0.3729779 0.3724259 1
2 5820364 0.3758676 0.3711775 0.3695976 0.3693112 2
3 5820378 0.3885081 0.3823900 0.3804273 0.3797707 2
4 5820392 0.3779945 0.3729582 0.3714910 0.3709072 1
5 5820425 0.2954782 0.2971604 0.2973882 0.2973216 3
6 5820426 0.3376101 0.3368173 0.3360203 0.3359517 3
Each row represents one sample with four values,and the last column is the classification of this sample. I want to visualize each sample curve and set the class as the color.
I tried to reshape the data frame, but I then lost the class feature which I need.
Could you please give me some hint or show me how to do that in R?
Thanks in advance.

You are going to want to tidy your data first (shown below with tidyr::gather). Then, when you plot, you will want to set your group = ID and color = factor(class) (for discrete colors):
library(tidyr)
library(ggplot2)
df <- structure(list(ID = c(5820350L, 5820364L, 5820378L, 5820392L, 5820425L, 5820426L),
read1 = c(0.3791915, 0.3758676, 0.3885081, 0.3779945, 0.2954782, 0.3376101),
read2 = c(0.3747022, 0.3711775, 0.38239, 0.3729582, 0.2971604, 0.3368173),
read3 = c(0.3729779, 0.3695976, 0.3804273, 0.371491, 0.2973882, 0.3360203),
read4 = c(0.3724259, 0.3693112, 0.3797707, 0.3709072, 0.2973216, 0.3359517),
class = c(1L, 2L, 2L, 1L, 3L, 3L)),
.Names = c("ID", "read1", "read2", "read3", "read4", "class"),
class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))
df <- gather(df, reading, value, -c(ID, class))
ggplot(df, aes(x = reading, y = value, color = factor(class))) +
geom_line(aes(group = ID))

Here's a function that may do what you want:
PlotMultiCurve = function(x, classes, cols = NULL, colSet = "Set1", ...) {
if(!is.factor(classes)) classes = as.factor(classes)
nClasses = length(levels(classes))
if(is.null(cols)) cols = brewer.pal(nClasses, colSet)
plot(1:ncol(x), x[1,], col = cols[classes[1]], type = "l",
ylim = range(x), xaxt = "n", ...)
axis(1, 1:ncol(x), 1:ncol(x))
for(i in 2:nrow(x)) {
par(new = T)
plot(1:ncol(x), x[i,], col = cols[classes[i]], type = "l",
ylim = range(x), axes = F, xlab = "", ylab = "")
}
}
It uses chooses colors automatically from the RColorBrewer package unless you provide the colors. I copied your data directly into a text file and then ran the following:
# Prepare data
require(RColorBrewer)
myData = read.table("Data.2016-05-03.txt")
x = myData[,2:5]
classes = as.factor(myData$class)
# Plot into PNG file[![enter image description here][1]][1]
png("Plot.2016-05-03.png", width = 1000, height = 1000, res = 300)
par(cex = 0.8)
PlotMultiCurve(x = x, classes = classes, xlab = "Read", ylab = "Response")
dev.off()

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

batch generating PDF files with R markdown or other methods? - r

Related

Estimate_richness for all phyla in phyloseq

Formatting p-value cut-off line in a volcano plot in R

Subset data to contain only label of columns whose names match a condition

Write a Data.Table as a csv file

How to plot multiple curves and color them as group using R ggplot

Categories

Resources