How to modify R program to support RHadoop - r

I am new to RHadoop and R. I am having a normal R program which has a library(Methylkit). I am wondering can someone give some insights on how do I run this R program on hadoop. What do I need to modify in the original R program? It would be really help if some one gives me some idea.
The Code:
library(methylKit)
file.list=list( "new_sample1.txt","new_sample2.txt","n_sample3.txt")
myobj=read(file.list,sample.id=list("test1","test2","ctrl1"),assembly="hg19",treatment=c(1,1,0),context="CpG", pipeline=list(fraction=TRUE,chr.col=1,start.col=2,end.col=2,
coverage.col=6,strand.col=3,freqC.col=5 ))
getMethylationStats(myobj[[1]],plot=F,both.strands=F)
pdf("sample1_statistics.pdf")
getMethylationStats(myobj[[1]],plot=T,both.strands=F)
dev.off()
getMethylationStats(myobj[[2]],plot=F,both.strands=F)
pdf("sample2_statistics.pdf")
getMethylationStats(myobj[[2]],plot=T,both.strands=F)
dev.off()
getCoverageStats(myobj[[3]],plot=F,both.strands=F)
pdf("sample3_statistics.pdf")
getMethylationStats(myobj[[3]],plot=T,both.strands=F)
dev.off()
library("graphics")
pdf("sample1_coverage.pdf")
getCoverageStats(myobj[[1]], plot = T, both.strands = F)
dev.off()
pdf("sample2_coverage.pdf")
getCoverageStats(myobj[[2]], plot = T, both.strands = F)
dev.off()
pdf("sample3_coverage.pdf")
getCoverageStats(myobj[[3]], plot = T, both.strands = F)
dev.off()
meth=unite(myobj, destrand=FALSE)
pdf("correlation.pdf")
getCorrelation(meth,plot=T)
dev.off()
pdf("cluster.pdf")
clusterSamples(meth, dist="correlation",method="ward", plot=TRUE)
dev.off()
hc <- clusterSamples(meth, dist = "correlation", method = "ward",plot = FALSE)
pdf("pca.pdf")
PCASamples(meth, screeplot = TRUE)
PCASamples(meth)
myDiff=calculateDiffMeth(meth)
write.table(myDiff, "mydiff.txt", sep='\t')
myDiff25p.hyper <-get.methylDiff(myDiff,differenc=25,qvalue=0.01,type="hyper")
myDiff25p.hyper
write.table(myDiff25p.hyper,"hyper_methylated.txt",sep='\t')
myDiff25p.hypo <-get.methylDiff(myDiff,differenc=25,qvalue=0.01,type="hypo")
myDiff25p.hypo
write.table(myDiff25p.hypo,"hypo_methylated.txt",sep='\t')
myDiff25p <-get.methylDiff(myDiff,differenc=25,qvalue=0.01)
myDiff25p
write.table(myDiff25p,"differentialy_methylated.txt",sep='\t')
diffMethPerChr(myDiff,plot=FALSE,qvalue.cutoff=0.01,meth.cutoff=25)
pdf("diffMethPerChr.pdf")
diffMethPerChr(myDiff,plot=TRUE,qvalue.cutoff=0.01,meth.cutoff=25)
dev.off()
gene.obj <- read.transcript.features(system.file("extdata","refseq.hg18.bed.txt", package = "methylKit"))
write.table(gene.obj,"gene_obj.txt", sep='\t')
annotate.WithGenicParts(myDiff25p, gene.obj)
cpg.obj <- read.feature.flank(system.file("extdata","cpgi.hg18.bed.txt", package = "methylKit"),feature.flank.name = c("CpGi","shores"))
write.table(cpg.obj,"cpg_obj.txt", sep='\t')
diffCpGann <- annotate.WithFeature.Flank(myDiff25p,cpg.obj$CpGi, cpg.obj$shores, feature.name = "CpGi",flank.name = "shores")
write.table(diffCpGann,"diffCpCann.txt", sep='\t')
diffCpGann
promoters <- regionCounts(myobj, gene.obj$promoters)
head(promoters[[1]])
write.table(promoters,"promoters.txt", sep='\t')
diffAnn <- annotate.WithGenicParts(myDiff25p, gene.obj)
head(getAssociationWithTSS(diffAnn))
diffAnn
write.table(getAssociationWithTSS(diffAnn),"diff_ann.txt", sep='\t')
getTargetAnnotationStats(diffAnn, percentage = TRUE,precedence = TRUE)
pdf("piechart1.pdf")
plotTargetAnnotation(diffAnn, precedence = TRUE, main ="differential methylation annotation")
dev.off()
pdf("piechart2.pdf")
plotTargetAnnotation(diffCpGann, col = c("green","gray", "white"), main = "differential methylation annotation")
dev.off()
getFeatsWithTargetsStats(diffAnn, percentage = TRUE)

Are the *.txt files located in hdfs? If not, do put. You can use hadoop streaming to read data from hadoop.
line1 <- file('stdin')
open(line1)
while(length(line <- readLines(line1,n=1)) > 0) {
}
'stdin' is the input param to R-program from hadoop streaming jar. 'line' gets new line of data every time loop iterates.
Inside while loop do write the logic on what to do with line.
Use hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-streaming.jar -input hdfs_input_file1, file2,n-files -output hdfs_output_dir -file mapper_file -file reducer_file -mapper mapper.R -reducer reducer.R to run the program.
-input accepts n-input files. Hadoop streaming jar reads one by one and feed to stdin

Related

R foreach loop runs out of memory in HPC environment

I am using the foreach package in R to process raster files.
The R code below works fine locally (on Windows) when adapted to an 8-core processor, but runs out of memory in a HPC environment with 48 cores. The HPC environment has much more memory available (2 TB across all 48 cores) compared with my local box (32 GB), so that's not the limiting factor.
The memory creep occurs as the foreach loop proceeds. It's slow, but enough to eventually run out of memory.
I have tried switching parallel packages (to doMC, doSNOW), adding numerous garbage collection calls and rm() of large objects at the end of every iteration, fiddling with the number of cores used, as well as removing any temporary files immediately.
Any ideas on what may be causing my memory issues?
# Set Java memory maximum
options(java.parameters = "-Xmx39g")
library(sp)
library(raster)
library(dismo)
library(foreach)
library(doParallel)
library(rgdal)
library(rJava)
# Set directories
relPath <- "E:/BIEN_Cactaceae/"
bufferDir <- "Data/Buffers"
climDir <- "Data/FutureClimate/"
outDir <- "Analyses/FutureRanges/"
modelDir <- "Analyses/MaxEnt/"
outfileDir <- "OutFiles/"
tempDir <- "E:/Tmp/"
# Set directory for raster temporary files
rasterOptions(tmpdir = tempDir)
# Search for models
models <- list.files(path = paste0(relPath, modelDir), pattern = "rda$")
# Set up cluster
cl <- makeCluster(48, type = "FORK", outfile = paste0(relPath, outfileDir, "predictFuture.txt"))
registerDoParallel(cl)
# Loop through species and predict current ranges
foreach(i = 1:length(models),
.packages = c("sp", "raster", "dismo", "rgdal", "rJava"),
.inorder = F) %dopar% {
# Get taxon
taxon <- strsplit(models[i], ".", fixed = T)[[1]][1]
# Get buffer
tmpBuffer <- readOGR(dsn = paste0(relPath, bufferDir), layer = paste0(taxon, "_buff"), verbose = F)
# Get scenarios
scenarios <- list.files(path = paste0(relPath, climDir), pattern = "tif$")
# Get model
load(paste0(relPath, modelDir, models[i]))
# Loop over scenarios
for (j in scenarios) {
# Get scenario name
tmpScenarioName <- strsplit(j, ".", fixed = T)[[1]][1]
# Skip scenario if already processed
if (!file.exists(paste0(relPath, outDir, taxon, "_", tmpScenarioName, ".tif"))) {
# Read, crop, mask predictors
print(paste0(taxon, " - ", tmpScenarioName, ": processing"))
tmpScenarioStack <- raster::stack(paste0(relPath, climDir, j))
preds <- raster::crop(tmpScenarioStack, tmpBuffer)
preds <- raster::mask(preds, tmpBuffer)
# Rename predictors
tmpNames <- paste0(taxon, ".", 1:20)
tmpNames <- gsub("-", ".", tmpNames, fixed = T)
tmpNames <- gsub(" ", "_", tmpNames, fixed = T)
names(preds) <- tmpNames
# Predict with model
prediction <- dismo::predict(model_all, preds, progress = "")
# Export predictions
writeRaster(prediction, paste0(relPath, outDir, taxon, "_", tmpScenarioName, ".tif"))
removeTmpFiles(h = 2)
}
}
}
stopCluster(cl)

How to name the PDF file of output with CMD in R

I've met a problem when reading the documentation. It says in the description of the file argument:
a character string giving the name of the file. If it is of the form "|cmd", the output is piped to the command given by cmd.
I don't quite get the meaning here.
Does it mean that i can write the statement in R script like pdf(file = "|cmd", ...) and use command in the cmd like Rscript input.R --args 'output.pdf'?
Here is the example given in the documentation
## Test function for encodings
TestChars <- function(encoding = "ISOLatin1", ...)
{
pdf(encoding = encoding, ...)
par(pty = "s")
plot(c(-1,16), c(-1,16), type = "n", xlab = "", ylab = "",
xaxs = "i", yaxs = "i")
title(paste("Centred chars in encoding", encoding))
grid(17, 17, lty = 1)
for(i in c(32:255)) {
x <- i %% 16
y <- i %/% 16
points(x, y, pch = i)
}
dev.off()
}
## there will be many warnings.
TestChars("ISOLatin2")
## this does not view properly in older viewers.
TestChars("ISOLatin2", family = "URWHelvetica")
## works well for viewing in gs-based viewers, and often in xpdf.

R output to screen error

I get Rscript error of:
Error in dev.copy2pdf(file = fname, out.type = "pdf") :
can only print from a screen device
Execution halted
I am running the following R source in my Mac OSX console with:
Rscript --vanulla charts.R
I am using R version 3.3.2. Here is my source:
library(quantmod)
sym <- 'IBM'
d <- getSymbols(sym,src = "yahoo", auto.assign = FALSE)
chartSeries(d, name = sym, theme = "white", bar.type = 'ohlc',
line.type = "l",TA = "addVo();addSMA()",
subset = 'last 6 months')
addRSI()
dev <- dev.prev()
fname <- sprintf("%s.pdf",sym)
dev.copy2pdf(file = fname, out.type = "pdf")
dev.off()
How do I fix the dev.copy2pdf() if I want to output a PDF running on the conole. It runs fine within my RStudio,
Thanks
This seems to have fixed it
#https://stackoverflow.com/questions/5625394/problem-saving-pdf-file-in-r-with-ggplot2
pdf(fname)
chartSeries(d, name=sym, theme="white",bar.type='ohlc',line.type="l",TA="addRSI();addVo();addBBands();addSMA()",subset='last 6 months')
dev.off()

How to run PCA, distance matrix and other math procedures on genome VCF files in R?

I am learning to process VCF (variant call files) to produce plots and reports. Here is the R code, which crashes for unknown to me reasons. Please advise how to fix it and tell appropriate tutorials.
library(VariantAnnotation)
library(SNPRelate)
vcf<-readVcf("test.vcf","hg19") # load your VCF file from a set dir
snpgdsVCF2GDS("test.vcf", "my.gds")
snpgdsSummary("my.gds")
genofile <- openfn.gds("my.gds")
#dendogram
dissMatrix <- snpgdsDiss(genofile , sample.id=NULL, snp.id=NULL,
autosome.only=TRUE,remove.monosnp=TRUE, maf=NaN, missing.rate=NaN,
num.thread=10, verbose=TRUE)
snpHCluster <- snpgdsHCluster(dist, sample.id=NULL, need.mat=TRUE,
hang=0.25)
cutTree <- snpgdsCutTree(snpHCluster, z.threshold=15, outlier.n=5,
n.perm = 5000, samp.group=NULL,col.outlier="red", col.list=NULL,
pch.outlier=4, pch.list=NULL,label.H=FALSE, label.Z=TRUE,
verbose=TRUE)
#pca
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
pop_code <- read.gdsn(index.gdsn(genofile, "sample.id")
pca <- snpgdsPCA(genofile)
tab <- data.framesample.id = pca$sample.id,pop =
factor(pop_code)[match(pca$sample.id, sample.id)],EV1 =
pca$eigenvect[,1],EV2 = pca$eigenvect[,2],stringsAsFactors = FALSE)
plot(tab$EV2, tab$EV1, col=as.integer(tab$pop),xlab="eigenvector 2",
ylab="eigenvector 1") legend("topleft", legend=levels(tab$pop),
pch="o", col=1:nlevels(tab$pop))
Your code has several issues:
-the snpgdsHCluster step should be run on dissMatrix, not dist:
snpHCluster <- snpgdsHCluster(dissMatrix, sample.id=NULL, need.mat=TRUE,
hang=0.25)
-you need a paren after dataframe in the tab line:
tab <- data.frame(sample.id = pca$sample.id,pop =
factor(pop_code)[match(pca$sample.id, sample.id)],EV1 =
pca$eigenvect[,1],EV2 = pca$eigenvect[,2],stringsAsFactors = FALSE)
-legend is a separate command from plot:
plot(tab$EV2, tab$EV1, col=as.integer(tab$pop),xlab="eigenvector 2",
ylab="eigenvector 1")
legend("topleft", legend=levels(tab$pop),
pch="o", col=1:nlevels(tab$pop))
I think otherwise it should work for you.

TikZDevice: Add \caption{} and \label{} to TikZ diagram using R

I've created a for loop that outputs several plots (via ggplot2) from R into a single .tex file using the tikzDevice package. This makes it easier to include multiple diagrams from within a latex document using a single command that points to the .tex file outputted from R (say 'diagrams.tex'): \include{diagrams}.
However, I would also like to wrap each tikzpicture with the \begin{figure} environment, so that I can insert two additional lines into each respective figure: \caption{} and \label{}.
Question: is there a way to include the figure wrapper, caption, and label latex commands directly, for each respective ggplot image (from my R loop), in the outputted .tex file?
Here is reproducible R code that generates a file 'diagrams.tex' containing 3 ggplots:
require(ggplot2)
require(tikzDevice)
## Load example data frame
A1 = as.data.frame(rbind(c(4.0,1.5,6.1),
c(4.0,5.2,3.5),
c(4.0,3.4,4.3),
c(4.0,8.2,7.3),
c(4.0,2.9,6.3),
c(6.0,3.9,6.6),
c(6.0,1.5,6.1),
c(6.0,2.7,5.3),
c(6.0,2.9,7.4),
c(6.0,3.7,6.0),
c(8.0,3.9,4.2),
c(8.0,4.1,3.5),
c(8.0,3.7,5.8),
c(8.0,2.5,7.5),
c(8.0,4.1,3.5)))
names(A1) = c("state","rmaxpay","urate")
i = 2
## name output file
tikz( 'diagrams.tex' )
for (i in 2:4){ #begin LOOP
st = i*2
df = NULL
df = subset(A1, state == st , select = c(2:3))
print( # start print
ggplot(df, aes(rmaxpay,urate)) + geom_point()
) # end print
} #end LOOP
dev.off()
There may be a way to do this with plot hooks but as it is you can do it by using the console option and sink():
require(ggplot2)
require(tikzDevice)
## Load example data frame
A1 = as.data.frame(rbind(c(4.0,1.5,6.1),
c(4.0,5.2,3.5),
c(4.0,3.4,4.3),
c(4.0,8.2,7.3),
c(4.0,2.9,6.3),
c(6.0,3.9,6.6),
c(6.0,1.5,6.1),
c(6.0,2.7,5.3),
c(6.0,2.9,7.4),
c(6.0,3.7,6.0),
c(8.0,3.9,4.2),
c(8.0,4.1,3.5),
c(8.0,3.7,5.8),
c(8.0,2.5,7.5),
c(8.0,4.1,3.5)))
names(A1) = c("state","rmaxpay","urate")
i = 2
fn <- "diagrams.tex"
if(file.exists(fn)) file.remove(fn)
for (i in 2:4){ #begin LOOP
st = i*2
df = NULL
df = subset(A1, state == st , select = c(2:3))
cat("\\begin{figure}\n", file = fn, append=TRUE)
sink(fn, append=TRUE)
tikz(console = TRUE)
print( # start print
ggplot(df, aes(rmaxpay,urate)) + geom_point()
) # end print
dev.off()
sink()
cat(paste("\\caption{figure}\\label{fig:",i,"}\n",sep=""), file = fn, append=TRUE)
cat("\\end{figure}\n", file = fn, append=TRUE)
} #end LOOP

Resources