Find data in folders and give feedback about missing data - r

I have a R-script to create several small parts of a big dataset (actually a dataset of Europe). We need these small parts (tiles) to edit these tiles more easily than it would be with one big dataset.
Now I have 1 windows folder and in this folder I have 966 auto-generated folders - each one with 4 datasets (I hope at least it is 4). We need to know if there are exactly these 4 datasets in the folders and if some dataset is missing we need to know which one. The code you can see below is creating the folders. Its posted just to let you know the structures.
in_file <- "P:/High_Resolution_Layers/Forest... .tif/2015/TCD_2015_020m_eu_03035_d04_full/TCD_2015_020m_eu_03035_d04_full.tif"
for (t in 1:length(tiles)){
tileID <- tiles[t]
out_dir <- file.path(output_dir,tileID)
# out_dir_tmp <- file.path(out_dir, "tmp")
if(!exists(out_dir)) {dir.create(out_dir, recursive = T)}
# if(!exists(out_dir)) {dir.create(out_dir_tmp, recursive = T)}
# tmp_file <- file.path(out_dir_tmp, paste0(tileID, "_HRL_Forest.tif")) ## <- ändern ("_HRL_Forest.tif", _clc_2012.tif, _clc_2018.tif, _slope.tif)
out_file <- file.path(out_dir, paste0(tileID, "_HRL_Forest.tif")) ## <- ändern ("_HRL_Forest.tif", _clc_2012.tif, _clc_2018.tif, _slope.tif)
cmd <- paste("gdalwarp",
"-overwrite",
"-s_srs EPSG:3035",
"-t_srs EPSG:3035",
"-r near",
"-q",
"-tr 20 20",
"-te ", tile_list[t,3],tile_list[t,4],tile_list[t,3]+100000, tile_list[t,4]+100000,
"-tap",
"-of GTiff",
in_file,
out_file)
system(osgeo, input=cmd)
# cmd <- sprintf('gdal_translate -ot Byte -a_nodata 255 -co "COMPRESS=LZW" %s %s', tmp_file, out_file)
# system(osgeo, input=cmd)
# unlink(out_dir_tmp,recursive=T)
}

I'm going to make up a structure and list of files.
directories A through D
each directory must have files a.tif though c.tif
Since all dirs must have the same files within them, we can do a cartesian/outer join of them:
dirs <- LETTERS[1:4]
files_each_dir <- paste0(letters[1:3], ".tif")
(all_files <- outer(dirs, files_each_dir, file.path))
# [,1] [,2] [,3]
# [1,] "A/a.tif" "A/b.tif" "A/c.tif"
# [2,] "B/a.tif" "B/b.tif" "B/c.tif"
# [3,] "C/a.tif" "C/b.tif" "C/c.tif"
# [4,] "D/a.tif" "D/b.tif" "D/c.tif"
Since we don't need a matrix, I'll unlist them and then create the dirs/files:
c(all_files)
# [1] "A/a.tif" "B/a.tif" "C/a.tif" "D/a.tif" "A/b.tif" "B/b.tif" "C/b.tif"
# [8] "D/b.tif" "A/c.tif" "B/c.tif" "C/c.tif" "D/c.tif"
for (d in dirs) dir.create(d)
for (p in all_files) writeLines(p, p)
All expected files exist
(files_found <- list.files(pattern = "*.tif", recursive = TRUE, full.names = TRUE))
# [1] "./A/a.tif" "./A/b.tif" "./A/c.tif" "./B/a.tif" "./B/b.tif" "./B/c.tif"
# [7] "./C/a.tif" "./C/b.tif" "./C/c.tif" "./D/a.tif" "./D/b.tif" "./D/c.tif"
### remove the leading "./"
(files_found <- gsub("^\\./", "", files_found))
# [1] "A/a.tif" "A/b.tif" "A/c.tif" "B/a.tif" "B/b.tif" "B/c.tif" "C/a.tif"
# [8] "C/b.tif" "C/c.tif" "D/a.tif" "D/b.tif" "D/c.tif"
all(all_files %in% files_found)
# [1] TRUE
all_files[!all_files %in% files_found]
# character(0)
Test a missing file:
file.remove("B/c.tif")
# [1] TRUE
files_found <- list.files(pattern = "*.tif", recursive = TRUE, full.names = TRUE)
files_found <- gsub("^\\./", "", files_found)
all_files[!all_files %in% files_found]
# [1] "B/c.tif"
Note: we do not use files_each_dir for any of the follow-on tests. It is only needed if we expect a fixed-set of filenames.
Count files within each dir
If the filenames might be different, then we can count the number of files in each directory, irrespective of the actual names.
(len3 <- lengths(split(files_found, sapply(strsplit(files_found, "[/\\]"), `[[`, 1))) == 3)
# A B C D
# TRUE FALSE TRUE TRUE
names(len3)[ !len3 ]
# [1] "B"
File contents
If you need to test the contents such that some condition is true, try something like this. Here, I'm using simple shell command grep, but any function (R or shell) that takes a path and returns something you need (size, property, etc) should work.
func <- function(path) length(system2("grep", c("-lE", "'[a-z]'", path), stdout = TRUE)) > 0
(proper_contents <- sapply(files_found, func))
# A/a.tif A/b.tif A/c.tif B/a.tif B/b.tif C/a.tif C/b.tif C/c.tif D/a.tif D/b.tif
# TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# D/c.tif
# TRUE
Let's change one file's contents to test:
writeLines("123", "D/a.tif")
proper_contents <- sapply(files_found, func)
# Warning in system2("grep", c("-lE", "'[a-z]'", path), stdout = TRUE) :
# running command '"grep" -lE '[a-z]' D/a.tif' had status 1
names(proper_contents)[ !proper_contents ]
# [1] "D/a.tif"

Related

Save every nth file to a new subfolder following a if statement in R

first of all, apologies if this question has already been addressed somewhere else but I couldn't find an answer.
In R I have a for loop that saves text (writeLines) to a file according to a certain condition. Every 10th written file I would like to save to a new different subfolder. In this way each subfolder will only have a maximum of 10 files.
Now, I can code a script that does that if all files were to be written (see below), but I'm not sure how to implement the change of subfolder every time a folder is full with 10 files.
The example below should clarify.
# generate random num
set.seed(15)
input <- rnorm(100, mean = 10, sd = 3)
# Write to file only if num is greater than threshold
thrshld <- 10
out_dir <- "~/R/tmp/Batch_Saving"
k <- 1
i <- 1
for (i in 1:length(input)) {
if ( any(i == seq(from = 1, to = length(input), by = 10) ) ) {
# Create a new subfolder everytime i is a multiple of 10
sub_dir <- paste0(out_dir, "/Dir_Num_", k)
k <- k+1
}
if ( input[i] > thrshld) {
txt <- paste("The number", signif(input[i], 4), "is greater than", thrshld)
# Create folder if it doesn't exist
if (!dir.exists(sub_dir)) { dir.create(sub_dir, recursive = T) }
# Write text to file
writeLines(text = txt,
con = paste0(sub_dir, "/file_", i, ".txt") )
}
}
This script creates 10 folders where each as less than 10 files, but not the same number of files. One can check this quickly in the terminal like this:
Batch_Saving > find . -maxdepth 1 -mindepth 1 -type d -exec sh -c 'echo "{} : $(find "{}" -type f | wc -l)" file\(s\)' \;
./Dir_Num_4 : 6 file(s)
./Dir_Num_3 : 7 file(s)
./Dir_Num_2 : 4 file(s)
./Dir_Num_5 : 7 file(s)
./Dir_Num_10 : 4 file(s)
./Dir_Num_9 : 5 file(s)
./Dir_Num_7 : 3 file(s)
./Dir_Num_6 : 6 file(s)
./Dir_Num_1 : 6 file(s)
./Dir_Num_8 : 4 file(s)
I believe the only way to achieve this is to either read every time how many files have been written already or to somehow keep tracks of how many files have already been written and change accordingly the parameter k.
I believe this could also be achieved with other methods (i.e. apply) but for sake of learning I'd like to code this in a loop.
Thanks a lot!
Try nesting the k loop for sub-directory creation inside the i loop for threshold check as shown below:
for (i in 1:length(input)) {
if (input[i] > thrshld) {
if ( k %in% seq(from = 1, to = length(input), by = 10) ) {
# Create a new subfolder everytime i is a multiple of 10
sub_dir <- paste0(out_dir, "/Dir_Num_", k)
}
k <- k+1
#
txt <- paste("The number", signif(input[i], 4), "is greater than", thrshld)
#
# Create folder if it doesn't exist
if (!dir.exists(sub_dir)) { dir.create(sub_dir, recursive = T) }
# Write text to file
writeLines(text = txt,
con = paste0(sub_dir, "/file_", i, ".txt") )
}
}
Results in the following output
> list.files("~/R/tmp", recursive = T)
[1] "Dir_Num_1/file_1.txt" "Dir_Num_1/file_11.txt" "Dir_Num_1/file_13.txt" "Dir_Num_1/file_15.txt"
[5] "Dir_Num_1/file_18.txt" "Dir_Num_1/file_2.txt" "Dir_Num_1/file_4.txt" "Dir_Num_1/file_5.txt"
[9] "Dir_Num_1/file_7.txt" "Dir_Num_1/file_8.txt" "Dir_Num_11/file_21.txt" "Dir_Num_11/file_22.txt"
[13] "Dir_Num_11/file_25.txt" "Dir_Num_11/file_26.txt" "Dir_Num_11/file_27.txt" "Dir_Num_11/file_29.txt"
[17] "Dir_Num_11/file_30.txt" "Dir_Num_11/file_32.txt" "Dir_Num_11/file_33.txt" "Dir_Num_11/file_34.txt"
[21] "Dir_Num_21/file_35.txt" "Dir_Num_21/file_36.txt" "Dir_Num_21/file_38.txt" "Dir_Num_21/file_41.txt"
[25] "Dir_Num_21/file_42.txt" "Dir_Num_21/file_43.txt" "Dir_Num_21/file_46.txt" "Dir_Num_21/file_48.txt"
[29] "Dir_Num_21/file_49.txt" "Dir_Num_21/file_50.txt" "Dir_Num_31/file_51.txt" "Dir_Num_31/file_52.txt"
[33] "Dir_Num_31/file_54.txt" "Dir_Num_31/file_55.txt" "Dir_Num_31/file_58.txt" "Dir_Num_31/file_59.txt"
[37] "Dir_Num_31/file_61.txt" "Dir_Num_31/file_68.txt" "Dir_Num_31/file_70.txt" "Dir_Num_31/file_75.txt"
[41] "Dir_Num_41/file_77.txt" "Dir_Num_41/file_78.txt" "Dir_Num_41/file_80.txt" "Dir_Num_41/file_81.txt"
[45] "Dir_Num_41/file_82.txt" "Dir_Num_41/file_83.txt" "Dir_Num_41/file_88.txt" "Dir_Num_41/file_89.txt"
[49] "Dir_Num_41/file_93.txt" "Dir_Num_41/file_94.txt" "Dir_Num_51/file_96.txt" "Dir_Num_51/file_98.txt"
Try this:
# generate random num
set.seed(15)
input <- rnorm(100, mean = 10, sd = 3)
# Write to file only if num is greater than threshold
thrshld <- 10
out_dir <- "~/R/tmp/Batch_Saving"
for (i in 1:length(input)) {
# Create sub-folder if it doesn't exist
sub_dir = paste0(out_dir, "/Dir_Num_", ceiling(i/10))
dir.create(sub_dir, showWarnings = F)
if ( input[i] > thrshld) {
txt <- paste("The number", signif(input[i], 4), "is greater than", thrshld)
# Write text to file
writeLines(text = txt,
con = paste0(sub_dir, "/file_", i, ".txt") )
}
}
Number of files in sub-folder
sapply(list.dirs(), function(x)length(list.files(x)))
# . ./Dir_Num_1 ./Dir_Num_10 ./Dir_Num_2 ./Dir_Num_3 ./Dir_Num_4
# 11 6 4 4 7 6
# ./Dir_Num_5 ./Dir_Num_6 ./Dir_Num_7 ./Dir_Num_8 ./Dir_Num_9
# 7 6 3 4 5
Following the answer of nurandi, I added another counter c that counts how many times a file written and with an if-statement I check it that number has incremented by 10, and every time it does I create a new folder numbered with k.
# generate random num
set.seed(15)
input <- rnorm(100, mean = 10, sd = 3)
# Write to file only if num is greater than threshold
thrshld <- 10
out_dir <- "~/R/tmp/Batch_Saving"
k <- 1
i <- 1
c <- 1
for (i in 1:length(input)) {
if (input[i] > thrshld) {
txt <- paste("The number", signif(input[i], 4), "is greater than", thrshld)
if ( c %in% seq(from = 1, to = length(input), by = 10) ) {
# Create a new subfolder everytime c is higher than a multiple of 10
sub_dir <- paste0(out_dir, "/Dir_Num_", k)
k <- k + 1
}
# Create folder if it doesn't exist
if (!dir.exists(sub_dir)) { dir.create(sub_dir, recursive = T) }
# Write text to file
writeLines(text = txt,
con = paste0(sub_dir, "/file_", i, ".txt") )
# Increase wrote files counter to +1
c <- c + 1
}
}
So that the folders are numbered in increasing order.
sapply(list.dirs(recursive = F), function(x) length( list.files(x) ) )
./Dir_Num_1 ./Dir_Num_2 ./Dir_Num_3 ./Dir_Num_4
10 10 10 10
./Dir_Num_5 ./Dir_Num_6
10 2

Can I import variables into R from a global file?

I am integrating an R script to produce some graphics into a larger project that is pulled together with a Makefile. In this larger project, I have a file called globals.mk that contains global variables used by many other scripts in the project. For example, the number of simulations I want to run is a global that I want to use in this R script. Can I "import" this as a variable, or is it necessary to manually define every variable within the R script?
Edit: here is a sample of the globals that I would need to read in.
num = 100
path = ./here/is/a/path
file = $(path)/file.csv
And I would like the R script to set the variables num as 100 (or "100"), path as "./here/is/a/path" and file as "./here/is/a/path/file.csv".
If it is ok to replace the parentheses with brace brackets then readRenviron will read in such files and perform the substitutions returning the contents as environmental variables.
# write out test file globals2.mk which uses brace brackets
Lines <- "num = 100
path = ./here/is/a/path
file = ${path}/file.csv"
cat(Lines, file = "globals2.mk")
readRenviron("globals2.mk")
Sys.getenv("num")
## [1] "100"
Sys.getenv("path")
## [1] "./here/is/a/path"
Sys.getenv("file")
## [1] "./here/is/a/path/file.csv"
If it is important to use parentheses rather than brace brackets, read in globals.mk, replace the parentheses with brace brackets and then write the file out again.
# write out test file - this one uses parentheses as in question
Lines <- "num = 100
path = ./here/is/a/path
file = $(path)/file.csv"
cat(Lines, file = "globals.mk")
# read globals.mk, perform () to {} substitutions, write out and then re-read
tmp <- tempfile()
L <- readLines("globals.mk")
cat(paste(chartr("()", "{}", L), collapse = "\n"), file = tmp)
readRenviron(tmp)
If the .mk file has anything other than direct variable expansion (such as more complex make-rules/tricks/functions), it might be better to trust make to do the expansion for you, and then read it in. There's a post here that I found that dumps all variable contents (after processing).
TL;DR
expand_mkvars <- function(path, aslist = FALSE) {
stopifnot(file.exists(mk <- Sys.which("make")))
tf <- tempfile(fileext = ".mk")
# needed on my windows system
tf <- normalizePath(tf, winslash = "/", mustWork = FALSE) # tempfile should suffice
on.exit(suppressWarnings(file.remove(tf)), add = TRUE)
writeLines(c(".PHONY: printvars",
"printvars:",
"\t#$(foreach V,$(sort $(.VARIABLES)), \\",
"\t $(if $(filter-out environment% default automatic, \\",
"\t $(origin $V)),$(warning $V=$($V))))"), con = tf)
out <- system2(mk, c("-f", shQuote(path), "-f", shQuote(tf), "-n", "printvars"),
stdout = TRUE, stderr = TRUE)
out <- out[grepl(paste0("^", tf), out)]
out <- gsub(paste0("^", tf, ":[0-9]+:\\s*"), "", out)
known_noneed <- c(".DEFAULT_GOAL", "CURDIR", "GNUMAKEFLAGS", "MAKEFILE_LIST", "MAKEFLAGS")
out <- out[!grepl(paste0("^(", paste(known_noneed, collapse = "|"), ")="), out)]
if (aslist) {
spl <- strsplit(out, "=")
nms <- sapply(spl, `[[`, 1)
rest <- lapply(spl, function(a) paste(a[-1], collapse = "="))
setNames(rest, nms)
} else out
}
In action:
expand_mkvars("~/StackOverflow/karthikt.mk")
# [1] "file=./here/is/a/path/file.csv" "num=100"
# [3] "path=./here/is/a/path"
expand_mkvars("~/StackOverflow/karthikt.mk", aslist = TRUE)
# $file
# [1] "./here/is/a/path/file.csv"
# $num
# [1] "100"
# $path
# [1] "./here/is/a/path"
I have not tested on other systems, so you might need to adjust known_noneed to add extra variables that popup. Depending on your needs, you might be able to filter more-intelligently (e.g., none of your variables lead with a capital letter), but for this example I kept it to the known-not-wanted variables that make is giving us.
The blog post suggests using a phony target of
.PHONY: printvars
printvars:
#$(foreach V,$(sort $(.VARIABLES)), \
$(if $(filter-out environment% default automatic, \
$(origin $V)),$(warning $V=$($V))))
(some are tabs, not all spaces, very important for make)
Unfortunately, it produces more output than you technically need:
$ /c/Rtools/bin/make.exe -f ~/StackOverflow/karthikt.mk printvars
C:/Users/r2/StackOverflow/karthikt.mk:10: .DEFAULT_GOAL=all
C:/Users/r2/StackOverflow/karthikt.mk:10: CURDIR=/Users/r2/Projects/Ford/shiny/shinyobjects/inst
C:/Users/r2/StackOverflow/karthikt.mk:10: GNUMAKEFLAGS=
C:/Users/r2/StackOverflow/karthikt.mk:10: MAKEFILE_LIST= C:/Users/r2/StackOverflow/karthikt.mk
C:/Users/r2/StackOverflow/karthikt.mk:10: MAKEFLAGS=
C:/Users/r2/StackOverflow/karthikt.mk:10: SHELL=sh
C:/Users/r2/StackOverflow/karthikt.mk:10: file=./here/is/a/path/file.csv
C:/Users/r2/StackOverflow/karthikt.mk:10: num=100
C:/Users/r2/StackOverflow/karthikt.mk:10: path=./here/is/a/path
make: Nothing to be done for 'printvars'.
so we need a little filtering, ergo the majority of code in the function.
Edit: it the readRenviron-to-envvar is the best way for you, it would not be difficult to redirect the output of this make call to another file, parse out the relevant lines, and then do readRenviron on that new file. It seems more indirect due to the use of two temp files, but they're cleaned up so that should be nothing to worry about.

How to apply rma() normalization to a unique CEL file?

I have implemented a R script that performs batch correction on a gene expression dataset. To do the batch correction, I first need to normalize the data in each CEL file through the Affy rma() function of Bioconductor.
If I run it on the GSE59867 dataset obtained from GEO, everything works.
I define a batch as the data collection date: I put all the CEL files having the same date into a specific folder, and then consider that date/folder as a specific batch.
On the GSE59867 dataset, a batch/folder contains only 1 CEL file. Nonetheless, the rma() function works on it perfectly.
But, instead, if I try to run my script on another dataset (GSE36809), I have some troubles: if I try to apply the rma() function to a batch/folder containing only 1 file, I get the following error:
Error in `colnames<-`(`*tmp*`, value = "GSM901376_c23583161.CEL.gz") :
attempt to set 'colnames' on an object with less than two dimensions
Here's my specific R code, to let you understand.
You first have to download the file GSM901376_c23583161.CEL.gz:
setwd(".")
options(stringsAsFactors = FALSE)
fileURL <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM901nnn/GSM901376/suppl/GSM901376%5Fc23583161%2ECEL%2Egz"
fileDownloadCommand <- paste("wget ", fileURL, " ", sep="")
system(fileDownloadCommand)
Library installation:
source("https://bioconductor.org/biocLite.R")
list.of.packages <- c("easypackages")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
listOfBiocPackages <- c("oligo", "affyio","BiocParallel")
bioCpackagesNotInstalled <- which( !listOfBiocPackages %in% rownames(installed.packages()) )
cat("package missing listOfBiocPackages[", bioCpackagesNotInstalled, "]: ", listOfBiocPackages[bioCpackagesNotInstalled], "\n", sep="")
if( length(bioCpackagesNotInstalled) ) {
biocLite(listOfBiocPackages[bioCpackagesNotInstalled])
}
library("easypackages")
libraries(list.of.packages)
libraries(listOfBiocPackages)
Application of rma()
thisFileDate <- "GSM901376_c23583161.CEL.gz"
thisDateRawData <- read.celfiles(thisDateCelFiles)
thisDateNormData <- rma(thisDateRawData)
After the call to rma(), I get the error.
How can I solve this problem?
I also tried to skip this normalization, by saving the thisDateRawData object directly. But then I have the problem that I cannot combine together this thisDateRawData (that is a ExpressionFeatureSet) with the outputs of rma() (that are ExpressionSet objects).
(EDIT: I extensively edited the question, and added a piece of R code you should be able to run on your pc.)
Hmm. This is a puzzling problem. the oligo::rma() function might be buggy for class GeneFeatureSet with single samples. I got it to work with a single sample by using lower-level functions, but it means I also had to create the expression set from scratch by specifying the slots:
# source("https://bioconductor.org/biocLite.R")
# biocLite("GEOquery")
# biocLite("pd.hg.u133.plus.2")
# biocLite("pd.hugene.1.0.st.v1")
library(GEOquery)
library(oligo)
# # Instead of using .gz files, I extracted the actual CELs.
# # This is just to illustrate how I read in the files; your usage will differ.
# projectDir <- "" # Path to .tar files here
# setwd(projectDir)
# untar("GSE36809_RAW.tar", exdir = "GSE36809")
# untar("GSE59867_RAW.tar", exdir = "GSE59867")
# setwd("GSE36809"); gse3_cels <- dir()
# sapply(paste(gse3_cels, sep = "/"), gunzip); setwd(projectDir)
# setwd("GSE59867"); gse5_cels <- dir()
# sapply(paste(gse5_cels, sep = "/"), gunzip); setwd(projectDir)
#
# Read in CEL
#
# setwd("GSE36809"); gse3_cels <- dir()
# gse3_efs <- read.celfiles(gse3_cels[1])
# # Assuming you've read in the CEL files as a GeneFeatureSet or
# # ExpressionFeatureSet object (i.e. gse3_efs in this example),
# # you can now fit the RMA and create an ExpressionSet object with it:
exprsData <- basicRMA(exprs(gse3_efs), pnVec = featureNames(gse3_efs))
gse3_expset <- new("ExpressionSet")
slot(gse3_expset, "assayData") <- assayDataNew(exprs = exprsData)
slot(gse3_expset, "phenoData") <- phenoData(gse3_efs)
slot(gse3_expset, "featureData") <- annotatedDataFrameFrom(attr(gse3_expset,
'assayData'), byrow = TRUE)
slot(gse3_expset, "protocolData") <- protocolData(gse3_efs)
slot(gse3_expset, "annotation") <- slot(gse3_efs, "annotation")
Hopefully the above approach will work in your code.

R: Memory Management during xmlEventParse of Huge (>20GB) files

Building on this previous question (see here), I am attempting to read in many, large xml files via xmlEventParse whilst saving node-varying data. Working with this sample xml: https://www.nlm.nih.gov/databases/dtd/medsamp2015.xml.
The code below uses xpathSapply to extract the necessary values and a series of if statements to combine the values in a way that matches the unique value (PMID) to each of the non-unique values (LastName) within a record - for which there may be no LastNames. The goal is to write a series of small csv's along the way (here, after every 1000 LastNames) to minimize the amount of memory used.
When run on the full-sized data set, the code successfully outputs files in batches, however something is still being stored in memory that eventually causes a system error once all RAM is used. I've watched the task manager while the code runs and can see R's memory grow as the program progresses. And if I stop the program mid-run and then clear the R workspace, including hidden items, the memory still appears to be in use by R. It is not until I shutdown R is the memory freed up again.
Run this a few times yourself and you'll see R's memory usage grow even after clearing the workspace.
Please help! This problem appears to be common to others reading in large XML files in this manner (See for example comments in this question).
My code is as follows:
library(XML)
filename <- "~/Desktop/medsamp2015.xml"
tempdat <- data.frame(pmid=as.numeric(),
lname=character(),
stringsAsFactors=FALSE)
cnt <- 1
branchFunction <- function() {
func <- function(x, ...) {
v1 <- xpathSApply(x, path = "//PMID", xmlValue)
v2 <- xpathSApply(x, path = "//Author/LastName", xmlValue)
print(cbind(c(rep(v1,length(v2))), v2))
#below is where I store/write the temp data along the way
#but even without doing this, memory is used (even after clearing)
tempdat <<- rbind(tempdat,cbind(c(rep(v1,length(v2))), v2))
if (nrow(tempdat) > 1000){
outname <- paste0("~/Desktop/outfiles",cnt,".csv")
write.csv(tempdat, outname , row.names = F)
tempdat <<- data.frame(pmid=as.numeric(),
lname=character(),
stringsAsFactors=FALSE)
cnt <<- cnt+1
}
}
list(MedlineCitation = func)
}
myfunctions <- branchFunction()
#RUN
xmlEventParse(
file = filename,
handlers = NULL,
branches = myfunctions
)
Here is an example, we have a launch script invoke.sh, that calls an R Script and passes the url and filename as parameters... In this case, I had previously downloaded the test file medsamp2015.xml and put in the ./data directory.
My sense would be to create a loop in the invoke.sh script and iterate through the list of target file names. For each file you invoke an R instance, download it, process the file and move on to the next.
Caveat: I didn't check or change your function against any other download files and formats. I would turn off the printing of the output by removing the print() wrapper on line 62.
print( cbind(c(rep(v1, length(v2))), v2))
See: runtime.txt for print out.
The output .csv files are placed in the ./data directory.
Note: This is a derivative of a previous answer provided by me on this subject:
R memory not released in Windows. I hope it helps by way of example.
Launch Script
1 #!/usr/local/bin/bash -x
2
3 R --no-save -q --slave < ./47162861.R --args "https://www.nlm.nih.gov/databases/dtd" "medsamp2015.xml"
R File - 47162861.R
# Set working directory
projectDir <- "~/dev/stackoverflow/47162861"
setwd(projectDir)
# -----------------------------------------------------------------------------
# Load required Packages...
requiredPackages <- c("XML")
ipak <- function(pkg) {
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
ipak(requiredPackages)
# -----------------------------------------------------------------------------
# Load required Files
# trailingOnly=TRUE means that only your arguments are returned
args <- commandArgs(trailingOnly = TRUE)
if ( length(args) != 0 ) {
dataDir <- file.path(projectDir,"data")
fileUrl = args[1]
fileName = args[2]
} else {
dataDir <- file.path(projectDir,"data")
fileUrl <- "https://www.nlm.nih.gov/databases/dtd"
fileName <- "medsamp2015.xml"
}
# -----------------------------------------------------------------------------
# Download file
# Does the directory Exist? If it does'nt create it
if (!file.exists(dataDir)) {
dir.create(dataDir)
}
# Now we check if we have downloaded the data already if not we download it
if (!file.exists(file.path(dataDir, fileName))) {
download.file(fileUrl, file.path(dataDir, fileName), method = "wget")
}
# -----------------------------------------------------------------------------
# Now we extrat the data
tempdat <- data.frame(pmid = as.numeric(), lname = character(),
stringsAsFactors = FALSE)
cnt <- 1
branchFunction <- function() {
func <- function(x, ...) {
v1 <- xpathSApply(x, path = "//PMID", xmlValue)
v2 <- xpathSApply(x, path = "//Author/LastName", xmlValue)
print(cbind(c(rep(v1, length(v2))), v2))
# below is where I store/write the temp data along the way
# but even without doing this, memory is used (even after
# clearing)
tempdat <<- rbind(tempdat, cbind(c(rep(v1, length(v2))),
v2))
if (nrow(tempdat) > 1000) {
outname <- file.path(dataDir, paste0(cnt, ".csv")) # Create FileName
write.csv(tempdat, outname, row.names = F) # Write File to created directory
tempdat <<- data.frame(pmid = as.numeric(), lname = character(),
stringsAsFactors = FALSE)
cnt <<- cnt + 1
}
}
list(MedlineCitation = func)
}
myfunctions <- branchFunction()
# -----------------------------------------------------------------------------
# RUN
xmlEventParse(file = file.path(dataDir, fileName),
handlers = NULL,
branches = myfunctions)
Test File and output
~/dev/stackoverflow/47162861/data/medsamp2015.xml
$ ll
total 2128
drwxr-xr-x# 7 hidden staff 238B Nov 10 11:05 .
drwxr-xr-x# 9 hidden staff 306B Nov 10 11:11 ..
-rw-r--r--# 1 hidden staff 32K Nov 10 11:12 1.csv
-rw-r--r--# 1 hidden staff 20K Nov 10 11:12 2.csv
-rw-r--r--# 1 hidden staff 23K Nov 10 11:12 3.csv
-rw-r--r--# 1 hidden staff 37K Nov 10 11:12 4.csv
-rw-r--r--# 1 hidden staff 942K Nov 10 11:05 medsamp2015.xml
Runtime Output
> ./invoke.sh > runtime.txt
+ R --no-save -q --slave --args https://www.nlm.nih.gov/databases/dtd medsamp2015.xml
Loading required package: XML
File: runtime.txt

Getting example codes of R functions into knitr using helpExtract function

I want to get the example codes of R functions to use in knitr. There might be an easy way but tried the following code using helpExtract function which can be obtained from here (written by #AnandaMahto). With my approach I have to look whether a function has Examples or not and have to include only those functions which have Examples.
This is very inefficient and naive approach. Now I'm trying to include only those functions which have Examples. I tried the following code but it is not working as desired. How can I to extract Examples codes from an R package?
\documentclass{book}
\usepackage[T1]{fontenc}
\begin{document}
<< label=packages, echo=FALSE>>=
library(ggplot2)
library(devtools)
source_gist("https://gist.github.com/mrdwab/7586769")
library(noamtools) # install_github("noamtools", "noamross")
#
\chapter{Linear Model}
<< label = NewTest1, results="asis">>=
tryCatch(
{helpExtract(lm, section="Examples", type = "s_text");
cat(
"\\Sexpr{
knit_child(
textConnection(helpExtract(lm, section=\"Examples\", type = \"s_text\"))
, options = list(tidy = FALSE, eval = TRUE)
)
}", "\n"
)
}
, error=function(e) FALSE
)
#
\chapter{Modify properties of an element in a theme object}
<< label = NewTest2, results="asis">>=
tryCatch(
{helpExtract(add_theme , section="Examples", type = "s_text");
cat(
"\\Sexpr{
knit_child(
textConnection(helpExtract(add_theme , section=\"Examples\", type = \"s_text\"))
, options = list(tidy = FALSE, eval = TRUE)
)
}", "\n"
)
}
, error=function(e) FALSE
)
#
\end{document}
I've done some quick work modifying the function (which I've included at this Gist). The Gist also includes a sample Rnw file (I haven't had a chance to check an Rmd file yet).
The function now looks like this:
helpExtract <- function(Function, section = "Usage", type = "m_code", sectionHead = NULL) {
A <- deparse(substitute(Function))
x <- capture.output(tools:::Rd2txt(utils:::.getHelpFile(utils::help(A)),
options = list(sectionIndent = 0)))
B <- grep("^_", x) ## section start lines
x <- gsub("_\b", "", x, fixed = TRUE) ## remove "_\b"
X <- rep(FALSE, length(x)) ## Create a FALSE vector
X[B] <- 1 ## Initialize
out <- split(x, cumsum(X)) ## Create a list of sections
sectionID <- vapply(out, function(x) ## Identify where the section starts
grepl(section, x[1], fixed = TRUE), logical(1L))
if (!any(sectionID)) { ## If the section is missing...
"" ## ... just return an empty character
} else { ## Else, get that list item
out <- out[[which(sectionID)]][-c(1, 2)]
while(TRUE) { ## Remove the extra empty lines
out <- out[-length(out)] ## from the end of the file
if (out[length(out)] != "") { break }
}
switch( ## Determine the output type
type,
m_code = {
before <- "```r"
after <- "```"
c(sectionHead, before, out, after)
},
s_code = {
before <- "<<eval = FALSE>>="
after <- "#"
c(sectionHead, before, out, after)
},
m_text = {
c(sectionHead, paste(" ", out, collapse = "\n"))
},
s_text = {
before <- "\\begin{verbatim}"
after <- "\\end{verbatim}"
c(sectionHead, before, out, after)
},
stop("`type` must be either `m_code`, `s_code`, `m_text`, or `s_text`")
)
}
}
What has changed?
A new argument sectionHead has been added. This is used to be able to specify the section title in the call to the helpExtract function.
The function checks to see whether the relevant section is available in the parsed document. If it is not, it simply returns a "" (which doesn't get printed).
Example use would be:
<<echo = FALSE>>=
mySectionHeading <- "\\section{Some cool section title}"
#
\Sexpr{knit_child(textConnection(
helpExtract(cor, section = "Examples", type = "s_code",
sectionHead = mySectionHeading)),
options = list(tidy = FALSE, eval = FALSE))}
Note: Since Sexpr doesn't allow curly brackets to be used ({), we need to specify the title outside of the Sexpr step, which I have done in a hidden code chunk.
This is not a complete answer so I'm marking it as community wiki. Here are two simple lines to get the examples out of the Rd file for a named function (in this case lm). The code is much simpler than Ananda's gist in my opinion:
x <- utils:::.getHelpFile(utils::help(lm))
sapply(x[sapply(x, function(z) attr(z, "Rd_tag") == "\\examples")][[1]], `[[`, 1)
The result is a simple vector of all of the text in the Rd "examples" section, which should be easy to parse, evaluate, or include in a knitr doc.
[1] "\n"
[2] "require(graphics)\n"
[3] "\n"
[4] "## Annette Dobson (1990) \"An Introduction to Generalized Linear Models\".\n"
[5] "## Page 9: Plant Weight Data.\n"
[6] "ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)\n"
[7] "trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)\n"
[8] "group <- gl(2, 10, 20, labels = c(\"Ctl\",\"Trt\"))\n"
[9] "weight <- c(ctl, trt)\n"
[10] "lm.D9 <- lm(weight ~ group)\n"
[11] "lm.D90 <- lm(weight ~ group - 1) # omitting intercept\n"
[12] "\n"
[13] "\n"
[14] "opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))\n"
[15] "plot(lm.D9, las = 1) # Residuals, Fitted, ...\n"
[16] "par(opar)\n"
[17] "\n"
[18] "\n"
[19] "### less simple examples in \"See Also\" above\n"
Perhaps the following might be useful.
get.examples <- function(pkg=NULL) {
suppressWarnings(f <- unique(utils:::index.search(TRUE, find.package(pkg))))
out <- setNames(sapply(f, function(x) {
tf <- tempfile("Rex")
tools::Rd2ex(utils:::.getHelpFile(x), tf)
if (!file.exists(tf)) return(invisible())
readLines(tf)
}), basename(f))
out[!sapply(out, is.null)]
}
ex.base <- get.examples('base')
This returns the examples for all functions (that have documentation containing examples) within the specified vector of packages. If pkg=NULL, it returns the examples for all functions within loaded packages.
For example:
ex.base['scan']
# $scan
# [1] "### Name: scan"
# [2] "### Title: Read Data Values"
# [3] "### Aliases: scan"
# [4] "### Keywords: file connection"
# [5] ""
# [6] "### ** Examples"
# [7] ""
# [8] "cat(\"TITLE extra line\", \"2 3 5 7\", \"11 13 17\", file = \"ex.data\", sep = \"\\n\")"
# [9] "pp <- scan(\"ex.data\", skip = 1, quiet = TRUE)"
# [10] "scan(\"ex.data\", skip = 1)"
# [11] "scan(\"ex.data\", skip = 1, nlines = 1) # only 1 line after the skipped one"
# [12] "scan(\"ex.data\", what = list(\"\",\"\",\"\")) # flush is F -> read \"7\""
# [13] "scan(\"ex.data\", what = list(\"\",\"\",\"\"), flush = TRUE)"
# [14] "unlink(\"ex.data\") # tidy up"
# [15] ""
# [16] "## \"inline\" usage"
# [17] "scan(text = \"1 2 3\")"
# [18] ""
# [19] ""
# [20] ""
# [21] ""

Resources