loop with rsource inside not working - r

I am using rsource in order to produce certain variables in this way:
set more off
local n 4
local i 100
rsource, terminator(END_OF_R) rpath(C:\Program Files\R\R-3.2.5\bin\R.exe) roptions(`" --vanilla --args "`i'" "`n'" "')
library(foreign)
trailargs <- commandArgs(trailingOnly=TRUE);
trailargs;
i<- as.numeric(trailargs[1])
n<- as.numeric(trailargs[2])
y = n-i
sample = data.frame(y)
path = paste("C:/Users/.../Desktop/sample","_",n,"_test.dta",sep="")
write.dta(sample, path)
END_OF_R
This is working, and i do get my sample_100_test.dta
However, i would like to produce a series of these datasets, in this way
set more off
local n 4
forvalues i = 1/10 {
rsource, terminator(END_OF_R) rpath(C:\Program Files\R\R-3.2.5\bin\R.exe) roptions(`" --vanilla --args "`i'" "`n'" "')
library(foreign)
trailargs <- commandArgs(trailingOnly=TRUE);
trailargs;
i<- as.numeric(trailargs[1])
n<- as.numeric(trailargs[2])
y = n-i
sample = data.frame(y)
path = paste("C:/Users/.../Desktop/sample","_",n,"_test.dta",sep="")
write.dta(sample, path)
END_OF_R
..... within loop
use sample","_",n,"_test.dta, clear
...estimation
erase sample","_",n,"_test.dta
}
And this is not working. What is the problem with this loop, why do i get error
"End of R output: command library is unrecognized"
Thanks for suggestions

This is the approach I would use, which is to increment i within R:
set more off
local n 100
local maxi 4
local seeds "1234 12345 123467"
rsource, terminator(END_OF_R) rpath(C:\Program Files\R\R-3.2.5\bin\R.exe) roptions(`" --vanilla --args "`n'" "`maxi'" "`seeds'" "')
library(foreign)
trailargs <- commandArgs(trailingOnly=TRUE);
trailargs;
n <- as.numeric(trailargs[1])
maxi <- as.numeric(trailargs[2])
seeds <- as.numeric(unlist(strsplit(trailargs[3], split=" ")))
print(seeds)
for(seed in seeds) {
set.seed(seed)
for(i in 1:maxi) {
y = n-i
sample = data.frame(y)
path = paste("C:/Users/.../Desktop/sample","_",i,"_test.dta",sep="")
write.dta(sample, path)
}
}
END_OF_R
As I said in my comments, this should be a lot more efficient than opening and closing R 10 times.

Related

Can a Y/N prompt in the RStudio Console be deactivated?

I'm using a function from an R package called RAC (R Package for Aqua Culture). It generates a Y/N prompt in the console window prior to execution. Is there a way to deactivate the prompt or automatically answer N every time?
The function Bass_pop_main will generate:
Do you want to change the inputs? [y/n]
Here's an example:
library(RAC)
setwd("../RAC_seabass") #working directory
userpath <- "../RAC_seabass" #userpath
Bass_pop_skeleton(userpath) #create input and output folders
forcings <- Bass_pop_dataloader(userpath) #load environmental variables
output <- Bass_pop_main("../RAC", forcings) #run growth model
Not sure if there is any setting that you can supply externally which will allow you to answer "No" automatically every time. However, we can change the source code of Bass_pop_main according to our requirement and use it. The source code is available if you enter Bass_pop_main in the console.
library(RAC)
Bass_pop_main_revised <- function (userpath, forcings) {
rm(list = ls())
cat("Sea Bass population bioenergetic model\n")
cat(" \n")
currentpath = getwd()
out_pre <- Bass_pop_pre(userpath, forcings)
Param = out_pre[[1]]
Tint = out_pre[[2]]
Gint = out_pre[[3]]
Food = out_pre[[4]]
IC = out_pre[[5]]
times = out_pre[[6]]
Dates = out_pre[[7]]
N = out_pre[[8]]
CS = out_pre[[9]]
out_RKsolver <- Bass_pop_loop(Param, Tint, Gint, Food, IC, times, N, userpath)
out_post <- Bass_pop_post(userpath, out_RKsolver, times, Dates, N, CS)
cat(" ")
cat("End")
return(out_post)
}
Now use Bass_pop_main_revised function instead of Bass_pop_main and it will never ask for input.
setwd("../RAC_seabass")
userpath <- "../RAC_seabass"
Bass_pop_skeleton(userpath)
forcings <- Bass_pop_dataloader(userpath)
output <- Bass_pop_main_revised("../RAC", forcings)

R interate over files and write results

Making my first attempt in R to iterate over data files to do an analysis and write output to a file. Here is the code I have:
# load packages igraph, dils, sna
sink('analysis-output.txt')
for (week in c("002","003","004","005","006")) {
cat("*** ",week,"\n")
obs <- read.table(paste(week,"obs.txt",sep="-"),sep="\t", header=FALSE)
per <- read.table(paste(week,"per.txt",sep="-"),sep="\t", header=FALSE)
n <- length(obs)
mper <- AdjacencyFromEdgelist(per, check.full = TRUE)
mobs <- AdjacencyFromEdgelist(obs, check.full = TRUE)
g<-array(c(mper[[1]],mobs[[1]]),c(n,n,2))
q<-qaptest(g,gcor,g1=1,g2=2)
summary(q)
}
sink()
When I run this, there are no errors, but analysis-output.txt contains only the output of the cat function, i.e.
*** 002
*** 003
*** 004
*** 005
*** 006
However, when I run the code interior to the loop by itself like so
sink('analysis-output.txt')
week = "002"
cat("*** ",week,"\n")
obs <- read.table(paste(week,"obs.txt",sep="-"),sep="\t", header=FALSE)
per <- read.table(paste(week,"per.txt",sep="-"),sep="\t", header=FALSE)
n <- length(obs)
mper <- AdjacencyFromEdgelist(per, check.full = TRUE)
mobs <- AdjacencyFromEdgelist(obs, check.full = TRUE)
g<-array(c(mper[[1]],mobs[[1]]),c(n,n,2))
q<-qaptest(g,gcor,g1=1,g2=2)
summary(q)
sink()
I do get the summary results in the output file:
*** 002
QAP Test Results
Estimated p-values:
p(f(perm) >= f(d)): 0.355
p(f(perm) <= f(d)): 0.656
Test Diagnostics:
Test Value (f(d)): -0.09045692
Replications: 1000
Distribution Summary:
Min: -0.207416
1stQ: -0.1971768
Med: -0.1720356
Mean: 0.01895663
3rdQ: 0.1185298
Max: 0.9982945
What am I doing wrong?
sink diverts the console output to a file, so the data needs to be output by R to appear in the file. I think what's missing is that, within a loop, you need to explicitly print the output of summary to get it to appear in the console (and thus be written by sink).
Compare the console output of these two loops:
for (i in 1:5) {
print(summary(cars))
}
Prints results to console
for (i in 1:5) {
summary(cars)
}
Does not print to console

Load the MNIST digit recognition dataset with R and see any results

In the book "Machine Learning - A Probabilistic Perspective" by Kevin P. Murphy the first task reads:
Exercise 1.1 KNN classifier on shuffled MNIST data
Run mnist1NNdemo
and verify that the misclassification rate (on the first 1000 test
cases) of MNIST of a 1-NN classifier is 3.8%. (If you run it all on
all 10,000 test cases, the error rate is 3.09%.) Modify the code so
that you first randomly permute the features (columns of the training
and test design matrices), as in shuffledDigitsDemo, and then apply
the classifier. Verify that the error rate is not changed.
My simple understanding is that the exercise is looking for the 1-NN after loading the files(kNN() in R).
The files:
train-images-idx3-ubyte.gz: training set images (9912422 bytes)
train-labels-idx1-ubyte.gz: training set labels (28881 bytes)
t10k-images-idx3-ubyte.gz: test set images (1648877 bytes)
t10k-labels-idx1-ubyte.gz: test set labels (4542 bytes)
are taken from the The MNIST DATABASE
I found a popular template for loading the files:
# for the kNN() function
library(VIM)
load_mnist <- function() {
load_image_file <- function(filename) {
ret = list()
f = file(filename,'rb')
readBin(f,'integer',n=1,size=4,endian='big')
ret$n = readBin(f,'integer',n=1,size=4,endian='big')
nrow = readBin(f,'integer',n=1,size=4,endian='big')
ncol = readBin(f,'integer',n=1,size=4,endian='big')
x = readBin(f,'integer',n=ret$n*nrow*ncol,size=1,signed=F)
ret$x = matrix(x, ncol=nrow*ncol, byrow=T)
close(f)
ret
}
load_label_file <- function(filename) {
f = file(filename,'rb')
readBin(f,'integer',n=1,size=4,endian='big')
n = readBin(f,'integer',n=1,size=4,endian='big')
y = readBin(f,'integer',n=n,size=1,signed=F)
close(f)
y
}
train <<- load_image_file("train-images.idx3-ubyte")
test <<- load_image_file("t10k-images.idx3-ubyte")
train$y <<- load_label_file("train-labels.idx1-ubyte")
test$y <<- load_label_file("t10k-labels.idx1-ubyte")
}
show_digit <- function(arr784, col=gray(12:1/12)) {
image(matrix(arr784, nrow=28)[,28:1], col=col)
}
According to the comment, in the command line this should work:
# Error "Error in matrix(arr784, nrow = 28) : object 'train' not found"
show_digit(train$x[5,])
The question is how can I use the show_digit function ?
Edit Remove extra question
What I figured out for the problem:
First run the whole file in R Studio or ESS, then call the load_mnist() from the console.
After that execute show_digit(train$x[3,]) in the console again and it works.
Finding the KNN classifier can be done on the whole data set:
a <- knn(train, test, train$y) but it would be a very slow process.
Predictions for the result can be done like table(test$y, a), test$y is predicted, a is the actual result.

R: Memory Management during xmlEventParse of Huge (>20GB) files

Building on this previous question (see here), I am attempting to read in many, large xml files via xmlEventParse whilst saving node-varying data. Working with this sample xml: https://www.nlm.nih.gov/databases/dtd/medsamp2015.xml.
The code below uses xpathSapply to extract the necessary values and a series of if statements to combine the values in a way that matches the unique value (PMID) to each of the non-unique values (LastName) within a record - for which there may be no LastNames. The goal is to write a series of small csv's along the way (here, after every 1000 LastNames) to minimize the amount of memory used.
When run on the full-sized data set, the code successfully outputs files in batches, however something is still being stored in memory that eventually causes a system error once all RAM is used. I've watched the task manager while the code runs and can see R's memory grow as the program progresses. And if I stop the program mid-run and then clear the R workspace, including hidden items, the memory still appears to be in use by R. It is not until I shutdown R is the memory freed up again.
Run this a few times yourself and you'll see R's memory usage grow even after clearing the workspace.
Please help! This problem appears to be common to others reading in large XML files in this manner (See for example comments in this question).
My code is as follows:
library(XML)
filename <- "~/Desktop/medsamp2015.xml"
tempdat <- data.frame(pmid=as.numeric(),
lname=character(),
stringsAsFactors=FALSE)
cnt <- 1
branchFunction <- function() {
func <- function(x, ...) {
v1 <- xpathSApply(x, path = "//PMID", xmlValue)
v2 <- xpathSApply(x, path = "//Author/LastName", xmlValue)
print(cbind(c(rep(v1,length(v2))), v2))
#below is where I store/write the temp data along the way
#but even without doing this, memory is used (even after clearing)
tempdat <<- rbind(tempdat,cbind(c(rep(v1,length(v2))), v2))
if (nrow(tempdat) > 1000){
outname <- paste0("~/Desktop/outfiles",cnt,".csv")
write.csv(tempdat, outname , row.names = F)
tempdat <<- data.frame(pmid=as.numeric(),
lname=character(),
stringsAsFactors=FALSE)
cnt <<- cnt+1
}
}
list(MedlineCitation = func)
}
myfunctions <- branchFunction()
#RUN
xmlEventParse(
file = filename,
handlers = NULL,
branches = myfunctions
)
Here is an example, we have a launch script invoke.sh, that calls an R Script and passes the url and filename as parameters... In this case, I had previously downloaded the test file medsamp2015.xml and put in the ./data directory.
My sense would be to create a loop in the invoke.sh script and iterate through the list of target file names. For each file you invoke an R instance, download it, process the file and move on to the next.
Caveat: I didn't check or change your function against any other download files and formats. I would turn off the printing of the output by removing the print() wrapper on line 62.
print( cbind(c(rep(v1, length(v2))), v2))
See: runtime.txt for print out.
The output .csv files are placed in the ./data directory.
Note: This is a derivative of a previous answer provided by me on this subject:
R memory not released in Windows. I hope it helps by way of example.
Launch Script
1 #!/usr/local/bin/bash -x
2
3 R --no-save -q --slave < ./47162861.R --args "https://www.nlm.nih.gov/databases/dtd" "medsamp2015.xml"
R File - 47162861.R
# Set working directory
projectDir <- "~/dev/stackoverflow/47162861"
setwd(projectDir)
# -----------------------------------------------------------------------------
# Load required Packages...
requiredPackages <- c("XML")
ipak <- function(pkg) {
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
ipak(requiredPackages)
# -----------------------------------------------------------------------------
# Load required Files
# trailingOnly=TRUE means that only your arguments are returned
args <- commandArgs(trailingOnly = TRUE)
if ( length(args) != 0 ) {
dataDir <- file.path(projectDir,"data")
fileUrl = args[1]
fileName = args[2]
} else {
dataDir <- file.path(projectDir,"data")
fileUrl <- "https://www.nlm.nih.gov/databases/dtd"
fileName <- "medsamp2015.xml"
}
# -----------------------------------------------------------------------------
# Download file
# Does the directory Exist? If it does'nt create it
if (!file.exists(dataDir)) {
dir.create(dataDir)
}
# Now we check if we have downloaded the data already if not we download it
if (!file.exists(file.path(dataDir, fileName))) {
download.file(fileUrl, file.path(dataDir, fileName), method = "wget")
}
# -----------------------------------------------------------------------------
# Now we extrat the data
tempdat <- data.frame(pmid = as.numeric(), lname = character(),
stringsAsFactors = FALSE)
cnt <- 1
branchFunction <- function() {
func <- function(x, ...) {
v1 <- xpathSApply(x, path = "//PMID", xmlValue)
v2 <- xpathSApply(x, path = "//Author/LastName", xmlValue)
print(cbind(c(rep(v1, length(v2))), v2))
# below is where I store/write the temp data along the way
# but even without doing this, memory is used (even after
# clearing)
tempdat <<- rbind(tempdat, cbind(c(rep(v1, length(v2))),
v2))
if (nrow(tempdat) > 1000) {
outname <- file.path(dataDir, paste0(cnt, ".csv")) # Create FileName
write.csv(tempdat, outname, row.names = F) # Write File to created directory
tempdat <<- data.frame(pmid = as.numeric(), lname = character(),
stringsAsFactors = FALSE)
cnt <<- cnt + 1
}
}
list(MedlineCitation = func)
}
myfunctions <- branchFunction()
# -----------------------------------------------------------------------------
# RUN
xmlEventParse(file = file.path(dataDir, fileName),
handlers = NULL,
branches = myfunctions)
Test File and output
~/dev/stackoverflow/47162861/data/medsamp2015.xml
$ ll
total 2128
drwxr-xr-x# 7 hidden staff 238B Nov 10 11:05 .
drwxr-xr-x# 9 hidden staff 306B Nov 10 11:11 ..
-rw-r--r--# 1 hidden staff 32K Nov 10 11:12 1.csv
-rw-r--r--# 1 hidden staff 20K Nov 10 11:12 2.csv
-rw-r--r--# 1 hidden staff 23K Nov 10 11:12 3.csv
-rw-r--r--# 1 hidden staff 37K Nov 10 11:12 4.csv
-rw-r--r--# 1 hidden staff 942K Nov 10 11:05 medsamp2015.xml
Runtime Output
> ./invoke.sh > runtime.txt
+ R --no-save -q --slave --args https://www.nlm.nih.gov/databases/dtd medsamp2015.xml
Loading required package: XML
File: runtime.txt

How can I capture warning messages from rstan R package's stan() function, when running an R script that calls stan() on the command line?

In the R script Fit12_for_stack.R, I call rstan package's stan() function. When I run the Fit12_for_stack.R code in an interactive R session, I get these warning messages from stan():
Warning messages:
1: There were 13 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help.
2: Examine the pairs() plot to diagnose sampling problems
When I run the script Fit12_for_stack.R on the command line with the command:
Rscript Fit12_for_stack.R
I get output, but not the warning messages. How can I capture the stan() warning messages when running the R script that calls stan() on the command line?
From the post How to save all console output to file in R?, I tried adding
con <- file("test.log")
sink(con, append=TRUE)
sink(con, append=TRUE, type="message")
to the top of the script, but test.log again showed output, without stan() warning messages.
This is what Fit12_for_stack.R looks like:
con <- file("test.log")
sink(con, append=TRUE)
sink(con, append=TRUE, type="message")
library("rstan")
J <- 2
L <- 3
X <- matrix(c(98, 22, 42, 99, 68, 61), nrow = L, ncol = J)
N <- matrix(100, nrow = L, ncol = J)
fit <- stan(file="try8.stan",
data=list(J, L, X, N),
iter=100, chains=4, seed = 1)
This is what try8.stan looks like:
data{
int<lower=0> J;
int<lower=0> L;
// Declare arrays with integer entries.
int X[L,J];
int N[L,J];
}
parameters {
// Add parameters:
// - pi_vec = [pi_1, ..., pi_L]
// - C_vec = [C_11, ..., C_JJ]
vector<lower=0,upper=1>[L] pi_vec;
vector<lower=0,upper=1>[J] C_vec;
matrix<lower=0,upper=1>[L,J] Alpha;
}
transformed parameters {
}
model {
for (i in 1:L) {
pi_vec[i] ~ uniform(0,1);
}
for (j in 1:J) {
C_vec[j] ~ uniform(0,1);
}
for (i in 1:L) {
for (j in 1:J) {
Alpha[i,j] ~ normal(pi_vec[i], sqrt(C_vec[j]*(pi_vec[i])*(1 - pi_vec[i]))) T[0,1];
// For the (Like = 1) test, have X[i,j] ~ U(0,1),
// i.e. set the likelihood's density to 1 so that posterior density = prior density.
X[i,j] ~ uniform(0,1);
}
}
}
I tried adding stan(..., cores = 2) and the warning messages were logged. I skimmed through the source, and my guess (I could be wrong) is that when cores = 1, the warning is only thrown if R is in interactive mode (right at the very end of the script).
When cores is more than 1, sink() does not appear to log the output from the workers, but redirecting the output seems to work, e.g.
Rscript Fit12_for_stack.R > test.Rout
So I removed the lines
con <- file("test.log")
sink(con, append=TRUE)
sink(con, append=TRUE, type="message")
from
Fit12_for_stack.R
and ran the program with the command
R CMD BATCH Fit12_for_stack.R
This produced an output file
Fit12_for_stack.Rout
with the stan() warnings included.

Resources