Parallel proccessing in R doParallel foreach save data

Parallel proccessing in R doParallel foreach save data - r

Progress has been made on getting the parallel processing part working but saving the vector with the fetch distances is not working properly. The error I get is
df_Test_Fetch <- data.frame(x_lake_length)
Error in data.frame(x_lake_length) : object 'x_lake_length' not found
write.table(df_Test_Fetch,file="C:/tempTest_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
Error in is.data.frame(x) : object 'df_Test_Fetch' not found
I have tried altering the code below so that the foreach step is output to x_lake_length. But that did not output the vector as I hoped. How can I get the actually results to be saved to a csv file. I am running a windows 8 computer with R x64 3.3.0.
Thanks you in advance
Jen
Here is the full code.
# make sure there is no prexisting data
rm(x_lake_length)
# Libraries ---------------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(lakemorpho,rgdal,maptools,sp,doParallel,foreach,
doParallel)
# HPC ---------------------------------------------------------------------
cores_2_use <- detectCores() - 2
cl <- makeCluster(cores_2_use, useXDR = F)
clusterSetRNGStream(cl, 9956)
registerDoParallel(cl, cores_2_use)
# Data --------------------------------------------------------------------
ogrDrivers()
dsn <- system.file("vectors", package = "rgdal")[1]
# the line below is commented out but when I run the script on my data the line below is what I use instead of the one above
# then making the name changes as needed
# dsn<-setwd("J:\\Elodea\\ByHUC6\\")
ogrListLayers(dsn)
ogrInfo(dsn=dsn, layer="trin_inca_pl03")
owd <- getwd()
setwd(dsn)
ogrInfo(dsn="trin_inca_pl03.shp", layer="trin_inca_pl03")
setwd(owd)
x <- readOGR(dsn=dsn, layer="trin_inca_pl03")
summary(x)
# Analysis ----------------------------------------------------------------
myfun <- function(x,i){tmp<-lakeMorphoClass(x[i,],NULL,NULL,NULL)
x_lake_length<-vector("numeric",length = nrow(x))
x_lake_length[i]<-lakeMaxLength(tmp,200)
print(i)
Sys.sleep(0.1)}
foreach(i = 1:nrow(x),.combine=cbind,.packages=c("lakemorpho","rgdal")) %dopar% (
myfun(x,i)
)
options(digits=10)
df_Test_Fetch <- data.frame(x_lake_length)
write.table(df_Test_Fetch,file="C:/temp/Test_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
print(proc.time())

I think this is what you want, though without understanding the subject matter I can't be 100% sure.
What I did was add a return() to your parallelized function and assigned the value of that returned object to x_lake_length when you call the foreach. But I'm only guessing that that's what you were trying to do, so please correct me if I'm wrong.
# make sure there is no prexisting data
rm(x_lake_length)
# Libraries ---------------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(lakemorpho,rgdal,maptools,sp,doParallel,foreach,
doParallel)
# HPC ---------------------------------------------------------------------
cores_2_use <- detectCores() - 2
cl <- makeCluster(cores_2_use, useXDR = F)
clusterSetRNGStream(cl, 9956)
registerDoParallel(cl, cores_2_use)
# Data --------------------------------------------------------------------
ogrDrivers()
dsn <- system.file("vectors", package = "rgdal")[1]
# the line below is commented out but when I run the script on my data the line below is what I use instead of the one above
# then making the name changes as needed
# dsn<-setwd("J:\\Elodea\\ByHUC6\\")
ogrListLayers(dsn)
ogrInfo(dsn=dsn, layer="trin_inca_pl03")
owd <- getwd()
setwd(dsn)
ogrInfo(dsn="trin_inca_pl03.shp", layer="trin_inca_pl03")
setwd(owd)
x <- readOGR(dsn=dsn, layer="trin_inca_pl03")
summary(x)
# Analysis ----------------------------------------------------------------
myfun <- function(x,i){tmp<-lakeMorphoClass(x[i,],NULL,NULL,NULL)
x_lake_length<-vector("numeric",length = nrow(x))
x_lake_length[i]<-lakeMaxLength(tmp,200)
print(i)
Sys.sleep(0.1)
return(x_lake_length)
}
x_lake_length <- foreach(i = 1:nrow(x),.combine=cbind,.packages=c("lakemorpho","rgdal")) %dopar% (
myfun(x,i)
)
options(digits=10)
df_Test_Fetch <- data.frame(x_lake_length)
write.table(df_Test_Fetch,file="C:/temp/Test_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
print(proc.time())

Related

R GSIF package buffer.dist(): 'subscript out of bounds'

I which to use the buffer.dist() function of the GSIF package developed by Tomislav Hengl et al. (2018). It has not been updated since 2019 and was taken down from CRAN.
I downloaded the latest version of GSIF (v0.5-5 - 2019-01-04) from the CRAN repository and loaded the functions manually into the R workspace. All functions can be found in the folder "R".
> sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 11.6
# Manually load GSIF environment (manually download from CRAN repository)
source("AAAA.R") # needs to be loaded first
# Manually load function buffer.dist()
source("buffer.dist.R")
# Load library
library(sp)
library(plotKML)
library(raster)
library(gstat)
## Follow the workflow in the tutorial: https://peerj.com/articles/5518/GeoMLA_README_thengl.pdf
# Load example data from gstat package
data(meuse, echo = FALSE)
data(meuse.grid)
# transform into SpatialPoints objects (input data requirement for buffer.dist() )
meuse.sp <- SpatialPointsDataFrame(meuse[1:2], meuse[3:14], proj4string = CRS('+init=epsg:4326'))
meuse.grid.spdf <- SpatialPixelsDataFrame(meuse.grid[1:2], meuse.grid[6], proj4string = CRS('+init=epsg:4326'))
# derive buffer distances for each individual point, using the buffer function in the raster package which derives a gridded map for each observation point ()
grid.dist0 <- buffer.dist(meuse.sp["zinc"],
meuse.grid.spdf[1],
as.factor(1:nrow(meuse.sp)))
This gives me the following error message:
Error in x#coords[i, , drop = FALSE] : subscript out of bounds
Here is the buffer.dist() function (Hengl et al., 2018):
setMethod("buffer.dist", signature(observations = "SpatialPointsDataFrame", predictionDomain = "SpatialPixelsDataFrame"), function(observations, predictionDomain, classes, width, ...){
if(missing(width)){ width <- sqrt(areaSpatialGrid(predictionDomain)) }
if(!length(classes)==length(observations)){ stop("Length of 'observations' and 'classes' does not match.") }
## remove classes without any points:
xg = summary(classes, maxsum=length(levels(classes)))
selg.levs = attr(xg, "names")[xg > 0]
if(length(selg.levs)<length(levels(classes))){
fclasses <- as.factor(classes)
fclasses[which(!fclasses %in% selg.levs)] <- NA
classes <- droplevels(fclasses)
}
## derive buffer distances
s <- list(NULL)
for(i in 1:length(levels(classes))){
s[[i]] <- raster::distance(rasterize(observations[which(classes==levels(classes)[i]),1]#coords, y=raster(predictionDomain)), width=width, ...)
}
s <- s[sapply(s, function(x){!is.null(x)})]
s <- brick(s)
s <- as(s, "SpatialPixelsDataFrame")
s <- s[predictionDomain#grid.index,]
return(s)
})
I went through all steps of the function manually. It is in the second last row where the bug seems to occur:
s <- s[predictionDomain#grid.index,]
Error in x#coords[i, , drop = FALSE] : subscript out of bounds
Do you have any suggestion how to fix the issue?

You do not describe what that method does, but it seems that it does something like this:
bufdist <- function(obs, r, classes, width) {
s <- list()
cls <- sort(unique(classes))
for (i in 1:length(cls)) {
obsi <- obs[classes==cls[i], ]
x <- rasterize(obsi, r)
s[[i]] <- buffer(x, width, background=NA)
}
names(s) <- cls
rast(s)
}
library(terra)
f <- system.file("ex/elev.tif", package="terra")
r <- rast(f)
set.seed(1)
v <- spatSample(r, 50, as.points=TRUE)
cls <- sample(LETTERS[1:4], 50, replace=TRUE)
b <- bufdist(v, r, cls, 7500)
plot(b, col="red")

How to fix C function R_nc4_get_vara_double returned error in ncdf4 parallel processing in R

I want to download nc data through OPENDAP from a remote storage. I use parallel backend with foreach - dopar loop as follows:
# INPUTS
inputs=commandArgs(trailingOnly = T)
interimpath=as.character(inputs[1])
gcm=as.character(inputs[2])
period=as.character(inputs[3])
var=as.character(inputs[4])
source='MACAV2'
cat('\n\n EXTRACTING DATA FOR',var, gcm, period, '\n\n')
# CHANGING LIBRARY PATHS
.libPaths("/storage/home/htn5098/local_lib/R40") # local library for packages
setwd('/storage/work/h/htn5098/DataAnalysis')
source('./src/Rcodes/CWD_function_package.R') # Calling the function Rscript
# CALLING PACKAGES
library(foreach)
library(doParallel)
library(parallel)
library(filematrix)
# REGISTERING CORES FOR PARALLEL PROCESSING
no_cores <- detectCores()
cl <- makeCluster(no_cores)
registerDoParallel(cl)
invisible(clusterEvalQ(cl,.libPaths("/storage/home/htn5098/local_lib/R40"))) # Really have to import library paths into the workers
invisible(clusterEvalQ(cl, c(library(ncdf4))))
# EXTRACTING DATA FROM THE .NC FILES TO MATRIX FORM
url <- readLines('./data/external/MACAV2_OPENDAP_allvar_allgcm_allperiod.txt')
links <- grep(x = url,pattern = paste0('.*',var,'.*',gcm,'_.*',period), value = T)
start=c(659,93,1) # lon, lat, time
count=c(527,307,-1)
spfile <- read.csv('./data/external/SERC_MACAV2_Elev.csv',header = T)
grids <- sort(unique(spfile$Grid))
clusterExport(cl,list('ncarray2matrix','start','count','grids')) #exporting data into clusters for parallel processing
cat('\nChecking when downloading all grids\n')
# k <- foreach(x = links,.packages = c('ncdf4')) %dopar% {
# nc <- nc_open(x)
# nc.var=ncvar_get(nc,varid=names(nc$var),start=start,count=count)
# return(nc.var)
# nc_close(nc)
# }
k <- foreach(x = links,.packages = c('ncdf4'),.errorhandling = 'pass') %dopar% {
nc <- nc_open(x)
print(nc)
nc.var=ncvar_get(nc,varid=names(nc$var),start=c(659,93,1),count=c(527,307,-1))
nc_close(nc)
return(dim(nc.var))
Sys.sleep(10)
}
# k <- parSapply(cl,links,function(x) {
# nc <- nc_open(x)
# nc.var=ncvar_get(nc,varid=names(nc$var),start=start,count=count)
# nc_close(nc)
# return(nc.var)
# })
print(k)
However, I keep getting this error:
<simpleError in ncvar_get_inner(ncid2use, varid2use, nc$var[[li]]$missval, addOffset, scaleFact, start = start, count = count, verbose = verbose, signedbyte = signedbyte, collapse_degen = collapse_degen): C function R_nc4_get_vara_double returned error>
What could be the reason for this problem? Can you recommend a solution for this that is time-efficient (I have to repeat this for about 20 files)?
Thank you.

I had the same error in my code. The problem was not the code itself. It was one of the files that I wanted to read. It has something wrong, so R couldn't open it. I identified the file and downloaded it again, and the same code worked perfectly.

I also encountered the same error. For me, restarting R session did the trick.

How to apply rma() normalization to a unique CEL file?

I have implemented a R script that performs batch correction on a gene expression dataset. To do the batch correction, I first need to normalize the data in each CEL file through the Affy rma() function of Bioconductor.
If I run it on the GSE59867 dataset obtained from GEO, everything works.
I define a batch as the data collection date: I put all the CEL files having the same date into a specific folder, and then consider that date/folder as a specific batch.
On the GSE59867 dataset, a batch/folder contains only 1 CEL file. Nonetheless, the rma() function works on it perfectly.
But, instead, if I try to run my script on another dataset (GSE36809), I have some troubles: if I try to apply the rma() function to a batch/folder containing only 1 file, I get the following error:
Error in `colnames<-`(`*tmp*`, value = "GSM901376_c23583161.CEL.gz") :
attempt to set 'colnames' on an object with less than two dimensions
Here's my specific R code, to let you understand.
You first have to download the file GSM901376_c23583161.CEL.gz:
setwd(".")
options(stringsAsFactors = FALSE)
fileURL <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM901nnn/GSM901376/suppl/GSM901376%5Fc23583161%2ECEL%2Egz"
fileDownloadCommand <- paste("wget ", fileURL, " ", sep="")
system(fileDownloadCommand)
Library installation:
source("https://bioconductor.org/biocLite.R")
list.of.packages <- c("easypackages")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
listOfBiocPackages <- c("oligo", "affyio","BiocParallel")
bioCpackagesNotInstalled <- which( !listOfBiocPackages %in% rownames(installed.packages()) )
cat("package missing listOfBiocPackages[", bioCpackagesNotInstalled, "]: ", listOfBiocPackages[bioCpackagesNotInstalled], "\n", sep="")
if( length(bioCpackagesNotInstalled) ) {
biocLite(listOfBiocPackages[bioCpackagesNotInstalled])
}
library("easypackages")
libraries(list.of.packages)
libraries(listOfBiocPackages)
Application of rma()
thisFileDate <- "GSM901376_c23583161.CEL.gz"
thisDateRawData <- read.celfiles(thisDateCelFiles)
thisDateNormData <- rma(thisDateRawData)
After the call to rma(), I get the error.
How can I solve this problem?
I also tried to skip this normalization, by saving the thisDateRawData object directly. But then I have the problem that I cannot combine together this thisDateRawData (that is a ExpressionFeatureSet) with the outputs of rma() (that are ExpressionSet objects).
(EDIT: I extensively edited the question, and added a piece of R code you should be able to run on your pc.)

Hmm. This is a puzzling problem. the oligo::rma() function might be buggy for class GeneFeatureSet with single samples. I got it to work with a single sample by using lower-level functions, but it means I also had to create the expression set from scratch by specifying the slots:
# source("https://bioconductor.org/biocLite.R")
# biocLite("GEOquery")
# biocLite("pd.hg.u133.plus.2")
# biocLite("pd.hugene.1.0.st.v1")
library(GEOquery)
library(oligo)
# # Instead of using .gz files, I extracted the actual CELs.
# # This is just to illustrate how I read in the files; your usage will differ.
# projectDir <- "" # Path to .tar files here
# setwd(projectDir)
# untar("GSE36809_RAW.tar", exdir = "GSE36809")
# untar("GSE59867_RAW.tar", exdir = "GSE59867")
# setwd("GSE36809"); gse3_cels <- dir()
# sapply(paste(gse3_cels, sep = "/"), gunzip); setwd(projectDir)
# setwd("GSE59867"); gse5_cels <- dir()
# sapply(paste(gse5_cels, sep = "/"), gunzip); setwd(projectDir)
#
# Read in CEL
#
# setwd("GSE36809"); gse3_cels <- dir()
# gse3_efs <- read.celfiles(gse3_cels[1])
# # Assuming you've read in the CEL files as a GeneFeatureSet or
# # ExpressionFeatureSet object (i.e. gse3_efs in this example),
# # you can now fit the RMA and create an ExpressionSet object with it:
exprsData <- basicRMA(exprs(gse3_efs), pnVec = featureNames(gse3_efs))
gse3_expset <- new("ExpressionSet")
slot(gse3_expset, "assayData") <- assayDataNew(exprs = exprsData)
slot(gse3_expset, "phenoData") <- phenoData(gse3_efs)
slot(gse3_expset, "featureData") <- annotatedDataFrameFrom(attr(gse3_expset,
'assayData'), byrow = TRUE)
slot(gse3_expset, "protocolData") <- protocolData(gse3_efs)
slot(gse3_expset, "annotation") <- slot(gse3_efs, "annotation")
Hopefully the above approach will work in your code.

Using R on a Hadoop installed cluster

I am a R user. I know very little about Linux command, PuTTY or Hadoop/Hive. So please correct me, if I am wrong.
I am now working with a team. They have a running Ubuntu system on a cluster. I can use PuTTY to access this Ubuntu system and access the data files by using code:
user$hadoop fs -ls /datafolder/
or by using hive:
user$hive
hive>use datafolder;
hive>show tables;
On the opposite, the team that I am working with knows very little about R, so they want me to do the R part. I have installed R on the cluster, and also installed rJava HRive and other packages in R. (I am not sure I did this correctly, but R seems running OK).
Now I can do some testing. I can run the following code on R # cluster:
user$R
>install.packages(c('Rcpp', 'RJSONIO', 'bitops', 'digest', 'functional', 'stringr', 'plyr', 'reshape2','caTools'))
>Sys.setenv(HADOOP_CMD="/opt/cloudera/bin/hadoop")
>Sys.setenv(HADOOP_HOME="/opt/cloudera/lib/hadoop")
>Sys.setenv(HADOOP_STREAMING="/opt/cloudera/lib/hadoop-mapreduce/hadoop-streaming.jar")
>library(rmr2)
>library(rhdfs)
>hdfs.init()
Testing:
>ints = to.dfs(1:10)
>calc = mapreduce(input = ints, map = function(k,v) cbind(v, v/2, 2*v))
>test <- from.dfs(calc)
>test
I can successfully load "test" back by using from.dfs. It seemed like I can save a dummy dataset to Hadoop, and can get it back from Hadoop successfully (correct?)
Now, my question is, how to let R import those datasets that I can see from
user$hadoop fs -ls /datafolder/
or
>hive use datafolder;

this is example of wordcount with load result back to R:
Sys.setenv(HADOOP_CMD="/usr/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/opt/cloudera/parcels/CDH-5.1.0-1.cdh5.1.0.p0.53/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming.jar")
Sys.setenv(JAVA_HOME="/usr/java/jdk1.7.0_55-cloudera")
Sys.setenv(HADOOP_COMMON_LIB_NATIVE_DIR="/opt/cloudera/parcels/CDH-5.1.0-1.cdh5.1.0.p0.53/lib/hadoop/lib/native")
Sys.setenv(HADOOP_OPTS="-Djava.library.path=HADOOP_HOME/lib")
library(rhdfs)
hdfs.init()
library(rmr2)
## space and word delimiter
map <- function(k,lines) {
words.list <- strsplit(lines, '\\s')
words <- unlist(words.list)
return( keyval(words, 1) )
}
reduce <- function(word, counts) {
keyval(word, sum(counts))
}
wordcount <- function (input, output=NULL) {
mapreduce(input=input, output=output, input.format="text", map=map, reduce=reduce)
}
## variables
hdfs.root <- '/user/node'
hdfs.data <- file.path(hdfs.root, 'data')
hdfs.out <- file.path(hdfs.root, 'out')
## run mapreduce job
##out <- wordcount(hdfs.data, hdfs.out)
system.time(out <- wordcount(hdfs.data, hdfs.out))
## fetch results from HDFS
results <- from.dfs(out)
results.df <- as.data.frame(results, stringsAsFactors=F)
colnames(results.df) <- c('word', 'count')
##head(results.df)
## sorted output TOP10
head(results.df[order(-results.df$count),],10)

How to implement parallel jags on Windows with foreach?

I would like to run jags models in parallel on my windows computer with 4 cores, but have not been able to figure out why my model will not run. I have searched the web extensively including these posts:
http://andrewgelman.com/2011/07/23/parallel-jags-rngs/
http://users.soe.ucsc.edu/~draper/eBay-Google-2013-parallel-rjags-example.txt
When I run a simple example (see code below) with %do%, the model runs fine (serially of course). When I use %dopar%, I receive the error:
Error in { : task 1 failed - "Symbol table is empty"
library(rjags)
library(coda)
library(foreach)
library(doParallel)
library(random)
load.module("lecuyer")
### Data generation
y <- rnorm(100)
n <- length(y)
win.data <- list(y=y, n=n)
# Define model
sink("model.txt")
cat("
model {
# Priors
mu ~ dnorm(0, 0.001)
tau <- 1 / (sigma * sigma)
sigma ~ dunif(0, 10)
# Likelihood
for (i in 1:n) {
y[i] ~ dnorm(mu, tau)
}
}
",fill=TRUE)
sink()
inits <- function(){ list(mu=rnorm(1), sigma=runif(1, 0, 10),
.RNG.name = "lecuyer::RngStream",
.RNG.seed = as.numeric(randomNumbers( n = 1, min = 1, max = 1e+06, col = 1 )) ) }
params <- c('mu','sigma')
cl <- makePSOCKcluster(3)
clusterSetRNGStream(cl)
registerDoParallel(cl)
model.wd <- paste(getwd(), '/model.txt', sep='') # I wondered if the cores were having trouble finding the model.
m <- foreach(i=1:3, .packages=c('rjags','random','coda'), .multicombine=TRUE) %dopar% {
load.module( "lecuyer" )
model.jags <- jags.model(model.wd, win.data, inits=inits, n.chains=1, n.adapt=1000, quiet=TRUE)
result <- coda.samples(model.jags, params, 1000, thin=5)
return(result)
}
stopCluster(cl)
# Error in { : task 1 failed - "Symbol table is empty
sessionInfo()
# R version 3.0.1 (2013-05-16)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
#
# locale:
# [1] LC_COLLATE=English_Canada.1252 LC_CTYPE=English_Canada.1252 LC_MONETARY=English_Canada.1252
# [4] LC_NUMERIC=C LC_TIME=English_Canada.1252
#
# attached base packages:
# [1] parallel stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] random_0.2.1 doParallel_1.0.3 iterators_1.0.6 foreach_1.4.1 rjags_3-10 coda_0.16-1
# [7] lattice_0.20-21
#
# loaded via a namespace (and not attached):
# [1] codetools_0.2-8 compiler_3.0.1 grid_3.0.1 tools_3.0.1
More Details:
The problem occurs on a Windows 7 computer with NO admin privaleges, but not on a computer WITH admin privaleges. The problem occurs with Rgui and Rterm and with the new rjags packaged 3-11. The error message occurs within the function jags.model
The problem appears to stem from a mismatch in writing and reading files to a temporary directory. When I start R, it automatically creates a temporary folder. When I close R, this folder is automatically deleted, unless it contains files.
For example, when I start R it creates this folder:
C:\Users\jesse whittington\AppData\Local\Temp\RtmpoBe1gw.
When I run a rjags model with
m <- jags.model(file='model.txt', data=win.data, inits=inits, n.chains=3, n.adapt=1000, quiet=FALSE)
No files are written to this temporary directory.
When I run 3 chains serially with foreach and %do%, 3 temporary files are written to this folder. These files are 1 kb in size and when I open with a text editor they appear blank.
wd <- getwd()
cl <- makePSOCKcluster(3, outfile=paste(wd,'/Out_messages.txt', sep='')) # 3 chains
clusterSetRNGStream(cl)
registerDoParallel(cl)
m <- foreach(i=1:3, .packages=c('rjags','random','coda'), .multicombine=TRUE) %do% {
load.module( "lecuyer" )
result <- jags.model(file='model.txt', data=win.data, inits=inits, n.chains=1, n.adapt=1000, quiet=FALSE)
return(result)
}
stopCluster(cl)
When I run 3 chains in parallel with foreach and %dopar%, 3 temporary files are written to the folder ..Temp\RtmpoBe1gw. The error messages in the outfile suggest that the function is looking for DIFFERENT files in DIFFERENT temporary directories. When, I include a line to create a tempfile directory and name, I see that 3 new temporary folders are created (they are later deleted with stopCluster). jags.model looks in these 3 folders for the temporary files and fails because there is nothing in them. Thus, I suspect tempfiles are written to one temporary directory (associated with the parent R session) and then fails when trying to open different tmpfiles in the 3 temporary directories created within foreach.
wd <- getwd()
cl <- makePSOCKcluster(3, outfile=paste(wd,'/Out_messages.txt', sep='')) # 3 chains
clusterSetRNGStream(cl)
registerDoParallel(cl)
m <- foreach(i=1:3, .packages=c('rjags','random','coda'), .multicombine=TRUE) %dopar% {
load.module( "lecuyer" )
tmp <- tempfile()
print(tmp)
result <- jags.model(file='model.txt', data=win.data, inits=inits, n.chains=1, n.adapt=1000, quiet=FALSE)
return(result)
}
stopCluster(cl)
From Out_messages.txt
starting worker pid=4396 on localhost:11109 at 08:34:06.430
starting worker pid=6548 on localhost:11109 at 08:34:06.879
starting worker pid=6212 on localhost:11109 at 08:34:07.418
Loading required package: coda
Loading required package: lattice
Loading required package: coda
Loading required package: lattice
Loading required package: coda
Loading required package: lattice
Linked to JAGS 3.3.0
Loaded modules: basemod,bugs
Linked to JAGS 3.3.0
Loaded modules: basemod,bugs
Linked to JAGS 3.3.0
Loaded modules: basemod,bugs
module lecuyer loaded
module lecuyer loaded
module lecuyer loaded
[1] "C:\\Users\\JESSEW~1\\AppData\\Local\\Temp\\RtmpQbPAVC\\file112c8077a0" # Note this is from: tmp <- tempfile()
[1] "C:\\Users\\JESSEW~1\\AppData\\Local\\Temp\\RtmpMPMpcY\\file199489564c6"
[1] "C:\\Users\\JESSEW~1\\AppData\\Local\\Temp\\Rtmpk9vMR5\\file18445f6b2fd4"
Compiling model graph
Compiling model graph
Compiling model graph
Warning messages:
1: In jags.model(file = "model.txt", data = win.data, inits = inits, :
Unused variable "y" in data
2: In jags.model(file = "model.txt", data = win.data, inits = inits, :
Unused variable "n" in data
3: In jags.model(file = "model.txt", data = win.data, inits = inits, :
Failed to open file C:\Users\JESSEW~1\AppData\Local\Temp\RtmpQbPAVC\file112c394b4eef
Nothing to compile
4: In jags.model(file = "model.txt", data = win.data, inits = inits, :
Unused initial value for "mu" in chain 1
5: In jags.model(file = "model.txt", data = win.data, inits = inits, :
Unused initial value for "sigma" in chain 1
6: In jags.model(file = "model.txt", data = win.data, inits = inits, :
Can't initialize. No nodes in graph (Have you compiled the model?)
The folder RtmpQbPAVC is created but the file file112c394b4eef does not exist.

Steve brought this to my attention, but your second example shows that it is not a problem with rjags. I am unable to reproduce the bug in either example using the same setup (Windows 7, R 3.0.1, JAGS 3.0.3, ordinary user without admin access).

Since the errors are caused by writing and reading the model file, I suggest that you bypass that issue by using the "textConnection" function. This can be used to create a file-like object without creating an actual file, thus avoiding the need for temporary files. I modified your example to demonstrate this:
library(rjags)
library(doParallel)
library(random)
load.module("lecuyer")
y <- rnorm(100)
n <- length(y)
win.data <- list(y=y, n=n)
model <- "
model {
# Priors
mu ~ dnorm(0, 0.001)
tau <- 1 / (sigma * sigma)
sigma ~ dunif(0, 10)
# Likelihood
for (i in 1:n) {
y[i] ~ dnorm(mu, tau)
}
}"
inits <- function() {
list(mu=rnorm(1), sigma=runif(1, 0, 10),
.RNG.name="lecuyer::RngStream",
.RNG.seed=as.numeric(randomNumbers(n=1, min=1, max=1e+06, col=1)))
}
params <- c('mu', 'sigma')
cl <- makePSOCKcluster(3)
clusterSetRNGStream(cl)
registerDoParallel(cl)
m <- foreach(i=1:3, .packages=c('rjags', 'random'),
.combine='c', .final=mcmc.list) %dopar% {
load.module( "lecuyer" )
model.jags <- jags.model(textConnection(model), win.data, inits=inits,
n.chains=1, n.adapt=1000, quiet=TRUE)
coda.samples(model.jags, params, 1000, thin=5)
}
I also changed the result handling so that the value returned by the foreach loop is an "mcmc.list" object, which is what the "coda.samples" function returns.

I have identified the source of the problem.
I can write and read files to and from a temporary directory when using R normally.
When in parallel, I can write files to the temporary directories, but I do NOT have permission to read files.
The problem occurs both writing and reading text files (using writeLines and readLines) and csv files.
I have since found that if I receive this message: "Error in { : task 1 failed - cannot open the connection", I can rectify the problem by deleting all temporary files in TEMP. For some locked files, I have to shut down and restart the computer before I am able to delete the necessary files. Even so, within the same R session I might receive the error message and then be able to successful run the program on my next try. The problem likely stems from our government anti-virus software and/or the structure of our remote network access.
Here is an example that writes and reads text files for simplicity.
library(foreach)
library(doParallel)
wd <- getwd()
data <- data.frame(x=1:10, y=1:10)
This works fine.
modfile <- tempfile()
print(modfile)
# "C:\\Users\\JESSEW~1\\AppData\\Local\\Temp\\RtmpsvYfFk\\filef38a272022"
write.csv(data, modfile, row.names=F)
m <- read.csv(modfile)
This does not work
cl <- makePSOCKcluster(3, outfile=paste(wd,'/Out_messages.txt', sep='')) # 3 chains
clusterSetRNGStream(cl)
registerDoParallel(cl)
m <- foreach(i=1:3) %dopar% {
modfile <- tempfile()
write.csv(data, modfile, row.names=F)
x <- read.csv(modfile)
return(x)
}
# Error in { : task 1 failed - "cannot open the connection"
stopCluster(cl)
Here is the output from Out_message.txt. Note the "Permission Denied" on the far right.
starting worker pid=6852 on localhost:11611 at 22:09:19.488
starting worker pid=6984 on localhost:11611 at 22:09:19.926
starting worker pid=3384 on localhost:11611 at 22:09:20.441
Warning message:
Warning message:
In file(con, "r") :
cannot open file 'C:\Users\JESSEW~1\AppData\Local\Temp\Rtmp6dEZLP\file1ac44a506032': Permission denied
In file(con, "r") :
cannot open file 'C:\Users\JESSEW~1\AppData\Local\Temp\RtmpuydRvR\file1b48185f2a2d': Permission denied
Warning message:
In file(con, "r") :
cannot open file 'C:\Users\JESSEW~1\AppData\Local\Temp\RtmpAbOIng\filed382ef37d51': Permission denied

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Parallel proccessing in R doParallel foreach save data - r

Related

R GSIF package buffer.dist(): 'subscript out of bounds'

How to fix C function R_nc4_get_vara_double returned error in ncdf4 parallel processing in R

How to apply rma() normalization to a unique CEL file?

Using R on a Hadoop installed cluster

How to implement parallel jags on Windows with foreach?

Categories

Resources