How to clusterExport in a foreach loop (OS:Windows) - r

I have the following problem. I need the timeindex of variables in my Global Environment, but when I want to export them into my cluster during the parallel processing from my Global Environment, I'm getting the following message:
Error in { : task 1 failed - "object 'Szenario' not found"
A minimal example of my original code, which produces the error:
Historical <- structure(c(18.5501872568473, 24.3295787432955, 14.9342384460288,
13.0653757599636, 8.67294618896797, 13.4587662721594, 20.792126254714,
17.5162747884424, 28.8253151239752, 23.0568765432192), index = structure(c(-7305,
-7304, -7303, -7302, -7301, -7300, -7299, -7298, -7297, -7296
), class = "Date"), class = "zoo")
Szenario <- structure(c(10.2800684124652, 14.5495871489361, 9.8565852930294,
21.1654540678686, 21.1936990312861, 12.4209005842752, 9.77473132000267,
17.1997402736739, 17.884107611858, 13.622588360244), index = structure(c(13149,
13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158
), class = "Date"), class = "zoo")
library(doParallel)
library(foreach)
library(raster)
library(zoo)
library(parallel)
# Parallelisation Settings
# Definition of how many cores you want to use
UseCores <- detectCores() -2 # -1 at max because one core has to be used for other tasks
# Register CoreCluster
cl <- makeCluster(UseCores)
registerDoParallel(cl)
foreach(fn=1:1) %dopar% {
library(raster) # needed libraries have to be loaded inside the loop, while parallel processing occurs
library(zoo)
library(base)
library(parallel)
#In my original script, I'm looping through Filenames, which are called like my variables in my Global environment (without .tif at the end of the filename), variables names are saved as characters
file.referenz.name <- c("Historical")
file.szenario.name <- c("Szenario")
#Create timeindex für rasterstacks to subset later on with them (getZ, setZ)
clusterExport(cl, varlist = c(file.szenario.name, file.referenz.name), envir = .GlobalEnv)
time.index.szenario <- index(get(file.szenario.name))
time.index.referenz <- index(get(file.referenz.name))
}
#end cluster
stopCluster(cl)

Try this
foreach(fn=1:1, .export=c("Szenario"),
.packages=c("raster", "zoo", "base", "parallel") %dopar%
And it's confusing to clusterExport inside a %dopar% {}. You can either clusterExport to each cl before foreach, or simply use .export in foreach
You can remove library statements in the %dopar% {}.

Related

`doParallel` vs `future` while using `Seurat` package

Here is the story.
From Seurat vignette, FindMarkers() can be accelerated by utilizing future package, future::plan("multiprocess", workers = 4)
However, I am running a simulation that I need to use FindAllMarkers() inside a doParallel::foreach() loop after doParallel::registerDoParallel(numCores=10).
What's the parallelization that happened behind the scene?
How to leverage the most power of HPC under this setup?
How many CPUs should I allocate for this job to maximize the parallelization?
Any idea is welcome.
Below is a minimum example. pbmc.rds is here.
library(Seurat)
# Enable parallelization for `FindAllMarkers()`
library(future)
plan("multiprocess", workers = 4)
# Enable parallelization for `foreach()` loop
library(doParallel)
registerDoParallel(cores = 10)
pbmc <- readRDS("pbmc.rds")
rst <- foreach(i = 1:10/10, .combine = "cbind") %doPar% {
pbmc <- FindClusters(pbmc, resolution = i)
# should put future command here instead?
# plan("multiprocess", workers = 4)
DEgenes <- FindAllMarkers(pbmc)
write.csv(DEgenes, paste0("DEgenes_resolution_", i, "csv"))
pbmc$seurat_clusters
}

How to fix C function R_nc4_get_vara_double returned error in ncdf4 parallel processing in R

I want to download nc data through OPENDAP from a remote storage. I use parallel backend with foreach - dopar loop as follows:
# INPUTS
inputs=commandArgs(trailingOnly = T)
interimpath=as.character(inputs[1])
gcm=as.character(inputs[2])
period=as.character(inputs[3])
var=as.character(inputs[4])
source='MACAV2'
cat('\n\n EXTRACTING DATA FOR',var, gcm, period, '\n\n')
# CHANGING LIBRARY PATHS
.libPaths("/storage/home/htn5098/local_lib/R40") # local library for packages
setwd('/storage/work/h/htn5098/DataAnalysis')
source('./src/Rcodes/CWD_function_package.R') # Calling the function Rscript
# CALLING PACKAGES
library(foreach)
library(doParallel)
library(parallel)
library(filematrix)
# REGISTERING CORES FOR PARALLEL PROCESSING
no_cores <- detectCores()
cl <- makeCluster(no_cores)
registerDoParallel(cl)
invisible(clusterEvalQ(cl,.libPaths("/storage/home/htn5098/local_lib/R40"))) # Really have to import library paths into the workers
invisible(clusterEvalQ(cl, c(library(ncdf4))))
# EXTRACTING DATA FROM THE .NC FILES TO MATRIX FORM
url <- readLines('./data/external/MACAV2_OPENDAP_allvar_allgcm_allperiod.txt')
links <- grep(x = url,pattern = paste0('.*',var,'.*',gcm,'_.*',period), value = T)
start=c(659,93,1) # lon, lat, time
count=c(527,307,-1)
spfile <- read.csv('./data/external/SERC_MACAV2_Elev.csv',header = T)
grids <- sort(unique(spfile$Grid))
clusterExport(cl,list('ncarray2matrix','start','count','grids')) #exporting data into clusters for parallel processing
cat('\nChecking when downloading all grids\n')
# k <- foreach(x = links,.packages = c('ncdf4')) %dopar% {
# nc <- nc_open(x)
# nc.var=ncvar_get(nc,varid=names(nc$var),start=start,count=count)
# return(nc.var)
# nc_close(nc)
# }
k <- foreach(x = links,.packages = c('ncdf4'),.errorhandling = 'pass') %dopar% {
nc <- nc_open(x)
print(nc)
nc.var=ncvar_get(nc,varid=names(nc$var),start=c(659,93,1),count=c(527,307,-1))
nc_close(nc)
return(dim(nc.var))
Sys.sleep(10)
}
# k <- parSapply(cl,links,function(x) {
# nc <- nc_open(x)
# nc.var=ncvar_get(nc,varid=names(nc$var),start=start,count=count)
# nc_close(nc)
# return(nc.var)
# })
print(k)
However, I keep getting this error:
<simpleError in ncvar_get_inner(ncid2use, varid2use, nc$var[[li]]$missval, addOffset, scaleFact, start = start, count = count, verbose = verbose, signedbyte = signedbyte, collapse_degen = collapse_degen): C function R_nc4_get_vara_double returned error>
What could be the reason for this problem? Can you recommend a solution for this that is time-efficient (I have to repeat this for about 20 files)?
Thank you.
I had the same error in my code. The problem was not the code itself. It was one of the files that I wanted to read. It has something wrong, so R couldn't open it. I identified the file and downloaded it again, and the same code worked perfectly.
I also encountered the same error. For me, restarting R session did the trick.

R foreach loop runs out of memory in HPC environment

I am using the foreach package in R to process raster files.
The R code below works fine locally (on Windows) when adapted to an 8-core processor, but runs out of memory in a HPC environment with 48 cores. The HPC environment has much more memory available (2 TB across all 48 cores) compared with my local box (32 GB), so that's not the limiting factor.
The memory creep occurs as the foreach loop proceeds. It's slow, but enough to eventually run out of memory.
I have tried switching parallel packages (to doMC, doSNOW), adding numerous garbage collection calls and rm() of large objects at the end of every iteration, fiddling with the number of cores used, as well as removing any temporary files immediately.
Any ideas on what may be causing my memory issues?
# Set Java memory maximum
options(java.parameters = "-Xmx39g")
library(sp)
library(raster)
library(dismo)
library(foreach)
library(doParallel)
library(rgdal)
library(rJava)
# Set directories
relPath <- "E:/BIEN_Cactaceae/"
bufferDir <- "Data/Buffers"
climDir <- "Data/FutureClimate/"
outDir <- "Analyses/FutureRanges/"
modelDir <- "Analyses/MaxEnt/"
outfileDir <- "OutFiles/"
tempDir <- "E:/Tmp/"
# Set directory for raster temporary files
rasterOptions(tmpdir = tempDir)
# Search for models
models <- list.files(path = paste0(relPath, modelDir), pattern = "rda$")
# Set up cluster
cl <- makeCluster(48, type = "FORK", outfile = paste0(relPath, outfileDir, "predictFuture.txt"))
registerDoParallel(cl)
# Loop through species and predict current ranges
foreach(i = 1:length(models),
.packages = c("sp", "raster", "dismo", "rgdal", "rJava"),
.inorder = F) %dopar% {
# Get taxon
taxon <- strsplit(models[i], ".", fixed = T)[[1]][1]
# Get buffer
tmpBuffer <- readOGR(dsn = paste0(relPath, bufferDir), layer = paste0(taxon, "_buff"), verbose = F)
# Get scenarios
scenarios <- list.files(path = paste0(relPath, climDir), pattern = "tif$")
# Get model
load(paste0(relPath, modelDir, models[i]))
# Loop over scenarios
for (j in scenarios) {
# Get scenario name
tmpScenarioName <- strsplit(j, ".", fixed = T)[[1]][1]
# Skip scenario if already processed
if (!file.exists(paste0(relPath, outDir, taxon, "_", tmpScenarioName, ".tif"))) {
# Read, crop, mask predictors
print(paste0(taxon, " - ", tmpScenarioName, ": processing"))
tmpScenarioStack <- raster::stack(paste0(relPath, climDir, j))
preds <- raster::crop(tmpScenarioStack, tmpBuffer)
preds <- raster::mask(preds, tmpBuffer)
# Rename predictors
tmpNames <- paste0(taxon, ".", 1:20)
tmpNames <- gsub("-", ".", tmpNames, fixed = T)
tmpNames <- gsub(" ", "_", tmpNames, fixed = T)
names(preds) <- tmpNames
# Predict with model
prediction <- dismo::predict(model_all, preds, progress = "")
# Export predictions
writeRaster(prediction, paste0(relPath, outDir, taxon, "_", tmpScenarioName, ".tif"))
removeTmpFiles(h = 2)
}
}
}
stopCluster(cl)

Parallel proccessing in R doParallel foreach save data

Progress has been made on getting the parallel processing part working but saving the vector with the fetch distances is not working properly. The error I get is
df_Test_Fetch <- data.frame(x_lake_length)
Error in data.frame(x_lake_length) : object 'x_lake_length' not found
write.table(df_Test_Fetch,file="C:/tempTest_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
Error in is.data.frame(x) : object 'df_Test_Fetch' not found
I have tried altering the code below so that the foreach step is output to x_lake_length. But that did not output the vector as I hoped. How can I get the actually results to be saved to a csv file. I am running a windows 8 computer with R x64 3.3.0.
Thanks you in advance
Jen
Here is the full code.
# make sure there is no prexisting data
rm(x_lake_length)
# Libraries ---------------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(lakemorpho,rgdal,maptools,sp,doParallel,foreach,
doParallel)
# HPC ---------------------------------------------------------------------
cores_2_use <- detectCores() - 2
cl <- makeCluster(cores_2_use, useXDR = F)
clusterSetRNGStream(cl, 9956)
registerDoParallel(cl, cores_2_use)
# Data --------------------------------------------------------------------
ogrDrivers()
dsn <- system.file("vectors", package = "rgdal")[1]
# the line below is commented out but when I run the script on my data the line below is what I use instead of the one above
# then making the name changes as needed
# dsn<-setwd("J:\\Elodea\\ByHUC6\\")
ogrListLayers(dsn)
ogrInfo(dsn=dsn, layer="trin_inca_pl03")
owd <- getwd()
setwd(dsn)
ogrInfo(dsn="trin_inca_pl03.shp", layer="trin_inca_pl03")
setwd(owd)
x <- readOGR(dsn=dsn, layer="trin_inca_pl03")
summary(x)
# Analysis ----------------------------------------------------------------
myfun <- function(x,i){tmp<-lakeMorphoClass(x[i,],NULL,NULL,NULL)
x_lake_length<-vector("numeric",length = nrow(x))
x_lake_length[i]<-lakeMaxLength(tmp,200)
print(i)
Sys.sleep(0.1)}
foreach(i = 1:nrow(x),.combine=cbind,.packages=c("lakemorpho","rgdal")) %dopar% (
myfun(x,i)
)
options(digits=10)
df_Test_Fetch <- data.frame(x_lake_length)
write.table(df_Test_Fetch,file="C:/temp/Test_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
print(proc.time())
I think this is what you want, though without understanding the subject matter I can't be 100% sure.
What I did was add a return() to your parallelized function and assigned the value of that returned object to x_lake_length when you call the foreach. But I'm only guessing that that's what you were trying to do, so please correct me if I'm wrong.
# make sure there is no prexisting data
rm(x_lake_length)
# Libraries ---------------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(lakemorpho,rgdal,maptools,sp,doParallel,foreach,
doParallel)
# HPC ---------------------------------------------------------------------
cores_2_use <- detectCores() - 2
cl <- makeCluster(cores_2_use, useXDR = F)
clusterSetRNGStream(cl, 9956)
registerDoParallel(cl, cores_2_use)
# Data --------------------------------------------------------------------
ogrDrivers()
dsn <- system.file("vectors", package = "rgdal")[1]
# the line below is commented out but when I run the script on my data the line below is what I use instead of the one above
# then making the name changes as needed
# dsn<-setwd("J:\\Elodea\\ByHUC6\\")
ogrListLayers(dsn)
ogrInfo(dsn=dsn, layer="trin_inca_pl03")
owd <- getwd()
setwd(dsn)
ogrInfo(dsn="trin_inca_pl03.shp", layer="trin_inca_pl03")
setwd(owd)
x <- readOGR(dsn=dsn, layer="trin_inca_pl03")
summary(x)
# Analysis ----------------------------------------------------------------
myfun <- function(x,i){tmp<-lakeMorphoClass(x[i,],NULL,NULL,NULL)
x_lake_length<-vector("numeric",length = nrow(x))
x_lake_length[i]<-lakeMaxLength(tmp,200)
print(i)
Sys.sleep(0.1)
return(x_lake_length)
}
x_lake_length <- foreach(i = 1:nrow(x),.combine=cbind,.packages=c("lakemorpho","rgdal")) %dopar% (
myfun(x,i)
)
options(digits=10)
df_Test_Fetch <- data.frame(x_lake_length)
write.table(df_Test_Fetch,file="C:/temp/Test_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
print(proc.time())

using package snow's parRapply: argument missing error

I want to find documents whose similarity between other doucuments are larger than a given value(0.1) by cutting documents into blocks.
library(tm)
data("crude")
sample.dtm <- DocumentTermMatrix(
crude, control=list(
weighting=function(x) weightTfIdf(x, normalize=FALSE),
stopwords=TRUE
)
)
step = 5
n = nrow(sample.dtm)
block = n %/% step
start = (c(1:block)-1)*step+1
end = start+step-1
j = unlist(lapply(1:(block-1),function(x) rep(((x+1):block),times=1)))
i = unlist(lapply(1:block,function(x) rep(x,times=(block-x))))
ij <- cbind(i,j)
library(skmeans)
getdocs <- function(k){
ci <- c(start[k[[1]]]:end[k[[1]]])
cj <- c(start[k[[2]]]:end[k[[2]]])
combi <- sample.dtm[ci]
combj < -sample.dtm[cj]
rownames(combi)<-ci
rownames(combj)<-cj
comb<-c(combi,combj)
sim<-1-skmeans_xdist(comb)
cat("Block", k[[1]], "with Block", k[[2]], "\n")
flush.console()
tri.sim<-upper.tri(sim,diag=F)
results<-tri.sim & sim>0.1
docs<-apply(results,1,function(x) length(x[x==TRUE]))
docnames<-names(docs)[docs>0]
gc()
return (docnames)
}
It works well when using apply
system.time(rmdocs<-apply(ij,1,getdocs))
When using parRapply
library(snow)
library(skmeans)
cl<-makeCluster(2)
clusterExport(cl,list("getdocs","sample.dtm","start","end"))
system.time(rmdocs<-parRapply(cl,ij,getdocs))
Error:
Error in checkForRemoteErrors(val) :
2 nodes produced errors; first error: attempt to set 'rownames' on an object with no dimensions
Timing stopped at: 0.01 0 0.04
It seems that sample.dtm coundn't be used in parRapply. I'm confused. Can anyone help me? Thanks!
In addition to exporting objects, you need to load the necessary packages on the cluster workers. In your case, the result of not doing so is that there isn't a dimnames method defined for "DocumentTermMatrix" objects, causing rownames<- to fail.
You can load packages on the cluster workers with the clusterEvalQ function:
clusterEvalQ(cl, { library(tm); library(skmeans) })
After doing that, rownames(combi)<-ci will work correctly.
Also, if you want to see the output from cat, you should use the makeCluster outfile argument:
cl <- makeCluster(2, outfile='')

Resources