R foreach loop runs out of memory in HPC environment - r

I am using the foreach package in R to process raster files.
The R code below works fine locally (on Windows) when adapted to an 8-core processor, but runs out of memory in a HPC environment with 48 cores. The HPC environment has much more memory available (2 TB across all 48 cores) compared with my local box (32 GB), so that's not the limiting factor.
The memory creep occurs as the foreach loop proceeds. It's slow, but enough to eventually run out of memory.
I have tried switching parallel packages (to doMC, doSNOW), adding numerous garbage collection calls and rm() of large objects at the end of every iteration, fiddling with the number of cores used, as well as removing any temporary files immediately.
Any ideas on what may be causing my memory issues?
# Set Java memory maximum
options(java.parameters = "-Xmx39g")
library(sp)
library(raster)
library(dismo)
library(foreach)
library(doParallel)
library(rgdal)
library(rJava)
# Set directories
relPath <- "E:/BIEN_Cactaceae/"
bufferDir <- "Data/Buffers"
climDir <- "Data/FutureClimate/"
outDir <- "Analyses/FutureRanges/"
modelDir <- "Analyses/MaxEnt/"
outfileDir <- "OutFiles/"
tempDir <- "E:/Tmp/"
# Set directory for raster temporary files
rasterOptions(tmpdir = tempDir)
# Search for models
models <- list.files(path = paste0(relPath, modelDir), pattern = "rda$")
# Set up cluster
cl <- makeCluster(48, type = "FORK", outfile = paste0(relPath, outfileDir, "predictFuture.txt"))
registerDoParallel(cl)
# Loop through species and predict current ranges
foreach(i = 1:length(models),
.packages = c("sp", "raster", "dismo", "rgdal", "rJava"),
.inorder = F) %dopar% {
# Get taxon
taxon <- strsplit(models[i], ".", fixed = T)[[1]][1]
# Get buffer
tmpBuffer <- readOGR(dsn = paste0(relPath, bufferDir), layer = paste0(taxon, "_buff"), verbose = F)
# Get scenarios
scenarios <- list.files(path = paste0(relPath, climDir), pattern = "tif$")
# Get model
load(paste0(relPath, modelDir, models[i]))
# Loop over scenarios
for (j in scenarios) {
# Get scenario name
tmpScenarioName <- strsplit(j, ".", fixed = T)[[1]][1]
# Skip scenario if already processed
if (!file.exists(paste0(relPath, outDir, taxon, "_", tmpScenarioName, ".tif"))) {
# Read, crop, mask predictors
print(paste0(taxon, " - ", tmpScenarioName, ": processing"))
tmpScenarioStack <- raster::stack(paste0(relPath, climDir, j))
preds <- raster::crop(tmpScenarioStack, tmpBuffer)
preds <- raster::mask(preds, tmpBuffer)
# Rename predictors
tmpNames <- paste0(taxon, ".", 1:20)
tmpNames <- gsub("-", ".", tmpNames, fixed = T)
tmpNames <- gsub(" ", "_", tmpNames, fixed = T)
names(preds) <- tmpNames
# Predict with model
prediction <- dismo::predict(model_all, preds, progress = "")
# Export predictions
writeRaster(prediction, paste0(relPath, outDir, taxon, "_", tmpScenarioName, ".tif"))
removeTmpFiles(h = 2)
}
}
}
stopCluster(cl)

Related

How to fix C function R_nc4_get_vara_double returned error in ncdf4 parallel processing in R

I want to download nc data through OPENDAP from a remote storage. I use parallel backend with foreach - dopar loop as follows:
# INPUTS
inputs=commandArgs(trailingOnly = T)
interimpath=as.character(inputs[1])
gcm=as.character(inputs[2])
period=as.character(inputs[3])
var=as.character(inputs[4])
source='MACAV2'
cat('\n\n EXTRACTING DATA FOR',var, gcm, period, '\n\n')
# CHANGING LIBRARY PATHS
.libPaths("/storage/home/htn5098/local_lib/R40") # local library for packages
setwd('/storage/work/h/htn5098/DataAnalysis')
source('./src/Rcodes/CWD_function_package.R') # Calling the function Rscript
# CALLING PACKAGES
library(foreach)
library(doParallel)
library(parallel)
library(filematrix)
# REGISTERING CORES FOR PARALLEL PROCESSING
no_cores <- detectCores()
cl <- makeCluster(no_cores)
registerDoParallel(cl)
invisible(clusterEvalQ(cl,.libPaths("/storage/home/htn5098/local_lib/R40"))) # Really have to import library paths into the workers
invisible(clusterEvalQ(cl, c(library(ncdf4))))
# EXTRACTING DATA FROM THE .NC FILES TO MATRIX FORM
url <- readLines('./data/external/MACAV2_OPENDAP_allvar_allgcm_allperiod.txt')
links <- grep(x = url,pattern = paste0('.*',var,'.*',gcm,'_.*',period), value = T)
start=c(659,93,1) # lon, lat, time
count=c(527,307,-1)
spfile <- read.csv('./data/external/SERC_MACAV2_Elev.csv',header = T)
grids <- sort(unique(spfile$Grid))
clusterExport(cl,list('ncarray2matrix','start','count','grids')) #exporting data into clusters for parallel processing
cat('\nChecking when downloading all grids\n')
# k <- foreach(x = links,.packages = c('ncdf4')) %dopar% {
# nc <- nc_open(x)
# nc.var=ncvar_get(nc,varid=names(nc$var),start=start,count=count)
# return(nc.var)
# nc_close(nc)
# }
k <- foreach(x = links,.packages = c('ncdf4'),.errorhandling = 'pass') %dopar% {
nc <- nc_open(x)
print(nc)
nc.var=ncvar_get(nc,varid=names(nc$var),start=c(659,93,1),count=c(527,307,-1))
nc_close(nc)
return(dim(nc.var))
Sys.sleep(10)
}
# k <- parSapply(cl,links,function(x) {
# nc <- nc_open(x)
# nc.var=ncvar_get(nc,varid=names(nc$var),start=start,count=count)
# nc_close(nc)
# return(nc.var)
# })
print(k)
However, I keep getting this error:
<simpleError in ncvar_get_inner(ncid2use, varid2use, nc$var[[li]]$missval, addOffset, scaleFact, start = start, count = count, verbose = verbose, signedbyte = signedbyte, collapse_degen = collapse_degen): C function R_nc4_get_vara_double returned error>
What could be the reason for this problem? Can you recommend a solution for this that is time-efficient (I have to repeat this for about 20 files)?
Thank you.
I had the same error in my code. The problem was not the code itself. It was one of the files that I wanted to read. It has something wrong, so R couldn't open it. I identified the file and downloaded it again, and the same code worked perfectly.
I also encountered the same error. For me, restarting R session did the trick.

How to read in parallel multiple chunks from the same connection in R?

I have a .bz2 file and I want to read it and do some processing. The file cannot be loaded in memory. I want to do some computations on the chunks I read and I they can be performed independently of one another and therefore I thought I would try to do it in parallel.
I tried the following:
library(parallel)
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
con = file("myfile.bz2", "r")
parLapply(cl, con,
function(x)
print(head(read.csv(x, nrows = 100, stringsAsFactors = F, header = F, colClasses = "character", fill = F), 16)))
## Doesn't work Error in checkForRemoteErrors(val) :
one node produced an error: 'file' must be a character string or connection
parLapply(cl, list(con, con, con),
function(x)
print(head(read.csv(x, nrows = 100, stringsAsFactors = F, header = F, colClasses = "character", fill = F), 16)))
## Doesn't work Error in checkForRemoteErrors(val) :
3 nodes produced errors; first error: invalid connection
Can this somehow work?
Also any other recommendation as to how to go about it would be really helpful, as I am new to the world of parallel processing.
You cannot and must not use connections from one R process in another R process - connections are unique to the R session where they are created.
Internally, they are just integer indices and there is very little in R that protects you from mistakenly trying to use them in other R processes. If your want to know for details, see https://github.com/HenrikBengtsson/Wishlist-for-R/issues/81.
FWIW, if you use the future framework for parallelization and set R option future.globals.onReference to "error", then it protects you against this mistake (https://cran.r-project.org/web/packages/future/vignettes/future-4-non-exportable-objects.html). For example,
library(future.apply)
options(future.globals.onReference = "error")
library(parallel)
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
cat("Hello world\n", file = bzfile("myfile.bz2", open="wb"))
con <- file("myfile.bz2", "r")
y <- future_lapply(list(con, con, con), FUN = function(x) {
data <- read.csv(x, nrows = 100, stringsAsFactors = FALSE, header = FALSE, colClasses = "character", fill = FALSE)
print(head(data), 16)
})
Error: Detected a non-exportable reference ('externalptr') in one of the globals (<unknown>) used in the future expression

mcapply: all scheduled cores encountered errors in user code

The following is my code. I am trying get the list of all the files (~20000) that end with .idat and read each file using the function illuminaio::readIDAT.
library(illuminaio)
library(parallel)
library(data.table)
# number of cores to use
ncores = 8
# this gets all the files with .idat extension ~20000 files
files <- list.files(path = './',
pattern = "*.idat",
full.names = TRUE)
# function to read the idat file and create a data.table of filename, and two more columns
# write out as csv using fwrite
get.chiptype <- function(x)
{
idat <- readIDAT(x)
res <- data.table(filename = x, nSNPs = nrow(idat$Quants), Chip = idat$ChipType)
fwrite(res, file.path = 'output.csv', append = TRUE)
}
# using mclapply call the function get.chiptype on all 20000 files.
# use 8 cores at a time
mclapply(files, FUN = function(x) get.chiptype(x), mc.cores = ncores)
After reading and writing info about 1200 files, I get the following message:
Warning message:
In mclapply(files, FUN = function(x) get.chiptype(x), mc.cores = ncores) :
all scheduled cores encountered errors in user code
How do I resolve it?
Calling mclapply() in some instances requires you to specify a random number generator that allows for multiple streams of random numbers.
R version 2.14.0 has an implementation of Pierre L'Ecuyer's multiple pseudo-random number generator.
Try adding the following before the mclapply() call, with a pre-specified value for 'my.seed':
set.seed( my.seed, kind = "L'Ecuyer-CMRG" );

R RDS file size much larger than object size

I have an object x that contains a list of lists of matrices and model objects from lm and gbm, etc. object.size(x) shows only about 50MB, but the file resulting from saveRDS is more than 5 times larger at more than 250MB. In general, what are some of the common causes for the RDS file to be much larger than the corresponding object size? And what can I do to minimize the discrepancy between the object size and the file size?
EDIT:
I have trimmed down my original problem enough to give a reproducible example (I know the code is lapplying over one element, but this is a reduced example). There seems to be at least 2 problems:
1) The resulting RDS files are about 2~3 times larger than their corresponding object size.
2) The objects from lapply and mclapply have the nearly the same object.size, yet the resulting file is 1.5 times larger for the object returned from mclapply.
Since fit1 and fit2 have almost the same size, inspecting the size of their components within R doesn't seem to be too helpful. Does anyone have suggestion on how to debug this problem?
library(doParallel)
library(data.table)
library(caret)
fitModels <- function(dmy, dat, file.name) {
methods <- list(
list(method = 'knn', tuneLength = 1),
list(method = 'svmRadial', tuneLength = 1)
)
opts <- list(
form = as.formula('X1 ~ .'),
data = as.data.frame(dat),
trControl = trainControl(method = 'none', returnData = F)
)
fit <- mclapply(methods, function(x) do.call(train, c(opts, x)), mc.cores = 2)
saveRDS(fit, paste(file.name, 'rds', sep = '.'))
return(fit)
}
dat <- data.frame(matrix(rnorm(5e4), nrow = 1e3))
fit1 <- lapply(1, fitModels, dat, file.name = 'test1')
fit2 <- mclapply(1, fitModels, dat, file.name = 'test2', mc.cores = 1)
print(object.size(fit1))
print(object.size(fit2))
print(file.info('test1.rds')$size)
print(file.info('test2.rds')$size)
The output is:
2148744 bytes
2149208 bytes
[1] 4659831
[1] 6968437

Up-to-date multicore computing in triple for-loop in R

I want to calculate GLCM with 488 raster files. Because of the enormous calculation time i want to use all the power of my multicore processor (AMD Phenom II 6-core).
library("glcm")
library(raster)
library(devtools)
install_github('azvoleff/glcm')
setwd(working dir.)
rasters <- list.files()[grep("()\\w*.tif", list.files())]
statistics <- c("mean", "variance", "homogeneity", "contrast", "dissimilarity", "entropy","second_moment", "correlation")
shift1 <- c(0,0,1,1)
shift2 <- c(0,1,0,1)
for (j in 1:length(rasters)){
raster1 <- raster(rasters[j])
for (i in 1:length(statistics)){
for (k in 1:length(shift1)){
GLCM <- glcm(raster1, window=c(11,11), statistics=statistics[i], shift = c(shift1[k],shift2[k]), na_opt="ignore")
file <- paste("./GLCM/", substr(tiles[j],0,nchar(tiles[j])-4),"_", statistics[i], "_shift_",shift1[k], shift2[k] , ".tif", sep="")
writeRaster(GLCM, filename = file, type = "GTIFF")
}
}
gc()
}
I searched the internet for multicore solutions in R, but could not find out which one is up to date. So I hope someone can help me.
glcm is not coded to run in parallel, but given that you are processing 488 rasters, I wouldn't worry about running the algorithm itself in parallel - processing the rasters in parallel (say two at a time on an average laptop machine, more if you have more processing pwer and RAM) is the simplest approach here. glcm versions > 1.4 will automatically run block by block over large images (and will account for edge effects), so memory shouldn't be an issue.
Something like the below should get you started (based on your code):
library(glcm)
library(raster)
library(foreach)
library(doparallel)
cl <- makeCluster()
registerDoParallel(cl)
setwd(working dir.)
rasters <- list.files()[grep("()\\w*.tif", list.files())]
statistics <- c("mean", "variance", "homogeneity", "contrast",
"dissimilarity", "entropy","second_moment",
"correlation")
shift1 <- c(0, 0, 1, 1)
shift2 <- c(0, 1, 0, 1)
foreach (j in 1:length(rasters), .packages=c('raster', 'glcm')) %dopar% {
raster1 <- raster(rasters[j])
for (i in 1:length(statistics)) {
for (k in 1:length(shift1)) {
GLCM <- glcm(raster1, window=c(11,11), statistics=statistics[i],
shift = c(shift1[k],shift2[k]), na_opt="ignore")
file <- paste("./GLCM/", substr(tiles[j], 0, nchar(tiles[j])-4),
"_", statistics[i], "_shift_",shift1[k], shift2[k],
".tif", sep="")
writeRaster(GLCM, filename = file, type = "GTIFF")
}
}
}
stopCluster(cl)

Resources