Parallel foreach() %dopar% {...} computing in JAGS - r

I wonder I can use parallel computing in JAGS as I want.
Here is my R script.
library(foreach)
list.data2 <- foreach(i=1:n.rep) %do% {
foreach(j=1:2) %do% {list( cap = cap_data[[i]][[j]],
loc = loc_data[[i]][[j]],
eff = eff_data[[i]][[j]],
trap.numb = trap.numb2,
av = av,
forest = env$forest,
crop = env$crop,
bamboo = env$bamboo,
grass = env$grass,
abandoned = env$abandoned,
city = env$city,
rate = env$for_cr_rate,
m.numb = m.numb,
ones = matrix( 1, m.numb, 5 )
) #,bound_mat=bound_mat,bound_numb=bound_numb
}
}
inits2 <- foreach(j=1:2) %do% {list( n=n.inits2[[j]],
b0=0.5, b1=0.1, b2=0.1, b3=0.1, b4=0.1, b5=0.1, b6=0.1,
a0=5, a1=0.5, a2=0.5, a3=0.5, a4=0.5, a5=0.5, a6=0.5,
sd=1,
err=rep(0,m.numb),
r_capt=0.10
)
}
para2 <- c("a0","a1","a2","a3","a4", "a5","a6",
"b0","b1","b2","b3","b4", "b5","b6", "n28", "n29", "r_capt")
library(R2jags)
start.time <- Sys.time()
install.packages("doParallel")
library(doParallel)
registerDoParallel(cores=6)
x_real2 <- foreach( i = 1:2,
.packages = "R2jags"
) %dopar% {jags( "realdata_5years.txt",
data = list.data2[[i]][[?]],
inits = inits2[[i]],
para = para2,
n.chain = 3,
n.iter = n.1000000,
n.burnin = 400000,
n.thin = 200
)
}
sum_real2 <- foreach(i = 1:2) %do% {x_real2[[i]]$BUGSoutput$summary}
---------------------------------------------------------------------
So, I have two data sets and each has 30 ( == n.rep ) times repetition.
Therefore I have 60 data lists in total.
I would like to use six cores for both 2 data sets and each 3 MCMC chains.
Moreover, I need to repeat this calculation 30 ( == n.rep ) times.
However, I have no idea to write in this way. I have problems in the last 4 lines.
Should I use %dopar% twice?
or
Should I use jags.parallel in addition to the foreach?

Related

parallel foreach cant find object created within the loop

this is my first question here so i hope i'm doing it right.
I'm trying to run a variant of RandomForest called Geographical Regression Forest (package SpatialML). So to train the models i'm doing a foreach loop in parallel and using a sample with replacement on the calibration data.
library(SpatialML)
library(doParallel)
rm(list=ls())
ds <- SpatialML::random.test.data()
ds
# Parallel settings
ncores <- detectCores() - 1
cl <- makePSOCKcluster(ncores)
y <- names(ds)[1]
x <- paste(names(ds)[c(2,3)], collapse = "+")
f <- as.formula(paste0(y,"~",x));f
clusterEvalQ(cl , expr = c(library(SpatialML)))
clusterExport(cl, list("ds"))
#### Bootstraps ####
registerDoParallel(cl)
time <- system.time(foreach (i = 1:10) %dopar% {
# sample with replacement
trainingREP <- sample.int(nrow(ds), 1*nrow(ds), replace = T)
# Geographical Regression Forest
grf.boot <- grf(formula = f, dframe = ds[trainingREP, ],
bw = round(nrow(ds)/10, digits = 0), kernel = "adaptive",
coords = ds[trainingREP, c(4,5)], ntree = 500, importance = T)
# Save GRF
modelFile <- paste("./bootModGRF_",i,".rds",sep="")
saveRDS( object = grf.boot, file = modelFile)
})stopCluster(cl)
but when i run this code i get
Error in { : task 1 failed - "object 'trainingREP' not found
why cant the foreach loop read an object that is created within the same loop?

Making sense of the foreach

I'm new to paralleling the for loop using foreach and struggle to understand how it works. As an example for the exercise, I created a simple list (input2) based on a dataframe (input). I try to calculate b by looping through h and j.
library(doParallel)
library(foreach)
library(dplyr)
input <- data.frame(matrix(rnorm(200*200, 0, .5), ncol=200))
input[input <=0] =0
input['X201'] <- seq(from = 0, to = 20, length.out = 10)
input <- input %>% select(c(X201, 1:200))
input2 <- split(input, f= input$X201)
a = 0
b= 0
cl <- parallel::makeCluster(20)
doParallel::registerDoParallel(cl)
tm1 <- system.time(
y <-
foreach (h = length(input2),.combine = 'cbind') %:%
foreach (j = nrow(input2[[h]]),.combine = 'c',packages = 'foreach') %dopar%{
a = input2[[h]][j,3]
b = b + a
}
)
parallel::stopCluster(cl)
registerDoSEQ()
print("Cluster stopped.")
y is about 0.55 (the exact value depends on the random number one generated), which is the value of input2[[10]][20,3], not the accumulative value I desired. I checked the manual of the foreach package but still not sure I fully understand the mechanism of the foreach function.
R foreach returns back results instead allows the outside variable to be changed. So don't expect a, b to be updated correctly.
Try the following
cl <- parallel::makeCluster(20)
doParallel::registerDoParallel(cl)
tm2 <- system.time(
results <- foreach(h = (1:length(input2)), .combine = "c") %dopar%{
sum( input2[[h]][1:nrow(input2[[h]]),3])
},
b <- sum(results[1:length(results)])
)
parallel::stopCluster(cl)
registerDoSEQ()
b
tm2

foreach with serial and parallel backend giving different results

I have a strange case where using foreach with a serial and parallel backend gives different results the first time I call but then later on both results matches. I used RNG to make the results reproducible for the same seed
Below is a sample function to explain the scenario :
func <- function(ncores = NULL, seed = 1234){
if (!is.null(ncores)){ # this block registers for parallel backend
cl <- makeCluster(ncores)
registerDoParallel(cl)
registerDoRNG(seed, once = TRUE)
on.exit(stopCluster(cl))
} else { # this block registers for serial computation
registerDoSEQ()
registerDoRNG(seed, once = TRUE)
}
w = foreach(i = 1:10, .combine = 'c') %dorng% {
mean(sample(1:100, 50, replace = TRUE))
}
attr(w, "rng") <- NULL
return(w)
}
# first time running below 2 lines
# case 1 : serial
w1 <- func(ncores = NULL)
# Case 2 : parallel
w2 <- func(ncores= 5)
identical(w1, w2)
# second time running below 2 lines
# case 1: serial
w3 <- func(ncores = NULL)
# case 2: parallel
w4 <- func(ncores= 5)
identical(w1, w2)
# [1] FALSE
identical(w3, w4)
# [1] TRUE
Am i missing something while registering sequential process ?
The solution is to use the following expression:
w = foreach(i = 1:10, .combine = 'c', .options.RNG=seed) %dorng% {
mean(sample(1:100, 50, replace = TRUE))}
You can find an explanation at the vignette here.
So your function looks like this:
func <- function(ncores = NULL, seed = 1234){
if (!is.null(ncores)){ # this block registers for parallel backend
cl <- makeCluster(ncores)
registerDoParallel(cl)
on.exit(stopCluster(cl))
} else { # this block registers for serial computation
registerDoSEQ()
}
w = foreach(i = 1:10, .combine = 'c', .options.RNG=seed) %dorng% {
mean(sample(1:100, 50, replace = TRUE))
}
attr(w, "rng") <- NULL
return(w)
}

Foreach Parallel - Combine function for Multiple Outputs

I have a set of ratings for 45000 users and 40 odd movies. I need to predict new ratings for each user based on their pearson correlation with other users. I also need to store the set of similar users and their similarities for each user-movie combination.I am using the foreach package to execute the loops in parallel. The code that I have managed to write is this:
library(foreach)
x <- matrix(rnorm(1:1000), nrow = 100 , ncol =10 )
df = list()
# correlation matrix
cor_mat <- cor(t(x))
cor_mat = abs(cor_mat)
# similarity limits
upper = 1
lower = 0.04
# Initiating parallel environment
cl = makeCluster(3)
registerDoParallel(cl)
res <- foreach(i = 1:nrow(x) , .combine = rbind,.packages= c('base','foreach')) %dopar%{
foreach(j = 1:ncol(x) , .combine = c, .packages = c('base','foreach')) %do%{
sim_user = which(cor_mat[i,] >= lower & cor_mat[i,] < upper)
bx = as.numeric(t(x[sim_user,j]) %*%
cor_mat[sim_user,j]/sum(cor_mat[sim_user,j]))
df[[length(df)+1]] = data.frame(i,j,sim_user,cor_mat[sim_user,j])
return(bx)
}
}
stopCluster(cl)
I am able to accomplish half of my task i.e. creating a matrix of predicted ratings from the foreach output 'res'. But my list df where I am appending the list of similar users is empty at the end of the foreach loop.
What customized combine function can be written to output both the matrix of predicted ratings and the list of similar users?
For multiple output functions, it is always better to return everything inside a list. In that case, it means that you need to specify your own functions to combine data. Here, I return two elements each time: bx and df. My combine functions therefore combine each of those two elements separately and return them in a length-2 list.
combine_custom_j <- function(LL1, LL2) {
bx <- c(LL1$bx, LL2$bx)
dfs <- c(LL1$df, LL2$df)
return(list(bx = bx, df = dfs))
}
combine_custom_i <- function(LL1, LL2) {
bx <- rbind(LL1$bx, LL2$bx)
dfs <- c(LL1$df, LL2$df)
return(list(bx = bx, df = dfs))
}
res <- foreach(i = 1:nrow(x) , .combine = combine_custom_i,.packages= c('base','foreach')) %dopar%{
foreach(j = 1:ncol(x) , .combine = combine_custom_j, .packages = c('base','foreach')) %do%{
sim_user = which(cor_mat[i,] >= lower & cor_mat[i,] < upper)
bx = as.numeric(t(x[sim_user,j]) %*%
cor_mat[sim_user,j]/sum(cor_mat[sim_user,j]))
return(list(bx = bx, df = data.frame(i,j,sim_user,cor_mat[sim_user,j])))
}
}
Although I have returned your data frames in a list like your code suggested, I believe you might want to rbind them? In that case, you can simply replace the c(LL1$df, LL2$df) by rbind(LL1$df, LL2$df) in both combine functions.

R loops and data.frame

A part of the code is
sse <-c()
k <- c()
for (i in seq(3, 15, 1)) {
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainLabels, k=i)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i] <- sum((newdata.trainLabels-pred_y)^2)
k[i] <- i
}
pred_y is a column for each i. I want to create a data frame with all the 13 columns. Can it be done by using a for loop? Or else how can this be accomplished? I need suggestions.
You can use foreach which has the added advantage that it can be run in parallel if you have multiple cores in your CPU. Here is the non-parallel code:
library("iterators")
library("foreach")
library("FNN")
data(iris3)
newdata.training <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
newdata.test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
newdata.trainlabels <- factor(c(rep(1,25), rep(2,25), rep(3,25)))
k.values = seq(3, 15, 1)
start = 2 # to index sse array using k.values
sse = numeric(length = length(k.values))
results = foreach(i = iter(k.values),.combine = cbind) %do%
{
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainlabels, k=i, prob = TRUE)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i - start] <- sum((as.numeric(newdata.trainlabels)-pred_y)^2)
pred_y
}
results1 = data.frame(results)
colnames(results1) = k.values
Here is the parallel version:
# Parallel version
library("iterators")
library("foreach")
library("parallel")
library("doParallel")
library("FNN")
data(iris3)
newdata.training <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
newdata.test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
newdata.trainlabels <- factor(c(rep(1,25), rep(2,25), rep(3,25)))
num.cores = detectCores()
clusters <- makeCluster(num.cores)
registerDoParallel(clusters)
k.values = seq(3, 15, 1)
start = 2 # to index sse array using k.values
sse = numeric(length = length(k.values))
results = foreach(i = iter(k.values),.combine = cbind, .packages=c("FNN")) %dopar%
{
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainlabels, k=i, prob = TRUE)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i - start] <- sum((as.numeric(newdata.trainlabels)-pred_y)^2)
pred_y
}
results1 = data.frame(results)
colnames(results1) = k.values
stopCluster(clusters)
There are only a few differences between the non-parallel code and the parallel code. First, there are additional libraries to load. Second, you need to create and register the clusters that will do the parallel computation (and stop the clusters when you are done). Third, foreach uses %dopar% infix operator instead of %do%. Fourth, the foreach function needs the .packages parameter to pass KNN to each of the clusters.

Resources