Only one processor being used while running NetLogo models using parApply - r

I am using the 'RNetLogo' package to run sensitivity analyses on my NetLogo model. My model has 24 parameters I need to vary - so parallelising this process would be ideal! I've been following along with the example in Thiele's "Parallel processing with the RNetLogo package" vignette, which uses the 'parallel' package in conjunction with 'RNetLogo'.
I've managed to get R to initialise the NetLogo model across all 12 of my processors, which I've verified using gui=TRUE. The problem comes when I try to run the simulation code across the 12 processors using 'parApply'. This line runs without error, but it only runs on one of the processors (using around 8% of my total CPU power). Here's a mock up of my R code file - I've included some commented-out code at the end, showing how I run the simulation without trying to parallelise:
### Load packages
library(parallel)
### Set up initialisation function
prepro <- function(dummy, gui, nl.path, model.path) {
library(RNetLogo)
NLStart(nl.path, gui=gui)
NLLoadModel(model.path)
}
### Set up finalisation function
postpro <- function(x) {
NLQuit()
}
### Set paths
# For NetLogo
nl.path <- "C:/Program Files/NetLogo 6.0/app"
nl.jarname <- "netlogo-6.0.0.jar"
# For the model
model.path <- "E:/Model.nlogo"
# For the function "sim" code
sim.path <- "E:/sim.R"
### Set base values for parameters
base.param <- c('prey-max-velocity' = 25,
'prey-agility' = 3.5,
'prey-acceleration' = 20,
'prey-deceleration' = 25,
'prey-vision-distance' = 10,
'prey-vision-angle' = 240,
'time-to-turn' = 5,
'time-to-return-to-foraging' = 300,
'time-spent-circling' = 2,
'predator-max-velocity' = 35,
'predator-agility' = 3.5,
'predator-acceleration' = 20,
'predator-deceleration' = 25,
'predator-vision-distance' = 20,
'predator-vision-angle' = 200,
'time-to-give-up' = 120,
'number-of-safe-zones' = 1,
'number-of-target-patches' = 5,
'proportion-obstacles' = 0.05,
'obstacle-radius' = 2.0,
'obstacle-radius-range' = 0.5,
'obstacle-sensitivity-for-prey' = 0.95,
'obstacle-sensitivity-for-predators' = 0.95,
'safe-zone-attractiveness' = 500
)
## Get names of parameters
param.names <- names(base.param)
### Load the code of the simulation function (name: sim)
source(file=sim.path)
### Convert "base.param" to a matrix, as required by parApply
base.param <- matrix(base.param, nrow=1, ncol=24)
### Get the number of simulations we want to run
design.combinations <- length(base.param[[1]])
already.processed <- 0
### Initialise NetLogo
processors <- detectCores()
cl <- makeCluster(processors)
clusterExport(cl, 'sim')
gui <- FALSE
invisible(parLapply(cl, 1:processors, prepro, gui=gui, nl.path=nl.path, model.path=model.path))
### Run the simulation across all processors, using parApply
sim.result.base <- parApply(cl, base.param, 1, sim,
param.names,
no.repeated.sim = 100,
trace.progress = FALSE,
iter.length = design.combinations,
function.name = "base parameters")
### Run the simulation on a single processor
#sim.result.base <- sim(base.param,
# param.names,
# no.repeated.sim = 100,
# my.nl1,
# trace.progress = TRUE,
# iter.length = design.combinations,
# function.name = "base parameters")
Here's a mock up for the 'sim' function (adapted from Thiele's paper "Facilitating parameter estimation and sensitivity analyses of agent-based models - a cookbook using NetLogo and R"):
sim <- function(param.set, parameter.names, no.repeated.sim, trace.progress, iter.length, function.name) {
# Some security checks
if (length(param.set) != length(parameter.names))
{ stop("Wrong length of param.set!") }
if (no.repeated.sim <= 0)
{ stop("Number of repetitions must be > 0!") }
if (length(parameter.names) <= 0)
{ stop("Length of parameter.names must be > 0!") }
# Create an empty list to save the simulation results
eval.values <- NULL
# Run the repeated simulations (to control stochasticity)
for (i in 1:no.repeated.sim)
{
# Create a random-seed for NetLogo from R, based on min/max of NetLogo's random seed
NLCommand("random-seed",runif(1,-2147483648,2147483647))
## This is the stuff for one simulation
cal.crit <- NULL
# Set NetLogo parameters to current parameter values
lapply(seq(1:length(parameter.names)), function(x) {NLCommand("set ",parameter.names[x], param.set[x])})
NLCommand("setup")
# This should run "go" until prey-win =/= 5, i.e. when the pursuit ends
NLDoCommandWhile("prey-win = 5", "go")
# Report a value
prey <- NLReport("prey-win")
# Report another value
pred <- NLReport("predator-win")
## Extract the values we are interested in
cal.crit <- rbind(cal.crit, c(prey, pred))
# append to former results
eval.values <- rbind(eval.values,cal.crit)
}
## Make sure eval.values has column names
names(eval.values) <- c("PreySuccess", "PredSuccess")
# Return the mean of the repeated simulation results
if (no.repeated.sim > 1) {
return(colMeans(eval.values))
}
else {
return(eval.values)
}
}
I think the problem might lie in the "nl.obj" string that RNetLogo uses to identify the NetLogo instance you want to run the code on - however, I've tried several different methods of fixing this, and I haven't been able to come up with a solution that works. When I initialise NetLogo across all the processors using the code provided in Thiele's example, I don't set an "nl.obj" value for each instance, so I'm guessing RNetLogo uses some kind of default list? However, in Thiele's original code, the "sim" function requires you to specify which NetLogo instance you want to run it on - so R will spit an error when I try to run the final line (Error in checkForRemoteErrors(val) : one node produced an error: argument "nl.obj" is missing, with no default). I have modified the "sim" function code so that it doesn't require this argument and just accepts the default setting for nl.obj - but then my simulation only runs on a single processor. So, I think that by default, "sim" must only be running the code on a single instance of NetLogo. I'm not certain how to fix it.
This is also the first time I've used the 'parallel' package, so I could be missing something obvious to do with 'parApply'. Any insight would be much appreciated!
Thanks in advance!

I am still in the process of applying a similar technique to perform a Morris Elementary Effects screening with my NetLogo model. For me the parallel execution works fine. I compared your script to mine and noticed that in my version the 'parApply' call of the simulation function (simfun) is embedded in a function statement (see below). Maybe including the function already solves your issue.
sim.results.morris <- parApply(cl, mo$X, 1, function(x) {simfun(param.set=x,
no.repeated.sim=no.repeated.sim,
parameter.names=input.names,
iter.length=iter.length,
fixed.values=fixed.values,
model.seed=new.model.seed,
function.name="Morris")})

Related

how to write out multiple files in R?

I am a newbie R user. Now, I have a question related to write out multiple files with different names. Lets says that my data has the following structure:
IV_HAR_m1<-matrix(rnorm(1:100), ncol=30, nrow = 2000)
DV_HAR_m1<-matrix(rnorm(1:100), ncol=10, nrow = 2000)
I am trying to estimate multiple LASSO regressions. At the beginning, I was storing the iterations in one object called Dinamic_beta. This object was stored in only one file, and it saves the required information each time that my code iterate.
For doing this I was using stew which belongs to pomp package, but the total process takes 5 or 6 days and I am worried about a power outage or a fail in my computer.
Now, I want to save each environment (iterations) in a .Rnd file. I do not know how can I do that? but the code that I am using is the following:
library(glmnet)
library(Matrix)
library(pomp)
space <- 7 #THE NUMBER OF FILES THAT I would WANT TO CREATE
Dinamic_betas<-array(NA, c(10, 31, (nrow(IV_HAR_m1)-space)))
dimnames(Dinamic_betas) <- list(NULL, NULL)
set.seed(12345)
stew( #stew save the enviroment in a .Rnd file
file = "Dinamic_LASSO_RD",{ # The name required by stew for creating one file with all information
for (i in 1:dim(Dinamic_betas)[3]) {
tryCatch( #print messsages
expr = {
cv_dinamic <- cv.glmnet(IV_HAR_m1[i:(space+i-1),],
DV_HAR_m1[i:(space+i-1),], alpha = 1, family = "mgaussian", thresh=1e-08, maxit=10^9)
LASSO_estimation_dinamic<- glmnet(IV_HAR_m1[i:(space+i-1),], DV_HAR_m1[i:(space+i-1),],
alpha = 1, lambda = cv_dinamic$lambda.min, family = "mgaussian")
coefs <- as.matrix(do.call(cbind, coef(LASSO_estimation_dinamic)))
Dinamic_betas[,,i] <- t(coefs)
},
error = function(e){
message("Caught an error!")
print(e)
},
warning = function(w){
message("Caught an warning!")
print(w)
},
finally = {
message("All done, quitting.")
}
)
if (i%%400==0) {print(i)}
}
}
)
If someone can suggest another package that stores the outputs in different files I will grateful.
Try adding this just before the close of your loop
save.image(paste0("Results_iteration_",i,".RData"))
This should save your entire workspace to disk for every iteration. You can then use load() to load the workspace of every environment. Let me know if this works.

Load the MNIST digit recognition dataset with R and see any results

In the book "Machine Learning - A Probabilistic Perspective" by Kevin P. Murphy the first task reads:
Exercise 1.1 KNN classifier on shuffled MNIST data
Run mnist1NNdemo
and verify that the misclassification rate (on the first 1000 test
cases) of MNIST of a 1-NN classifier is 3.8%. (If you run it all on
all 10,000 test cases, the error rate is 3.09%.) Modify the code so
that you first randomly permute the features (columns of the training
and test design matrices), as in shuffledDigitsDemo, and then apply
the classifier. Verify that the error rate is not changed.
My simple understanding is that the exercise is looking for the 1-NN after loading the files(kNN() in R).
The files:
train-images-idx3-ubyte.gz: training set images (9912422 bytes)
train-labels-idx1-ubyte.gz: training set labels (28881 bytes)
t10k-images-idx3-ubyte.gz: test set images (1648877 bytes)
t10k-labels-idx1-ubyte.gz: test set labels (4542 bytes)
are taken from the The MNIST DATABASE
I found a popular template for loading the files:
# for the kNN() function
library(VIM)
load_mnist <- function() {
load_image_file <- function(filename) {
ret = list()
f = file(filename,'rb')
readBin(f,'integer',n=1,size=4,endian='big')
ret$n = readBin(f,'integer',n=1,size=4,endian='big')
nrow = readBin(f,'integer',n=1,size=4,endian='big')
ncol = readBin(f,'integer',n=1,size=4,endian='big')
x = readBin(f,'integer',n=ret$n*nrow*ncol,size=1,signed=F)
ret$x = matrix(x, ncol=nrow*ncol, byrow=T)
close(f)
ret
}
load_label_file <- function(filename) {
f = file(filename,'rb')
readBin(f,'integer',n=1,size=4,endian='big')
n = readBin(f,'integer',n=1,size=4,endian='big')
y = readBin(f,'integer',n=n,size=1,signed=F)
close(f)
y
}
train <<- load_image_file("train-images.idx3-ubyte")
test <<- load_image_file("t10k-images.idx3-ubyte")
train$y <<- load_label_file("train-labels.idx1-ubyte")
test$y <<- load_label_file("t10k-labels.idx1-ubyte")
}
show_digit <- function(arr784, col=gray(12:1/12)) {
image(matrix(arr784, nrow=28)[,28:1], col=col)
}
According to the comment, in the command line this should work:
# Error "Error in matrix(arr784, nrow = 28) : object 'train' not found"
show_digit(train$x[5,])
The question is how can I use the show_digit function ?
Edit Remove extra question
What I figured out for the problem:
First run the whole file in R Studio or ESS, then call the load_mnist() from the console.
After that execute show_digit(train$x[3,]) in the console again and it works.
Finding the KNN classifier can be done on the whole data set:
a <- knn(train, test, train$y) but it would be a very slow process.
Predictions for the result can be done like table(test$y, a), test$y is predicted, a is the actual result.

How to visulize the convolution layer and feature layer in mxnet after cnn was finished trained?

I want to plot or visualize the result of each layers out from a trained CNN with mxnet in R. Like w´those abstract art from what a nn's each layer can see.
But I don't know how. Please somebody help me. One way I can think out is to put the weights and bias back to every step and plot the step out. But when I try to put model$arg.params$convolution0_weight back to mx.symbol.Convolution(), I get
Error in mx.varg.symbol.Convolution(list(...)) :
./base.h:291: Unsupported parameter type object type for argument weight, expect integer, logical, or string.
Can anyone help me?
I thought out one way, but encounter a difficulty at one step. Here is what I did.
I found all the trained cnn's parameters inmodel$arg.params , and to compute with parameters we can use mx.nd... founctions as bellow:
`#convolution 1_result
conv1_result<- mxnet::mx.nd.Convolution(data=mx.nd.array(train_array),weight=model$arg.params$convolution0_weight,bias=model$arg.params$convolution0_bias,kernel=c(8,8),num_filter = 50)
str(conv1_result)
tanh1_result<-mx.nd.Activation(data= conv1_result, act_type = "sigmoid")
pool1_result <- mx.nd.Pooling(data = tanh1_result, pool_type = "avg", kernel = c(4,4), stride = c(4,4))
conv2 result
conv2_result<- mxnet::mx.nd.Convolution(data=pool1_result,weight=model$arg.params$convolution1_weight,bias=model$arg.params$convolution1_bias,kernel=c(5,5),num_filter = 50)
tanh2_result<-mx.nd.Activation(data= conv1_result, act_type = "sigmoid")
pool2_result <- mx.nd.Pooling(data = tanh1_result, pool_type = "avg", kernel = c(4,4), stride = c(4,4))
1st fully connected layer result
flat_result <- mx.nd.flatten(data = pool2_result)
fcl_1_result <- mx.nd.FullyConnected(data = flat_result,weight = model$arg.params$fullyconnected0_weight,bias = model$arg.params$fullyconnected0_bias, num_hidden = 500)
tanh_3_result <- mx.nd.Activation(data = fcl_1_result, act_type = "tanh")
2nd fully connected layer result
fcl_2_result <- mx.nd.FullyConnected(data = tanh_3,weight = model$arg.params$fullyconnected1_weight,bias = model$arg.params$fullyconnected1_bias, num_hidden =100)`
but when I came to mx.nd.FullyConnected() step , I encountered not sufficient memory(i have 16 GB RAM) and R crashed.
So, does anyone know how to batch_size the input data in
mx.nd.FullyConnected(), or any method to make mx.nd.FullyConnected() run successfully as mx.model.FeedForward.create()
did?
Here is the code that can help you to achieve what you want. The code below displays activations of 2 convolution layers of LeNet. The code gets as an input MNIST dataset, which is 28x28 grayscale images (downloaded automatically), and produces images as activations.
You can grab outputs from executor. To see the list of available outputs use names(executor$ref.outputs)
The result of each output is available as a matrix with values in [-1; 1] range. The dimensions of the matrix depends on parameters of the layer. The code use these matrices to display as greyscaled images where -1 is white pixel, 1 - black pixel. (most of the code is taken from https://github.com/apache/incubator-mxnet/issues/1152 and massaged a little bit)
The code is a self sufficient to run, but I have noticed that if I build the model second time in the same R session, the names of ouputs get different indices, and later the code fails because the expected names of outputs are hard coded. So if you decide to create a model more than once, you will need to restart R session.
Hope it helps and you can adjust this example to your case.
library(mxnet)
download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip', destfile = 'mnist_csv.zip')
unzip('mnist_csv.zip', exdir = '.')
train <- read.csv('train.csv', header=TRUE)
data.x <- train[,-1]
data.x <- data.x/255
data.y <- train[,1]
val_ind = 1:100
train.x <- data.x[-val_ind,]
train.x <- t(data.matrix(train.x))
train.y <- data.y[-val_ind]
val.x <- data.x[val_ind,]
val.x <- t(data.matrix(val.x))
val.y <- data.y[val_ind]
train.array <- train.x
dim(train.array) <- c(28, 28, 1, ncol(train.x))
val.array <- val.x
dim(val.array) <- c(28, 28, 1, ncol(val.x))
# input layer
data <- mx.symbol.Variable('data')
# first convolutional layer
convLayer1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=30)
convAct1 <- mx.symbol.Activation(data=convLayer1, act_type="tanh")
poolLayer1 <- mx.symbol.Pooling(data=convAct1, pool_type="max", kernel=c(2,2), stride=c(2,2))
# second convolutional layer
convLayer2 <- mx.symbol.Convolution(data=poolLayer1, kernel=c(5,5), num_filter=60)
convAct2 <- mx.symbol.Activation(data=convLayer2, act_type="tanh")
poolLayer2 <- mx.symbol.Pooling(data=convAct2, pool_type="max",
kernel=c(2,2), stride=c(2,2))
# big hidden layer
flattenData <- mx.symbol.Flatten(data=poolLayer2)
hiddenLayer <- mx.symbol.FullyConnected(flattenData, num_hidden=500)
hiddenAct <- mx.symbol.Activation(hiddenLayer, act_type="tanh")
# softmax output layer
outLayer <- mx.symbol.FullyConnected(hiddenAct, num_hidden=10)
LeNet1 <- mx.symbol.SoftmaxOutput(outLayer)
# Group some output layers for visual analysis
out <- mx.symbol.Group(c(convAct1, poolLayer1, convAct2, poolLayer2, LeNet1))
# Create an executor
executor <- mx.simple.bind(symbol=out, data=dim(val.array), ctx=mx.cpu())
# Prepare for training the model
mx.set.seed(0)
# Set a logger to keep track of callback data
logger <- mx.metric.logger$new()
# Using cpu by default, but set gpu if your machine has a supported one
devices=mx.cpu(0)
# Train model
model <- mx.model.FeedForward.create(LeNet1, X=train.array, y=train.y,
eval.data=list(data=val.array, label=val.y),
ctx=devices,
num.round=1,
array.batch.size=100,
learning.rate=0.05,
momentum=0.9,
wd=0.00001,
eval.metric=mx.metric.accuracy,
epoch.end.callback=mx.callback.log.train.metric(100, logger))
# Update parameters
mx.exec.update.arg.arrays(executor, model$arg.params, match.name=TRUE)
mx.exec.update.aux.arrays(executor, model$aux.params, match.name=TRUE)
# Select data to use
mx.exec.update.arg.arrays(executor, list(data=mx.nd.array(val.array)), match.name=TRUE)
# Do a forward pass with the current parameters and data
mx.exec.forward(executor, is.train=FALSE)
# List of outputs available.
names(executor$ref.outputs)
# Plot the filters of a sample from validation set
sample_index <- 99 # sample number in validation set. Change it to if you want to see other samples
activation0_filter_count <- 30 # number of filters of the "convLayer1" layer
par(mfrow=c(6,5), mar=c(0.1,0.1,0.1,0.1)) # number of rows x columns in output
dim(executor$ref.outputs$activation0_output)
for (i in 1:activation0_filter_count) {
outputData <- as.array(executor$ref.outputs$activation0_output)[,,i,sample_index]
image(outputData,
xaxt='n', yaxt='n',
col=gray(seq(1,0,-0.1)))
}
activation1_filter_count <- 60 # number of filters of the "convLayer2" layer
dim(executor$ref.outputs$activation1_output)
par(mfrow=c(6,10), mar=c(0.1,0.1,0.1,0.1)) # number of rows x columns in output
for (i in 1:activation1_filter_count) {
outputData <- as.array(executor$ref.outputs$activation1_output)[,,i,sample_index]
image(outputData,
xaxt='n', yaxt='n',
col=gray(seq(1,0,-0.1)))
}
As a result you should see the following images for a validation sample #2 (use RStudio left and right arrows to navigate between them).

replacement has length zero in list() in r

I'm trying to run this code, and I'm using mhadaptive package, but the problem is that when I run these code without writing metropolis_hastings (that is one part of mhadaptive package) error does not occur, but when I add mhadaptive package the error occur. What should I do?
li_F1<-function(pars,data) #defining first function
{
a01<-pars[1] #defining parameters
a11<-pars[2]
epsilon<<-pars[3]
b11<-pars[4]
a02<-pars[5]
a12<-pars[6]
b12<-pars[7]
h<-pars[8]
h[[i]]<-list() #I want my output is be listed in the h
h[[1]]<-0.32082184 #My first value of h is known and other values should calculate by formula
for(i in 2:nrow(F_2_))
{
h[[i]]<- ((a01+a11*(h[[i-1]])*(epsilon^2)*(h[[i-1]])*b11)+(F1[,2])*((a02+a12*(h[[i-1]])*(epsilon^2)+(h[[i-1]])*b12)))
pred<- h[[i]]
}
log_likelihood<-sum(dnorm(prod(h[i]),pred,sd = 1 ,log = TRUE))
return(h[i])
prior<- prior_reg(pars)
return(log_likelihood + prior)
options(digits = 22)
}
prior_reg<-function(pars) #defining another function
{
epsilon<<-pars[3] #error
prior_epsilon<-pt(0.95,5,lower.tail = TRUE,log.p = FALSE)
return(prior_epsilon)
}
F1<-as.matrix(F_2_) #defining my importing data and simulatunig data with them
x<-F1[,1]
y<-F1[,2]
d<-cbind(x,y)
#using mhadaptive package
mcmc_r<-Metro_Hastings(li_func = li_F1,pars=c(10,15,10,10,10,15),par_names=c('a01','a02','a11','a12','b11','b12'),data=d)
By running this code this error occur.
Error in h[[i]] <- list() : replacement has length zero
I'll so much appreciate who help me.

How could this custom process be done in parallel? or multicores?

I am trying to figure out how i could use any of the parallel processing packages like foreach or doParallel in this random forest loop i have created:
ModelInfo <- data.frame ( model=as.numeric()
,Nodesize=as.numeric()
,Mrty=as.numeric()
,Maxdepth=as.numeric()
,Cp=as.numeric()
,Accuracy_Training=as.numeric()
,AUC_Training=as.numeric())
w=1
set.seed(1809)
NumberOfSamples=1
# Number of iterations
rfPred=list()
pred=list()
roundpred=list()
cTab=list()
Acc=list()
pred.to.roc=list()
pred.rocr=list()
perf.rocr=list()
AUC=list()
Var_imp=list()
rf_model_tr = list()
length(rf_model_tr) <- NumberOfSamples
for (i in 1:NumberOfSamples)
{
rf_model_tr[[i]] = list()
rfPred[[i]]=list()
pred[[i]]=list()
roundpred[[i]]=list()
cTab[[i]]=list()
Acc[[i]]=list()
pred.to.roc[[i]]=list()
pred.rocr[[i]]=list()
perf.rocr[[i]]=list()
AUC[[i]]=list()
Var_imp[[i]]=list()
## Tune nodesize
nodesize =c(10,20,50,80,100,200)
n=length(nodesize)
length(rf_model_tr[[i]]) <- n
for ( j in 1: length (nodesize))
{
rf_model_tr[[i]][[j]] = list()
rfPred[[i]][[j]]=list()
pred[[i]][[j]]=list()
roundpred[[i]][[j]]=list()
cTab[[i]][[j]]=list()
Acc[[i]][[j]]=list()
pred.to.roc[[i]][[j]]=list()
pred.rocr[[i]][[j]]=list()
perf.rocr[[i]][[j]]=list()
AUC[[i]][[j]]=list()
Var_imp[[i]][[j]]=list()
## Tune mrty
mrtysize =c(2,3,4)
m=length(mrtysize)
length(rf_model_tr[[i]][[j]]) <- m
for ( k in 1: length (mrtysize))
{
rf_model_tr[[i]][[j]][[k]] = list()
rfPred[[i]][[j]][[k]]=list()
pred[[i]][[j]][[k]]=list()
roundpred[[i]][[j]][[k]]=list()
cTab[[i]][[j]][[k]]=list()
Acc[[i]][[j]][[k]]=list()
pred.to.roc[[i]][[j]][[k]]=list()
pred.rocr[[i]][[j]][[k]]=list()
perf.rocr[[i]][[j]][[k]]=list()
AUC[[i]][[j]][[k]]=list()
Var_imp[[i]][[j]][[k]]=list()
## Tune maxdepth
maxdep =c(10,20,30)
z=length(maxdep)
length(rf_model_tr[[i]][[j]][[k]]) <- z
for (l in 1:length (maxdep))
{
rf_model_tr[[i]][[j]][[k]][[l]] = list()
rfPred[[i]][[j]][[k]][[l]]=list()
pred[[i]][[j]][[k]][[l]]=list()
roundpred[[i]][[j]][[k]][[l]]=list()
cTab[[i]][[j]][[k]][[l]]=list()
Acc[[i]][[j]][[k]][[l]]=list()
pred.to.roc[[i]][[j]][[k]][[l]]=list()
pred.rocr[[i]][[j]][[k]][[l]]=list()
perf.rocr[[i]][[j]][[k]][[l]]=list()
AUC[[i]][[j]][[k]][[l]]=list()
Var_imp[[i]][[j]][[k]][[l]]=list()
## Tune cp
cp =c(0,0.01,0.001)
p=length(cp)
length(rf_model_tr[[i]][[j]][[k]][[l]]) <- p
for (m in 1:length (cp))
{
rf_model_tr[[i]][[j]][[k]][[l]][[m]]= randomForest (as.factor(class) ~.
, data=train,mtry=mrtysize[[k]],maxDepth = maxdep[[l]], replace=F, importance=T, do.trace=10, ntree=200,nodesize=nodesize[j],cp=cp[[m]])
#Accuracy
rfPred[[i]][[j]][[k]][[l]][[m]] <- predict(rf_model_tr[[i]][[j]][[k]][[l]][[m]], train, type = "prob")
pred[[i]][[j]][[k]][[l]][[m]] <- colnames(rfPred[[i]][[j]][[k]][[l]][[m]] )[apply(rfPred[[i]][[j]][[k]][[l]][[m]] ,1,which.max)]
cTab[[i]][[j]][[k]][[l]][[m]] = table(pred[[i]][[j]][[k]][[l]][[m]],train$class)
Acc[[i]][[j]][[k]][[l]][[m]]<- sum(diag(cTab[[i]][[j]][[k]][[l]][[m]])) / sum(cTab[[i]][[j]][[k]][[l]][[m]])
#AUC
pred.to.roc[[i]][[j]][[k]][[l]][[m]]<-rfPred[[i]][[j]][[k]][[l]][[m]][,2]
pred.rocr[[i]][[j]][[k]][[l]][[m]]<-prediction(pred.to.roc[[i]][[j]][[k]][[l]][[m]],as.factor(train$class))
perf.rocr[[i]][[j]][[k]][[l]][[m]]<-performance(pred.rocr[[i]][[j]][[k]][[l]][[m]],measure="auc",x.measure="cutoff")
AUC[[i]][[j]][[k]][[l]][[m]]<-as.numeric(perf.rocr[[i]][[j]][[k]][[l]][[m]]#y.values)
#Variable Importance
Var_imp[[i]][[j]][[k]][[l]][[m]]<-(importance(rf_model_tr[[i]][[j]][[k]][[l]][[m]],type=2))
ModelInfo[w,1]<-w
ModelInfo[w,2]<-nodesize[[j]]
ModelInfo[w,3]<-mrtysize[[k]]
ModelInfo[w,4]<-maxdep[[l]]
ModelInfo[w,5]<-cp[[m]]
ModelInfo[w,6]<-Acc[[i]][[j]][[k]][[l]][[m]]
ModelInfo[w,7]<-AUC[[i]][[j]][[k]][[l]][[m]]
w=w+1
}
}
}
}
}
Basically ,what i am doing is that i am creating all possible model variations with one dataset based on the available tuning parameters for a random forest (nodesize,cp ect) and storing that information to the table model info as every iteration goes by. In addition i add measures like accuracy and AUC, so as to compare the different models created in the end and make a pick.
The reason i am looking for an alternative, is that the caret package offers me only to tune the mtry allthough there i do have the chance to run parRF which could solve my problem, but i prefer to incorporate something here, how would that be possible?
I have read about the foreach and doParallel packages but i dont quite get how this could be syntaxed here.
If the initial data is needed please let me know, i just thought at this point to show the part that neeeds to be parallel computed.
Thank you in advance
Hi I normally just code everything manually. In linux/mac I use parallel package and mclapply which can use memory forking. Forking processes use less memory and are faster to start up. Windows do not support forking thus I use doParallel package (other packages could do also). the foreach() function is a user friendly parallel mapper. I find myself to spend more time setting up single PC parallel computing than saving from speed-up. Still fun :)
If you work on a university, you may have access to a large cluster. The BatchJobs package is another mapper which can use many different backends, e.g. a Torque/PBS que system. I can borrow 80 nodes with 4 CPU's giving me a potential 320 times speedup (more like 150 times in practice). I learned about BatchJobs from this great introduction. I like that BatchJobs also can run single or multi-core locally, which is much easier to debug.
The code below introduces how to create a list of jobs with both foreach and BatchJobs. Each job is a set of arguments. The job arguments are fused with standard arguments and a model is trained. Some statistics is returned and all results and arguments are combined into a data.frame.
useForeach = FALSE #If FALSE, will run as batchjobs. Only faster for cluster computing.
library(randomForest)
#load a data set
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
download.file(url,destfile="winequality-white.csv",mode="w")
wwq = read.csv(file="winequality-white.csv",header=T,sep=";")
X = wwq[,names(wwq) != "quality"]
y = wwq[,"quality"]
#2 - make jobs
pars = expand.grid(
mtry = c(1:3),
sampsize = floor(seq(1000,1898,length.out = 3)),
nodesize = c(1,3)
)
jobs = lapply(1:dim(pars)[1], function(i) pars[i,])
#3 - make node function there will excute a number of jobs
test.pars = function(someJobs,useForeach=TRUE) {
#if running cluster, global environment imported manually
if(!useForeach) load(file="thisState.rda")
do.call(rbind,lapply(someJobs,function(aJob){ #do jobs and bind results by rows
print(aJob)
merged.args = c(alist(x=X,y=y),as.list(aJob)) #merge std. and job args
run.time = system.time({rfo = do.call(randomForest,merged.args)}) #run a job
data.frame(accuracy=tail(rfo$rsq,1),run.time=run.time[3],mse=tail(rfo$mse,1))
}))
}
##test function single core
jobsToTest = 1:5
out = test.pars(jobs[jobsToTest])
print(cbind(out,do.call(rbind,jobs[jobsToTest])))
#4a execute jobs with foreach package:
if(useForeach) {
library(foreach)
library(doParallel)
CPUs=4
cl = makeCluster(CPUs)#works both for windows and linux, otherwise forking is better
registerDoParallel(cl)
nodes=min(CPUs,length(jobs)) #how many splits of jobList, not so important for foreach...
job.array = suppressWarnings(split(jobs,1:nodes)) #split warns if each core cannot get same amount of jobs
out = foreach(i=job.array,.combine=rbind,.packages="randomForest") %dopar% test.pars(i)
stopCluster(cl)
} else {
library(BatchJobs)
#4b - execute jobs with BatchJobs package (read manual how to set up on cluster)
nodes=min(80,length(jobs)) # how many nodes to split job onto
job.array = split(jobs,1:nodes)
save(list=ls(),file="thisState.rda") #export this state(global environment) to every node
#initiate run
reg = makeRegistry(id ="myFirstBatchJob",packages="randomForest")
batchMap(reg,fun=test.pars,someJobs = job.array,more.args=list(useForeach=FALSE))
submitJobs(reg)
waitForJobs(reg)
out = loadResults(reg)
#6- wrap up save filnalResults to user
finalResult = cbind(do.call(rbind,jobs),do.call(rbind,out))
save(out,file="finalResult.rda")
removeRegistry(reg,ask="no")
}
#7- print final result
print(cbind(do.call(rbind,jobs),out))

Resources