this is my first question here so i hope i'm doing it right.
I'm trying to run a variant of RandomForest called Geographical Regression Forest (package SpatialML). So to train the models i'm doing a foreach loop in parallel and using a sample with replacement on the calibration data.
library(SpatialML)
library(doParallel)
rm(list=ls())
ds <- SpatialML::random.test.data()
ds
# Parallel settings
ncores <- detectCores() - 1
cl <- makePSOCKcluster(ncores)
y <- names(ds)[1]
x <- paste(names(ds)[c(2,3)], collapse = "+")
f <- as.formula(paste0(y,"~",x));f
clusterEvalQ(cl , expr = c(library(SpatialML)))
clusterExport(cl, list("ds"))
#### Bootstraps ####
registerDoParallel(cl)
time <- system.time(foreach (i = 1:10) %dopar% {
# sample with replacement
trainingREP <- sample.int(nrow(ds), 1*nrow(ds), replace = T)
# Geographical Regression Forest
grf.boot <- grf(formula = f, dframe = ds[trainingREP, ],
bw = round(nrow(ds)/10, digits = 0), kernel = "adaptive",
coords = ds[trainingREP, c(4,5)], ntree = 500, importance = T)
# Save GRF
modelFile <- paste("./bootModGRF_",i,".rds",sep="")
saveRDS( object = grf.boot, file = modelFile)
})stopCluster(cl)
but when i run this code i get
Error in { : task 1 failed - "object 'trainingREP' not found
why cant the foreach loop read an object that is created within the same loop?
Related
This is the problem instructions I was given.
Build a K-NN classifier, use 5-fold cross-validation to evaluate its performance based on average accuracy.
Report accuracy measure for k = 2, ..., 10
Write your code below (Hint: you need a loop within a loop, with the outer loop going through each value of k and inner loop going through each fold):
You can manually try k=2,...,10, but try to use a outer loop through each value of k.
I was given 2 for loops. One for Creating folds and the other for calculating k=1:10, which are listed below.
# Given data
library(datasets)
data(iris)
library(dplyr)
normalize = function(x){
return ((x - min(x))/(max(x) - min(x)))}
# normalize
Iris_normalized = IrisData %>% mutate_at(1:4, normalize)
# Create folds
cv = createFolds(y = IrisData$class, k = 5)
accuracy = c()
for (test_rows in cv) {
IrisData_train = IrisData[-test_rows,]
IrisData_test = IrisData[test_rows,]
tree = rpart(class ~ ., data = IrisData_train,
method = "class", parms = list(split = "information"))
pred_tree = predict(tree, IrisData_test, type = "class")
cm = confusionMatrix(pred_tree, IrisData_test[,5])
accuracy = c(accuracy, cm$overall[1])
}
print(mean(accuracy))
# Manual K validation
SSE_curve <- c()
for (k in 1:10) {
print(k)
kcluster = kmeans(utility_normalized, center = k)
sse = kcluster$tot.withinss
print(sse)
SSE_curve[k] = sse
}
So if I am understanding the instructions correctly. I need to:
Create 5 folds using normalized data with a for loop and set.seed.
Use a for loop to find the accuracy in k=1:10 for each fold.
I am not sure how these 2 for-loops combine to give me this result in the instructions.
I imagine the code you provide is just an example and this question sounds a lot like a student homework problem. You should at least provide your effort so far.
However here are two possible solutions:
1)two nested for-loop:
library(class)
library(dplyr)
data("iris")
normalize = function(x){
return ((x - min(x))/(max(x) - min(x)))}
# normalize
Iris_normalized = iris %>% mutate_at(1:4, normalize)
av_accuracy <- as.list(2:10)
for (k in 2:10) {
set.seed(4)
cv <- createFolds(y = Iris_normalized$Species, k = 5)
accuracy <- c()
for (i in cv) {
IrisData_train = Iris_normalized[-i,]
IrisData_test = Iris_normalized[i,]
tree <- knn(IrisData_train[,-5],IrisData_test[,-5],cl=IrisData_train$Species,k=k)
cm <- confusionMatrix(tree, IrisData_test[,5])
accuracy <- c(accuracy, cm$overall[1])
}
av_accuracy[[k-1]] <- mean(accuracy)
}
results <- data.frame(k=2:10,mean.accuracy=unlist(av_accuracy))
using the caret framework, which is built exactly for this kind of task:
control <- trainControl(method = "cv",5,returnResamp="all",)
grid <- expand.grid(k=2:10)
fit <-
train(
Species ~ .,
data = Iris_normalized,
trControl = control,
tuneGrid = grid,
method = "knn"
)
fit$results
I wrote the below R function to do the following task:
Simulate 10 time series data set from ARIMA model through arima.sim() function
Split the series into sub-series of possible 2s, 3s, 4s, 5s, 6s, 7s, 8s, and 9s.
For each size take a resample the blocks with replacement, for new series and obtain the best ARIMA model from the subseries from each block size through auto.arima() function.
Obtain for each subseries of each block sizes RMSE.
.
## Load packages and prepare multicore process
library(forecast)
library(future.apply)
plan(multisession)
library(parallel)
library(foreach)
library(doParallel)
n_cores <- detectCores()
cl <- makeCluster(n_cores)
registerDoParallel(cores = detectCores())
## simulate ARIMA(1,0, 0)
#n=10; phi <- 0.6; order <- c(1, 0, 0)
bootstrap1 <- function(n, phi){
ts <- arima.sim(n, model = list(ar=phi, order = c(1, 0, 0)), sd = 1)
########################################################
## create a vector of block sizes
t <- length(ts) # the length of the time series
lb <- seq(n-2)+1 # vector of block sizes to be 1 < l < n (i.e to be between 1 and n exclusively)
########################################################
## This section create matrix to store block means
BOOTSTRAP <- matrix(nrow = 1, ncol = length(lb))
colnames(BOOTSTRAP) <-lb
########################################################
## This section use foreach function to do detail in the brace
BOOTSTRAP <- foreach(b = 1:length(lb), .combine = 'cbind') %do%{
l <- lb[b]# block size at each instance
m <- ceiling(t / l) # number of blocks
blk <- split(ts, rep(1:m, each=l, length.out = t)) # divides the series into blocks
######################################################
res<-sample(blk, replace=T, 10) # resamples the blocks
res.unlist <- unlist(res, use.names = FALSE) # unlist the bootstrap series
train <- head(res.unlist, round(length(res.unlist) - 10)) # Train set
test <- tail(res.unlist, length(res.unlist) - length(train)) # Test set
nfuture <- forecast::forecast(train, model = forecast::auto.arima(train), lambda=0, biasadj=TRUE, h = length(test))$mean # makes the `forecast of test set
RMSE <- Metrics::rmse(test, nfuture) # RETURN RMSE
BOOTSTRAP[b] <- RMSE
}
BOOTSTRAPS <- matrix(BOOTSTRAP, nrow = 1, ncol = length(lb))
colnames(BOOTSTRAPS) <- lb
BOOTSTRAPS
return(list(BOOTSTRAPS))
}
If the function is called as below:
bootstrap1(10, 0.6)
I get the following result:
##$BOOTSTRAPS
## 2 3 4 5 6 7 8 9
##[1,] 1.287224 2.264574 2.998069 2.349261 1.677791 1.183126 2.021157 1.357658
My attempt to use Monte Carlo function to make my function run three(3) different and distinct times.
param_list=list("n"=10, "phi"=0.6)
library(MonteCarlo)
MC_result<-MonteCarlo(func = bootstrap1, nrep=3, param_list = param_list)
I got the following error message:
Error in MonteCarlo(func = bootstrap1, nrep = 3, param_list = param_list) :
func has to return a list with named components. Each component has to be scalar.
Please help me to get right what I did wrong either on my function or the MonteCarlo() function.
Based on the error message, I would try replacing the end of your function with something like:
names(BOOTSTRAPS) <- letters[1:10]
return(as.list(BOOTSTRAPS))
Then the resulting output is a named list with names letters[1:10].
I use foreach + doParallel to apply a function to each row of a matrix multithreadedly in R. When the matrix has many rows, foreach takes a long time before and after multithreadedly going over the iterations.
For example, if I run:
library(foreach)
library(doParallel)
doWork <- function(data) {
# setup parallel backend to use many processors
cores=detectCores()
number_of_cores_to_use = cores[1]-1 # not to overload the computer
cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
cl <- makeCluster(number_of_cores_to_use)
clusterExport(cl=cl, varlist=c('ns','weights'))
registerDoParallel(cl)
cat('...Starting foreach initialization')
output <- foreach(i=1:length(data[,1]), .combine=rbind) %dopar% {
cat(i)
y = data[i,5]
a = 100
for (i in 1:3) { # Useless busy work
b=matrix(runif(a*a), nrow = a, ncol=a)
}
return(runif(10))
}
# stop cluster
cat('...Stop cluster')
stopCluster(cl)
return(output)
}
r = 100000
c = 10
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data)
output[1:10,]
The CPU usage is as follows (100% means all cores are fully utilized):
with annotations:
How can I optimize the code so that foreach doesn't take a long time before and after multithreadedly going over the iterations? The main time sink is the time spent after. The time spent after grows significantly with the number of foreach iterations, sometimes making the code has slow as if a simple for loop was used.
Another example (let's assume lm and poly cannot take matrices as arguments):
library(foreach)
library(doParallel)
doWork <- function(data,weights) {
# setup parallel backend to use many processors
cores=detectCores()
number_of_cores_to_use = cores[1]-1 # not to overload the computer
cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
cl <- makeCluster(number_of_cores_to_use)
clusterExport(cl=cl, varlist=c('weights'))
registerDoParallel(cl)
cat('...Starting foreach initialization')
output <- foreach(i=1:nrow(data), .combine=rbind) %dopar% {
x = sort(data[i,])
fit = lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = 2,raw=TRUE), na.action=na.omit, weights=weights)
return(fit$coef)
}
# stop cluster
cat('...Stop cluster')
stopCluster(cl)
return(output)
}
r = 10000
c = 10
weights=runif(c-1)
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data,weights)
output[1:10,]
Try this:
devtools::install_github("privefl/bigstatsr")
library(bigstatsr)
options(bigstatsr.ncores.max = parallel::detectCores())
doWork2 <- function(data, weights, ncores = parallel::detectCores() - 1) {
big_parallelize(data, p.FUN = function(X.desc, ind, weights) {
X <- bigstatsr::attach.BM(X.desc)
output.part <- matrix(0, 3, length(ind))
for (i in seq_along(ind)) {
x <- sort(X[, ind[i]])
fit <- lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = 2, raw = TRUE),
na.action = na.omit, weights = weights)
output.part[, i] <- fit$coef
}
t(output.part)
}, p.combine = "rbind", ncores = ncores, weights = weights)
}
system.time({
data.bm <- as.big.matrix(t(data))
output2 <- doWork2(data.bm, weights)
})
all.equal(output, output2, check.attributes = FALSE)
This is twice as fast on my computer (which has only 4 cores). Remarks:
Using more than half of the cores is often useless.
Your data is not very large, so using a big.matrix may not be useful here.
big_parallelize separate the matrix in ncores blocks of columns and apply your function on each and then combine the results.
In the function, it's better to make the output before the loop, and then fill it than to use a foreach that rbind all the results.
I'm accessing only columns, not rows.
So all these are good practices, yet it is not really relevant for your data. The gain should be higher when using more cores and for larger datasets.
Basically, if you want to be super fast, reimplementing the lm part in Rcpp would be a good solution.
As F. Privé mentioned in the comment:
The problem is with rbind I think. rbind lots of values from a list takes a long time. Also, fillings rows is bad, because matrices are stored by column. Also, making a long foreach loop is not efficient (use blocks instead).
To use use blocks instead (if 5 cores are used, each core receives 20% of the matrix):
library(foreach)
library(doParallel)
array_split <- function(data, number_of_chunks) {
# [Partition matrix into N equally-sized chunks with R](https://stackoverflow.com/a/45198299/395857)
# Author: lmo
rowIdx <- seq_len(nrow(data))
lapply(split(rowIdx, cut(rowIdx, pretty(rowIdx, number_of_chunks))), function(x) data[x, ])
}
doWork <- function(data) {
# setup parallel backend to use many processors
cores=detectCores()
number_of_cores_to_use = cores[1]-1 # not to overload the computer
cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
cl <- makeCluster(number_of_cores_to_use)
clusterExport(cl=cl, varlist=c('ns','weights'))
registerDoParallel(cl)
cat('...Starting array split')
number_of_chunks = number_of_cores_to_use
data_chunks = array_split(data=data, number_of_chunks=number_of_chunks)
degree_poly = 2
cat('...Starting foreach initialization')
output <- foreach(i=1:length(data_chunks), .combine=rbind) %dopar% {
data_temporary = data_chunks[[i]]
output_temporary = matrix(0, nrow=nrow(data_temporary), ncol = degree_poly + 1)
for(i in 1:length(data_temporary[,1])) {
x = sort(data_temporary[i,])
fit = lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = degree_poly,raw=TRUE), na.action=na.omit, weights=weights)
output_temporary[i,] = fit$coef
}
return(output_temporary)
}
# stop cluster
cat('...Stop cluster')
stopCluster(cl)
return(output)
}
r = 100000
c = 10
weights=runif(c-1)
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data)
output[1:10,]
FYI:
Partition matrix into N equally-sized chunks with R
Using parLapply and clusterExport inside a function
I have a multiclassification problem and I'm trying to run KNN algorithm to find the 50 nearest neighbors around each data point. I have used FNN package in R, however it takes a long time since my dataset has about 29 million row. I was wondering if there is a package in R that can run KNN in parallel. Do you have any suggestion with an example of its useage?
you can use the following by modifying it accordig to KNN .. If need i will provided you with exact code .. the following code is for svc
pkgs <- c('foreach', 'doParallel')
lapply(pkgs, require, character.only = T)
registerDoParallel(cores = 4)
### PREPARE FOR THE DATA ###
df1 <- read.csv(...... your dataset path........)
## do normalization if needed ##
### SPLIT DATA INTO K FOLDS ###
set.seed(2016)
df1$fold <- caret::createFolds(1:nrow(df1), k = 10, list = FALSE)
### PARAMETER LIST ###
cost <- 10^(-1:4)
gamma <- 2^(-4:-1)
parms <- expand.grid(cost = cost, gamma = gamma)
### LOOP THROUGH PARAMETER VALUES ###
result <- foreach(i = 1:nrow(parms), .combine = rbind) %do% {
c <- parms[i, ]$cost
g <- parms[i, ]$gamma
### K-FOLD VALIDATION ###
out <- foreach(j = 1:max(df1$fold), .combine = rbind, .inorder = FALSE) %dopar% {
deve <- df1[df1$fold != j, ]
test <- df1[df1$fold == j, ]
mdl <- e1071::svm(Classification-type-column ~ ., data = deve, type = "C-classification", kernel = "radial", cost = c, gamma = g, probability = TRUE)
pred <- predict(mdl, test, decision.values = TRUE, probability = TRUE)
data.frame(y = test$DEFAULT, prob = attributes(pred)$probabilities[, 2])
}
### CALCULATE SVM PERFORMANCE ###
roc <- pROC::roc(as.factor(out$y), out$prob)
data.frame(parms[i, ], roc = roc$auc[1])
}
A part of the code is
sse <-c()
k <- c()
for (i in seq(3, 15, 1)) {
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainLabels, k=i)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i] <- sum((newdata.trainLabels-pred_y)^2)
k[i] <- i
}
pred_y is a column for each i. I want to create a data frame with all the 13 columns. Can it be done by using a for loop? Or else how can this be accomplished? I need suggestions.
You can use foreach which has the added advantage that it can be run in parallel if you have multiple cores in your CPU. Here is the non-parallel code:
library("iterators")
library("foreach")
library("FNN")
data(iris3)
newdata.training <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
newdata.test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
newdata.trainlabels <- factor(c(rep(1,25), rep(2,25), rep(3,25)))
k.values = seq(3, 15, 1)
start = 2 # to index sse array using k.values
sse = numeric(length = length(k.values))
results = foreach(i = iter(k.values),.combine = cbind) %do%
{
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainlabels, k=i, prob = TRUE)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i - start] <- sum((as.numeric(newdata.trainlabels)-pred_y)^2)
pred_y
}
results1 = data.frame(results)
colnames(results1) = k.values
Here is the parallel version:
# Parallel version
library("iterators")
library("foreach")
library("parallel")
library("doParallel")
library("FNN")
data(iris3)
newdata.training <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
newdata.test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
newdata.trainlabels <- factor(c(rep(1,25), rep(2,25), rep(3,25)))
num.cores = detectCores()
clusters <- makeCluster(num.cores)
registerDoParallel(clusters)
k.values = seq(3, 15, 1)
start = 2 # to index sse array using k.values
sse = numeric(length = length(k.values))
results = foreach(i = iter(k.values),.combine = cbind, .packages=c("FNN")) %dopar%
{
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainlabels, k=i, prob = TRUE)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i - start] <- sum((as.numeric(newdata.trainlabels)-pred_y)^2)
pred_y
}
results1 = data.frame(results)
colnames(results1) = k.values
stopCluster(clusters)
There are only a few differences between the non-parallel code and the parallel code. First, there are additional libraries to load. Second, you need to create and register the clusters that will do the parallel computation (and stop the clusters when you are done). Third, foreach uses %dopar% infix operator instead of %do%. Fourth, the foreach function needs the .packages parameter to pass KNN to each of the clusters.