I'm new to paralleling the for loop using foreach and struggle to understand how it works. As an example for the exercise, I created a simple list (input2) based on a dataframe (input). I try to calculate b by looping through h and j.
library(doParallel)
library(foreach)
library(dplyr)
input <- data.frame(matrix(rnorm(200*200, 0, .5), ncol=200))
input[input <=0] =0
input['X201'] <- seq(from = 0, to = 20, length.out = 10)
input <- input %>% select(c(X201, 1:200))
input2 <- split(input, f= input$X201)
a = 0
b= 0
cl <- parallel::makeCluster(20)
doParallel::registerDoParallel(cl)
tm1 <- system.time(
y <-
foreach (h = length(input2),.combine = 'cbind') %:%
foreach (j = nrow(input2[[h]]),.combine = 'c',packages = 'foreach') %dopar%{
a = input2[[h]][j,3]
b = b + a
}
)
parallel::stopCluster(cl)
registerDoSEQ()
print("Cluster stopped.")
y is about 0.55 (the exact value depends on the random number one generated), which is the value of input2[[10]][20,3], not the accumulative value I desired. I checked the manual of the foreach package but still not sure I fully understand the mechanism of the foreach function.
R foreach returns back results instead allows the outside variable to be changed. So don't expect a, b to be updated correctly.
Try the following
cl <- parallel::makeCluster(20)
doParallel::registerDoParallel(cl)
tm2 <- system.time(
results <- foreach(h = (1:length(input2)), .combine = "c") %dopar%{
sum( input2[[h]][1:nrow(input2[[h]]),3])
},
b <- sum(results[1:length(results)])
)
parallel::stopCluster(cl)
registerDoSEQ()
b
tm2
Related
I'm carrying out several independent (random) experiments on an igraph graph. I'd like to run them in parallel.
An experiment has several iterations (a for loop), and each iteration consists of a certain number of parallelized tasks (foreach, %dopar%). The results of all iterations (and thus the results of the experiment) are stored in a list. This concludes the experiment.
I've tried using the %:% operator along with %dopar%, as per R - is there a way to create a nested %dopar% foreach loop? and https://cran.r-project.org/web/packages/foreach/vignettes/nested.html. But my situation is different because there is a for loop (over iterations) separating the outer loop (over experiments) and the inner foreach %dopar% loop; therefore I don't know what to do.
My code looks like this:
library(igraph)
nb_trials <- 15
nb_iterations <- 60
nb_tasks <- 10
K <- 0.05
n.cores <- parallel::detectCores() - 1
my.cluster <- parallel::makeCluster(
n.cores,
type = "PSOCK"
)
doParallel::registerDoParallel(cl = my.cluster)
# Utility function for combining foreach outputs
comb <- function(x, ...) {
lapply(seq_along(x),
function(i) c(x[[i]], lapply(list(...), function(y) y[[i]])))
}
results_per_trial <- vector("list", nb_trials)
for (trial in 1:nb_trials) {
initial_graph <- make_empty_graph()
initial_graph <- initial_graph + vertices(name = c("A", "B", "C", "D"))
initial_graph <- initial_graph + edges(c("A", "B", "A", "C", "A", "D"), attr1 = c(0, 0, 0), attr2 = c(0, 0, 0))
results_per_iteration <- vector("list", nb_iterations)
for (iter in 1:nb_iterations) {
performance <- foreach(task = 1:nb_tasks, .combine = "comb", .packages = "igraph", .multicombine = TRUE, .init=list(list(), list())) %dopar% {
current_graph <- initial_graph
edge_attr(current_graph, "attr1", sample(3, 1)) <- rnorm(1)
edge_attr(current_graph, "attr2", sample(3, 1)) <- runif(1)
list(E(current_graph)$attr1, E(current_graph)$attr2)
}
edge_attr(initial_graph, "attr1") <- (1 - K)*E(initial_graph)$attr1 + Reduce('+', performance[[1]])
edge_attr(initial_graph, "attr2") <- (1 - K)*E(initial_graph)$attr2 + Reduce('+', performance[[2]])
results_per_iteration[[iter]] <- performance
}
results_per_trial[[trial]] <- results_per_iteration
}
}
I'd like to replace the "for (trial in 1:nb_trials)" loop by a foreach %dopar% one. Is this possible?
I am trying to convert the following for loop to foreach to take the advantage of parallel.
dt = data.frame(t(data.frame(a=sample(1:10,10), b=sample(1:10,10), c=sample(1:10,10), d=sample(1:10,10))))
X = as.matrix(dt)
c = ncol(X)
itemnames=names(dt)
sm=matrix(0,c,c)
colnames(sm)=itemnames
row.names(sm)=itemnames
for (j in 1:c){
ind=setdiff(1:c,j)
print(ind)
print(j)
sm[j,ind]=sign(X[j]-X[ind])
print(sm[j,ind])
}
cvec = 1:c
r = foreach(d = cvec, .combine = rbind) %dopar% {
ind = setdiff(1:10,d)
sm[d,ind]=sign(X[d]-X[ind])
}
With for loop I am getting the 10*10 matrix where the above sign function repelaces the off diagonal elements and it would be 0 for diagonal elements.
But with foreach, I am getting 10*9 matrix, its missing the diagonal elements and everything else is same.
Please help me to get the same output as for loop. Thanks in advance.
I am not sure what you are trying to achieve here, since you are only using the first ten elements of you matrix. This can be done without any loops:
sign(outer(X[1:10], X[1:10], FUN = "-"))
In addition, I am not sure that parallel processing will be faster for this kind of problem, even assuming that the real case is much bigger. But if you want to use foreach, you should not assign to the global sm within the loop and instead return a suitable vector in the end:
foreach(d = cvec, .combine = rbind) %dopar% {
ind <- setdiff(cvec,d)
res <- rep(0, 10)
res[ind] <- sign(X[d]-X[ind])
res
}
If you want to assign to a matrix in parallel, you'll need a shared matrix:
# devtools::install_github("privefl/bigstatsr")
library(bigstatsr)
sm <- FBM(c, c)
library(foreach)
cl <- parallel::makeCluster(3)
doParallel::registerDoParallel(cl)
r = foreach(d = cvec, .combine = c) %dopar% {
ind = setdiff(1:10,d)
sm[d,ind]=sign(X[d]-X[ind])
NULL
}
parallel::stopCluster(cl)
sm[]
I wonder I can use parallel computing in JAGS as I want.
Here is my R script.
library(foreach)
list.data2 <- foreach(i=1:n.rep) %do% {
foreach(j=1:2) %do% {list( cap = cap_data[[i]][[j]],
loc = loc_data[[i]][[j]],
eff = eff_data[[i]][[j]],
trap.numb = trap.numb2,
av = av,
forest = env$forest,
crop = env$crop,
bamboo = env$bamboo,
grass = env$grass,
abandoned = env$abandoned,
city = env$city,
rate = env$for_cr_rate,
m.numb = m.numb,
ones = matrix( 1, m.numb, 5 )
) #,bound_mat=bound_mat,bound_numb=bound_numb
}
}
inits2 <- foreach(j=1:2) %do% {list( n=n.inits2[[j]],
b0=0.5, b1=0.1, b2=0.1, b3=0.1, b4=0.1, b5=0.1, b6=0.1,
a0=5, a1=0.5, a2=0.5, a3=0.5, a4=0.5, a5=0.5, a6=0.5,
sd=1,
err=rep(0,m.numb),
r_capt=0.10
)
}
para2 <- c("a0","a1","a2","a3","a4", "a5","a6",
"b0","b1","b2","b3","b4", "b5","b6", "n28", "n29", "r_capt")
library(R2jags)
start.time <- Sys.time()
install.packages("doParallel")
library(doParallel)
registerDoParallel(cores=6)
x_real2 <- foreach( i = 1:2,
.packages = "R2jags"
) %dopar% {jags( "realdata_5years.txt",
data = list.data2[[i]][[?]],
inits = inits2[[i]],
para = para2,
n.chain = 3,
n.iter = n.1000000,
n.burnin = 400000,
n.thin = 200
)
}
sum_real2 <- foreach(i = 1:2) %do% {x_real2[[i]]$BUGSoutput$summary}
---------------------------------------------------------------------
So, I have two data sets and each has 30 ( == n.rep ) times repetition.
Therefore I have 60 data lists in total.
I would like to use six cores for both 2 data sets and each 3 MCMC chains.
Moreover, I need to repeat this calculation 30 ( == n.rep ) times.
However, I have no idea to write in this way. I have problems in the last 4 lines.
Should I use %dopar% twice?
or
Should I use jags.parallel in addition to the foreach?
I have a set of ratings for 45000 users and 40 odd movies. I need to predict new ratings for each user based on their pearson correlation with other users. I also need to store the set of similar users and their similarities for each user-movie combination.I am using the foreach package to execute the loops in parallel. The code that I have managed to write is this:
library(foreach)
x <- matrix(rnorm(1:1000), nrow = 100 , ncol =10 )
df = list()
# correlation matrix
cor_mat <- cor(t(x))
cor_mat = abs(cor_mat)
# similarity limits
upper = 1
lower = 0.04
# Initiating parallel environment
cl = makeCluster(3)
registerDoParallel(cl)
res <- foreach(i = 1:nrow(x) , .combine = rbind,.packages= c('base','foreach')) %dopar%{
foreach(j = 1:ncol(x) , .combine = c, .packages = c('base','foreach')) %do%{
sim_user = which(cor_mat[i,] >= lower & cor_mat[i,] < upper)
bx = as.numeric(t(x[sim_user,j]) %*%
cor_mat[sim_user,j]/sum(cor_mat[sim_user,j]))
df[[length(df)+1]] = data.frame(i,j,sim_user,cor_mat[sim_user,j])
return(bx)
}
}
stopCluster(cl)
I am able to accomplish half of my task i.e. creating a matrix of predicted ratings from the foreach output 'res'. But my list df where I am appending the list of similar users is empty at the end of the foreach loop.
What customized combine function can be written to output both the matrix of predicted ratings and the list of similar users?
For multiple output functions, it is always better to return everything inside a list. In that case, it means that you need to specify your own functions to combine data. Here, I return two elements each time: bx and df. My combine functions therefore combine each of those two elements separately and return them in a length-2 list.
combine_custom_j <- function(LL1, LL2) {
bx <- c(LL1$bx, LL2$bx)
dfs <- c(LL1$df, LL2$df)
return(list(bx = bx, df = dfs))
}
combine_custom_i <- function(LL1, LL2) {
bx <- rbind(LL1$bx, LL2$bx)
dfs <- c(LL1$df, LL2$df)
return(list(bx = bx, df = dfs))
}
res <- foreach(i = 1:nrow(x) , .combine = combine_custom_i,.packages= c('base','foreach')) %dopar%{
foreach(j = 1:ncol(x) , .combine = combine_custom_j, .packages = c('base','foreach')) %do%{
sim_user = which(cor_mat[i,] >= lower & cor_mat[i,] < upper)
bx = as.numeric(t(x[sim_user,j]) %*%
cor_mat[sim_user,j]/sum(cor_mat[sim_user,j]))
return(list(bx = bx, df = data.frame(i,j,sim_user,cor_mat[sim_user,j])))
}
}
Although I have returned your data frames in a list like your code suggested, I believe you might want to rbind them? In that case, you can simply replace the c(LL1$df, LL2$df) by rbind(LL1$df, LL2$df) in both combine functions.
A part of the code is
sse <-c()
k <- c()
for (i in seq(3, 15, 1)) {
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainLabels, k=i)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i] <- sum((newdata.trainLabels-pred_y)^2)
k[i] <- i
}
pred_y is a column for each i. I want to create a data frame with all the 13 columns. Can it be done by using a for loop? Or else how can this be accomplished? I need suggestions.
You can use foreach which has the added advantage that it can be run in parallel if you have multiple cores in your CPU. Here is the non-parallel code:
library("iterators")
library("foreach")
library("FNN")
data(iris3)
newdata.training <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
newdata.test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
newdata.trainlabels <- factor(c(rep(1,25), rep(2,25), rep(3,25)))
k.values = seq(3, 15, 1)
start = 2 # to index sse array using k.values
sse = numeric(length = length(k.values))
results = foreach(i = iter(k.values),.combine = cbind) %do%
{
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainlabels, k=i, prob = TRUE)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i - start] <- sum((as.numeric(newdata.trainlabels)-pred_y)^2)
pred_y
}
results1 = data.frame(results)
colnames(results1) = k.values
Here is the parallel version:
# Parallel version
library("iterators")
library("foreach")
library("parallel")
library("doParallel")
library("FNN")
data(iris3)
newdata.training <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
newdata.test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
newdata.trainlabels <- factor(c(rep(1,25), rep(2,25), rep(3,25)))
num.cores = detectCores()
clusters <- makeCluster(num.cores)
registerDoParallel(clusters)
k.values = seq(3, 15, 1)
start = 2 # to index sse array using k.values
sse = numeric(length = length(k.values))
results = foreach(i = iter(k.values),.combine = cbind, .packages=c("FNN")) %dopar%
{
y_pred <-knn(train = newdata.training, test = newdata.test,
cl = newdata.trainlabels, k=i, prob = TRUE)
pred_y <- as.numeric(levels(y_pred)[y_pred])
sse[i - start] <- sum((as.numeric(newdata.trainlabels)-pred_y)^2)
pred_y
}
results1 = data.frame(results)
colnames(results1) = k.values
stopCluster(clusters)
There are only a few differences between the non-parallel code and the parallel code. First, there are additional libraries to load. Second, you need to create and register the clusters that will do the parallel computation (and stop the clusters when you are done). Third, foreach uses %dopar% infix operator instead of %do%. Fourth, the foreach function needs the .packages parameter to pass KNN to each of the clusters.