foreach (parallel) for matrix operation in R - r

I am trying to convert the following for loop to foreach to take the advantage of parallel.
dt = data.frame(t(data.frame(a=sample(1:10,10), b=sample(1:10,10), c=sample(1:10,10), d=sample(1:10,10))))
X = as.matrix(dt)
c = ncol(X)
itemnames=names(dt)
sm=matrix(0,c,c)
colnames(sm)=itemnames
row.names(sm)=itemnames
for (j in 1:c){
ind=setdiff(1:c,j)
print(ind)
print(j)
sm[j,ind]=sign(X[j]-X[ind])
print(sm[j,ind])
}
cvec = 1:c
r = foreach(d = cvec, .combine = rbind) %dopar% {
ind = setdiff(1:10,d)
sm[d,ind]=sign(X[d]-X[ind])
}
With for loop I am getting the 10*10 matrix where the above sign function repelaces the off diagonal elements and it would be 0 for diagonal elements.
But with foreach, I am getting 10*9 matrix, its missing the diagonal elements and everything else is same.
Please help me to get the same output as for loop. Thanks in advance.

I am not sure what you are trying to achieve here, since you are only using the first ten elements of you matrix. This can be done without any loops:
sign(outer(X[1:10], X[1:10], FUN = "-"))
In addition, I am not sure that parallel processing will be faster for this kind of problem, even assuming that the real case is much bigger. But if you want to use foreach, you should not assign to the global sm within the loop and instead return a suitable vector in the end:
foreach(d = cvec, .combine = rbind) %dopar% {
ind <- setdiff(cvec,d)
res <- rep(0, 10)
res[ind] <- sign(X[d]-X[ind])
res
}

If you want to assign to a matrix in parallel, you'll need a shared matrix:
# devtools::install_github("privefl/bigstatsr")
library(bigstatsr)
sm <- FBM(c, c)
library(foreach)
cl <- parallel::makeCluster(3)
doParallel::registerDoParallel(cl)
r = foreach(d = cvec, .combine = c) %dopar% {
ind = setdiff(1:10,d)
sm[d,ind]=sign(X[d]-X[ind])
NULL
}
parallel::stopCluster(cl)
sm[]

Related

How to fill in matrix inside of foreach loop?

I would like to fill in the matrix in inside of the parallel loop.
When I call the function, it returns back me the empty matrix
I was wondering whether can someone help me with that.
Compute_TaskSimilarity<-function(X,...){
Task_similarity<-matrix(0,nrow=100,ncol=100)
foreach(i = 1:K, .combine = "cbind") %dopar% {
for (j in (i + 1):(ncol(Task_similarity))) {
Myvalue<- ComputeValue
if (Myvalue!=0){
TaskSimilarity[i, j] <- Myvalue
} else{
TaskSimilarity[i, j] <- 0.0
}
}
return(TaskSimilarity)
}
Maybe do something like this: create a data.frame of all combinations of indices, then apply over all combinations, then reshape into a matrix of the right size. (I added a simple multiplication as example for a more complex operation and used a smaller array of lenght 10):
indices <- data.frame(x=rep(1:10, each=10), y=rep(1:10, 10))
result <- foreach(i=1:nrow(indices)) %dopar% {
# just an example for a more complex calculation
indices$x[i] * indices$y[i]
}
result <- do.call(c, result)
dim(result) <- c(10, 10)

How can I reduce the time foreach take before and after multithreadedly going over the iterations?

I use foreach + doParallel to apply a function to each row of a matrix multithreadedly in R. When the matrix has many rows, foreach takes a long time before and after multithreadedly going over the iterations.
For example, if I run:
library(foreach)
library(doParallel)
doWork <- function(data) {
# setup parallel backend to use many processors
cores=detectCores()
number_of_cores_to_use = cores[1]-1 # not to overload the computer
cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
cl <- makeCluster(number_of_cores_to_use)
clusterExport(cl=cl, varlist=c('ns','weights'))
registerDoParallel(cl)
cat('...Starting foreach initialization')
output <- foreach(i=1:length(data[,1]), .combine=rbind) %dopar% {
cat(i)
y = data[i,5]
a = 100
for (i in 1:3) { # Useless busy work
b=matrix(runif(a*a), nrow = a, ncol=a)
}
return(runif(10))
}
# stop cluster
cat('...Stop cluster')
stopCluster(cl)
return(output)
}
r = 100000
c = 10
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data)
output[1:10,]
The CPU usage is as follows (100% means all cores are fully utilized):
with annotations:
How can I optimize the code so that foreach doesn't take a long time before and after multithreadedly going over the iterations? The main time sink is the time spent after. The time spent after grows significantly with the number of foreach iterations, sometimes making the code has slow as if a simple for loop was used.
Another example (let's assume lm and poly cannot take matrices as arguments):
library(foreach)
library(doParallel)
doWork <- function(data,weights) {
# setup parallel backend to use many processors
cores=detectCores()
number_of_cores_to_use = cores[1]-1 # not to overload the computer
cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
cl <- makeCluster(number_of_cores_to_use)
clusterExport(cl=cl, varlist=c('weights'))
registerDoParallel(cl)
cat('...Starting foreach initialization')
output <- foreach(i=1:nrow(data), .combine=rbind) %dopar% {
x = sort(data[i,])
fit = lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = 2,raw=TRUE), na.action=na.omit, weights=weights)
return(fit$coef)
}
# stop cluster
cat('...Stop cluster')
stopCluster(cl)
return(output)
}
r = 10000
c = 10
weights=runif(c-1)
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data,weights)
output[1:10,]
Try this:
devtools::install_github("privefl/bigstatsr")
library(bigstatsr)
options(bigstatsr.ncores.max = parallel::detectCores())
doWork2 <- function(data, weights, ncores = parallel::detectCores() - 1) {
big_parallelize(data, p.FUN = function(X.desc, ind, weights) {
X <- bigstatsr::attach.BM(X.desc)
output.part <- matrix(0, 3, length(ind))
for (i in seq_along(ind)) {
x <- sort(X[, ind[i]])
fit <- lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = 2, raw = TRUE),
na.action = na.omit, weights = weights)
output.part[, i] <- fit$coef
}
t(output.part)
}, p.combine = "rbind", ncores = ncores, weights = weights)
}
system.time({
data.bm <- as.big.matrix(t(data))
output2 <- doWork2(data.bm, weights)
})
all.equal(output, output2, check.attributes = FALSE)
This is twice as fast on my computer (which has only 4 cores). Remarks:
Using more than half of the cores is often useless.
Your data is not very large, so using a big.matrix may not be useful here.
big_parallelize separate the matrix in ncores blocks of columns and apply your function on each and then combine the results.
In the function, it's better to make the output before the loop, and then fill it than to use a foreach that rbind all the results.
I'm accessing only columns, not rows.
So all these are good practices, yet it is not really relevant for your data. The gain should be higher when using more cores and for larger datasets.
Basically, if you want to be super fast, reimplementing the lm part in Rcpp would be a good solution.
As F. Privé mentioned in the comment:
The problem is with rbind I think. rbind lots of values from a list takes a long time. Also, fillings rows is bad, because matrices are stored by column. Also, making a long foreach loop is not efficient (use blocks instead).
To use use blocks instead (if 5 cores are used, each core receives 20% of the matrix):
library(foreach)
library(doParallel)
array_split <- function(data, number_of_chunks) {
# [Partition matrix into N equally-sized chunks with R](https://stackoverflow.com/a/45198299/395857)
# Author: lmo
rowIdx <- seq_len(nrow(data))
lapply(split(rowIdx, cut(rowIdx, pretty(rowIdx, number_of_chunks))), function(x) data[x, ])
}
doWork <- function(data) {
# setup parallel backend to use many processors
cores=detectCores()
number_of_cores_to_use = cores[1]-1 # not to overload the computer
cat(paste('number_of_cores_to_use:',number_of_cores_to_use))
cl <- makeCluster(number_of_cores_to_use)
clusterExport(cl=cl, varlist=c('ns','weights'))
registerDoParallel(cl)
cat('...Starting array split')
number_of_chunks = number_of_cores_to_use
data_chunks = array_split(data=data, number_of_chunks=number_of_chunks)
degree_poly = 2
cat('...Starting foreach initialization')
output <- foreach(i=1:length(data_chunks), .combine=rbind) %dopar% {
data_temporary = data_chunks[[i]]
output_temporary = matrix(0, nrow=nrow(data_temporary), ncol = degree_poly + 1)
for(i in 1:length(data_temporary[,1])) {
x = sort(data_temporary[i,])
fit = lm(x[1:(length(x)-1)] ~ poly(x[-1], degree = degree_poly,raw=TRUE), na.action=na.omit, weights=weights)
output_temporary[i,] = fit$coef
}
return(output_temporary)
}
# stop cluster
cat('...Stop cluster')
stopCluster(cl)
return(output)
}
r = 100000
c = 10
weights=runif(c-1)
data = matrix(runif(r*c), nrow = r, ncol=c)
output = doWork(data)
output[1:10,]
FYI:
Partition matrix into N equally-sized chunks with R
Using parLapply and clusterExport inside a function

How to construct in R a parallel version of nested for loop to compute values for a square matrix where the function is dependent on i and j?

I have a function that takes i and j as parameters and returns a single value and I currently have a nested loop designed to compute a value for each entry in a square matrix. But in essence since each individual value can be computed in parallel. Is there a way I can apply lapply in this situation? The resulting matrix must be N X N and the function is dependant on i and j. Thanks
for ( i in 1:matrixRowLength ) {
for ( j in 1:matrixColLength ) {
result_matrix[i,j] <- function(i,j) } }
The foreach package has a nesting operator that can be useful when parallelizing nested for loops. Here's an example:
library(doSNOW)
cl <- makeSOCKcluster(3)
registerDoSNOW(cl)
matrixRowLength <- 5
matrixColLength <- 5
fun <- function(i, j) 10 * i + j
result_matrix.1 <-
foreach(j=1:matrixColLength, .combine='cbind') %:%
foreach(i=1:matrixRowLength, .combine='c') %dopar% {
fun(i, j)
}
Note that I reversed the order of the loops so that the matrix is computed column by column. This is generally preferable since matrices in R are stored in column-major order.
The nesting operator is useful if you have large tasks and at least one of the loops may have a small number of iterations. But in many cases, it's safer to only parallelize the outer loop:
result_matrix.2 <-
foreach(j=1:matrixColLength, .combine='cbind') %dopar% {
x <- double(matrixRowLength)
for (i in 1:matrixRowLength) {
x[i] <- fun(i, j)
}
x
}
Note that it can also be useful to use chunking in the outer loop to decrease the amount of post processing performed by the master process. Unfortunately, this technique is a bit more tricky:
library(itertools)
nw <- getDoParWorkers()
result_matrix.3 <-
foreach(jglobals=isplitIndices(matrixColLength, chunks=nw),
.combine='cbind') %dopar% {
localColLength <- length(jglobals)
m <- matrix(0, nrow=matrixRowLength, ncol=localColLength)
for (j in 1:localColLength) {
for (i in 1:matrixRowLength) {
m[i,j] <- fun(i, jglobals[j])
}
}
m
}
In my experience, this method often gives the best performance.
Thanks for an interesting question / use case. Here's a solution using the future package (I'm the author):
First, define (*):
future_array_call <- function(dim, FUN, ..., simplify = TRUE) {
args <- list(...)
idxs <- arrayInd(seq_len(prod(dim)), .dim = dim)
idxs <- apply(idxs, MARGIN = 1L, FUN = as.list)
y <- future::future_lapply(idxs, FUN = function(idx_list) {
do.call(FUN, args = c(idx_list, args))
})
if (simplify) y <- simplify2array(y)
dim(y) <- dim
y
}
This function does not make any assumptions on what data type your function returns, but with the default simplify = TRUE it will try to simplify the returned data type iff possible (similar to how sapply() works).
Then with your matrix dimensions (**):
matrixRowLength <- 5
matrixColLength <- 5
dim <- c(matrixRowLength, matrixColLength)
and function:
slow_fun <- function(i, j, ..., a = 1.0) {
Sys.sleep(0.1)
a * i + j
}
you can run calculate slow_fun(i, j, a = 10) for all elements as:
y <- future_array_call(dim, FUN = slow_fun, a = 10)
To do it in parallel on your local machine, use:
library("future")
plan(multiprocess)
y <- future_array_call(dim, FUN = slow_fun, a = 10)
On a cluster of machines (for which you have SSH access with SSH-key authentication), use:
library("future")
plan(cluster, workers = c("machine1", "machine2"))
y <- future_array_call(dim, FUN = slow_fun, a = 10)
Footnotes:
(*) If you wonder how it works, just replace the future::future_lapply() statement with a regular lapply().
(**) future_array_call(dim, FUN) should work for any length(dim), not just for two (= matrices).

Foreach Parallel - Combine function for Multiple Outputs

I have a set of ratings for 45000 users and 40 odd movies. I need to predict new ratings for each user based on their pearson correlation with other users. I also need to store the set of similar users and their similarities for each user-movie combination.I am using the foreach package to execute the loops in parallel. The code that I have managed to write is this:
library(foreach)
x <- matrix(rnorm(1:1000), nrow = 100 , ncol =10 )
df = list()
# correlation matrix
cor_mat <- cor(t(x))
cor_mat = abs(cor_mat)
# similarity limits
upper = 1
lower = 0.04
# Initiating parallel environment
cl = makeCluster(3)
registerDoParallel(cl)
res <- foreach(i = 1:nrow(x) , .combine = rbind,.packages= c('base','foreach')) %dopar%{
foreach(j = 1:ncol(x) , .combine = c, .packages = c('base','foreach')) %do%{
sim_user = which(cor_mat[i,] >= lower & cor_mat[i,] < upper)
bx = as.numeric(t(x[sim_user,j]) %*%
cor_mat[sim_user,j]/sum(cor_mat[sim_user,j]))
df[[length(df)+1]] = data.frame(i,j,sim_user,cor_mat[sim_user,j])
return(bx)
}
}
stopCluster(cl)
I am able to accomplish half of my task i.e. creating a matrix of predicted ratings from the foreach output 'res'. But my list df where I am appending the list of similar users is empty at the end of the foreach loop.
What customized combine function can be written to output both the matrix of predicted ratings and the list of similar users?
For multiple output functions, it is always better to return everything inside a list. In that case, it means that you need to specify your own functions to combine data. Here, I return two elements each time: bx and df. My combine functions therefore combine each of those two elements separately and return them in a length-2 list.
combine_custom_j <- function(LL1, LL2) {
bx <- c(LL1$bx, LL2$bx)
dfs <- c(LL1$df, LL2$df)
return(list(bx = bx, df = dfs))
}
combine_custom_i <- function(LL1, LL2) {
bx <- rbind(LL1$bx, LL2$bx)
dfs <- c(LL1$df, LL2$df)
return(list(bx = bx, df = dfs))
}
res <- foreach(i = 1:nrow(x) , .combine = combine_custom_i,.packages= c('base','foreach')) %dopar%{
foreach(j = 1:ncol(x) , .combine = combine_custom_j, .packages = c('base','foreach')) %do%{
sim_user = which(cor_mat[i,] >= lower & cor_mat[i,] < upper)
bx = as.numeric(t(x[sim_user,j]) %*%
cor_mat[sim_user,j]/sum(cor_mat[sim_user,j]))
return(list(bx = bx, df = data.frame(i,j,sim_user,cor_mat[sim_user,j])))
}
}
Although I have returned your data frames in a list like your code suggested, I believe you might want to rbind them? In that case, you can simply replace the c(LL1$df, LL2$df) by rbind(LL1$df, LL2$df) in both combine functions.

how to use foreach calculate the each element in the upper triangular matrix?

I want to calculate each element in the upper triangular matrix using the foreach function
library(foreach)
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
tempdata <- matrix(0, nrow = 10, ncol = 10)
tempdata2 <- matrix(0, nrow = 10, ncol = 10)
foreach (i = 1:9, .combine='rbind') %do% {
for (j in (i+1):10) {
tempdata[i, j] <- i+j;
tempdata2[i, j] <- i*j
}
}
it works when I use %do%, but when I use %dopar% I get some nothing.
What am I doing wrong? thank you guys. Any suggestion will be appreciated.
You can't modify variables defined outside of the foreach loop and expect that data to be sent back to the master process. for loops allow that kind of side effect, but it doesn't work in parallel computing unless the workers are threads within the same process, and that isn't supported by any of the R parallel processing packages because R is single threaded.
Instead, you need to return a value from the body of the foreach loop and combine those values to get the desired result. In your case, you compute two values per iteration of the foreach loop, so you have to bundle them into a list, which means you need a more complicated combine function. Here's one way to do it:
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
comb <- function(...) {
mapply(rbind, ..., SIMPLIFY=FALSE)
}
r <- foreach(i=1:9, .combine='comb', .multicombine=TRUE) %dopar% {
tmp <- double(10)
tmp2 <- double(10)
for(j in (i+1):10) {
tmp[j] <- i+j
tmp2[j] <- i*j
}
list(tmp, tmp2)
}
tempdata <- r[[1]]
tempdata2 <- r[[2]]

Resources