Parallelization Apply to parRapply - r

My data set is:
ll <- matrix(c(5, 6, 60, 60), ncol=2)
And I use the function spDistsN1 from the library "sp" to obtain a distance matrix with apply:
apply(ll, 1, function(x) spDistsN1(as.matrix(ll), x, longlat = T))
But I want to do it with parallelization, so for that:
library(parallel)
ncore <- detectCores()
cl <- makeCluster(ncore)
clusterEvalQ(cl = cl, expr = c(library(sp)))
parRapply(cl = cl, x = ll, FUN = function(x) spDistsN1(as.matrix(ll), x,
longlat = T))
It shows the following error:
Error in checkForRemoteErrors(val) :
4 nodes produced errors; first error: object 'll' not found
How do I fix it?

An easier alternative to using parallel's parApply() or parRapply() is to use future_apply() of the future.apply package (disclaimer: I'm the author) because global variables are automatically exported - no need to worry about parallel::clusterExport() etc. Just use it as you would use apply(), e.g.
library(sp)
library(future.apply)
plan(multiprocess) ## parallelize on local machine
ll <- matrix(c(5, 6, 60, 60), ncol = 2)
## Sequentially
y0 <- apply(ll, 1, function(x) A(ll, x, longlat = TRUE))
print(y0)
# [,1] [,2]
# [1,] 0.00000 55.79918
# [2,] 55.79918 0.00000
## In parallel
y1 <- future_apply(ll, 1, function(x) spDistsN1(ll, x, longlat = TRUE))
print(y1)
# [,1] [,2]
# [1,] 0.00000 55.79918
# [2,] 55.79918 0.00000
print(identical(y1, y0))
# [1] TRUE
You may also find the blog post future.apply - Parallelize Any Base R Apply Function helpful.

You need to export all variables to workers. See ?parallel::clusterExport.

Related

parallel computing in R

I am familiar with Matlab, but recently started to use R. I encounterd a problem when using parallel computing in R.
I want to use a matrix or a 3-d array as an output after parellel computing. In Matlab, an example what I want to do is as follows.
X=zeros(10,5,100);
Y=zeros(100,2);
parfor i=1:100;
X(:,:,i) = randn(10,5);
Y(i,:) = randn(1,2);
end
However, as long as I investigated, foreach in R seem return only vectors(list?) and a matrix or an array seem not allowed. I'm wondering how I need to wright a code to implement what Matlab does.
Here is my suggested solution based on the Matlab code in the question.
#install.packages("foreach")
#install.packages("doParallel")
library(foreach)
library(doParallel)
X <- array(c(rep(0,10), rep(0,5), rep(0,100)),dim = c(10,5,100))
Y <- array(c(rep(0,100), rep(0,2)),dim = c(100,2))
X=foreach(i=1:100) %dopar% {
X[ , , i]= array(c( rnorm(10), rnorm(5)),dim = c(10,5))
X
}
Y=foreach(i=1:100) %dopar% {
Y[ i , ]= array(c( rnorm(1), rnorm(2)),dim = c(1,2))
Y
}
You can use the following R code to replicate your Matlab code. Note that R is vectorized, so for loops are often unnecessary.
X <- rep(0, 10*5*100)
Y <- rep(0, 100*2)
dim(X) <- c(10,5,100)
dim(Y) <- c(100,2)
set.seed(1234) # This is just for replication. Can omit if you want fully random numbers
X[] <- rnorm(length(X))
Y[] <- rnorm(length(Y))
apply(X, c(1,2), mean)
apply(Y, 2, mean)
Output:
> apply(X, c(1,2), mean)
[,1] [,2] [,3] [,4] [,5]
[1,] 0.055024278 -0.0205575146 -0.071816174 -0.087659326 0.12707484
[2,] -0.018254630 0.0821710032 -0.005589141 0.049391162 -0.01413225
[3,] -0.009338280 0.0284398951 -0.004083183 0.013750904 -0.02076238
[4,] 0.039544807 -0.0425689205 0.016054568 0.013936608 -0.06183537
[5,] 0.053641184 0.1362104005 0.069674908 0.008190821 -0.21042331
[6,] 0.065895652 -0.0098767327 -0.082148255 0.038705556 -0.05255018
[7,] 0.007917696 -0.0002747114 -0.045812106 -0.062452164 0.23984287
[8,] -0.173435275 0.0328011859 -0.053835173 0.057308693 -0.03760174
[9,] 0.047481900 -0.0225967973 -0.161777736 0.005625679 -0.05406814
[10,] -0.031460338 0.0628553230 -0.176667084 -0.098304874 0.06060704
> apply(Y, 2, mean)
[1] 0.06083109 -0.03338719
You can also simplify this a little with the following commands:
set.seed(1234)
X <- rnorm(10*5*100)
Y <- rnorm(100*2)
dim(X) <- c(10,5,100)
dim(Y) <- c(100,2)
apply(X, c(1,2), mean)
apply(Y, 2, mean)

add a list of vectors to get sum vector from tibble [duplicate]

I'm wondering about an elegant way allowing to sum (or calculate a mean) a numeric values of a list. e.g.
x <- list( a = matrix(c(1,2,3,4), nc=2), b = matrix(1, nc=2, nr=2))
and want to get
x[[1]]+x[[2]]
or a mean:
(x[[1]]+x[[2]])/2
You can use Reduce to successively apply a binary function to elements in a list.
Reduce("+",x)
[,1] [,2]
[1,] 2 4
[2,] 3 5
Reduce("+",x)/length(x)
[,1] [,2]
[1,] 1.0 2.0
[2,] 1.5 2.5
Even if Reduce() is the standard answer to the question of summing list of matrices and it has been pointed out many times, I collected some of the most prominent ways to achieve this goal in the following code. The main purpose is to show if there is any choice which is clearly better than others for speed and "precision".
# load libraries
library(microbenchmark)
library(ggplot2)
# generate the data with ten matrices to sum
mat_list <- lapply(1:10, function(x) matrix(rnorm(100), nrow = 10, ncol = 10))
# larger and longer test set
mat_list_large <- lapply(1:1000, function(x) matrix(rnorm(100000), nrow = 1000, ncol = 100))
# function with reduce #james
f1 <- function(mat_list){
Reduce(`+`, mat_list)
}
# function with apply #Jilber Urbina
f2 <- function(mat_list){
apply(simplify2array(mat_list), c(1:2), sum)
}
# function with do.call #Tyler Rinker
f3 <- function(mat_list){
x <- mat_list[[1]]
lapply(seq_along(mat_list)[-1], function(i){
x <<- do.call("+", list(x, mat_list[[i]]))
})
return(x)
}
# function with loop modified from #Carl Witthoft
f4 <- function(mat_list){
out_mat <- mat_list[[1]]
for (i in 2:length(mat_list)) out_mat <- out_mat + mat_list[[i]]
return(out_mat)
}
# test to see if they are all equal
all.equal(f1(mat_list), f2(mat_list), f3(mat_list), f4(mat_list), tolerance = 1.5e-8) # TRUE
# ps: the second method seems to differ slightly from the others
# run 100 times all the functions for having a statistic on their speed
mb <- microbenchmark("Reduce" = f1(mat_list),
"apply" = f2(mat_list),
"do.call" = f3(mat_list),
"loop" = f4(mat_list),
times = 100)
mb2 <- microbenchmark("Reduce" = f1(mat_list_large),
"apply" = f2(mat_list_large),
"do.call" = f3(mat_list_large),
"loop" = f4(mat_list_large),
times = 100)
# see output using a violin plot
autoplot(mb)
autoplot(mb2) # longer version for bigger datasets
Therefore, it is probably better to use Reduce() as for median speed and clearness of code.

Matrix of distances with Geosphere: avoid repeat calculus

I want to compute the distance among all points in a very large matrix using distm from geosphere.
See a minimal example:
library(geosphere)
library(data.table)
coords <- data.table(coordX=c(1,2,5,9), coordY=c(2,2,0,1))
distances <- distm(coords, coords, fun = distGeo)
The issue is that due to the nature of the distances I am computing, distm gives me back a symmetric matrix, therefore, I could avoid to calculate more than half of the distances:
structure(c(0, 111252.129800202, 497091.059564718, 897081.91986428,
111252.129800202, 0, 400487.621661164, 786770.053508848, 497091.059564718,
400487.621661164, 0, 458780.072878927, 897081.91986428, 786770.053508848,
458780.072878927, 0), .Dim = c(4L, 4L))
May you help me to find a more efficient way to compute all those distances avoiding doing twice each one?
If you want to compute all pairwise distances for points x, it is better to use distm(x) rather than distm(x,x). The distm function returns the same symmetric matrix in both cases but when you pass it a single argument it knows that the matrix is symmetric, so it won't do unnecessary computations.
You can time it.
library("geosphere")
n <- 500
xy <- matrix(runif(n*2, -90, 90), n, 2)
system.time( replicate(100, distm(xy, xy) ) )
# user system elapsed
# 61.44 0.23 62.79
system.time( replicate(100, distm(xy) ) )
# user system elapsed
# 36.27 0.39 38.05
You can also look at the R code for geosphere::distm to check that it treats the two cases differently.
Aside: Quick google search finds parallelDist: Parallel Distance Matrix Computation on CRAN. The geodesic distance is an option.
You can prepare a data frame of possible combinations without repetitions (with gtools packages). Then to compute distances for those pairs. Here is the code:
library(gtools)
library(geosphere)
library(data.table)
coords <- data.table(coordX = c(1, 2, 5, 9), coordY = c(2, 2, 0, 1))
pairs <- combinations(n = nrow(coords), r = 2, repeats.allowed = F, v = c(1:nrow(coords)))
distances <- apply(pairs, 1, function(x) {
distm(coords[x[1], ], coords[x[2], ], fun = distGeo)
})
# Construct distances matrix
dist_mat <- matrix(NA, nrow = nrow(coords), ncol = nrow(coords))
dist_mat[upper.tri(dist_mat)] <- distances
dist_mat[lower.tri(dist_mat)] <- distances
dist_mat[is.na(dist_mat)] <- 0
print(dist_mat)
The results:
[,1] [,2] [,3] [,4]
[1,] 0.0 111252.1 497091.1 400487.6
[2,] 111252.1 0.0 897081.9 786770.1
[3,] 497091.1 400487.6 0.0 458780.1
[4,] 897081.9 786770.1 458780.1 0.0
Using combn() from base R might be slightly simpler and probably faster than loading additional packages. Then, distm() uses distGeo() as a source, so using the latter should be even faster.
coords <- as.data.frame(coords) # this won't work with data.tables though
cbind(t(combn(1:4, 2)), unique(geosphere::distGeo(coords[combn(1:4, 2), ])))
# [,1] [,2] [,3]
# [1,] 1 2 111252.1
# [2,] 1 3 497091.1
# [3,] 1 4 897081.9
# [4,] 2 3 786770.1
# [5,] 2 4 400487.6
# [6,] 3 4 458780.1
We could check it out with a benchmark.
Unit: microseconds
expr min lq mean median uq max neval cld
distm 555.690 575.846 597.7672 582.352 596.1295 904.718 100 b
distGeo 426.335 434.372 450.0196 441.516 451.8490 609.524 100 a
Looks good.

Error with parallelization in R for S4 objects

I am trying to optimize a function that I am going to do with a several rasters with millions of cells, so I want to parallelize this function.
The initial Raster
So this is the initial raster:
library(raster)
SPA <- raster(nrows=3, ncols=3, xmn = -10, xmx = -4, ymn = 4, ymx = 10)
values(SPA) <- c(0.1, 0.4, 0.6, 0, 0.2, 0.4, 0, 0.1, 0.2)
plot(SPA)
The objective of the function is to get a dataframe with the distance between all of the cells present in the raster with a column from, a column to, and a column distance.
Transition layer
in order to do that I create a transition layer using the gdistance package:
library(gdistance)
h16 <- transition(SPA, transitionFunction=function(x){1},16,symm=FALSE)
h16 <- geoCorrection(h16, scl=FALSE)
and the origin points for every cell:
B <- xyFromCell(SPA, cell = 1:ncell(SPA))
head(B)
x y
[1,] -9 9
[2,] -7 9
[3,] -5 9
[4,] -9 7
[5,] -7 7
[6,] -5 7
Distance function
With some help from some stackoverflow answers I made this function which is faster than the accCost one in gdistance
accCost2 <- function(x, fromCoords) {
fromCells <- cellFromXY(x, fromCoords)
tr <- transitionMatrix(x)
tr <- rBind(tr, rep(0, nrow(tr)))
tr <- cBind(tr, rep(0, nrow(tr)))
startNode <- nrow(tr)
adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
tr[adjP] <- Inf
adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}
What I want to parallelize
And using apply I get my desired data.frame
connections <- data.frame(from = rep(1:nrow(B), each = nrow(B)),to = rep(1:nrow(B), nrow(B)), dist =as.vector(apply(B,1, accCost2, x = h16)))
head(connections)
from to dist
1 1 1 0.0
2 1 2 219915.7
3 1 3 439831.3
4 1 4 221191.8
5 1 5 312305.7
6 1 6 493316.1
This is what I tried with parApply
library("parallel")
cl = makeCluster(3)
clusterExport(cl, c("B", "h16", "accCost2"))
clusterEvalQ(cl, library(gdistance), library(raster))
connections <- data.frame(from = rep(1:nrow(B), each = nrow(B)),to = rep(1:nrow(B), nrow(B)), dist =as.vector(parRapply(cl, B,1, accCost2, x = h16)))
stopCluster(cl)
But I get the following error:
Error in x[i, , drop = FALSE] : object of type 'S4' is not subsettable
I am fairly new in parallelization, and I am not sure what I am doing wrong
There are several syntax issues in your code.
This code works for me.
library("parallel")
accCost_wrap <- function(x){accCost2(h16,x)}
#Instead of including h16 in the parRapply function,
#just get it in the node environment
cl = makeCluster(3)
clusterExport(cl, c("h16", "accCost2"))
#B will be "sent" to the nodes through the parRapply function.
clusterEvalQ(cl, {library(gdistance)})
#raster is a dependency of gdistance, so no need to include raster here.
pp <- parRapply(cl, x=B, FUN=accCost_wrap)
stopCluster(cl)
connections <- data.frame(from = rep(1:nrow(B), each = nrow(B)),
to = rep(1:nrow(B), nrow(B)),
dist = as.vector(pp))
Your version of accCost is indeed faster than the version in gdistance. Your version omits the checks to see if your points are within the extent of your transition layer. Proceed with caution.
(You could make your function even faster by taking the cell numbers as input. Also, sending so much data back from each node does not seem very efficient.)

How to sum a numeric list elements

I'm wondering about an elegant way allowing to sum (or calculate a mean) a numeric values of a list. e.g.
x <- list( a = matrix(c(1,2,3,4), nc=2), b = matrix(1, nc=2, nr=2))
and want to get
x[[1]]+x[[2]]
or a mean:
(x[[1]]+x[[2]])/2
You can use Reduce to successively apply a binary function to elements in a list.
Reduce("+",x)
[,1] [,2]
[1,] 2 4
[2,] 3 5
Reduce("+",x)/length(x)
[,1] [,2]
[1,] 1.0 2.0
[2,] 1.5 2.5
Even if Reduce() is the standard answer to the question of summing list of matrices and it has been pointed out many times, I collected some of the most prominent ways to achieve this goal in the following code. The main purpose is to show if there is any choice which is clearly better than others for speed and "precision".
# load libraries
library(microbenchmark)
library(ggplot2)
# generate the data with ten matrices to sum
mat_list <- lapply(1:10, function(x) matrix(rnorm(100), nrow = 10, ncol = 10))
# larger and longer test set
mat_list_large <- lapply(1:1000, function(x) matrix(rnorm(100000), nrow = 1000, ncol = 100))
# function with reduce #james
f1 <- function(mat_list){
Reduce(`+`, mat_list)
}
# function with apply #Jilber Urbina
f2 <- function(mat_list){
apply(simplify2array(mat_list), c(1:2), sum)
}
# function with do.call #Tyler Rinker
f3 <- function(mat_list){
x <- mat_list[[1]]
lapply(seq_along(mat_list)[-1], function(i){
x <<- do.call("+", list(x, mat_list[[i]]))
})
return(x)
}
# function with loop modified from #Carl Witthoft
f4 <- function(mat_list){
out_mat <- mat_list[[1]]
for (i in 2:length(mat_list)) out_mat <- out_mat + mat_list[[i]]
return(out_mat)
}
# test to see if they are all equal
all.equal(f1(mat_list), f2(mat_list), f3(mat_list), f4(mat_list), tolerance = 1.5e-8) # TRUE
# ps: the second method seems to differ slightly from the others
# run 100 times all the functions for having a statistic on their speed
mb <- microbenchmark("Reduce" = f1(mat_list),
"apply" = f2(mat_list),
"do.call" = f3(mat_list),
"loop" = f4(mat_list),
times = 100)
mb2 <- microbenchmark("Reduce" = f1(mat_list_large),
"apply" = f2(mat_list_large),
"do.call" = f3(mat_list_large),
"loop" = f4(mat_list_large),
times = 100)
# see output using a violin plot
autoplot(mb)
autoplot(mb2) # longer version for bigger datasets
Therefore, it is probably better to use Reduce() as for median speed and clearness of code.

Resources