avoid R loop and parallelize with snow - r

I have a large loop that will take too long (~100 days). I'm hoping to speed it up with the snow library, but I'm not great with apply statements. This is only part of the loop, but if I can figure this part out, the rest should be straightforward. I'm ok with a bunch of apply statements or loops, but one apply statement using a function to get object 'p' would be ideal.
Original data
dim(m1) == x x # x >>> 0
dim(m2) == y x # y >>> 0, y > x, y > x-10
dim(mout) == x x
thresh == x-10 #specific to my data, actual number probably unimportant
len(v1) == y #each element is a random integer, min==1, max==thresh
len(v2) == y #each element is a random integer, min==1, max==thresh
Original loop
p <- rep(NA,y)
for (k in 1:y){
mout <- m1 * matrix(m2[k,],x,x)
mout <- mout/sum(mout)
if (v1[k] < thresh + 1){
if(v2[k] < thresh + 1){
p[k] <- out[v1[k],v2[k]]
}
if(v2[k] > thresh){
p[k] <- sum(mout[v1[k],(thresh+1):x])
}
}
#do stuff with object 'p'
}

library(snow)
dostuff <- function(k){
#contents of for-loop
mout <- m1 * matrix(m2[k,],x,x)
mout <- mout/sum(mout)
if (v1[k] < thresh + 1){
if(v2[k] < thresh + 1){
p <- out[v1[k],v2[k]]
}
if(v2[k] > thresh){
p <- sum(mout[v1[k],(thresh+1):x])
}
}
#etc etc
return(list(p,
other_vars))
}
exports = c('m1',
'm2',
'thresh',
'v1',
'x' ,
'v2')
cl = makeSOCKcluster(4)
clusterExport(cl,exports)
loop <- as.array(1:y)
out <- parApply(cl,loop,1,dostuff)
p <- rep(NA,y)
for(k in 1:y){
p[k] <- out[[k]][[1]]
other_vars[k] <- out[[k]][[2]]
}

Related

Error in seq.default(a, length = max(0, b - a - 1)) : length must be non-negative number

I tried running the code below.
set.seed(307)
y<- rnorm(200)
h2=0.3773427
t=seq(-3.317670, 2.963407, length.out=500)
fit=density(y, bw=h2, n=1024, kernel="epanechnikov")
integrate.xy(fit$x, fit$y, min(fit$x), t[407])
However, i recived the following message:
"Error in seq.default(a, length = max(0, b - a - 1)) :
length must be non-negative number"
I am not sure what's wrong.
I do not encounter any problem when i use t[406] or t[408] as follow:
integrate.xy(fit$x, fit$y, min(fit$x), t[406])
integrate.xy(fit$x, fit$y, min(fit$x), t[408])
Does anyone know what's the problem and how to fix it? Appreciate your help please. Thanks!
I went through the source code for the integrate.xy function, and there seems to be a bug relating to the usage of the xtol argument.
For reference, here is the source code of integrate.xy function:
function (x, fx, a, b, use.spline = TRUE, xtol = 2e-08)
{
dig <- round(-log10(xtol))
f.match <- function(x, table) match(signif(x, dig), signif(table,
dig))
if (is.list(x)) {
fx <- x$y
x <- x$x
if (length(x) == 0)
stop("list 'x' has no valid $x component")
}
if ((n <- length(x)) != length(fx))
stop("'fx' must have same length as 'x'")
if (is.unsorted(x)) {
i <- sort.list(x)
x <- x[i]
fx <- fx[i]
}
if (any(i <- duplicated(x))) {
n <- length(x <- x[!i])
fx <- fx[!i]
}
if (any(diff(x) == 0))
stop("bug in 'duplicated()' killed me: have still multiple x[]!")
if (missing(a))
a <- x[1]
else if (any(a < x[1]))
stop("'a' must NOT be smaller than min(x)")
if (missing(b))
b <- x[n]
else if (any(b > x[n]))
stop("'b' must NOT be larger than max(x)")
if (length(a) != 1 && length(b) != 1 && length(a) != length(b))
stop("'a' and 'b' must have length 1 or same length !")
else {
k <- max(length(a), length(b))
if (any(b < a))
stop("'b' must be elementwise >= 'a'")
}
if (use.spline) {
xy <- spline(x, fx, n = max(1024, 3 * n))
if (xy$x[length(xy$x)] < x[n]) {
if (TRUE)
cat("working around spline(.) BUG --- hmm, really?\n\n")
xy$x <- c(xy$x, x[n])
xy$y <- c(xy$y, fx[n])
}
x <- xy$x
fx <- xy$y
n <- length(x)
}
ab <- unique(c(a, b))
xtol <- xtol * max(b - a)
BB <- abs(outer(x, ab, "-")) < xtol
if (any(j <- 0 == apply(BB, 2, sum))) {
y <- approx(x, fx, xout = ab[j])$y
x <- c(ab[j], x)
i <- sort.list(x)
x <- x[i]
fx <- c(y, fx)[i]
n <- length(x)
}
ai <- rep(f.match(a, x), length = k)
bi <- rep(f.match(b, x), length = k)
dfx <- fx[-c(1, n)] * diff(x, lag = 2)
r <- numeric(k)
for (i in 1:k) {
a <- ai[i]
b <- bi[i]
r[i] <- (x[a + 1] - x[a]) * fx[a] + (x[b] - x[b - 1]) *
fx[b] + sum(dfx[seq(a, length = max(0, b - a - 1))])
}
r/2
}
The value given to the xtol argument, is being overwritten in the line xtol <- xtol * max(b - a). But the value of the dig variable is calculated based on the original value of xtol, as given in the input to the function. Because of this mismatch, f.match function, in the line bi <- rep(f.match(b, x), length = k), returns no matches between x and b (i.e., NA). This results in the error that you have encountered.
A simple fix, at least for the case in question, would be to remove the xtol <- xtol * max(b - a) line. But, you should file a bug report with the maintainer of this package, for a more rigorous fix.

Repeating a function multiple times using a for loop

A have code that creates a random graph in the form of a matrix. Now I would like it to create many, say m, random graphs so the output is m matrices. I am trying to do this with a for loop. This would be my preferred method however I am open to other suggestions (apply family?). Here is my code, where n is the number of nodes/vertices the graph has and beta is the amount of preferential attachment (keep this between 0 and 1.5)
multiplerandomgraphs <- function(n, beta, m) {
for(k in 1:m) {
randomgraph <- function(n, beta) {
binfunction <- function(y) {
L <- length(y)
x <- c(0, cumsum(y))
U <- runif(1, min = 0 , max = sum(y))
for(i in 1:L) {
if(x[i] <= U && x[i+1] > U){
return(i)
}
}
}
mat <- matrix(0,n,n)
mat[1,2] <- 1
mat[2,1] <- 1
for(i in 3:n) {
degvect <- colSums(mat[ , (1:(i-1))])
degvect <- degvect^(beta)
j <- binfunction(degvect)
mat[i,j] <- 1
mat[j,i] <- 1
}
return(mat)
}
}
}
You can define your randomgraph function as randomgraph <- function(i, n, beta) {} with the body the same as your definition, leaves the parameter i as a dummy parameter. And then use apply function as listOfMatrix <- lapply(1:m, randomgraph, n, beta) which return a list of matrix.

looping through a matrix with a function

I'd like to perform this function on a matrix 100 times. How can I do this?
v = 1
m <- matrix(0,10,10)
rad <- function(x) {
idx <- sample(length(x), size=1)
flip = sample(0:1,1,rep=T)
if(flip == 1) {
x[idx] <- x[idx] + v
} else if(flip == 0) {
x[idx] <- x[idx] - v
return(x)
}
}
This is what I have so far but doesn't work.
for (i in 1:100) {
rad(m)
}
I also tried this, which seemed to work, but gave me an output of like 5226 rows for some reason. The output should just be a 10X10 matrix with changed values depending on the conditions of the function.
reps <- unlist(lapply(seq_len(100), function(x) rad(m)))
Ok I think I got it.
The return statement in your function is only inside a branch of an if statement, so it returns a matrix with a probability of ~50% while in the other cases it does not return anything; you should change the code function into this:
rad <- function(x) {
idx <- sample(length(x), size=1)
flip = sample(0:1,1,rep=T)
if(flip == 1) {
x[idx] <- x[idx] + v
} else if(flip == 0) {
x[idx] <- x[idx] - v
}
return(x)
}
Then you can do:
for (i in 1:n) {
m <- rad(m)
}
Note that this is semantically equal to:
for (i in 1:n) {
tmp <- rad(m) # return a modified verion of m (m is not changed yet)
# and put it into tmp
m <- tmp # set m equal to tmp, then in the next iteration we will
# start from a modified m
}
When you run rad(m) is not do changes on m.
Why?
It do a local copy of m matrix and work on it in the function. When function end it disappear.
Then you need to save what function return.
As #digEmAll write the right code is:
for (i in 1:100) {
m <- rad(m)
}
You don't need a loop here. The whole operation can be vectorized.
v <- 1
m <- matrix(0,10,10)
n <- 100 # number of random replacements
idx <- sample(length(m), n, replace = TRUE) # indices
flip <- sample(c(-1, 1), n, replace = TRUE) # subtract or add
newVal <- aggregate(v * flip ~ idx, FUN = sum) # calculate new values for indices
m[newVal[[1]]] <- m[newVal[[1]]] + newVal[[2]] # add new values

R foreach loops on 3D array data

Some background: I need to work out the distance from some point to each cell in a 3D grid, then apply a function to this distance. I need to do this for multiple points, and add the functions values in each cell for all the points. I can do this using the following code for points located at (x,y,z):
x <- c(1,2,3,4,5)
y <- x
z <- x
radius <- c(0.4,0.5,0.6,0.7,0.8)
numsphere <- length(x)
radius_buffer <- 0.2
xvox <- seq((min(x)-1),(max(x)+2),0.5)
yvox <- xvox
zvox <- xvox
probability_array <<- array(0,dim=c(length(xvox),length(yvox),length(zvox)))
for (j in 1:length(yvox)){ # for every y element
for (i in 1:length(xvox)){ # for every x element
for (k in length(zvox):1){ # for every z element
for (n in 1:numsphere){ # for the total number of points
dist_sd <- ((xvox[i]-x[n])^2+(yvox[j]-y[n])^2+(zvox[k]-z[n])^2)^0.5
probability_array[i,j,k] <- probability_array[i,j,k] +
round(exp(-1*(dist_sd-radius[n])^2/(2*radius_buffer^2)),3)
}
}
}
}
The output is an array and the plotted result looks like this:
probability_array <- probability_array/max(probability_array)
contour3d(probability_array,level=c(0.2,0.8,0.9),x=xvox,y=yvox,z=zvox,color = c("aquamarine","gold","darkorange"),alpha = c(0.1,0.2,0.5),add=T)
I have tried to parallelise this because it seems ideal for it, but can't get it to work.
I've tried:
cl<-makeCluster(detectCores(),type="SOCK")
registerDoSNOW(cl)
for (j in 1:length(yvox)){
for (i in 1:length(xvox)){
for(k in length(zvox):1){
probability_array[i,j,k] <- foreach(n=1:numsphere, .combine='+') %dopar% {
dist_sd <- ((xvox[i]-x[n])^2+(yvox[j]-y[n])^2+(zvox[k]-z[n])^2)^0.5
round(exp(-1*(dist_sd-radius[n])^2/(2*radius_buffer^2)),3)
}
}
}
}
and things like:
r <- foreach(j=1:length(yvox)) %:% foreach(i=1:length(xvox)) %:% foreach(k=length(zvox):1) %:% foreach(n=1:numsphere, .combine='+') %do% {
dist_sd <- ((xvox[i]-x[n])^2+(yvox[j]-y[n])^2+(zvox[k]-z[n])^2)^0.5
probability_array[i,j,k] <- probability_array[i,j,k] + round(exp(-1*(dist_sd-radius[n])^2/(2*radius_buffer^2)),3)
probability_array[i,j,k]
}
But I'm missing something important. Any help would be greatly appreciated.
Cheers
When parallelizing computations, because of the overhead it introduces,
it is preferable to run large pieces of computations in parallel,
rather than small -- outer loops, rather than inner loops.
In this case, however, there is no need to parallelize the computations:
you can just vectorize them.
# 3-dimensional analogue of row() and col()
dim3 <- function( a, i ) {
stopifnot( length(dim(a)) == 3 )
r <- a
if( i == 1 ) { r[] <- rep(1:dim(a)[1], dim(a)[2] * dim(a)[3]) }
if( i == 2 ) { r[] <- rep(1:dim(a)[2], each = dim(a)[1], times = dim(a)[3]) }
if( i == 3 ) { r[] <- rep(1:dim(a)[3], each = dim(a)[1] * dim(a)[2]) }
r
}
probability_array <- array(0,dim=c(length(xvox),length(yvox),length(zvox)))
i <- dim3(probability_array,1)
j <- dim3(probability_array,2)
k <- dim3(probability_array,3)
for (n in 1:numsphere){
dist_sd <- sqrt(
(xvox[i]-x[n])^2 + (yvox[j]-y[n])^2 + (zvox[k]-z[n])^2
)
probability_array <- probability_array +
# Rounding intermediate results looks suspicious
round(exp(-1*(dist_sd-radius[n])^2/(2*radius_buffer^2)),3)
}

for loop creating vector r

I am trying to create a function to calculate the Box-Cox transformation in R, where you iterate values of lambda (lambdas) in a formula to maximize L. What I ultimately want is a vector of L, such that for all i in lambda, there is a corresponding L value.
y <- c(256,256,231,101,256,213,241,246,207,143,287,240,262,234,146,255,184,161,252,229,283,132,218,113,194,237,181,262,104)
df <- 28
n=29
lambdas <- seq(-3,3,0.001)
L <- c(rep(NA,length(lambdas)))
for(i in lambdas) {
if(i != 0) {
yprime <- (((y^i)-1)/i)
} else
{ yprime <- log(y)
}
st2 <- var(yprime)
L <- (((-df/2)*(log(st2))) + ((i-1)*(df/n)*(sum(log(y)))))
}
What I typically end up with L as a vector of 1, with the final iteration calculated.
Use seq_along to generate an index for lambdas[] and L[]
for(i in seq_along(lambdas)) {
if(i != 0) {
yprime <- (((y^lambdas[i])-1)/lambdas[i])
} else {
yprime <- log(y)
}
st2 <- var(yprime)
L[i] <- (((-df/2)*(log(st2))) + ((lambdas[i]-1)*(df/n)*(sum(log(y)))))
}
plot(L)

Resources