When I using the doParallel library, I encountered this weird error, the system throws this
" Error in { : task 1 failed -could not find function "%dopar%"
To be specific, this is what I did
library(doParallel)
cl <- makeCluster(4)
registerDoParallel(cl)
# Read the data
coin95 <-read.csv('~/Documents/coin95.csv')
coin95 <- coin95[,!(names(coin95) %in% c("X"))]
coin95[c("Person")] <- sapply(coin95[c("Person")],as.character)
# create the name list
coin95_name <- as.character(coin95$Person)
coin95_name <- unique(coin95_name)
n <- as.numeric(length(coin95_name))
# the average counting process
ntw <- function(now){
foreach (Ii = coin95_name,.combine = "+",.export = c("coin95","n")) %dopar% {
time <-subset(coin95, subset = coin95$Person == Ii)$duration
stepfun(time,seq(0,length(time)))(now)/n
}
}
# the average cumulative hazard
lambda <- function(now,params){
b <- params[1]
sigma <- params[2]
mu <- params[3]
xi <- params[4]
beta1 <- params[5]
beta2 <- params[6]
k <- function(spread){
L0 <- (1+(spread -mu)*xi/sigma)^(-1/xi)
return(L0)
}
foreach(Ii = coin95_name,.combine = "+",.export = c("coin95","n")) %dopar% {
time <- subset(coin95, subset = coin95$Person == Ii)$duration
noncov <- subset(coin95, subset = coin95$Person == Ii)$noncovered
reim <- subset(coin95, subset = coin95$Person == Ii)$reimbursement
(b*now+sum( exp(-k(now-time[(time < now)])+beta1*noncov[(time < now)]+beta2*reim[(time <now)]) ))/n
}
}
So far, everything is GOOD, I have created two functions ntw and lambda using the foreach. They worked perfectly.
Then I create the third function also using the foreach:
# the distance
Time <- coin95$duration
Time <- sort(as.double(Time))
jl <- function(params){
res<-foreach(Ii = Time,.combine = "rbind",.export = c("ntw","lambda")) %dopar% {
(ntw(Ii)-ntw(Ii-1e-7)) * (ntw(Ii)- lambda(Ii,params))^2
}
return(sqrt(sum(res)))
}
guess<-c(0.0,1.3333,0.0,0.1,-1.2,3e-3)
Type jl(guess):
> jl(guess)
Show Traceback
Rerun with Debug
Error in { : task 1 failed -could not find function "%dopar%"
Any Idea what's going wrong ?
Quick fix for problem with foreach %dopar% is to reinstall these packages:
install.packages("doSNOW")
install.packages("doParallel")
install.packages("doMPI")
Above packages are responsible for parallelism in R. Bug which existed in old versions of these packages is now removed. I should mention that it will most likely help even though you are not using these packages in your code.
Related
I'm trying to use a foreach loop but if I try to fit a Gumbel distribution within the loop I get the error
"Error in { : task 1 failed - "The dgumbel function must be defined""
I have read many posts about functions and foreach loops and they all seem to suggest putting ".export = functionname" into the foreach line. I did this and don't understand why it is still not running. If I exclude the Gumbel fistdist line and fit only normal distributions, it all works fine.
If I use a normal for loop, the fitdist of a Gumbel distribution works fine,too.
# create data that will be used to fit a distribution
cum_prec <- matrix (nrow= 80, ncol=10)
x = c(1:6, 8:10)
for ( i in x){
cum_prec[,i] <- rnorm(80,400, 50)}
cum_prec[,7] <- rGumbel(80,400, 50)
# package fitdistrplus does not work for the gumbel distribution. Therefore, the gumbel distribution has to be added manually.
dgumbel <- function(x,a,b) {1/b*exp((a-x)/b)*exp(-exp((a-x)/b))}
pgumbel <- function(q,a,b) {exp(-exp((a-q)/b))}
qgumbel <- function(p,a,b) {a-b*log(-log(p))}
library(doParallel)
cl <- makeCluster(4)
registerDoParallel(cl)
clim <- foreach (n=1:10, .export=c("dgumbel", "pgumbel","qgumbel"), .combine= cbind, .packages= "fitdistrplus") %dopar%
{
rep=10000
normal <- matrix(nrow=rep, ncol=10)
x = c(1:6, 8:10)
for ( i in x){
Normal<-fitdist(cum_prec[,i],"norm")
normal[,i] <- rnorm(rep,Normal$estimate[1], Normal$estimate[2])
}
Gumbel<-fitdist(cum_prec[,7],"gumbel", start=list(a=0,b=1),optim.method="Nelder-Mead")
normal[,7] <- rGumbel(rep,Gumbel$estimate[1], Gumbel$estimate[2])
SU <- vector("numeric", rep)
for (i in 1:rep){
su = (quantile(normal[,1], probs=runif(1,0,1))+ quantile(normal[,2], probs=runif(1,0,1)) + quantile(normal[,3], probs=runif(1,0,1))
+ quantile(normal[,4], probs=runif(1,0,1))+ quantile(normal[,5], probs=runif(1,0,1))+ quantile(normal[,6], probs=runif(1,0,1))
+ quantile(normal[,7], probs=runif(1,0,1))+ quantile(normal[,8], probs=runif(1,0,1))+ quantile(normal[,9], probs=runif(1,0,1))
+ quantile(normal[,10], probs=runif(1,0,1)))/10
SU[i] <- su
}
}
stopCluster(cl)
This works normally on my computer:
registerDoSNOW(makeCluster(2, type = "SOCK"))
foreach(i = 1:M,.combine = "c") %dopar% {
sum(rnorm(M))
}
So I can say that I can run parallelized code on this computer, right?
Ok. I have a piece of code that I wish to run on parallel with foreach. It runs perfectly when it's written with %do%, but doesn't work properly when I change it to %dopar%. (PS: I have already initialized the cluster with registerDoSNOW(makeCluster(2, type = "SOCK")) in the same way as before.)
My main interest in the code is getting the vector u.varpred. I get it nicely with %do%, but when I run it with %dopar%, the vector comes as a NULL.
Here is the loop with the code that's needed to run it all properly. It uses functions in the geoR package.
#you can pretty much ignore all this, it's just preparation for the loop
N=20
NN=10
set.seed(111);
datap <- grf(N, cov.pars=c(20, 5),nug=1)
grid.o <- expand.grid(seq(0, 1, l=100), seq(0, 1, l=100))
grid.c <- expand.grid(seq(0, 1, l=NN), seq(0,1, l=NN))
beta1=mean(datap$data)
emv<- likfit(datap, ini=c(10,0.4), nug=1)
krieging <- krige.conv(datap, loc=grid.o,
krige=krige.control(type.krige="SK", trend.d="cte",
beta =beta1, cov.pars=emv$cov.pars))
names(grid.c) = names(as.data.frame(datap$coords))
list.geodatas<-list()
valores<-c(datap$data,0)
list.dataframes<-list()
list.krigings<-list(); i=0; u.varpred=NULL;
#here is the foreach code
t<-proc.time()
foreach(i=1:length(grid.c[,1]), .packages='geoR') %do% {
list.dataframes[[i]] <- rbind(datap$coords,grid.c[i,]);
list.geodatas[[i]] <- as.geodata(data.frame(cbind(list.dataframes[[i]],valores)))
list.krigings[[i]] <- krige.conv(list.geodatas[[i]], loc=grid.o,
krige=krige.control(type.krige="SK", trend.d="cte",
beta =beta1, cov.pars=emv$cov.pars));
u.varpred[i] <- mean(krieging$krige.var - list.krigings[[i]]$krige.var)
list.dataframes[[i]]<-0 #i dont need those objects anymore but since they
# are lists i dont want to put <-NULL as it'll ruin their ordering
list.krigings[[i]]<- 0
list.geodatas[[i]] <-0
}
t<-proc.time()-t
t
You can check that this runs nicely (provided you have the following packages: geoR, foreach and doSNOW). But once I use registerDoSNOW(......) and %dopar%, u.varpred comes as a NULL.
Could you guys please try to see if I made a mistake in the foreach statement/process or if it's just the code that can't be parallel? (I thought it could, because any given iteration does not deppend on any of the iterations before it..)
I am sorry both the code and this question are so long. Thanks in advance for taking the time to read it.
My friend helped me directly. Here is a way it works:
u.varpred <- foreach(i = 1:length(grid.c[,1]), .packages = 'geoR', .combine = "c") %dopar% {
list.dataframes[[i]] <- rbind(datap$coords,grid.c[i,]);
list.geodatas[[i]] <- as.geodata(data.frame(cbind(list.dataframes[[i]],valores)));
list.krigings[[i]] <- krige.conv(list.geodatas[[i]], loc = grid.o,
krige = krige.control(type.krige = "SK", trend.d = "cte",
beta = beta1, cov.pars = emv$cov.pars));
u.varpred <- mean(krieging$krige.var - list.krigings[[i]]$krige.var);
list.dataframes[[i]] <- 0;
list.krigings[[i]] <- 0;
list.geodatas[[i]] <- 0;
u.varpred #this makes the results go into u.varpred
}
He gave me an example on why this works:
a <- NULL
foreach(i = 1:10) %dopar% {
a <- 5
}
print(a)
# a is still NULL
a <- NULL
a <- foreach(i = 1:10) %dopar% {
a <- 5
a
}
print(a)
#now it works
Hope this helps anyone.
I am working on data in R with 18M records. My computer does not have a wealth of RAM available, so I am trying the "ff" package to compensate. To make the amount of time reasonable, I am also using the "foreach" package and running the job in parallel. I am having issues when I run "foreach" in parallel with the full data; smaller groups of the data (say first 100K rows) run correctly.
What I am trying to obtain is rolling daily averages for peoples' values based on dates. I want the average daily value for past 7, 28, 91, etc. days. I am relatively new to R, so I do not understand its nuances. When I run this on the full data it stops after an hour and gives the error:
Task 1 failed - object 'PersonID' not found
What can I do to appropriately use the "ff" package with the "foreach" package. Also, it would be great if there were some way to output the data in a ff data frame and then into SQL. The code is below:
library("ff")
library("ffbase")
library("RODBC")
myconn <- odbcConnect("NO SHOW")
data <- as.ffdf(sqlFetch(myconn, "NO SHOW"))
#data[data=="NULL"] <- NA
#persons <- unique(data$PersonID, incomparables=FALSE)
persons <- aggregate(Value ~ PersonID, data=data, FUN=length)$PersonID
rollingLength <- 7
rollingTimes <- c(7,28,91,182,364,728,100000000)
valueCol <- 6
sinceCol <- 4
func <- function(stuff,id) {
check <- subset(stuff, PersonID == id)
tempvalue <- data.frame(matrix(,nrow=nrow(check),ncol=7,byrow=TRUE))
colnames(tempvalue) <- c("value7","value28","value91","value182","value364","value728","valueLTD")
tempvalue[1,] <- c(NA,NA,NA,NA,NA,NA,NA)
rollingTrips <- c(1,1,1,1,1,1,1)
rollingSinceLast <- c(0,0,0,0,0,0,0)
startIndex <- c(1,1,1,1,1,1,1)
rollingvalues <- c(0,0,0,0,0,0,0)
rollingvalues[1:rollingLength] <- check[1,valueCol]
if (nrow(check) > 1) {
for (r in 2:nrow(check)) {
tempvalue[r,] <- rollingvalues / rollingTrips
rollingvalues <- rollingvalues + check[r,valueCol]
rollingTrips <- rollingTrips + 1
rollingSinceLast <- rollingSinceLast + ifelse(is.na(check[r,sinceCol]), 0, check[r,sinceCol])
for (c in 1:(rollingLength-1)) {
while (rollingSinceLast[c] >= rollingTimes[c]) {
rollingvalues[c] <- rollingvalues[c] - check[startIndex[c],valueCol]
rollingTrips[c] <- rollingTrips[c] - 1
rollingSinceLast[c] <- rollingSinceLast[c] - check[startIndex[c]+1,sinceCol]
startIndex[c] <- startIndex[c] + 1
}
}
}
}
return (cbind(check, tempvalue))
}
library(foreach)
library(doParallel)
cl<-makeCluster(12)
registerDoParallel(cl)
strt<-Sys.time()
outdata <- foreach(id=persons, .combine="rbind", .packages="ff") %dopar% func(data,id)
print(Sys.time()-strt)
stopCluster(cl)
sqlSave(myconn, outdata)
odbcClose(myconn)
foreach package's %dopar% command need boundaries of a key value.
You can simply split your personID. Also, you sholud set the partition value less than makeCluster(). If you don't do that, you got file.access(filename, 0) == 0 is not TRUE massage. Because, you can not access to pre-saved ff package file on the same cluster.
split personID example:
split_min<-min(persons$personID)
split_max<-max(persons$personID)
partition<-12 # "partition < cluster" is good.
quart_half<-floor((split_max-split_min)/partition)
split_num<-matrix(0,partition,2)
split_num[1,1]<-split_min
split_num[1,2]<-quart_half+split_min
if(partition>=3){
for(i in 2:(partition-1)){
split_num[i,1]<-split_num[i-1,2]+1
split_num[i,2]<-split_num[i-1,2]+quart_half
}}
split_num[partition,1]<-split_num[partition-1,2]+1
split_num[partition,2]<-split_max
And, change foreach statement.
outdata <- foreach(i=1:partition, .combine="rbind", .packages="ff") %dopar% {
IDs<-subset(persons,personID>=split_num[i,1] & personID<=split_num[i,1])$personID
for(z in IDs){
func(data,z)}
}
or,
outdata <- foreach(i=1:partition, .combine="rbind") %dopar% {
require(ff) #or require(ffbase)
IDs<-subset(persons,personID>=split_num[i,1] & personID<=split_num[i,1])$personID
for(z in IDs){
func(data,z)}
}
Good luck to you.
I have tried running the following code on a Unix machine with 20 CPU, using R foreach, parallel, doParallel, and party packages (my objective is to have the party / varimp function working on several CPUs in parallel):
parallel_compute_varimp <- function (object, mincriterion = 0, conditional = FALSE, threshold = 0.2,
nperm = 1, OOB = TRUE, pre1.0_0 = conditional)
{
response <- object#responses
input <- object#data#get("input")
xnames <- colnames(input)
inp <- initVariableFrame(input, trafo = NULL)
y <- object#responses#variables[[1]]
error <- function(x, oob) mean((levels(y)[sapply(x, which.max)] != y)[oob])
w <- object#initweights
perror <- matrix(0, nrow = nperm * length(object#ensemble), ncol = length(xnames))
colnames(perror) <- xnames
data = foreach(b = 1:length(object#ensemble), .packages = c("party","stats"), .combine = rbind) %dopar%
{
try({
tree <- object#ensemble[[b]]
oob <- object#weights[[b]] == 0
p <- .Call("R_predict", tree, inp, mincriterion, -1L, PACKAGE = "party")
eoob <- error(p, oob)
for (j in unique(varIDs(tree))) {
for (per in 1:nperm) {
if (conditional || pre1.0_0) {
tmp <- inp
ccl <- create_cond_list(conditional, threshold, xnames[j], input)
if (is.null(ccl)) {
perm <- sample(which(oob))
}
else {
perm <- conditional_perm(ccl, xnames, input, tree, oob)
}
tmp#variables[[j]][which(oob)] <- tmp#variables[[j]][perm]
p <- .Call("R_predict", tree, tmp, mincriterion, -1L, PACKAGE = "party")
}
else {
p <- .Call("R_predict", tree, inp, mincriterion, as.integer(j), PACKAGE = "party")
}
perror[b, j] <- (error(p, oob) - eoob)
}
}
########
# return data to the %dopar% loop data variable
perror[b, ]
########
}) # END OF TRY
} # END OF LOOP WITH PARALLEL COMPUTING
perror = data
perror <- as.data.frame(perror)
return(MeanDecreaseAccuracy = colMeans(perror))
}
environment(parallel_compute_varimp) <- asNamespace('party')
cl <- makeCluster(detectCores())
registerDoParallel(cl, cores = detectCores())
<...>
system.time(data.cforest.varimp <- parallel_compute_varimp(data.cforest, conditional = TRUE))
but I am getting an error:
> system.time(data.cforest.varimp <- parallel_compute_varimp(data.cforest, conditional = TRUE))
Error in unserialize(socklist[[n]]) : error reading from connection
Timing stopped at: 58.302 13.197 709.307
The code was working with a smaller dataset on 4 CPUs.
I am running out of ideas. Can someone suggest a way to reach my objective of running party package varimp function on parallel CPUs?
The error:
Error in unserialize(socklist[[n]]) : error reading from connection
means that the master process got an error when calling unserialize to read from the socket connection to one of the workers. That probably means that the corresponding worker died, thus dropping its end of the socket connection. Unfortunately, it may have died for any number of reasons, many of which are very system specific.
You can usually figure out why the worker died by using the makeCluster "outfile" option so that the error message generated by the worker isn't thrown away. I usually recommend using outfile="" as described in this answer. Note that the "outfile" option works the same in both the snow and parallel packages.
You could also verify that your foreach loop works correctly when executed sequentially by registering the sequential backend:
registerDoSEQ()
If you're lucky, the foreach loop will fail when executed sequentially, since it's usually easier to figure out what is going wrong.
I would like to do bootstrap of residuals for nls fits in a loop. I use nlsBoot and in order to decrease computation time I would like to do that in parallel (on a Windows 7 system at the moment). Here is some code, which reproduces my problem:
#function for fitting
Falge2000 <- function(GP2000,alpha,PAR) {
(GP2000*alpha*PAR)/(GP2000+alpha*PAR-GP2000/2000*PAR)
}
#some data
PAR <- 10:1600
GPP <- Falge2000(-450,-0.73,PAR) + rnorm(length(PAR),sd=0.0001)
df1 <- data.frame(PAR,GPP)
#nls fit
mod <- nls(GPP~Falge2000(GP2000,alpha,PAR),start=list(GP2000=-450,alpha=-0.73),data=df1, upper=c(0,0),algorithm="port")
#bootstrap of residuals
library(nlstools)
summary(nlsBoot(mod,niter=5))
#works
#now do it several times
#and in parallel
library(foreach)
library(doParallel)
cl <- makeCluster(1)
registerDoParallel(cl)
ttt <- foreach(1:5, .packages='nlstools',.export="df1") %dopar% {
res <- nlsBoot(mod,niter=5)
summary(res)
}
#Error in { :
#task 1 failed - "Procedure aborted: the fit only converged in 1 % during bootstrapping"
stopCluster(cl)
I suspect this an issue with environments and after looking at the code of nlsBoot the problem seems to arise from the use of an anonymous function in a lapply call:
l1 <- lapply(1:niter, function(i) {
data2[, var1] <- fitted1 + sample(scale(resid1, scale = FALSE),
replace = TRUE)
nls2 <- try(update(nls, start = as.list(coef(nls)), data = data2),
silent = TRUE)
if (inherits(nls2, "nls"))
return(list(coef = coef(nls2), rse = summary(nls2)$sigma))
})
if (sum(sapply(l1, is.null)) > niter/2)
stop(paste("Procedure aborted: the fit only converged in",
round(sum(sapply(l1, is.null))/niter), "% during bootstrapping"))
Is there a way to use nlsBoot in a parallel loop? Or do I need to modify the function? (I could try to use a for loop instead of lapply.)
By moving the creation of the mod object into the %dopar% loop, it looks like everything works OK. Also, this automatically exports the df1 object, so you can remove the .export argument.
ttt <- foreach(1:5, .packages='nlstools') %dopar% {
mod <- nls(GPP~Falge2000(GP2000,alpha,PAR),start=list(GP2000=-450,alpha=-0.73),data=df1, upper=c(0,0),algorithm="port")
res <- nlsBoot(mod,niter=5)
capture.output(summary(res))
}
However, you might need to work out what you want returned. Using capture.output was just to see if things were working, since summary(res) seemed to only return NULL.