Error in { : task 1 failed - "could not find function "ranger"" > - r

I was able to run the following code without any problems:
# first code: works fine
library(dplyr)
library(ranger)
original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)), data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )
original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)
test_set= rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])
train_set = anti_join(original_data, test_set)
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
I am now trying to run the same code (Step 2, Step 3, Step 4) in parallel - here is my attempt:
# second code: does not work fine
library(doParallel)
library(foreach)
registerDoParallel(cores = detectCores())
foreach(i = 1:100) %dopar% {
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
}
stopImplicitCluster()
This is giving me the following error:
Error in { : task 1 failed - "could not find function "ranger""
I am not sure why this error is being produced, seeing as I have loaded the "ranger" library.
My Question: Can someone please show me what I am doing wrong and how can I make the second code run like the first code?
Thanks!
Note : After adding the suggestion made by #Waldi, the code doesn't produce an error, but is taking a very long time to run. Does anyone have any recommendations on how to improve this?

You can specify the packages you need using the .packages argument in foreach:
foreach(i = 1:100, .packages = 'ranger') %dopar% {...}
Detailed explanation on footnote regarding parallel processing being slow can be found here

Related

How can I catch fatal error on rpart in R for loop?

I am perofrimn grid search using rpart tree models. On some iteration I got fatal error due to values pass in control argument. Is there an easy way to stop R from crashing if it cannot fit tree in this iteration?
#large_grid
complexity_par_val <- seq(0.001, 0.01, 0.001)
min_bin_val <- seq(500, 5000, 500)
max_depth_val <- seq(1, 30, 1)
freq_tree_large_grid <- expand.grid(cp = complexity_par_val, min_bin = min_bin_val, max_depth = max_depth_val)
#random serach
set.seed(123)
n_search <- 500
sample_for_r_search <- freq_tree_large_grid[sample(nrow(freq_tree_large_grid), n_search), ]
result_of_r_search_freq_old <- result_of_r_search_freq
result_of_r_search_freq <- data.frame()
start_time <- Sys.time()
for(i in 1:n_search) {
cp_1 <- sample_for_r_search$cp[i]
min_bin_1 <- sample_for_r_search$min_bin[i]
max_depth_1 <- sample_for_r_search$max_depth[i]
cntr <- list(cp=cp_1, minbucket = min_bin_1, maxdepth = max_depth_1, xval = 0)
sum_dev <- 0
for (j in 1:8){
FREQ_V <- FREQ_TRAIN[FREQ_TRAIN$ValRandom10 == j,]
FREQ_D <- FREQ_TRAIN[FREQ_TRAIN$ValRandom10 != j,]
tryCatch({
tree <- rpart( formula = formula_tree,
data = FREQ_D,
method = "poisson" ,
control = cntr
)}, error=function(e){})
pred <- predict(tree, newdata = FREQ_V )*FREQ_V$Exposure
Dev <- Deviance_Poisson(pred, FREQ_V$ClaimNb)
sum_dev <- sum_dev+Dev
print('cv')
print(j)
}
CV8_DEV <- sum_dev/8
result_of_r_search_freq <- rbind(result_of_r_search_freq, data.frame(CV8_DEV, cp_1, min_bin_1, max_depth_1))
print('ending the cross validation nr:')
print(i)
}
end_time <- Sys.time()

R C5.0: Error while including minCases in tunegrid (caret)

i am trying to implement the minCases-argument into my tuning process of a c5.0 model.
As i am using the caret package i am trying to get that argument into the "tuneGrid".
For that purpose i found the following Tutorial.
https://www.euclidean.com/machine-learning-in-practice/2015/6/12/r-caret-and-parameter-tuning-c50
After implementing the code into my syntax i get the following error:
**Error: The tuning parameter grid should have columns NA, NA, NA, splits**
Anyone knows where there is a mistake?
The error occurs as soon as i am building my model "mdl" in the last line of the code.
With regard to the Tutorial mentionend above my current code is the following:
library(datasets)
data(iris)
library('gmodels')
library("RcppCNPy")
library("class")
library("C50")
library('caret')
library('mlbench')
####Customizing the C5.0
C5CustomSort <- function(x) {
x$model <- factor(as.character(x$model), levels = c("rules","tree"))
x[order(x$trials, x$model, x$splits, !x$winnow),]
}
C5CustomLoop <- function (grid)
{
loop <- ddply(grid, c("model", "winnow","splits"), function(x) c(trials = max(x$trials)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$trials)) {
index <- which(grid$model == loop$model[i] & grid$winnow == loop$winnow[i] & grid$splits == loop$splits[i])
trials <- grid[index, "trials"]
submodels[[i]] <- data.frame(trials = trials[trials != loop$trials[i]])
}
list(loop = loop, submodels = submodels)
}
C5CustomGrid <- function(x, y, len = NULL) {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
expand.grid(trials = c5seq, splits = c(2,10,20,50), winnow = c(TRUE, FALSE), model = c("tree","rules"))
}
C5CustomFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
# add the splits parameter to the fit function
# minCases is a function of splits
theDots <- list(...)
splits <- param$splits
minCases <- floor( length(y)/splits ) - 1
if(any(names(theDots) == "control"))
{
theDots$control$winnow <- param$winnow
theDots$control$minCases <- minCases
theDots$control$earlyStopping <- FALSE
}
else
theDots$control <- C5.0Control(winnow = param$winnow, minCases = minCases, earlyStopping=FALSE )
argList <- list(x = x, y = y, weights = wts, trials = param$trials, rules = param$model == "rules")
argList <- c(argList, theDots)
do.call("C5.0.default", argList)
}
GetC5Info <- function() {
# get the default C5.0 model functions
c5ModelInfo <- getModelInfo(model = "C5.0", regex = FALSE)[[1]]
# modify the parameters data frame so that it includes splits
c5ModelInfo$parameters$parameter <- factor(c5ModelInfo$parameters$parameter,levels=c(levels(c5ModelInfo$parameters$parameter),'splits'))
c5ModelInfo$parameters$label <- factor(c5ModelInfo$parameters$label,levels=c(levels(c5ModelInfo$parameters$label),'Splits'))
c5ModelInfo$parameters <- rbind(c5ModelInfo$parameters,c('splits','numeric','Splits'))
# replace the default c5.0 functions with ones that are aware of the splits parameter
c5ModelInfo$fit <- C5CustomFit
c5ModelInfo$loop <- C5CustomLoop
c5ModelInfo$grid <- C5CustomGrid
c5ModelInfo$sort <- C5CustomSort
return (c5ModelInfo)
}
c5info <- GetC5Info()
#Building the actual model
x_a <- iris[c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")]
y_a <-as.factor(iris[,c("Species")])
fitControl <- trainControl(method = "cv", number = 10)
grida <- expand.grid( .winnow = "FALSE", .trials=c(1,5,10,15,20), .model="tree", .splits=c(2,5,10,15,20,25,50,100) )
mdl<- train(x=x_a,y=y_a,tuneGrid=grida,trControl=fitControl,method=c5info)
the problem seems to be in some of the Custom functions, i have this other version that works for me:
library(caret)
library(C50)
library(mlbench)
library(tidyverse)
library(plyr)
C5CustomSort <- function(x) {
x$model <- factor(as.character(x$model), levels = c("rules","tree"))
x[order(x$trials, x$model, x$splits, !x$winnow),]
}
C5CustomLoop <- function (grid)
{
loop <- ddply(grid, .(winnow,model, splits,trials), function(x) c(trials = max(x$trials)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$trials)) {
index <- which(grid$model == loop$model[i] & grid$winnow ==
loop$winnow[i] & grid$splits == loop$splits[i])
trials <- grid[index, "trials"]
submodels[[i]] <- data.frame(trials = trials[trials !=
loop$trials[i]],winnow = loop$winnow[i], model=loop$model[i],splits=loop$splits[i])
}
list(loop = loop, submodels = submodels)
}
C5CustomGrid <- function(x, y, len = NULL) {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
expand.grid(trials = c5seq, splits = c(2,10,20,50), winnow = c(TRUE, FALSE), model = c("tree","rules"))
}
C5CustomFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
splits <- loop$splits
minCases <- floor( length(y)/splits ) - 1
if(any(names(theDots) == "control"))
{
theDots$control$winnow <- param$winnow
theDots$control$minCases <- minCases
theDots$control$earlyStopping <- FALSE
}
else
theDots$control <- C5.0Control(winnow = param$winnow, minCases = minCases, earlyStopping=FALSE )
argList <- list(x = x, y = y, weights = wts, trials = param$trials, rules = param$model == "rules")
argList <- c(argList, theDots)
do.call("C5.0.default", argList)
}
GetC5Info <- function() {
c5ModelInfo <- getModelInfo(model = "C5.0", regex = FALSE)[[1]]
c5ModelInfo$parameters$parameter <- factor(c5ModelInfo$parameters$parameter,levels=c(c5ModelInfo$parameters$parameter,'splits'))
c5ModelInfo$parameters$label <- factor(c5ModelInfo$parameters$label,levels=c(c5ModelInfo$parameters$label,'Splits'))
c5ModelInfo$parameters <- rbind(c5ModelInfo$parameters,c('splits','numeric','Splits'))parameter
c5ModelInfo$fit <- C5CustomFit
c5ModelInfo$loop <- C5CustomLoop
c5ModelInfo$sort <- C5CustomSort
return (c5ModelInfo)
}
c5info <- GetC5Info()
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
splits<-c(5,25,100)
grid <- expand.grid( winnow = c(FALSE), trials=c(5,6), model=c("tree"), splits=c(5,25,100) )
data(PimaIndiansDiabetes2)
x <- PimaIndiansDiabetes2[c("age","glucose","insulin","mass","pedigree","pregnant","pressure","triceps")]
y <- PimaIndiansDiabetes2$diabetes
mdl<- train(x=x,y=y,tuneGrid=grid,trControl=fitControl,method=c5info,verbose=FALSE)

Unable to run foreach in doParallel package

I'm trying to run the following R codes (https://www.r-bloggers.com/general-regression-neural-network-with-r/) to implement a General Regression Neural Network (GRNN) in R. "foreach" function is used (two times) to search for the optimal value of sigma.
pkgs <- c('MASS', 'doParallel', 'foreach', 'grnn')
lapply(pkgs, require, character.only = T)
registerDoParallel(cores = 8)
data(Boston)
# PRE-PROCESSING DATA
X <- Boston[-14]
st.X <- scale(X)
Y <- Boston[14]
boston <- data.frame(st.X, Y)
# SPLIT DATA SAMPLES
set.seed(2013)
rows <- sample(1:nrow(boston), nrow(boston) - 200)
set1 <- boston[rows, ]
set2 <- boston[-rows, ]
# DEFINE A FUNCTION TO SCORE GRNN
pred_grnn <- function(x, nn){
xlst <- split(x, 1:nrow(x))
pred <- foreach(i = xlst, .combine = rbind) %dopar% {
data.frame(pred = guess(nn, as.matrix(i)), i, row.names = NULL)
}
}
# SEARCH FOR THE OPTIMAL VALUE OF SIGMA BY THE VALIDATION SAMPLE
cv <- foreach(s = seq(0.2, 1, 0.05), .combine = rbind) %dopar% {
grnn <- smooth(learn(set1, variable.column = ncol(set1)), sigma = s)
pred <- pred_grnn(set2[, -ncol(set2)], grnn)
test.sse <- sum((set2[, ncol(set2)] - pred$pred)^2)
data.frame(s, sse = test.sse)
}
cat("\n### SSE FROM VALIDATIONS ###\n")
print(cv)
jpeg('grnn_cv.jpeg', width = 800, height = 400, quality = 100)
with(cv, plot(s, sse, type = 'b'))
cat("\n### BEST SIGMA WITH THE LOWEST SSE ###\n")
print(best.s <- cv[cv$sse == min(cv$sse), 1])
# SCORE THE WHOLE DATASET WITH GRNN
final_grnn <- smooth(learn(set1, variable.column = ncol(set1)), sigma = best.s)
pred_all <- pred_grnn(boston[, -ncol(set2)], final_grnn)
jpeg('grnn_fit.jpeg', width = 800, height = 400, quality = 100)
plot(pred_all$pred, boston$medv)
dev.off()
But the following error occurred after the second "foreach" function (I mean, after cv).
Error in { : task 1 failed - "unused argument (sigma = s)"
any help would be appreciated.

Function inputs from a list

How can I run a function (in R) where some of the inputs are pulled from a list (or data frame)? Am I right in thinking that this would be more efficient than running a for-loop?
I am running simulations and want to change the variable values, but as they take a long time to run I want them to run overnight and to just tick through the different values automatically.
Here's the code for the function:
n = 10000
mu = 0
sd = 1
n.sub = 100
iboot = 100
isim = 1000 ### REDUCED FOR THIS EXAMPLE ###
var.values <- NULL
var.values.pop <- NULL
hist.fn <- function(n,mu,sd,n.sub,iboot)
{
Pop <- rnorm(n,mu,sd)
var.pop <- var(Pop)
Samp <- sample(Pop, n.sub, replace = FALSE)
var.samp <- var(Samp)
for(i in 1:isim) {
for(j in 1:iboot) {
Boot <- sample(Samp, n.sub, replace = TRUE)
var.values[j] <- var(Boot)
}
Samp <- sample(Pop, n.sub, replace = FALSE)
var.values.pop[i] <- var(Samp)
}
hist.pop <- hist(var.values.pop,plot=F)
hist.boot <- hist(var.values,plot=F)
#mypath = file.path("C:", "Output", paste("hist.boot_n.", n.sub, "_var.", sd^2, "_isim.", isim, "_iboot.", iboot, ".wmf", sep=""))
#win.metafile(file=mypath)
plot.new() #### ADDED FOR THIS EXAMPLE INSTEAD OF OUTPUTTING TO FILE ####
plot(hist.pop, freq=FALSE, xlim=range(var.values.pop, var.values), ylim=range(hist.pop$density, hist.boot$density), main = paste("Histogram of variances \n n=",n.sub," mu=",mu,"var=",sd^2,"\n n.sim=",isim,"n.boot=",iboot,"\n"), cex.main=0.8, xlab="Variance", col="red")
plot(hist.boot, freq=FALSE, col="blue", border="blue", add=T, density=20, angle=45)
abline(v=var.pop, lty=2, col="black", lwd=2)
legend("topright", legend=c("sample","bootstrap"),col=c("red","blue"),lty=1,lwd=2,bty="n",cex=0.7)
#dev.off()
}
hist.fn(n,mu,sd,n.sub,iboot)
Then I want sd, n.sub, and iboot to change by running through the following values:
sd <- c(1,10,100,1000)
n.sub <- c(4,10,100,1000)
iboot <- c(100,1000,10000)
Perhaps something like this?
n = 10000
mu = 0
sd = 1
n.sub = 100
iboot = 100
isim = 1000
sd <- c(1,10,100,1000)
n.sub <- c(4,10,100,1000)
iboot <- c(100,1000,10000)
# hist.fn parameters: n,mu,sd,n.sub,iboot
params <- expand.grid(n = n, mu = mu, sd = sd,
n.sub = n.sub, iboot = iboot)
apply(params, 1, FUN = function(x) do.call(hist.fn, as.list(x) ) )
You probably want to put these:
var.values <- NULL
var.values.pop <- NULL
Inside hist.fn, because assigning values to variables outside a function doesn't work like you seem to think.
You should use do.call, which will apply the function using arguments in a list. I have simplified your example to run less loops for the example. You can modify the printline of the script in order to monitor your progress for a larger job:
# The function
hist.fn <- function(n,mu,isim,sd,n.sub,iboot)
{
Pop <- rnorm(n,mu,sd)
var.pop <- var(Pop)
Samp <- sample(Pop, n.sub, replace = FALSE)
var.samp <- var(Samp)
var.values <- NaN*seq(isim) # sets up an empty vector for results
var.values.pop <- NaN*seq(isim) # sets up an empty vector for results
for(i in seq(isim)) {
for(j in seq(iboot)) {
Boot <- sample(Samp, n.sub, replace = TRUE)
var.values[j] <- var(Boot)
print(paste("i =", i, "; j =", j))
}
Samp <- sample(Pop, n.sub, replace = FALSE)
var.values.pop[i] <- var(Samp)
}
list(var.values=var.values, var.values.pop=var.values.pop) #returns results in the form of a list
}
# Global variables
n = 100
mu = 0
isim = 10
# Changing variables
sd <- c(1,10,20,30)
n.sub <- c(4,10,20,30)
iboot <- c(100,200,300,400)
df <- data.frame(sd=sd, n.sub=n.sub, iboot=iboot)
res <- vector(mode="list", nrow(df)) # sets up an empty list for results
for(i in seq(nrow(df))){
res[[i]] <- do.call(hist.fn, c(n=n, mu=mu, isim=isim, df[i,]) )
}
res # show results
sd <- 1:3
n.sub <- 4:6
iboot <- 7:9
funct1<-function(x,y,z) print(x+y+z)
for (i in 1:length(sd)){
funct1(sd[i],n.sub[i],iboot[i])
}
just an example. Doing it with loop.

create data frame in R

see: Selecting significant cases from a chi-squared test
The model example given in the case above is:
f = function(N=1000){
out <- data.frame("Row" = 1:N
, "Column" = 1:N
, "Chi.Square" = runif(N)
, "df"= sample(N, 1:10, replace=T)
, "p.value" = round(runif(N), 3)
)
return(out)
}
but when I would apply this to my model I would turn this into:
f = function(N=7000){
combos <- combn(ncol(final),2)
adply(combos, 2, function(x) {
test <- chisq.test(final[, x[1]], final[, x[2]])
out <- data.frame("Row" = colnames(final)[x[1]]
, "Column" = colnames(final[x[2]])
, "Chi.Square" = round(test$statistic,3)
, "df"= test$parameter
, "p.value" = round(test$p.value, 3)
)
return(out)
}}
yet R does not see this as a finished command line. Why?
Get yourself a decent editor :-)
adply(
isn't closed.
Edit: nor
function(...){
It looks like the final } should really be a ) + }

Resources