I was able to run the following code without any problems:
# first code: works fine
original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)), data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )
original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)
test_set= rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])
train_set = anti_join(original_data, test_set)
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
results_df <-, results)
X<-split(results_df, results_df$iteration)
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
final_predictions = aggregate(.~ id,, results_2), mean)
I am now trying to run the same code (Step 2, Step 3, Step 4) in parallel - here is my attempt:
# second code: does not work fine
registerDoParallel(cores = detectCores())
foreach(i = 1:100) %dopar% {
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
results_df <-, results)
X<-split(results_df, results_df$iteration)
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
final_predictions = aggregate(.~ id,, results_2), mean)
This is giving me the following error:
Error in { : task 1 failed - "could not find function "ranger""
I am not sure why this error is being produced, seeing as I have loaded the "ranger" library.
My Question: Can someone please show me what I am doing wrong and how can I make the second code run like the first code?
Note : After adding the suggestion made by #Waldi, the code doesn't produce an error, but is taking a very long time to run. Does anyone have any recommendations on how to improve this?

You can specify the packages you need using the .packages argument in foreach:
foreach(i = 1:100, .packages = 'ranger') %dopar% {...}
Detailed explanation on footnote regarding parallel processing being slow can be found here


