Get multiple values out of parallel foreach loop - r

I've tried searching but haven't located anything that's gotten me all the way.
I'm running an occupancy prediction model on a stack of three rasters. Due to the large amount of processing that needs to happen I'm using a parallel foreach loop.
I need to retrieve three variables out of the results from the loop: test, na, and pred. I need those three values to fill in the new raster values and maintain the same extent. Unless someone knows a way to fill in the gaps created by NA values during processing?
Below is the code I've been trying to use based on posts I've found.
I also tried nesting foreach loops, but I'm not sure I understand how those work, or if that would achieve my ends.
library(parallel)
library(doSNOW)
multiResultClass<- function(test = NULL, tmp = NULL, na = NULL, pred = NULL){
results<- list(
test = test,
tmp = tmp,
na = na,
pred = pred
)
class(results)<- append(class(results), "multiResultClass")
return(results)
}
nc<- detectCores()-1
cl<- makeCluster(nc)
registerDoSNOW(cl)
predicts<- foreach (i = 1:nrow(pm), .multicombine = T, .maxcombine = 1000,
.packages = c("unmarked", "raster"), .verbose = T)%dopar%{
results<- multiResultClass()
test<- cellFromRow(pm, i)
tmp<- data.frame(pm[test])
na<- any(is.na(tmp[i, ]))
if(length(which(na) != nrow(tmp))){
pred<- predict(fmBest, "state", tmp)
}
results$test<- test
results$tmp<- tmp
results$na<- na
results$pred<- pred
return(results)
}
foreach(i = 1:nrow(pm))%do%{
test<- predicts[[i]]$test
na<- predicts[[i]]$na
pred<- predicts[[i]]$pred
}
stopCluster(cl)
I have a working foreach loop that gets me the pred values, but without test and na I haven't found a way to properly fill in the raster template the data needs to go into. That foreach loop is below:
library(parallel)
library(doSNOW)
ns<- detectCores()-1
cl<- makeCluster(ns);cl
registerDoSNOW(cl)
predictions<-
foreach (i = 1:nrow(pm), .multicombine = T, .maxcombine = 5000,
.packages = c("unmarked", "raster"), .verbose = T)%dopar%{
test<- cellFromRow(pm, i)
tmp<- data.frame(pm[test])
na<- any(is.na(tmp[i, ]))
if(length(which(na) != nrow(tmp))){
predict(fmBest, "state", tmp)
}
}
stopCluster(cl)

I finally found a combine function I could make work. Below is the code I used that returns values for test, na, and pred all in a large list.
library(doSNOW)
nc<- detectCores()-1
cl<- makeCluster(nc);cl
registerDoSNOW(cl)
comb<- function(...){
mapply('rbind', ..., SIMPLIFY = F)
}
predictions<- foreach(i = 1:nrow(pm), .combine = 'comb', .multicombine = T,
.maxcombine = 200, .packages = c("unmarked", "raster"), .verbose = T,
.inorder = F)%dopar%{
#get cell number values from raster stack
test<- cellFromRow(pm, i)
# make into a data.frame for prediction
tmp<- data.frame(pm[test])
# test which are na
na<- any(is.na(tmp[i, ]))
# avoid NA values entering the predict function
if(length(which(na)) != nrow(tmp)){
# # Predict the new data
pred<- predict(fmBest, "state", tmp)
}
list(test, na, pred)
}
stopCluster(cl)

Related

Parallelized version of code takes longer to run

I have the following code that is running fine:
# first code: works fine
# Step 1 : Create Data for Example:
library(dplyr)
library(ranger)
original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)), data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )
original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)
test_set= rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])
train_set = anti_join(original_data, test_set)
The actual code starts here:
Step 2:
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
Step 3:
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
Step 4:
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict(results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
I am planning on running this code on a dataset of about 200 million rows. I would like to speed this code up (Step 2, Step 3, Step 4) - I tried looking at different ways to do this, and came across "parallelization". Apparently, this can be done using libraries such as "future"/"foreach". Here was my attempt to parallelize the above code:
# second code: takes a long time to run
library(doParallel)
library(foreach)
registerDoParallel(cores = detectCores())
foreach(i = 1:100, .packages = 'ranger') %dopar% {
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict(results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
}
stopImplicitCluster()
For some reason, it seems that contrary to what I would have thought - parallelization is making this code take a lot longer to run.
My Question: Does anyone know if there are any other ways to speed up this code? I have a feeling I have not correctly understood the concepts behind parallelization - can someone please show me how to do this?
Parallel processing comes with the overhead of launching parallel tasks and putting together the results : it isn't always faster.
Before thinking about parallelizing, you could first identify the most time consuming parts of your code.
profvis package is a way of profiling code:
library(profvis)
profvis({
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict(results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
})
According to profvis, the most time consuming step is saveRDS:
However, this only accounts for 1.3 seconds, whereas using system.time() instead of profvis shows that the code needs about 6 seconds to complete.
Reading profvis FAQ explains that :
Calls to external programs and libraries also may not show up in the profiling data. If you call functions from a package to fetch data from external sources, keep in mind that time spent in those functions may not show in the profiler.
Timing each step alone shows that step 4 takes around 3 seconds and isn't accounted for by profvis.
This leads to the function which is called there : predict.ranger
?ranger::predict.ranger shows that this function is multithreaded :
num.threads : Number of threads. Default is number of CPUs available.
Meaning that the CPU is already using all it's processors most of the time, so that extra parallel processing won't help much, or might even be slower!
This can be seen on the task manager (x-axis = time, y-axis = CPU use from 0 to 100%):
with for loop :
with foreach loop :
You'll find hereafter the parallelized (or not) code used to compare performance. I put all loops together in one single loop.
Note that under Windows, you should use makeCluster instead of registerDoParallel to setup number of cores used.
library(doParallel)
library(foreach)
cl <- makeCluster( detectCores()-1)
registerDoParallel(cl)
# Step 2: Create "Balanced" Random Subsets:
results <- list()
results_1 <- list()
results_2 <- list()
wd = getwd()
# Measure performance
system.time({
foreach (i = 1:1000,.packages='ranger') %dopar% # Parallel version
# for (i in 1:1000) # non parallel version
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
# not necessary in loop
# results_df <- do.call(rbind.data.frame, results)
# X <- split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
model_i <- ranger(class ~ height + weight + salary, data = results_tmp, probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
predict_i <- data.frame(predict(model_i, data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
list(i,model_i,predict_i)
}
})
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
stopCluster(cl)
A few notes:
When running on 2E8 rows, you may want to make sure not to keep everything in memory and use fast operations. The data.table package may be useful here due to its performance and in-memory replacements. Maybe you do not need to export all training sets into the Global environment in step 2; I do not see where you use that, and it will take up a lot of memory (memory usage may become a primary concern here).
Looking purely at performance, saving all the models as RDS objects is quite time consuming. Unless required later, skipping this step might speed things up quite a bit. If you have memory issues and need to spill to disk, you may consider saving the predicted output, perhaps with data.table::fwrite and subsequently read it in with data.table::fread.
For some reason, despite the ranger and predict functions using multiple threads, running these steps in parallel may still give some speed improvements, depending on the way you can parallelize. In a linux environment, mclapply forks the process and does not copy data to all the nodes, so YMMV using other parallelization options. A few good suggestions for alternative ways to schedule in parallel are already in other comments/replies.
Unless I overlooked it, it seems to me that you could sample your training set once and then split into multiple parallel sets, as I did not see where you would use multiple iterations that feed sequentially into each other.
Below is one example that probably could be optimized further, depending on the memory profile
library(data.table)
library(parallel)
ncores <- floor(detectCores()/2)-1 # set no. of cores for parallel processing
trs <- setDT(train_set, keep.rownames = TRUE) # turn into data.table
n <- 1e2 # number of sampling iterations
# sample once, then label as iterations
results <- trs[c(sample(which(trs$class==0), 50*n, replace = TRUE),
sample(which(trs$class==1), 60*n, replace = TRUE))]
results[, iteration:=NA_character_]
results[class==0, iteration := as.character(cut(1:(50*n), n, labels = 1:n))]
results[class==1, iteration := as.character(cut(1:(60*n), n, labels = 1:n))]
results[, iteration := factor(iteration, order(unique(as.numeric(iteration))))]
# Step 3: Train Models on Each Subset:
calc_model <- \(x) ranger(class ~ height + weight + salary, data = x, probability = TRUE)
predict_model <- \(x) data.table(predict(calc_model(x), data = test_set)$predictions)[, id:=.I]
# save time and memory not saving model as RDS file; potentially, the list of models could
# be saved in one write operation, which could also be faster
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
# for some reason, despite predict using multiple threads, I still profit
# from parallelization here; skipping generation of X to save memory
results_2 <- mclapply(results[, unique(iteration)],
\(x){predict_model(results[iteration == x])}, mc.cores=ncores)
final_predictions <- rbindlist(results_2)[, lapply(.SD, mean), .SDcols=c("0", "1"), by="id"]
Created on 2022-06-27 by the reprex package (v2.0.1)

Rhadoop mapreduce for multiple input files

I'm building a mapreduce program, using R, that extracts the relevant features from a set of features in a dataset using genetic algorithm. I need to put many files as an input to my mapreduce job. My code below is my mapreduce program but it works only for one input file (data.csv).
library(caret)
library(dplyr)
library(rmr2)
Sys.setenv(HADOOP_CMD="/home/rania/hadoop-2.7.3/bin/hadoop")
Sys.getenv("HADOOP_CMD")
Sys.setenv(HADOOP_STREAMING="/home/rania/hadoop-streaming-2.7.3.jar")
library(rhdfs)
hdfs.init()
rmr.options(backend = "hadoop")
hdfs.mkdir("/user/rania/genetic")
hdfs.mkdir("/user/rania/genetic/data")
I put my files in one folder in hdfs
hadoop fs -copyFromLocal /home/rania/Downloads/matrices/*.csv /user/rania/genetic/data/
This is the map function
mon.map <- function(.,data){
data <- read.csv("/home/rania/Downloads/dataset.csv", header = T, sep = ";")
y <- c(1,0,1,0,1,1,1,1,0,0,1,0,1)
ga_ctrl <- gafsControl(functions = rfGA, # Assess fitness with RF
method = "cv") # 10 fold cross validation
set.seed(10)
lev <- c("1","0")
rf_ga3 <- gafs(x = data, y = y,
iters = 10, # 100 generations of algorithm
popSize = 4, # population size for each generation
levels = lev,
gafsControl = ga_ctrl)
keyval(rf_ga3$ga$final, data[names(data) %in% rf_ga3$ga$final] )
}
This is the reduce function
mon.reduce <- function(k,v){
keyval(k,v) }
Now i apply the mapreduce job
hdfs.root = 'genetic'
hdfs.data = file.path(hdfs.root, 'data')
hdfs.out = file.path(hdfs.root, 'out')
csv.format <- make.output.format("csv")
genetic = function (input, output) {mapreduce(input=input, output=output, input.format="csv",output.format=csv.format, map=mon.map,reduce=mon.reduce)}
out = genetic(hdfs.data, hdfs.out)
Then we print the result from hdfs
results <- from.dfs(out, format="csv")
print(results)
OR
hdfs.cat("/genetic/out/part-00000")
I tried to change the map function to make it work for many files but it failed
mon.map <- function(.,data){
data <- list.files(path="/home/rania/Downloads/matrices/", full.names=TRUE, pattern="\\.csv") %>% lapply(read.csv, header=TRUE, sep=",")
y <- c(1,0,1,0,1,1,1,1,0,0,1,0,1)
for (i in 1:4){
ga_ctrl <- gafsControl(functions = rfGA, # Assess fitness with RF
method = "cv") # 10 fold cross validation
set.seed(10)
lev <- c("1","0")
rf_ga3 <- gafs(x = data[[i]], y = y,
iters = 10, # 100 generations of algorithm
popSize = 4, # population size for each generation
levels = lev,
gafsControl = ga_ctrl)
}
keyval(rf_ga3$ga$final, do.call(cbind, Map(`[`, data, c(rf_ga3$ga$final))) )
}
what can i change in the previous map function to make it work for many input files? thanks

R: how to split dataframe in foreach %dopar%

This is a very simple example.
df = c("already ","miss you","haters","she's cool")
df = data.frame(df)
library(doParallel)
cl = makeCluster(4)
registerDoParallel(cl)
foreach(i = df[1:4,1], .combine = rbind, .packages='tm') %dopar% classification(i)
stopCluster(cl)
In real case I have dataframe with n=400000 rows.
I don't know how to send nrow/ncluster data for each cluster in one step, i = ?
I tried with isplitRows from library(itertools) without success.
You should try to work with indices to create subsets of your data.
foreach(i = nrow(df), .combine = rbind, .packages='tm') %dopar% {
tmp <- df[i, ]
classification(tmp)
}
This will take a new row of the data.frame each iteration.
Furthermore, you should notice that the result of a foreach loop will be written to a new variable. Thus, you should assign it like this:
res <- foreach(i = 1:10, .combine = c, ....) %dopar% {
# things you want to do
x <- someFancyFunction()
# the last value will be returned and combined by the .combine function
x
}
Try using a combination of split and mclapply as proposed in Aproach 1 here: https://www.r-bloggers.com/trying-to-reduce-the-memory-overhead-when-using-mclapply/
split lets you split data into groups defined by a factor, or you can just use 1:nrow(df) if you want to do the operation on each row seperately.
My solution after your comments:
n = 8 #number of cluster
library(foreach)
library(doParallel)
cl = makeCluster(n)
registerDoParallel(cl)
z = nrow(df)
y = floor(z/n)
x = nrow(df)%%n
ris = foreach(i = split(df[1:(z-x),],rep(1:n,each=y)), .combine = rbind, .packages='tm') %dopar% someFancyFunction(i)
stopCluster(cl)
#sequential
if (x !=0 )
ris = rbind(ris,someFancyFunction(df[(z-x+1):z,1]))
Note:
I used the sequential esecution at the end, because if "x" is not zero, the function split put the rest of rows (z-(z-x)) in the first cluster, and change the order of the result.

problems with cross validation code - r -

I'm writing a function to perform logistic regression on two columns of a dataframe. I can't get around the errors... I am trying to use 10-fold cross validation. Here's the code I'm using:
SAdata = read.table("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data",
sep=",",head=T,row.names=1)
log.fun = function(x,y) {
prediction = data.frame()
tset = data.frame()
dframe = cbind(x,y)
dframe = as.data.frame(dframe)
dframe$fold = sample(1:10, nrow(data), replace = TRUE)
list = 1:10
for (i in 1:10) {
train = subset(dframe, fold %in% list[-i])
test = subset(dframe, fold %in% c(i))
model = glm(x~y, data=train, family=binomial)
pred = as.data.frame(predict(model, test[,-1]))
prediction <- rbind(prediction, pred)
}
}
log.fun(SAdata$chd,SAdata$obesity)
The error I get is "Error in sample.int(length(x), size, replace, prob) :
invalid 'size' argument"
Any ideas?
This is kinda sub-optimal use of for loops and specially modelling... if you want to try some good models developing try the package 'caret'
If you still want to use that function here is a workaround
SAdata = read.table("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data",sep=",",head=T,row.names=1)
log.fun=function(x,y){
prediction = data.frame()
tset=data.frame()
dframe=cbind(x,y)
dframe=as.data.frame(dframe)
dframe$fold = sample(1:10, nrow(dframe), replace = TRUE)
list = 1:10
results <- list()
for (i in 1:10) {
results[[paste0('Fold',i)]]$train <- subset(dframe, fold %in% list[-i])
results[[paste0('Fold',i)]]$test <- subset(dframe, fold %in% c(i))
results[[paste0('Fold',i)]]$model <- glm(x~y, data=results[[i]]$train, family=binomial)
results[[paste0('Fold',i)]]$pred <- as.data.frame(predict(results[[i]]$model, results[[i]]$test[,-1]))
results[[paste0('Fold',i)]]$prediction <- rbind(prediction, results[[i]]$pred)
}
results}
your_results<-log.fun(SAdata$chd,SAdata$obesity)
head(your_results$Fold1$prediction)
In fact you had some problems in the function 'sample' since you were specifying 'data' and that object did not exist ... I replace it for dframe and added some names to each part of your results.
Hope it helps

Using lapply and !is.na to subset list vectors in R

I'm trying to apply the solution I found here to generate machine learning models:
Best way to name objects programmatically using R?
Here's a dummy data set:
data_pred <- data.frame(x1 = 1:10, x2 = 11:20, x3 = 21:30)
data_resp <- data.frame(y1 = c(1:5, NA, 7:10), y2 = c(NA, 2, NA, 4:10))
Here was my for() loop method of modeling the predictors in data_pred on each individual column of measured responses in data_resp using the caret package:
# data_pred contains predictors
# data_resp contains one column per measurement
# 1 matching row per observation in both data_pred and data_resp
for (i in 1:ncol(data_resp)) {
train(x = data_pred[!is.na(data_resp[, i]), ],
y = data_resp[!is.na(data_resp[, i], i],
... )
}
Now I'm trying to do the same with lapply, which I think has numerous advantages. I'm having an issue with translating the !is.na() criteria on the fly so that I'm only modeling with non-NA cases for each response. Here was my initial function to test the lapply method:
rf_func <- function(y) {
train(x = data_pred,
y = y,
method = "rf",
tuneGrid = data.frame(.mtry = 3:6),
nodesize = 3,
ntrees = 500,
trControl = trControl) }
Then create an empty list to store results and apply the function to data_resp:
models <- list(NULL)
models$rf <- lapply(as.list(data_resp), rf_func)
That works fine since randomForest can handle NAs, but other methods cannot, so I need to remove those rows from each data_resp element as well as the corresponding rows from my predictors.
I tried this without success:
train(x = data_pred_scale[!is.na(y), ],
y = y[!is.na(y)],
... }
I also tried y[[!is.na(y)]]
How do I translate the data.frame method (df[!is.na(df2), ]) to lapply?
several different ways to go about it. A simple approach is with an anonymous function:
lapply(data_resp, function(x) rf_func(x[!is.na(x)]))
In fiddling around quite a bit with a single element of my as.list(data_frame) to simulate what lapply would be passing, I came up with this, which I think is working:
rf_func <- function(y) {
train(x = data_pred_scale[!(unlist(lapply(y, is.na))), ],
y = y[!(unlist(lapply(y, is.na)))],
method = "rf",
tuneGrid = data.frame(.mtry = 3:6),
nodesize = 3,
ntrees = 500,
trControl = trControl) }
models$rf <- lapply(as.list(data_resp), rf_func)
It does seem to be working. I [hackishly] compared the non-NA data set to the trainingData results in each caret model like so:
nas <- NULL
for(i in 1:ncol(data_resp)) {nas <- c(nas, length(data_resp[!is.na(data_resp[, i]), i]))}
model_nas <- NULL
for(i in 1:length(nas)) {model_nas <- c(model_nas, nrow(models$rf[[i]]$trainingData))}
identical(nas, model_nas)
[1] TRUE
So, is y[!unlist(lapply(y, is.na)))] the best/most elegant way to do this sort of thing It's pretty ugly...
Edit: Based on #Ricardo Saporta 's answer, I was able to come up with this (probably obvious to the veterans, but bear with me):
rf_func <- function(x, y) {
train(x = x,
y = y,
method = "rf",
tuneGrid = data.frame(.mtry = 3:6),
nodesize = 3,
ntrees = 500,
trControl = trControl) }
models$rf <- lapply(data_resp, function (y) {
rf_func(data_pred_scale[!is.na(y), ], y[!is.na(y)] )
}
)
Is there still a better way, or is that fairly decent? (Certainly prettier than my first mess up above.)

Resources