I have the following code that is running fine:
# first code: works fine
# Step 1 : Create Data for Example:
library(dplyr)
library(ranger)
original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)), data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )
original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)
test_set= rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])
train_set = anti_join(original_data, test_set)
The actual code starts here:
Step 2:
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
Step 3:
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
Step 4:
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict(results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
I am planning on running this code on a dataset of about 200 million rows. I would like to speed this code up (Step 2, Step 3, Step 4) - I tried looking at different ways to do this, and came across "parallelization". Apparently, this can be done using libraries such as "future"/"foreach". Here was my attempt to parallelize the above code:
# second code: takes a long time to run
library(doParallel)
library(foreach)
registerDoParallel(cores = detectCores())
foreach(i = 1:100, .packages = 'ranger') %dopar% {
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict(results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
}
stopImplicitCluster()
For some reason, it seems that contrary to what I would have thought - parallelization is making this code take a lot longer to run.
My Question: Does anyone know if there are any other ways to speed up this code? I have a feeling I have not correctly understood the concepts behind parallelization - can someone please show me how to do this?
Parallel processing comes with the overhead of launching parallel tasks and putting together the results : it isn't always faster.
Before thinking about parallelizing, you could first identify the most time consuming parts of your code.
profvis package is a way of profiling code:
library(profvis)
profvis({
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict(results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
})
According to profvis, the most time consuming step is saveRDS:
However, this only accounts for 1.3 seconds, whereas using system.time() instead of profvis shows that the code needs about 6 seconds to complete.
Reading profvis FAQ explains that :
Calls to external programs and libraries also may not show up in the profiling data. If you call functions from a package to fetch data from external sources, keep in mind that time spent in those functions may not show in the profiler.
Timing each step alone shows that step 4 takes around 3 seconds and isn't accounted for by profvis.
This leads to the function which is called there : predict.ranger
?ranger::predict.ranger shows that this function is multithreaded :
num.threads : Number of threads. Default is number of CPUs available.
Meaning that the CPU is already using all it's processors most of the time, so that extra parallel processing won't help much, or might even be slower!
This can be seen on the task manager (x-axis = time, y-axis = CPU use from 0 to 100%):
with for loop :
with foreach loop :
You'll find hereafter the parallelized (or not) code used to compare performance. I put all loops together in one single loop.
Note that under Windows, you should use makeCluster instead of registerDoParallel to setup number of cores used.
library(doParallel)
library(foreach)
cl <- makeCluster( detectCores()-1)
registerDoParallel(cl)
# Step 2: Create "Balanced" Random Subsets:
results <- list()
results_1 <- list()
results_2 <- list()
wd = getwd()
# Measure performance
system.time({
foreach (i = 1:1000,.packages='ranger') %dopar% # Parallel version
# for (i in 1:1000) # non parallel version
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
# not necessary in loop
# results_df <- do.call(rbind.data.frame, results)
# X <- split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
model_i <- ranger(class ~ height + weight + salary, data = results_tmp, probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
predict_i <- data.frame(predict(model_i, data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
list(i,model_i,predict_i)
}
})
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
stopCluster(cl)
A few notes:
When running on 2E8 rows, you may want to make sure not to keep everything in memory and use fast operations. The data.table package may be useful here due to its performance and in-memory replacements. Maybe you do not need to export all training sets into the Global environment in step 2; I do not see where you use that, and it will take up a lot of memory (memory usage may become a primary concern here).
Looking purely at performance, saving all the models as RDS objects is quite time consuming. Unless required later, skipping this step might speed things up quite a bit. If you have memory issues and need to spill to disk, you may consider saving the predicted output, perhaps with data.table::fwrite and subsequently read it in with data.table::fread.
For some reason, despite the ranger and predict functions using multiple threads, running these steps in parallel may still give some speed improvements, depending on the way you can parallelize. In a linux environment, mclapply forks the process and does not copy data to all the nodes, so YMMV using other parallelization options. A few good suggestions for alternative ways to schedule in parallel are already in other comments/replies.
Unless I overlooked it, it seems to me that you could sample your training set once and then split into multiple parallel sets, as I did not see where you would use multiple iterations that feed sequentially into each other.
Below is one example that probably could be optimized further, depending on the memory profile
library(data.table)
library(parallel)
ncores <- floor(detectCores()/2)-1 # set no. of cores for parallel processing
trs <- setDT(train_set, keep.rownames = TRUE) # turn into data.table
n <- 1e2 # number of sampling iterations
# sample once, then label as iterations
results <- trs[c(sample(which(trs$class==0), 50*n, replace = TRUE),
sample(which(trs$class==1), 60*n, replace = TRUE))]
results[, iteration:=NA_character_]
results[class==0, iteration := as.character(cut(1:(50*n), n, labels = 1:n))]
results[class==1, iteration := as.character(cut(1:(60*n), n, labels = 1:n))]
results[, iteration := factor(iteration, order(unique(as.numeric(iteration))))]
# Step 3: Train Models on Each Subset:
calc_model <- \(x) ranger(class ~ height + weight + salary, data = x, probability = TRUE)
predict_model <- \(x) data.table(predict(calc_model(x), data = test_set)$predictions)[, id:=.I]
# save time and memory not saving model as RDS file; potentially, the list of models could
# be saved in one write operation, which could also be faster
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
# for some reason, despite predict using multiple threads, I still profit
# from parallelization here; skipping generation of X to save memory
results_2 <- mclapply(results[, unique(iteration)],
\(x){predict_model(results[iteration == x])}, mc.cores=ncores)
final_predictions <- rbindlist(results_2)[, lapply(.SD, mean), .SDcols=c("0", "1"), by="id"]
Created on 2022-06-27 by the reprex package (v2.0.1)
Related
I'm facing a real problem in the analysis of my data. I hope someone of you can help me.
I have 18 variables and a binary outcome. I want to try all the possible combinations of variables (i.e. 2^18 -1 = 262'143) and for each one of them to build three models (Logistic, SVM, RandomForest) and to compute the AUC.
A single iteration lasts about 10s and I have a fairly good computer (12 cores + HT, 160GB RAM), so I implemented the following parfor loop (fitData is a dataframe with all the covariates and the outcome)
cl <- makeCluster(15, outfile="")
registerDoParallel(cl)
clinicalSweep3 <- foreach (kei= 1:(2^(length(varNames)) -1), .combine='rbind', .packages=c('e1071','pROC','randomForest')) %dopar% {
varNamesKei <- varNames[as.logical(intToBits(kei)[1:(length(varNames))])]
yString <- (paste(as.integer(intToBits(kei)[1:(length(varNames))]),collapse=""))
yLabel <- as.character(paste(varNamesKei,collapse="+"))
covPacket <- data.frame(t(strsplit(yString,"")[[1]]))
colnames(covPacket) <- varNames
AUC_Logit <- NA
AUC_SVM <- NA
AUC_RF <- NA
AUC_Logit2CV <- NA
AUC_SVM2CV <- NA
AUC_RF2CV <- NA
predLogit <- 1:dim(fitData)[1]
predSVM <- 1:dim(fitData)[1]
predRF <- 1:dim(fitData)[1]
for (k in 1:dim(fitData)[1]){
predLogit[k] <- predict(glm(as.formula(paste("outcome ~ ", paste(varNamesKei, collapse= "+"))),data = fitData[-k,], family=binomial(link='logit')),type="response", newdata = fitData[k,,drop=F])
predSVM[k] <- predict(svm(as.formula(paste("outcome ~ ", paste(varNamesKei, collapse= "+"))),data = fitData[-k,], kernel = "linear", cost = 10, type = "C-classification"), newdata = as.data.frame(fitData[k,,drop=F]))
predRF[k] <- predict(randomForest(as.formula(paste("outcome ~ ", paste(varNamesKei, collapse= "+"))),data = fitData[-k,], ntree=100),type="prob", newdata = fitData[k,,drop=F])[2]
}
AUC_Logit <- auc(fitData$outcome,predLogit,direction="<", levels = c(F,T))
AUC_SVM <- auc(fitData$outcome,predSVM,direction="<", levels = c(F,T))
AUC_RF <- auc(fitData$outcome,predRF,direction="<", levels = c(F,T))
predLogit <- 1:dim(fitData)[1]
predSVM <- 1:dim(fitData)[1]
predRF <- 1:dim(fitData)[1]
for (k in 1:(dim(fitData)[1]/2)){
predLogit[c(2*k-1,2*k)] <- predict(glm(as.formula(paste("outcome ~ ", paste(varNamesKei, collapse= "+"))),data = fitData[-c(2*k-1,2*k),], family=binomial(link='logit')),type="response", newdata = fitData[c(2*k-1,2*k),,drop=F])
predSVM[c(2*k-1,2*k)] <- predict(svm(as.formula(paste("as.numeric(outcome)-1 ~ ", paste(varNamesKei, collapse= "+"))),data = fitData[-c(2*k-1,2*k),], kernel = "linear", cost = 10, type = "eps-regression"), newdata = as.data.frame(fitData[c(2*k-1,2*k),,drop=F]))
predRF[c(2*k-1,2*k)] <- predict(randomForest(as.formula(paste("outcome ~ ", paste(varNamesKei, collapse= "+"))),data = fitData[-c(2*k-1,2*k),], ntree=100),type="prob", newdata = fitData[c(2*k-1,2*k),,drop=F])[,2]
}
AUC_Logit2CV <- auc(fitData$outcome,predLogit,direction="<", levels = c(F,T))
AUC_SVM2CV <- auc(fitData$outcome,predSVM,direction="<", levels = c(F,T))
AUC_RF2CV <- auc(fitData$outcome,predRF,direction="<", levels = c(F,T))
}
cat(as.character(Sys.time()),"combo",yString,"done! (",kei,")\n")
data.frame( yString, covPacket,
AUC_Logit, AUC_SVM,AUC_RF ,
AUC_Logit2CV, AUC_SVM2CV, AUC_RF2CV ,
yLabel)
}
Now, the code run for about 30h without problems, even though I noticed the RAM was steadily increasing. Yesterday I decided to create a 100GB swap area and, indeed, when the last worker finished the situation was the following.
The CPU was no more used but the R session was still blocked and working.
I hoped it finishes overnight, but this morning the situation is not better.
The swappiness is set to 50.
Now I got that foreach should not be used with so many iterations.
Now the questions:
1) Should I stop and rewrite the entire code nesting a normal loop inside the foreach? (about 3000 iterations in foreach * 100 in normal for). Or should I wait (maybe increasing the swap)
clinicalSweep3 <- foreach (kei1= 0:floor((2^(length(varNames)) -1)/100), .combine='rbind', .packages=c('e1071','pROC','randomForest')) %dopar% {
for (kei2 in 1:min((2^(length(varNames)) -1) - kei1*100,100 ) ) {
kei <- kei1*100+kei2
# EDIT: I found that probably foreach( kei=idiv(2^(length(varNames)) -1, chunks=100),...) should do the work
2) Why this massive RAM usage and how to avoid it? (it's all in the main rsession, not in the workers). Should I run gc() at the beginning/end of each iteration? ( default .export = NULL should not import everything in the main thread form workers )
3) Why is my kernel (from ubuntu 20.04 Desktop) not swapping enough? Maybe because it's a single thread with 300GB?
4) If you have suggestions on how to improve the overall performance of the code I'll be glad to hear them!
Thank you!
I have a very large dataframe with > 5 millions obs. and 7 var.
I have to do thousands of bootstrap resampling and create a model with each sample, make predictions and save the results in a dataframe where each column is the prediction of one of the models
here is my code:
library(caret)
library(randomForest)
library(foreach)
library(doParallel)
#very small database example
db <-
data.frame(Y = as.factor(sample(c(0, 1), 1000, replace = T)),
X1 = sample(c(100:2000), 1000, replace = T),
X2=sample(c(100:2000), 1000, replace = T),
X3 = sample(c(100:2000), 1000, replace = T),
X4 = sample(c(100:2000), 1000, replace = T))
#bootstrap 100 times
times <- 100
set.seed(5)
#create 1000 resample indexes
res <- createResample(db$Y,times = times)
db_list <- list()
#list of database resampled
for (i in 1:times) {
db_list[[i]] <- db[res[[i]],]
}
#create 100 models from the db_list and make prediction
model_list <- list()
pred <- data.frame(Y=rep(NA,nrow(db)))
#on my machin this loop take 28.10 sec elapsed
system.time(for (l in seq_len(times)) {
set.seed(5)
model_list[[l]] <- randomForest(Y~.,data=db_list[[l]])
pred[,l] <- predict(model_list[[l]],newdata = db[,-1])#predict from the original Xn variables
db_pred <- db
db_pred <- cbind(db_pred,pred)
})
#make cluster with 11 core in my machine
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)
#on my machin this loop take 23.25 sec elapsed
system.time(foreach(l = seq_len(times),.packages = "randomForest")%dopar% {
set.seed(5)
model_list[[l]] <- randomForest(Y~.,data=db_list[[l]])
pred[,l] <- predict(model_list[[l]],newdata = db[,-1])#predict from the original Xn variables
db_pred <- db
db_pred <- cbind(db_pred,pred)
})
stopCluster(cl)
the code work well with a small dataframe but take forever with my data, despite the foreach loop.
I've tried with caret::train() but I have not found the way to store the predictions of all the models in a dataframe.
The question are:
how can I speed up the process?
Are there classifiers faster than randomforest that could be used with this data?
Thanks in advance for any help
I have the following function:
CFC_GLM <- function(data, frequency_bins){
adj_mat <- matrix(0, nrow = dim(data)[1], ncol = dim(data)[1])
bf_filters <- list()
combs <- combinations(length(frequency_bins), 2, repeats.allowed = T)
all_adj_mat <- list()
for(z in 1:length(frequency_bins)){
bf_filters[[z]] <- butter(3, c(frequency_bins[[z]][1]/1200,
frequency_bins[[z]][2]/1200), type = "pass")
}
for(f in 1:nrow(combs)){
for(i in 1:dim(data)[1]){
for(j in 1:dim(data)[1]){
sensor_1 <- data[i,]
sensor_2 <- data[j,]
sensor_1_filt = filtfilt(bf_filters[[combs[f,1]]], sensor_1)
sensor_2_filt = filtfilt(bf_filters[[combs[f,2]]], sensor_2)
a_y <- abs(hilbert(sensor_2_filt, 1200))
a_x <- abs(hilbert(sensor_1_filt, 1200))
theta_x <- angle(hilbert(sensor_1_filt, 1200)) %% 2*pi
a_x_norm <- (a_x - mean(a_x))/std(a_x)
a_y_norm <- (a_y - mean(a_y))/std(a_y)
theta_x_norm <- (theta_x - mean(theta_x))/std(theta_x)
fit <- lm(a_y_norm ~ sin(theta_x_norm) + cos(theta_x_norm) +
a_x_norm)
summ <- summary(fit)
r <- sqrt(summ$r.squared)
adj_mat[i,j] <- r
}
}
all_adj_mat[[f]] <- adj_mat
}
return(all_adj_mat)
}
Just to summarize, the function takes a matrix of signals (246 sensors by 2400 samples), performs some signal processing, and then performs a GLM between every possible pairs of sensors. This process is repeated for 4 frequency bandwidths and their combinations (within and cross-frequency coupling). Right now, this code seems terribly inefficient and takes a really long time to run. Is there a way to vectorize/parallelize this function? I have researched this question extensively and cannot seem to find an answer.
I am not sure whether to make some of the tasks within the function parallel or just make the whole function able to be called by parApply (vectorized). My intuition is the latter but I am not sure how to approach this. Any help is greatly appreciated.
Reproducible Example
test_data <- c(-347627.104358097, 821947.421444641, 496824.676355433,
-178091.364312102, -358842.250713998, 234666.210462063,
-1274153.04141668,
1017066.42839987, -158388.137875357, 191691.279588641,
-16231.2106151229,
378249.600546794, 1080850.88212858, -688841.640871254,
-616713.991288002,
639401.465180969, -1625802.44142751, 472370.867686569,
-631863.239075449,
-598755.248911174, 276422.966753179, -44010.9403226763,
1569374.08537143,
-1138797.2585617, -824232.849278583, 955783.332556046,
-1943384.98409094,
-54443.829280377, -1040354.44654998, -1207674.05255178,
496481.331429747,
-417435.356472725, 1886817.1254085, -1477199.59091112,
-947353.716505171,
1116336.49812969, -2173805.84111182, -574875.152250742,
-1343996.2219146,
-1492260.06197604, 626856.67540728, -713761.48191904, 1987730.27341334,
-1673384.77863935, -968522.886481198, 1089458.71433614,
-2274932.19262517,
-1096749.79392427, -1520842.86946059, -1390794.61065106,
669864.477272507,
-906096.822125892, 1863506.59188299, -1720956.06310511,
-889359.420058576,
885300.628410276, -2224340.54992297, -1619386.88041896,
-1570131.07127786,
-934848.556063722, 644671.113108699, -973418.329437102,
1541962.53750178,
-1636863.31666018, -728992.972371437, 551297.997356909,
-2026413.5471505,
-2129730.49230266, -1511423.25789691, -236962.889589694,
580683.399845852,
-906261.700784793, 1080101.95011954, -1455931.89179814,
-518630.187846405,
158846.288141661, -1715610.22092989, -2601349.5081924,
-1380068.64260811,
541310.557194977, 509125.333244057, -711696.682554995,
551748.792106809,
-1222430.29467688, -293847.487823853, -215078.751157158,
-1354005.89576504,
-2997647.23289805, -1220136.14918605, 1231169.98678596,
455388.081391798,
-415489.975542684, 32724.7895795912, -980848.930757441,
-86618.5594163355,
-506333.915891838, -1022235.58829567, -3279232.01820961,
-1076344.95091665,
1696655.88400158), .Dim = c(10L, 10L))
frequency_bins <- list(band1 = c(2,4), band2 = c(4,12), band3 =
c(12,30), band4 = c(30,100))
system.time(test_result <- CFC_GLM(test_data, frequency_bins))
user system elapsed
1.839 0.009 1.849
I'm not sure how to include the result in a manageable way. Sorry for the naivety. This is only with 10 sensors by 10 samples, to have a manageable test set.
Right off the bat I would suggest predeclaring the length of your lists.
bf_filters <- rep(list(NA), length(frequency_bins))
all_adj_mat <- rep(list(NA), nrow(combos))
#this is your function to be applied
i_j_fun <- function ( perms ) {
sensor_1_filt = filtfilt(bf_filters[[combos[f,1]]], data[perms[1],])
sensor_2_filt = filtfilt(bf_filters[[combos[f,2]]], data[persm[2],])
a_y <- abs(hilbert(sensor_2_filt, 1200))
a_x <- abs(hilbert(sensor_1_filt, 1200))
theta_x <- angle(hilbert(sensor_1_filt, 1200)) %% 2*pi
a_x_norm <- (a_x - mean(a_x))/std(a_x)
a_y_norm <- (a_y - mean(a_y))/std(a_y)
theta_x_norm <- (theta_x - mean(theta_x))/std(theta_x)
fit <- lm(a_y_norm ~ sin(theta_x_norm) + cos(theta_x_norm) +
a_x_norm)
summ <- summary(fit)
r <- sqrt(summ$r.squared)
return(r)
}
Your i and j for loops can be turned into a function and used with apply.
#perms acts like the for loop
perms <- permuations(dim(data)[1], 2, seq_len(dim(data)[1]))
for(f in 1:nrow(combs)){
all_adj_mat[[f]] <- matrix(apply(perms, 1, i_j_fun),
nrow = dim(data)[1], ncol = dim(data[2]), byrow = TRUE)
}
That should do it.
A critical difference (CD) plot for comparing classifiers over multiple data sets (Demšar2006) can be generated with the mlr package like this:
# THIS WORKS
library(mlr)
lrns = list(makeLearner("classif.knn"), makeLearner("classif.svm"))
tasks = list(iris.task, sonar.task)
rdesc = makeResampleDesc("CV", iters = 2L)
meas = list(acc)
bmr = benchmark(lrns, tasks, rdesc, measures = meas)
cd = generateCritDifferencesData(bmr)
plotCritDifferences(cd)
This requires the evaluation results to reside in a rather complex BenchmarkResult object, although the data is basically a matrix (where M[i, j] holds the score of classifier i for data set j).
I have previously generated such data in a Python workflow and imported in R into a data.frame (as there seems to be no Python package for such plots).
How can I generate a CD plot from this data?
I thought about creating a BenchmarkResult from the data.frame, but didn't know where to start:
# THIS DOES NOT WORK
library(mlr)
# Here I would import results from my experiments instead of using random data
# e.g. scores for 5 classifiers and 30 data sets, each
results = data.frame(replicate(5, runif(30, 0, 1)))
# This is the functionality I'm looking for
bmr = benchmarkResultFromDataFrame(results)
cd = generateCritDifferencesData(bmr)
plotCritDifferences(cd)
I finally managed to create the plot. It is necessary to set only a handful of the BenchmarkResult's attributes:
leaners with id and short.name for each classifier
measures
results with aggr for each dataset/classifier combination
The code may then look like this (smaller example of 5 datasets):
library(mlr)
# Here I would import results from my experiments instead of using random data
# e.g. scores for 5 classifiers and 30 data sets, each
results <- data.frame(replicate(5, runif(30, 0, 1)))
clf <- c('clf1', 'clf2', 'clf3', 'clf4', 'clf5')
clf.short.name <- c('c1', 'c2', 'c3', 'c4', 'c5')
dataset <- c('dataset1', 'dataset2', 'dataset3', 'dataset4', 'dataset5')
score <- list(acc)
# Setting up the learners: id, short.name
bmr <- list()
for (i in 1:5){
bmr$learners[[clf[i]]]$id <- clf[i]
bmr$learners[[clf[i]]]$short.name <- clf.short.name[i]
}
# Setting up the measures
bmr$measures <- list(acc)
# Setting up the results
for (i in 1:5){
bmr$results$`dataset1`[[clf[i]]]$aggr <- list('acc.test.mean' = results[1, i])
}
for (i in 1:5){
bmr$results$`dataset2`[[clf[i]]]$aggr <- list('acc.test.mean' = results[2, i])
}
for (i in 1:5){
bmr$results$`dataset3`[[clf[i]]]$aggr <- list('acc.test.mean' = results[3, i])
}
for (i in 1:5){
bmr$results$`dataset4`[[clf[i]]]$aggr <- list('acc.test.mean' = results[4, i])
}
for (i in 1:5){
bmr$results$`dataset5`[[clf[i]]]$aggr <- list('acc.test.mean' = results[5, i])
}
# Set BenchmarkResult class
class(bmr) <- "BenchmarkResult"
# Statistics and plot
cd = generateCritDifferencesData(bmr)
plotCritDifferences(cd)
Anyone who could teach me better R to avoid these for loops and code duplication would still be very welcome!
I am currently trying parallel computing in R.
I am trying to train a logistic ridge model , and I currently have 4 Cores on my computer. I would like to split my data set equally into 4 pieces, and use each core to train model (on the training data) and save the result of each core into a single vector . the problem is that i have no clue how to do it, right now I tried to parallel with the foreach package, but the problem is the each core sees the same training data. here is the code with the foreach package (which doesn't split the data) :
library(ridge)
library(parallel)
library(foreach)
num_of_cores <- detectCores()
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
data_per_core <- floor(nrow(mydata)/num_of_cores)
result <- data.frame()
r <- foreach(icount(4), .combine = cbind) %dopar% {
result <- logisticRidge(admit~ gre + gpa + rank,data = mydata)
coefficients(result)
}
any idea how to simultaneously split the data into x chunks and train the models in parallel ?
How about something like this? It uses snowfall instead of the foreach-library, but should give the same results.
library(snowfall)
library(ridge)
# for reproducability
set.seed(123)
num_of_cores <- parallel::detectCores()
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
data_per_core <- floor(nrow(mydata)/num_of_cores)
# we take random rows to each cluster, by sampleid
mydata$sampleid <- sample(1:num_of_cores, nrow(mydata), replace = T)
# create a small function that calculates the coefficients
regfun <- function(dat) {
library(ridge) # this has to be in the function, otherwise snowfall doesnt know the logistic ridge function
result <- logisticRidge(admit~ gre + gpa + rank, data = dat)
coefs <- as.numeric(coefficients(result))
return(coefs)
}
# prepare the data
datlist <- lapply(1:num_of_cores, function(i){
dat <- mydata[mydata$sampleid == i, ]
})
# initiate the clusters
sfInit(parallel = T, cpus = num_of_cores)
# export the function and the data to the cluster
sfExport("regfun")
# calculate, (sfClusterApply is very similar to sapply)
res <- sfClusterApply(datlist, function(datlist.element) {
regfun(dat = datlist.element)
})
#stop the cluster
sfStop()
# convert the list to a data.frame. data.table::rbindlist(list(res)) does the same job
res <- data.frame(t(matrix(unlist(res), ncol = num_of_cores)))
names(res) <- c("intercept", "gre", "gpa", "rank")
res
# res
# intercept gre
# 1 -3.002592 1.558363e-03
# 2 -4.142939 1.060692e-03
# 3 -2.967130 2.315487e-03
# 4 -1.176943 4.786894e-05
# gpa rank
# 1 0.7048146997 -0.382462408
# 2 0.9978841880 -0.314589628
# 3 0.6797382218 -0.464219036
# 4 -0.0004576679 -0.007618317
The itertools package provides a number of functions for iterating over various data structures with foreach loops. In this case, you could use the isplitRows function to split the data frame row-wise into one chunk per worker:
library(ridge)
library(doParallel)
library(itertools)
num_of_cores <- detectCores()
cl <- makePSOCKcluster(num_of_cores)
registerDoParallel(cl)
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
r <- foreach(d=isplitRows(mydata, chunks=num_of_cores),
.combine = cbind, .packages="ridge") %dopar% {
result <- logisticRidge(admit~ gre + gpa + rank, data = d)
coefficients(result)
}
isplitRows also takes a chunkSize argument if you want to control the maximum size of each chunk.
Note that using this technique, each worker only receives an appropriate fraction of mydata. This is particularly important for larger data frames with a PSOCK cluster.