R Explaining Random Forest Variable Selection Sample Code - r

I have the sample code of random forest variable selection. We want to choose the combination of variables with most importance and build the random forest model with the lowest OOB. Can anyone explain the for loop part in the function for me?
clinical_variables <- c("Age","location", "smoke", "perianal_disease","upper_tract", "LnASCA
IgA","LnASCA IgG", "LnANCA", "LnCbir", "LnOMPC", "CRP", "Albumin", "African American Race")
variable_selected_progress_biomarkers <- vector("list", 50)
error_rate_min_progress_biomarkers <- rep(NA, 50)
for (j in 1:50){
risk_progress_biomarker_variables <- risk_full %>%
select(names(risk), clinical_variables) %>%
select(-c("STRICTURE", "TIM2STRICTURE", "PENETRATING", "TIM2PENETRATING","BDNF","LASTFOLLOWUPDAYSPROGRESS", "PROGRESSED")) %>% names
risk_progress_biomarker_variables_total <- vector("list",104)
names(risk_progress_biomarker_variables_total) <- 104:1
error_rate_tail_progress_biomarker <- rep(NA, 104)
for (i in 1:104){
set.seed(4182019)
risk_progress_biomarker_variables_total[[i]] <- risk_progress_biomarker_variables
rf_risk_progress_biomarker <- rfsrc(
Surv(LASTFOLLOWUPDAYSPROGRESS, PROGRESSED) ~ .,
data = risk_full %>% select(risk_progress_biomarker_variables, LASTFOLLOWUPDAYSPROGRESS, PROGRESSED)%>%
mutate_if(is.factor, as.numeric),
ntree=1000,
importance = TRUE
)
error_rate_tail_progress_biomarker[i] <- tail(rf_risk_progress_biomarker$err.rate,n =1)
rf_risk_progress_biomarker_importance <- rf_risk_progress_biomarker$importance %>%
as.data.frame() %>%
rownames_to_column() %>%
as.tibble() %>%
dplyr::rename(VIMP = ".") %>%
arrange(desc(VIMP))
risk_progress_biomarker_variables <- rf_risk_progress_biomarker_importance %>%
head((dim(rf_risk_progress_biomarker_importance)[1]-1)) %>%
# top_n((dim(rf_risk_progress_biomarker_importance)[1]-1)) %>%
pull(rowname)
print(i)
}
tibble_error_rate_tail_progress_biomarker <- tibble(n = 104:1, error_rate = error_rate_tail_progress_biomarker)
suppressMessages(n_min_progress_biomarker <- tibble_error_rate_tail_progress_biomarker %>% top_n(-1) %>% pull(n))
suppressMessages(error_rate_min_progress_biomarker <- tibble_error_rate_tail_progress_biomarker %>% top_n(-1) %>% pull(error_rate))
variable_selected_progress_biomarkers[[j]] <- str_replace_all(risk_progress_biomarker_variables_total[[105-n_min_progress_biomarker]], "_", "")
error_rate_min_progress_biomarkers[j] <- error_rate_min_progress_biomarker
print(paste("Finish", j))
}

Related

Predict in workflow throws that column doesn't exist

Given the following code
library(tidyverse)
library(lubridate)
library(tidymodels)
library(ranger)
df <- read_csv("https://raw.githubusercontent.com/norhther/datasets/main/bitcoin.csv")
df <- df %>%
mutate(Date = dmy(Date),
Change_Percent = str_replace(Change_Percent, "%", ""),
Change_Percent = as.double(Change_Percent)
) %>%
filter(year(Date) > 2017)
int <- interval(ymd("2020-01-20"),
ymd("2022-01-15"))
df <- df %>%
mutate(covid = ifelse(Date %within% int, T, F))
df %>%
ggplot(aes(x = Date, y = Price, color = covid)) +
geom_line()
df <- df %>%
arrange(Date) %>%
mutate(lag1 = lag(Price),
lag2 = lag(lag1),
lag3 = lag(lag2),
profit_next_day = lead(Profit))
# modelatge
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day))
set.seed(42)
data_split <- initial_split(df_mod) # 3/4
train_data <- training(data_split)
test_data <- testing(data_split)
bitcoin_rec <-
recipe(profit_next_day ~ ., data = train_data) %>%
step_naomit(all_outcomes(), all_predictors()) %>%
step_normalize(all_numeric_predictors())
bitcoin_prep <-
prep(bitcoin_rec)
bitcoin_train <- juice(bitcoin_prep)
bitcoin_test <- bake(bitcoin_prep, test_data)
rf_spec <-
rand_forest(trees = 200) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
bitcoin_wflow <-
workflow() %>%
add_model(rf_spec) %>%
add_recipe(bitcoin_prep)
bitcoin_fit <-
bitcoin_wflow %>%
fit(data = train_data)
final_model <- last_fit(bitcoin_wflow, data_split)
collect_metrics(final_model)
final_model %>%
extract_workflow() %>%
predict(test_data)
The last chunk of code that extracts the workflow and predicts the test_data is throwing the error:
Error in stop_subscript(): ! Can't subset columns that don't exist.
x Column profit_next_day doesn't exist.
but profit_next_day exists already in test_data, as I checked multiple times, so I don't know what is happening. Never had this error before working with tidymodels.
The problem here comes from using step_naomit() on the outcome. In general, steps that change rows (such as removing them) can be pretty tricky when it comes time to resample or predict on new data. You can read more in detail in our book, but I would suggest that you remove step_naomit() altogether from your recipe and change your earlier code to:
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day)) %>%
na.omit()

Optimizing large db merge using split() function

I need to perform a conceptually straightforward double left-merge followed by a simple series of matching functions (See: Straightforward Solution). However, given the DBs I have to merge are large in size I tried to unpack the merging procedure by considering a for-loop that does the trick but is inefficient to say the least (See: For-loops Solution).
Is there a solution splitting and naming at least the largest db?
Below there is a toy example.
For reference, in my data:
db_m1 ~50k lines (for ~5k unique m1)
db_m2 ~25k lines (for ~5k unique m1 and m2)
db_p ~100m lines
set.seed(0)
db_m1 <- data.frame(
y=rep(1,20),
id=sort(rep(paste0("id_",c(letters[1:4])),5)),
m1=rep(c(1,2),10),
x1=sample(LETTERS, 20, TRUE),
x2=sample(LETTERS, 20, TRUE))
set.seed(0)
db_m2 <- data.frame(y=rep(1,20),
m1=sample(c(1:5),20,TRUE),
m2=sample(c(6:10),20,TRUE))
set.seed(0)
db_p <- data.frame(m2=sample(c(6:10),100,TRUE),
y1=sample(LETTERS, 100,TRUE),
y2=sample(LETTERS, 10,TRUE))
Straightforward Solution :
final_dplyr <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
For-loops Solution:
fn_final <- function(db_m1,db_m2,db_p) {
matches_final <- vector("list",length = length(unique(db_m1$y)))
for(i in 1:length(unique(db_m1$y))){
matches <- vector("list",length = length(unique(db_m1$m1)))
for(j in 1:length(unique(db_m1$m1))){
temp_db_m1 <- db_m1 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
temp_db_m2 <- db_m2 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
m_vector <- unique(temp_db_m2$m2)
temp_db_p <- db_p %>%
dplyr::filter(m2 %in% m_vector)
final <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>% dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
matches[[j]] <- final
}
matches_all <- do.call(rbind, matches)
matches_final[[i]] <- matches_all
}
final <- do.call(rbind, matches_final) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(final)
}
final_for <- fn_final(db_m1,db_m2,db_p)
This is a possible solution, should it be optimized further?
db_m1_s <- split(db_m1, f = list(db_m1$y,db_m1$m1))
db_m2_s <- split(db_m2, f = list(db_m2$y,db_m2$m1))
db_p_s <- split(db_p, f = list(db_p$m2))
match_fn <- function(temp_db_m1,temp_db_m2,temp_db_p){
final <- temp_db_m1 %>%
dplyr::left_join(temp_db_m2) %>%
dplyr::left_join(temp_db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
return(final)
}
fn_final <- function(db_m1,db_m1_s,db_m2_s,db_p_s) {
m <- names(db_m1_s)
matches_1 <- vector("list",length = length(m))
for(i in 1:length(m)){
temp_db_m1 <- db_m1_s[[m[i]]]
temp_db_m2 <- db_m2_s[[m[i]]]
n <- as.character(sort(unique(temp_db_m2$m2)))
matches_2 <- vector("list",length = length(n))
for(j in 1:length(n)){
temp_db_p <- db_p_s[[n[j]]]
final <- match_fn(temp_db_m1,temp_db_m2,temp_db_p)
matches_2[[j]] <- final
}
matches_all <- do.call(rbind, matches_2)
matches_1[[i]] <- matches_all
}
matches_0 <- do.call(rbind, matches_1) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(matches_0)
}
final_for <- fn_final(db_m1,db_m1_s,db_m2_s,db_p_s)

How to handle forecast data (melt and "unmelt") generated by modeltime prediction - lost variables

below I created some fake forecast data using the tidyverse modeltime packages. I have got monthly data from 2016 and want to produce a test fc for 2020. As you can see, the data I load comes in wide format. For usage in modeltime I transform it to long data. After the modeling phase, I want to create a dataframe for the 2020 prediction values. For this purpose I need to somehow "unmelt" the data. In this process I am unfortunately losing a lot of variables. From 240 variables that I want to forecast I get only 49 in the end result. Maybe I am blind, or I do not know how to configure the modeltime functions correctly. I would really much appreciate some help. Thanks in advance!
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
## create some senseless data to produce forecasts on...
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <-
c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239)
replicate <- rep(1,60) %*% t.default(fake_values)
replicate <- as.data.frame(replicate)
df <- bind_cols(replicate, dates) %>%
rename(c(dates = ...241))
## melt it down
data <- reshape2::melt(df, id.var='dates')
## make some senseless forecast on senseless data...
split_obj <- initial_time_split(data, prop = 0.8)
model_fit_prophet <- prophet_reg() %>%
set_engine(engine = "prophet") %>%
fit(value ~ dates, data = training(split_obj))
## model table
models_tbl_prophet <- modeltime_table(model_fit_prophet)
## calibration
calibration_tbl_prophet <- models_tbl_prophet %>%
modeltime_calibrate(new_data = testing(split_obj))
## forecast
fc_prophet <- calibration_tbl_prophet %>%
modeltime_forecast(
new_data = testing(split_obj),
actual_data = data,
keep_data = TRUE
)
## "unmelt" that bastard again
fc_prophet <- fc_prophet %>% filter(str_detect(.key, "prediction"))
fc_prophet <- fc_prophet[,c(4,9,10)]
fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01")
#fc_prophet <- fc_prophet %>% subset(fc_prophet, as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" )
fc_wide_prophet <- fc_prophet %>%
pivot_wider(names_from = variable, values_from = value)
Here is my full solution. I also have provided background on what I'm doing here: https://github.com/business-science/modeltime/issues/133
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
library(timetk)
## create some senseless data to produce forecasts on...
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <-
c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239)
replicate <- rep(1,60) %*% t.default(fake_values)
replicate <- as.data.frame(replicate)
df <- bind_cols(replicate, dates) %>%
rename(c(dates = ...241))
## melt it down
data <- reshape2::melt(df, id.var='dates')
data %>% as_tibble() -> data
data %>%
filter(as.numeric(variable) %in% 1:9) %>%
group_by(variable) %>%
plot_time_series(dates, value, .facet_ncol = 3, .smooth = F)
## make some senseless forecast on senseless data...
split_obj <- initial_time_split(data, prop = 0.8)
split_obj %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(dates, value)
split_obj_2 <- time_series_split(data, assess = "1 year", cumulative = TRUE)
split_obj_2 %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(dates, value)
model_fit_prophet <- prophet_reg() %>%
set_engine(engine = "prophet") %>%
fit(value ~ dates, data = training(split_obj))
## model table
models_tbl_prophet <- modeltime_table(model_fit_prophet)
## calibration
calibration_tbl_prophet <- models_tbl_prophet %>%
modeltime_calibrate(new_data = testing(split_obj_2))
## forecast
fc_prophet <- calibration_tbl_prophet %>%
modeltime_forecast(
new_data = testing(split_obj_2),
actual_data = data,
keep_data = TRUE
)
fc_prophet %>%
filter(as.numeric(variable) %in% 1:9) %>%
group_by(variable) %>%
plot_modeltime_forecast(.facet_ncol = 3)
## "unmelt" that bastard again
# fc_prophet <- fc_prophet %>% filter(str_detect(.key, "prediction"))
# fc_prophet <- fc_prophet[,c(4,9,10)]
# fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01")
# #fc_prophet <- fc_prophet %>% subset(fc_prophet, as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" )
#
# fc_wide_prophet <- fc_prophet %>%
# pivot_wider(names_from = variable, values_from = value)
# Make a future forecast
refit_tbl_prophet <- calibration_tbl_prophet %>%
modeltime_refit(data = data)
future_fc_prophet <- refit_tbl_prophet %>%
modeltime_forecast(
new_data = data %>% group_by(variable) %>% future_frame(.length_out = "1 year"),
actual_data = data,
keep_data = TRUE
)
future_fc_prophet %>%
filter(as.numeric(variable) %in% 1:9) %>%
group_by(variable) %>%
plot_modeltime_forecast(.facet_ncol = 3)
# Reformat as wide
future_wide_tbl <- future_fc_prophet %>%
filter(.key == "prediction") %>%
select(.model_id, .model_desc, dates, variable, .value) %>%
pivot_wider(
id_cols = c(.model_id, .model_desc, dates),
names_from = variable,
values_from = .value
)
future_wide_tbl[names(df)]

furrr package in R doesn't keep spreading the jobs across all cores?

My task is to compute Cosine dissimilarities.
Given a dataframe of user observations I perform a cosine dissimilarity between each pair of rows.
Long story short I am using furrr::future_map2_dfr function to spread the calculations across all cores I have.
For some reason when some cores are free while others are working hard their work doesn't keep spreading across other cores.
For example:
Here is the start point:
Now it's in the middle of the calculation:
Why cores 1, 2, 5, 6, 8, 11, 12, 15 doesn't participate and share the left jobs?
Same with other calculations.
Do I miss any settings of furrr that can change current behavior?
P.S
Now there are 5 cores that work "hard" and for some reason furrr doesn't spread their work to all 16 cores to make it faster.
Functions:
dissimilarity_wrapper <- function(n_users,
train_data,
train_data_std,
test_data,
std_thresh = 0.5) {
# NOTE:
# n_users must be set to maximum users in order to make this function
# work properly.
# Generating the options:
user_combinations <- expand.grid(i = seq_len(n_users),
j = seq_len(n_users))
plan(strategy = multicore)
expand_grid_options <- furrr::future_map2_dfr(.x = user_combinations$i,
.y = user_combinations$j,
function(x, y) {
expand.grid(test_idx = which(test_data$user_id == x),
train_idx = which(train_data$user_id == y))})
drop <- c("user_id", "row_num",
"obs_id", "scroll_id",
"time_stamp", "seq_label",
"scroll_length")
test <- test_data[expand_grid_options$test_idx, !names(test_data) %in% drop]
train <- train_data[expand_grid_options$train_idx, !names(train_data) %in% drop]
train_std <- train_data_std[expand_grid_options$train_idx, ]
# Calculate different D's:
D_manhattan_scaled <- (abs(test - train) / train_std) %>% rowSums()
D_cosinus <- 1 - (rowSums(test * train) / (sqrt(rowSums(test^2) * rowSums(train^2))))
train_std[train_std < std_thresh] <- 1
D_manhattan_scaled_adj_std <- (abs(test - train) / train_std) %>% rowSums()
D_manhattan <- (abs(test - train)) %>% rowSums()
return(expand_grid_options %>%
dplyr::mutate(
D_manhattan_scaled = D_manhattan_scaled,
D_cosinus = D_cosinus,
D_manhattan_scaled_adj_std = D_manhattan_scaled_adj_std,
D_manhattan = D_manhattan,
isSame = test_data[test_idx, ]$user_id == train_data[train_idx, ]$user_id))
}
train_test_std_split <- function(data,
train_size,
test_size,
feature_selection) {
train_set <- data %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= train_size) %>%
dplyr::ungroup()
if (length(feature_selection) > 1) {
# Manual:
# scaling_param_est <- scale_param_est_total_UG
scaling_param_est <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd))
} else if (length(feature_selection) == 1) {
scaling_param_est <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd)) %>%
dplyr::rename_at(vars("mean", "sd"),
funs(paste(feature_selection, ., sep = "_")))
}
train_set <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::mutate_at(vars(feature_selection), scale) %>%
data.table::as.data.table() %>%
dplyr::ungroup() %>%
dplyr::as_tibble() %>%
dplyr::arrange(time_stamp)
train_set_std <- train_set %>%
dplyr::left_join(train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(feature_selection, sd) %>%
dplyr::rename_at(vars(-"user_id"),
funs(paste0(feature_selection, "_sd"))), by = "user_id") %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::select(matches("_sd"))
test_set_unscaled <- data %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::filter(!(obs_id %in% train_set$obs_id)) %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= test_size) %>%
dplyr::ungroup()
# Manual:
# test_set_joined_with_scaling_params <- cbind(test_set_unscaled, scaling_param_est)
test_set_unscaled_joined_with_scaling_params <- test_set_unscaled %>%
dplyr::left_join(scaling_param_est, by = "user_id")
test_set_unscaled_joined_with_scaling_params[, feature_selection] <-
(test_set_unscaled_joined_with_scaling_params[, feature_selection] -
test_set_unscaled_joined_with_scaling_params[, paste0(feature_selection, "_mean")]) /
test_set_unscaled_joined_with_scaling_params[, paste0(feature_selection, "_sd")]
test_set <- test_set_unscaled_joined_with_scaling_params %>%
dplyr::select(user_id, obs_id, scroll_id,
time_stamp, row_num, scroll_length,
feature_selection)
# Validate:
# intersect(unique(test_set$obs_id), unique(train_set$obs_id))
# compute_std <- train_set %>%
# dplyr::group_by(user_id) %>%
# dplyr::select(-row_num) %>%
# dplyr::rename_at(vars(-user_id, -obs_id, -scroll_id,
# -time_stamp, -scroll_length),
# funs(paste(., "std", sep = "_"))) %>%
# dplyr::summarize_at(vars(matches("_std$")), funs(sd)) %>%
# dplyr::ungroup()
return(list("train_set" = train_set,
"train_set_std" = train_set_std,
"test_set" = test_set,
"test_set_unscaled" = test_set_unscaled))
}
build_dissimilarity_rank <- function(n_users,
train_set,
train_set_std,
test_set,
D_type = "D_cosinus") {
return(dissimilarity_wrapper(n_users, train_set, train_set_std, test_set) %>%
dplyr::mutate(train_user_id = train_set[train_idx, ]$user_id,
test_user_id = test_set[test_idx, ]$user_id) %>%
dplyr::select(test_idx,
train_user_id,
test_user_id,
train_idx,
D_manhattan_scaled,
D_cosinus,
D_manhattan_scaled_adj_std,
D_manhattan,
isSame) %>%
dplyr::group_by(test_idx, train_user_id) %>%
dplyr::arrange(train_user_id, !!rlang::sym(D_type)) %>%
dplyr::mutate(D_manhattan_rank = rank(D_manhattan),
D_manhattan_scaled_rank = rank(D_manhattan_scaled, ties.method = "first"),
D_cosinus_rank = rank(D_cosinus, ties.method = "first")) %>%
dplyr::ungroup())
}
build_param_est <- function(dissimilarity_rank,
K,
D_type_rank = "D_manhattan_scaled") {
return(dissimilarity_rank %>%
dplyr::filter(isSame, (!!rlang::sym(paste0(D_type_rank, "_rank"))) == K) %>%
dplyr::group_by(train_user_id) %>%
dplyr::summarise_at(vars(D_manhattan_scaled,
D_cosinus,
D_manhattan_scaled_adj_std,
D_manhattan),
funs(mean, median, sd, quantile(., probs = .9))) %>%
dplyr::rename_at(vars(matches("_quantile")),
funs(str_replace(., "_quantile", "_percentile_90"))) %>%
dplyr::rename_at(vars(matches("_sd")),
funs(str_replace(., "_sd", "_std")))
)
}
build_dissimilarity_table <- function(dissimilarity_rank,
param_est,
K,
i,
D_type_rank = "D_manhattan_scaled",
D_s = c("D_manhattan_scaled",
"D_cosinus",
"D_manhattan_scaled_adj_std",
"D_manhattan")) {
dissimilarity_table <- dissimilarity_rank %>%
dplyr::filter(isSame, (!!rlang::sym(paste0(D_type_rank, "_rank"))) == K) %>%
dplyr::left_join(param_est, by = c("train_user_id")) %>%
dplyr::ungroup()
dissimilarity_table[paste0(D_s, "_norm_standard")] <-
(dissimilarity_table[D_s] - dissimilarity_table[paste0(D_s, "_mean")]) /
dissimilarity_table[paste0(D_s, "_std")]
dissimilarity_table[paste0(D_s, "_norm_median")] <-
(dissimilarity_table[D_s] - dissimilarity_table[paste0(D_s, "_median")]) /
(dissimilarity_table[paste0(D_s, "_percentile_90")] - dissimilarity_table[paste0(D_s, "_median")])
# dplyr::mutate(experiment = i))
return(dissimilarity_table)
}
k_fold_data_prepare <- function(df, min_scroll_len = 3) {
# Given the data, split it by user id:
return(df %>%
dplyr::filter(scroll_length >= min_scroll_len) %>%
dplyr::arrange(time_stamp) %>%
dplyr::ungroup() %>%
split(.$user_id))
}
k_fold_engine <- function(df,
obs,
n_users,
K = 2,
feature_selection,
D_type = "D_cosinus") {
# Train - Test Split:
train_set <- df %>%
dplyr::arrange(time_stamp) %>%
dplyr::filter(obs_id != obs)
if (length(feature_selection) > 1) {
# Manual:
# scaling_param_est <- scale_param_est_total_UG
scaling_param_est <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection),
funs(mean, sd))
} else if (length(feature_selection) == 1) {
scaling_param_est <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd)) %>%
dplyr::rename_at(vars("mean", "sd"),
funs(paste(feature_selection, ., sep = "_")))
}
train_set <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::mutate_at(vars(feature_selection), scale) %>%
as.data.table() %>%
dplyr::ungroup() %>%
as_tibble()
# Compute std for each train variable:
train_set_std <- train_set %>%
dplyr::left_join(train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(feature_selection, sd) %>%
dplyr::rename_at(vars(-"user_id"),
funs(paste0(feature_selection, "_sd"))), by = "user_id") %>%
dplyr::select(matches("_sd"))
test_set <- df %>%
dplyr::filter(obs_id == obs)
test_set_joined_with_scaling_params <- test_set %>%
dplyr::left_join(scaling_param_est, by = "user_id") %>%
dplyr::arrange(time_stamp)
# Manual:
# test_set_joined_with_scaling_params <- cbind(test_set, scaling_param_est)
test_set_joined_with_scaling_params[, feature_selection] <-
(test_set_joined_with_scaling_params[, feature_selection] -
test_set_joined_with_scaling_params[, paste0(feature_selection, "_mean")]) /
test_set_joined_with_scaling_params[, paste0(feature_selection, "_sd")]
test_set <- test_set_joined_with_scaling_params %>%
dplyr::arrange(time_stamp) %>%
dplyr::select(user_id, obs_id, scroll_id,
time_stamp, row_num, scroll_length,
feature_selection)
# Compute std for each train variable:
# compute_std <- train_set %>%
# dplyr::group_by(user_id) %>%
# dplyr::select(-row_num) %>%
# dplyr::rename_at(vars(-user_id, -obs_id, -scroll_id,
# -time_stamp, -scroll_length),
# funs(paste(., "std", sep = "_"))) %>%
# dplyr::summarize_at(vars(matches("_std$")), funs(sd)) %>%
# dplyr::ungroup()
#
# train_set_std <- dplyr::left_join(train_set,
# compute_std,
# by = "user_id") %>%
# dplyr::ungroup() %>%
# dplyr::select(matches("_std$"))
# Compute the dissimilarities:
return(build_dissimilarity_rank(n_users,
train_set,
train_set_std,
test_set,
D_type))
}
k_fold_wrapper <- function(data_df,
K = 2,
D_type_rank = "D_cosinus",
feature_selection) {
data_seqed <- k_fold_data_prepare(data_df)
# Given the data splitted by user id, split it by observation id:
data_seqed_by_obs <- future_imap(data_seqed, ~split(., .$obs_id ))
# Get the observation ids per each splitted sub dataframe:
obs_ids <- future_imap(data_seqed_by_obs, ~as.integer(names(.)))
# Feed kfold engine with splitted data by user id and observations names:
plan(strategy = multicore)
dissimilarity_rank <- furrr::future_map_dfr(data_seqed, function(x) {
furrr::future_map_dfr(obs_ids[[as.character(x$user_id[1])]],
function(df,
obs,
n_users,
K,
feature_selection,
D_type_rank) {
k_fold_engine(df,
obs,
n_users,
K,
feature_selection,
D_type_rank) },
df = x, n_users = x$user_id[1],
K = K, feature_selection = feature_selection,
D_type = D_type_rank) } )
if(nrow(dissimilarity_rank[which(rowSums(is.na(dissimilarity_rank)) > 0), ])) {
dissimilarity_rank <- dissimilarity_rank[which(rowSums(is.na(dissimilarity_rank)) == 0), ] %>%
dplyr::mutate(row_num = row_number())
}
param_estimations <- dissimilarity_rank %>%
build_param_est(K, D_type_rank = D_type_rank)
# Summarize and return final param estimation (average):
# return(param_estimations %>%
# dplyr::group_by(train_user_id) %>%
# summarize_at(vars(-"train_user_id"), mean))
return(list("dissimilarity_rank" = dissimilarity_rank,
"param_estimations" = param_estimations))
}
The final script that causes the issues:
n_users <- max(unique(data$user_id))
train_df <- data %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= 50)
filter_users_low_amount_obs <- train_df %>%
dplyr::group_by(user_id) %>%
dplyr::summarise(n_obs = length(unique(obs_id))) %>%
dplyr::arrange(n_obs) %>%
dplyr::filter(n_obs >= 3) %>%
select(user_id)
train_df <- train_df %>%
filter(user_id %in% filter_users_low_amount_obs$user_id)
k_fold_d_rank_param_est <- k_fold_wrapper(train_df, K, D_type_rank = D_type, feature_selection)
dissimilarity_rank_1 <- k_fold_d_rank_param_est$dissimilarity_rank
param_est <- k_fold_d_rank_param_est$param_estimations
train_test_std_split_2 <- train_test_std_split(data,
train_size_2,
test_size = Inf,
feature_selection)
dissimilarity_rank_2 <- build_dissimilarity_rank(n_users,
train_test_std_split_2$train_set,
train_test_std_split_2$train_set_std,
train_test_std_split_2$test_set)
I believe that the option you are missing is the scheduling option for furrr. By default your data is split up into as many chunks as you have workers specified at the beginning of the future_map call and then each worker gets assigned one chunk to work on. Once a worker is done with it's chunk, it will look for another chunk and start working on that. If there are no more chunks left, the worker will go idle.
You can specify with the scheduling option into how many chunks your data should be split up per worker. For example .options = furrr_options(scheduling = 2) will create two chunks per worker and workers that finish early will start working on another chunk.
For more information here is a vignette on chunking
https://davisvaughan.github.io/furrr/articles/articles/chunking.html
PS: You have some nested future calls in your code, depending on your specified future::plan() this will only slow down the code

Append Shapley reason codes on all observations to the entire data

Here is my code to get the top 5 Shaply reason codes on mtcars dataset.
#install.packages("randomForest"); install.packages("tidyverse"); install.packages(""iml)
library(tidyverse); library(iml); library(randomForest)
set.seed(42)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs),
id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor = Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapley = Shapley$new(predictor, x.interest = mtcars1[1,])
shapleyresults <- as_tibble(shapley$results) %>% arrange(desc(phi)) %>% slice(1:5) %>% select(feature.value, phi)
How can I get the reason codes for all the observations (instead of one at a time in the 2nd last line in the above code: mtcars[1,])?
And, append/left_join the shapleyresults using id on to the entire dataset?
The dataset would be 5-times longer. Should we use purrr here to do that?
I found the solution.
#install.packages("randomForest"); install.packages("tidyverse"); install.packages("iml")
library(tidyverse); library(iml); library(randomForest)
set.seed(42)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs),
id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor <- Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapelyresults <- map_dfr(1:nrow(mtcars), ~(Shapley$new(predictor, x.interest = mtcars1[.x,]) %>%
.$results %>%
as_tibble() %>%
arrange(desc(phi)) %>%
slice(1:5) %>%
select(feature.value, phi) %>%
mutate(id = .x)))
final_data <- mtcars1 %>% left_join(shapelyresults, by = "id")

Resources