How to make this function dynamic? R function - r

i have a function in R that generates a table graph picking data from a dataframe and every time i want to pass a different variable (column name from dataframe) i have to repeat the code. So sometimes it can be the variable and sometimes the variableb, other times the variablec... etc.
generates_table_variablea <- function(data) { ## how to pass the column = variablea here like this
####### function(data, column = variablea) .. ???
big_data <- data %>%
group_by(a, b, c, d) %>%
mutate(total_categoria_abs = sum(abs(f))) %>%
mutate(volume_negativo = if_else(variablea < 0, f, 0)) %>%
mutate(volume_positivo = if_else(variablea > 0, f, 0)) %>%
mutate(total = sum(volume_positivo) - sum(volume_negativo)) %>%
mutate(e = if_else(variablea < 0, sum(variablea), 0)) %>%
ungroup() %>%
filter (variablea < 0) %>%
group_by(a, b, c, d) %>%
summarise(e = mean(e), vendas = sum(f*-1), frac_vendas = vendas*-1/mean(total_categoria_abs)) %>%
arrange(e) %>%
ungroup()
big_data$frac_vendas <- round(big_data$frac_vendas, digits = 2)
big_data$e <- round(big_data$e, digits = 0)
}
If I want to change this variable, I have to do the follow:
generates_table_variableb <- function(data) { ## HERE IT WILL BE function(data, column = variableb)...
big_data <- data %>%
group_by(a, b, c, d) %>%
mutate(total_categoria_abs = sum(abs(f))) %>%
mutate(volume_negativo = if_else(variableb < 0, f, 0)) %>% #### HERE I NEED TO CHANGE ALWAYS TO VARIABLEA, VARIABLEB, VARIABLEC...
mutate(volume_positivo = if_else(variableb > 0, f, 0)) %>%
mutate(total = sum(volume_positivo) - sum(volume_negativo)) %>%
mutate(e = if_else(variablea < 0, sum(variableb), 0)) %>%
ungroup() %>%
filter (variableb < 0) %>%
group_by(a, b, c, d) %>%
summarise(e = mean(e), vendas = sum(f*-1), frac_vendas = vendas*-1/mean(total_categoria_abs)) %>%
arrange(e) %>%
ungroup()
big_data$frac_vendas <- round(big_data$frac_vendas, digits = 2)
big_data$e <- round(big_data$e, digits = 0)
}
Having multiple functions doing the same thing is slowing down my code...
How could this be better? All that I want is to pass this column dynamically.

This is one of the way
library(dplyr)
x <- data.frame(v1=1:3, v2=4:6)
f <- function(data, var1){
x %>% select(!!var1)
}
f(x, quo(v1))
You can see more explanation in https://adv-r.hadley.nz/quasiquotation.html

I found a other away that works too:
generates_table_variablea <- function(dataframe, variable) { ## Here pass variable
big_data <- dataframe %>%
group_by(a, b, c, d) %>%
mutate(total_categoria_abs = sum(abs(f))) %>%
mutate(volume_negativo = if_else(.data[[variable]] < 0, f, 0)) %>%
mutate(volume_positivo = if_else(.data[[variable]] > 0, f, 0)) %>%
mutate(total = sum(volume_positivo) - sum(volume_negativo)) %>%
mutate(e = if_else(.data[[variable]] < 0, sum(variablea), 0)) %>%
ungroup() %>%
filter (.data[[variable]] < 0) %>%
group_by(a, b, c, d) %>%
summarise(e = mean(e), vendas = sum(f*-1), frac_vendas = vendas*-1/mean(total_categoria_abs)) %>%
arrange(e) %>%
ungroup()
big_data$frac_vendas <- round(big_data$frac_vendas, digits = 2)
big_data$e <- round(big_data$e, digits = 0)
}
Only replace the variable by .data[[variable]] and you can pass any column inside the function.

Related

Macro function in R

I'm new to R coming from SAS. I am trying to repeat this code for a set of datasets called mort_1969, mort_1970,..., mort_n. How can I modify it so I don't have to rerun it each time, replacing the year variable?
mrt <- read_csv("mort1985.csv")
mrt <- mrt %>%
rename(nchs_code = countyrs)
crosswalk <- read_csv("crosswalk.csv")
crosswalk <- crosswalk %>%
select(-X1)
mrt_cw <- left_join(mrt, crosswalk, by = "nchs_code")
mrt_cw1 <- mrt_cw %>%
filter(cityrs == 999) %>%
select(fips, racer3) %>%
mutate(count = 1) %>%
group_by(fips, racer3) %>%
summarise(mrt_c = sum(count)) %>%
mutate(racer3=replace(racer3, racer3==1, "white_pop"),
racer3=replace(racer3, racer3==2, "black_pop"),
racer3=replace(racer3, racer3==3, "other_race_pop"))
mrt_cw2 <- pivot_wider(
data=mrt_cw1,
names_from = racer3,
values_from = mrt_c
)
#Convert NAs to 0 and drop original variables
mrt_cw3 <- mrt_cw2 %>%
mutate(white_m = ifelse(is.na(white_pop), 0, white_pop),
black_m = ifelse(is.na(black_pop), 0, black_pop),
other_race_m = ifelse(is.na(other_race_pop), 0, other_race_pop)) %>%
select(-white_pop, -black_pop, -other_race_pop) %>%
mutate(total_m = white_m + black_m + other_race_m)
pop <- read_csv('population_file.csv')
##########################ADD YEAR
pop <- pop %>%
filter(year == 1985) %>%
select(-X1, -year)
mrt_cw4 <- left_join(mrt_cw3, pop, by = "fips")
mrt_cw5 <- mrt_cw4 %>%
mutate(ttl_rate = (total_m/total_pop)*100,
blk_rate = (black_m/black_pop)*100,
wht_rate = (white_m/white_pop)*100,
otr_rate = (other_race_m/other_races_pop)*100)
mrt_cw6 <- mrt_cw5 %>%
mutate(high_flag = ifelse(ttl_rate >= 100 | blk_rate >= 100 |
wht_rate >= 100 | otr_rate >= 100, 1, 0))
##########################ADD YEAR
#Save file
write.csv(mrt_cw6, "mort_85.csv")

Have tidygraph/igraph a random forest crawling algorithm within?

set.seed(1)
play_erdos_renyi(n = 1000,m =20000) %>%
mutate(name = cur_group_rows()) -> g
I would like to "crawl" within a tidygraph starting from a set of roots.
For example, I could sample a forest of Galton-Watson Trees with:
a %>% as_tibble() %>% sample_n(10) %>%
pull(name) -> roots
a %>% group_by(name) %>%
mutate(crawl_stage = ifelse(name %in% roots,0,NA),
parent = NA,
k = ifelse(crawl_stage == 0,
rpois(1,.5),
0)
) %>% ungroup() -> a
a %>% as_tibble %>%
filter(crawl_stage == 0,
k>0) %>% select(name,ego,k) %>% rowwise() %>%
mutate(picks = list(sample(ego,k))) %>%
select(name = picks, parent = name) %>%
unnest(cols = c(name)) -> stage
a %>% group_by(name) %>%
mutate(crawl_stage = ifelse(name %in% stage$name,1,crawl_stage),
parent = ifelse(name %in% stage$name,
stage$parent[stage$name == name],
parent),
k = ifelse(crawl_stage == 1,
rpois(1,.5),
k)
) %>% ungroup() -> a
Then I just have to i-loop the stages until sum(k[crawl_stage == i]) == 0.
But I am curious if this algorithm isn't already implemented in the packages. I think so, but maybe not necessarily with the tracking column parent that I set on.

An error keeps appearing that the sample doesn't work in R Studio cloud and I don't know why

sharp_null_thought_experiment <-
function() {
final_data %>%
mutate(
OUTCOME_Z_0 = rnorm(n(), sd = 0.5007117),
OUTCOME_Z_1 = OUTCOME_Z_0,
Z = sample(rep(c(0, 1), times = c(sum(final_data$treatment_group=="control"), sum(final_data$treatment_group=="treatment"))), size = n()),
OUTCOME = if_else(Z == 0, OUTCOME_Z_0, OUTCOME_Z_1)
) %>%
difference_in_means(OUTCOME ~ Z, data = .) %>%
tidy
}
sampling_distribution_sharp_null <- rerun(1000, sharp_null_thought_experiment()) %>%
bind_rows
sampling_distribution_sharp_null %>%
summarise(mean(estimate>=results$estimate))

Combine columns into list column

Is there a more tidyverse-idiomatic way to combine several columns into a list column than using mapply?
For example given the following
tibble(.rows = 9) %>%
mutate(foo = runif(n()),
a_1 = runif(n()),
a_2 = runif(n()),
a_3 = runif(n())) ->
Z
(where Z might contain other columns, and might also contain more than 3 as) one can do
Z %>% mutate(A = mapply(c, a_1, a_2, a_3, SIMPLIFY = FALSE))
which works fine, although it would be nice to be able to say starts_with('a_') instead of a_1, a_2, a_3.
Another possibility is
Z %>%
rowid_to_column() %>%
pivot_longer(cols = starts_with('a_')) %>%
group_by(rowid) %>%
summarise(foo = unique(foo),
A = list(value)) %>%
select(-rowid)
which technically works, but introduces other problems (e.g., it uses an ugly foo = unique(foo); furthermore if instead of just one foo there were many foos it would become a bit more involved).
Based on a previous answer (now deleted) and the comments, I made a comparison of different solutions:
FUN_mapply <- function() { Z %>% mutate(A = mapply(c, a_1, a_2, a_3, SIMPLIFY = FALSE)) }
FUN_asplit <- function() { Z %>% mutate(A = asplit(.[,grepl("^a", colnames(.))], 1)) }
FUN_pmap <- function() { Z %>% mutate(A = pmap(.[,grepl("^a", colnames(.))], c)) }
FUN_transpose <- function() { Z %>% mutate(A = transpose(.[,grepl("^a", colnames(.))])) }
FUN_asplit_tidy <- function() { Z %>% mutate(A = asplit(select(., starts_with("a")), 1)) }
FUN_pmap_tidy <- function() { Z %>% mutate(A = pmap(select(., starts_with("a")), c)) }
FUN_transpose_tidy <- function() { Z %>% mutate(A = transpose(select(., starts_with("a")))) }
all(unlist(pmap(list(FUN_mapply()$A, FUN_asplit()$A, FUN_pmap()$A, FUN_transpose()$A), ~all(mapply(all.equal, .x, .y, MoreArgs = list(attributes = F)))))) # All A columns are equal?
mb <- microbenchmark::microbenchmark(
FUN_mapply(),
FUN_asplit(),
FUN_pmap(),
FUN_transpose(),
FUN_asplit_tidy(),
FUN_pmap_tidy(),
FUN_transpose_tidy(),
times = 1000L
)
ggplot2::autoplot(mb)
Edit: Replace select(., starts_with("a")) with Z[,grepl("^a", colnames(Z))]

furrr package in R doesn't keep spreading the jobs across all cores?

My task is to compute Cosine dissimilarities.
Given a dataframe of user observations I perform a cosine dissimilarity between each pair of rows.
Long story short I am using furrr::future_map2_dfr function to spread the calculations across all cores I have.
For some reason when some cores are free while others are working hard their work doesn't keep spreading across other cores.
For example:
Here is the start point:
Now it's in the middle of the calculation:
Why cores 1, 2, 5, 6, 8, 11, 12, 15 doesn't participate and share the left jobs?
Same with other calculations.
Do I miss any settings of furrr that can change current behavior?
P.S
Now there are 5 cores that work "hard" and for some reason furrr doesn't spread their work to all 16 cores to make it faster.
Functions:
dissimilarity_wrapper <- function(n_users,
train_data,
train_data_std,
test_data,
std_thresh = 0.5) {
# NOTE:
# n_users must be set to maximum users in order to make this function
# work properly.
# Generating the options:
user_combinations <- expand.grid(i = seq_len(n_users),
j = seq_len(n_users))
plan(strategy = multicore)
expand_grid_options <- furrr::future_map2_dfr(.x = user_combinations$i,
.y = user_combinations$j,
function(x, y) {
expand.grid(test_idx = which(test_data$user_id == x),
train_idx = which(train_data$user_id == y))})
drop <- c("user_id", "row_num",
"obs_id", "scroll_id",
"time_stamp", "seq_label",
"scroll_length")
test <- test_data[expand_grid_options$test_idx, !names(test_data) %in% drop]
train <- train_data[expand_grid_options$train_idx, !names(train_data) %in% drop]
train_std <- train_data_std[expand_grid_options$train_idx, ]
# Calculate different D's:
D_manhattan_scaled <- (abs(test - train) / train_std) %>% rowSums()
D_cosinus <- 1 - (rowSums(test * train) / (sqrt(rowSums(test^2) * rowSums(train^2))))
train_std[train_std < std_thresh] <- 1
D_manhattan_scaled_adj_std <- (abs(test - train) / train_std) %>% rowSums()
D_manhattan <- (abs(test - train)) %>% rowSums()
return(expand_grid_options %>%
dplyr::mutate(
D_manhattan_scaled = D_manhattan_scaled,
D_cosinus = D_cosinus,
D_manhattan_scaled_adj_std = D_manhattan_scaled_adj_std,
D_manhattan = D_manhattan,
isSame = test_data[test_idx, ]$user_id == train_data[train_idx, ]$user_id))
}
train_test_std_split <- function(data,
train_size,
test_size,
feature_selection) {
train_set <- data %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= train_size) %>%
dplyr::ungroup()
if (length(feature_selection) > 1) {
# Manual:
# scaling_param_est <- scale_param_est_total_UG
scaling_param_est <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd))
} else if (length(feature_selection) == 1) {
scaling_param_est <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd)) %>%
dplyr::rename_at(vars("mean", "sd"),
funs(paste(feature_selection, ., sep = "_")))
}
train_set <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::mutate_at(vars(feature_selection), scale) %>%
data.table::as.data.table() %>%
dplyr::ungroup() %>%
dplyr::as_tibble() %>%
dplyr::arrange(time_stamp)
train_set_std <- train_set %>%
dplyr::left_join(train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(feature_selection, sd) %>%
dplyr::rename_at(vars(-"user_id"),
funs(paste0(feature_selection, "_sd"))), by = "user_id") %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::select(matches("_sd"))
test_set_unscaled <- data %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::filter(!(obs_id %in% train_set$obs_id)) %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= test_size) %>%
dplyr::ungroup()
# Manual:
# test_set_joined_with_scaling_params <- cbind(test_set_unscaled, scaling_param_est)
test_set_unscaled_joined_with_scaling_params <- test_set_unscaled %>%
dplyr::left_join(scaling_param_est, by = "user_id")
test_set_unscaled_joined_with_scaling_params[, feature_selection] <-
(test_set_unscaled_joined_with_scaling_params[, feature_selection] -
test_set_unscaled_joined_with_scaling_params[, paste0(feature_selection, "_mean")]) /
test_set_unscaled_joined_with_scaling_params[, paste0(feature_selection, "_sd")]
test_set <- test_set_unscaled_joined_with_scaling_params %>%
dplyr::select(user_id, obs_id, scroll_id,
time_stamp, row_num, scroll_length,
feature_selection)
# Validate:
# intersect(unique(test_set$obs_id), unique(train_set$obs_id))
# compute_std <- train_set %>%
# dplyr::group_by(user_id) %>%
# dplyr::select(-row_num) %>%
# dplyr::rename_at(vars(-user_id, -obs_id, -scroll_id,
# -time_stamp, -scroll_length),
# funs(paste(., "std", sep = "_"))) %>%
# dplyr::summarize_at(vars(matches("_std$")), funs(sd)) %>%
# dplyr::ungroup()
return(list("train_set" = train_set,
"train_set_std" = train_set_std,
"test_set" = test_set,
"test_set_unscaled" = test_set_unscaled))
}
build_dissimilarity_rank <- function(n_users,
train_set,
train_set_std,
test_set,
D_type = "D_cosinus") {
return(dissimilarity_wrapper(n_users, train_set, train_set_std, test_set) %>%
dplyr::mutate(train_user_id = train_set[train_idx, ]$user_id,
test_user_id = test_set[test_idx, ]$user_id) %>%
dplyr::select(test_idx,
train_user_id,
test_user_id,
train_idx,
D_manhattan_scaled,
D_cosinus,
D_manhattan_scaled_adj_std,
D_manhattan,
isSame) %>%
dplyr::group_by(test_idx, train_user_id) %>%
dplyr::arrange(train_user_id, !!rlang::sym(D_type)) %>%
dplyr::mutate(D_manhattan_rank = rank(D_manhattan),
D_manhattan_scaled_rank = rank(D_manhattan_scaled, ties.method = "first"),
D_cosinus_rank = rank(D_cosinus, ties.method = "first")) %>%
dplyr::ungroup())
}
build_param_est <- function(dissimilarity_rank,
K,
D_type_rank = "D_manhattan_scaled") {
return(dissimilarity_rank %>%
dplyr::filter(isSame, (!!rlang::sym(paste0(D_type_rank, "_rank"))) == K) %>%
dplyr::group_by(train_user_id) %>%
dplyr::summarise_at(vars(D_manhattan_scaled,
D_cosinus,
D_manhattan_scaled_adj_std,
D_manhattan),
funs(mean, median, sd, quantile(., probs = .9))) %>%
dplyr::rename_at(vars(matches("_quantile")),
funs(str_replace(., "_quantile", "_percentile_90"))) %>%
dplyr::rename_at(vars(matches("_sd")),
funs(str_replace(., "_sd", "_std")))
)
}
build_dissimilarity_table <- function(dissimilarity_rank,
param_est,
K,
i,
D_type_rank = "D_manhattan_scaled",
D_s = c("D_manhattan_scaled",
"D_cosinus",
"D_manhattan_scaled_adj_std",
"D_manhattan")) {
dissimilarity_table <- dissimilarity_rank %>%
dplyr::filter(isSame, (!!rlang::sym(paste0(D_type_rank, "_rank"))) == K) %>%
dplyr::left_join(param_est, by = c("train_user_id")) %>%
dplyr::ungroup()
dissimilarity_table[paste0(D_s, "_norm_standard")] <-
(dissimilarity_table[D_s] - dissimilarity_table[paste0(D_s, "_mean")]) /
dissimilarity_table[paste0(D_s, "_std")]
dissimilarity_table[paste0(D_s, "_norm_median")] <-
(dissimilarity_table[D_s] - dissimilarity_table[paste0(D_s, "_median")]) /
(dissimilarity_table[paste0(D_s, "_percentile_90")] - dissimilarity_table[paste0(D_s, "_median")])
# dplyr::mutate(experiment = i))
return(dissimilarity_table)
}
k_fold_data_prepare <- function(df, min_scroll_len = 3) {
# Given the data, split it by user id:
return(df %>%
dplyr::filter(scroll_length >= min_scroll_len) %>%
dplyr::arrange(time_stamp) %>%
dplyr::ungroup() %>%
split(.$user_id))
}
k_fold_engine <- function(df,
obs,
n_users,
K = 2,
feature_selection,
D_type = "D_cosinus") {
# Train - Test Split:
train_set <- df %>%
dplyr::arrange(time_stamp) %>%
dplyr::filter(obs_id != obs)
if (length(feature_selection) > 1) {
# Manual:
# scaling_param_est <- scale_param_est_total_UG
scaling_param_est <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection),
funs(mean, sd))
} else if (length(feature_selection) == 1) {
scaling_param_est <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd)) %>%
dplyr::rename_at(vars("mean", "sd"),
funs(paste(feature_selection, ., sep = "_")))
}
train_set <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::mutate_at(vars(feature_selection), scale) %>%
as.data.table() %>%
dplyr::ungroup() %>%
as_tibble()
# Compute std for each train variable:
train_set_std <- train_set %>%
dplyr::left_join(train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(feature_selection, sd) %>%
dplyr::rename_at(vars(-"user_id"),
funs(paste0(feature_selection, "_sd"))), by = "user_id") %>%
dplyr::select(matches("_sd"))
test_set <- df %>%
dplyr::filter(obs_id == obs)
test_set_joined_with_scaling_params <- test_set %>%
dplyr::left_join(scaling_param_est, by = "user_id") %>%
dplyr::arrange(time_stamp)
# Manual:
# test_set_joined_with_scaling_params <- cbind(test_set, scaling_param_est)
test_set_joined_with_scaling_params[, feature_selection] <-
(test_set_joined_with_scaling_params[, feature_selection] -
test_set_joined_with_scaling_params[, paste0(feature_selection, "_mean")]) /
test_set_joined_with_scaling_params[, paste0(feature_selection, "_sd")]
test_set <- test_set_joined_with_scaling_params %>%
dplyr::arrange(time_stamp) %>%
dplyr::select(user_id, obs_id, scroll_id,
time_stamp, row_num, scroll_length,
feature_selection)
# Compute std for each train variable:
# compute_std <- train_set %>%
# dplyr::group_by(user_id) %>%
# dplyr::select(-row_num) %>%
# dplyr::rename_at(vars(-user_id, -obs_id, -scroll_id,
# -time_stamp, -scroll_length),
# funs(paste(., "std", sep = "_"))) %>%
# dplyr::summarize_at(vars(matches("_std$")), funs(sd)) %>%
# dplyr::ungroup()
#
# train_set_std <- dplyr::left_join(train_set,
# compute_std,
# by = "user_id") %>%
# dplyr::ungroup() %>%
# dplyr::select(matches("_std$"))
# Compute the dissimilarities:
return(build_dissimilarity_rank(n_users,
train_set,
train_set_std,
test_set,
D_type))
}
k_fold_wrapper <- function(data_df,
K = 2,
D_type_rank = "D_cosinus",
feature_selection) {
data_seqed <- k_fold_data_prepare(data_df)
# Given the data splitted by user id, split it by observation id:
data_seqed_by_obs <- future_imap(data_seqed, ~split(., .$obs_id ))
# Get the observation ids per each splitted sub dataframe:
obs_ids <- future_imap(data_seqed_by_obs, ~as.integer(names(.)))
# Feed kfold engine with splitted data by user id and observations names:
plan(strategy = multicore)
dissimilarity_rank <- furrr::future_map_dfr(data_seqed, function(x) {
furrr::future_map_dfr(obs_ids[[as.character(x$user_id[1])]],
function(df,
obs,
n_users,
K,
feature_selection,
D_type_rank) {
k_fold_engine(df,
obs,
n_users,
K,
feature_selection,
D_type_rank) },
df = x, n_users = x$user_id[1],
K = K, feature_selection = feature_selection,
D_type = D_type_rank) } )
if(nrow(dissimilarity_rank[which(rowSums(is.na(dissimilarity_rank)) > 0), ])) {
dissimilarity_rank <- dissimilarity_rank[which(rowSums(is.na(dissimilarity_rank)) == 0), ] %>%
dplyr::mutate(row_num = row_number())
}
param_estimations <- dissimilarity_rank %>%
build_param_est(K, D_type_rank = D_type_rank)
# Summarize and return final param estimation (average):
# return(param_estimations %>%
# dplyr::group_by(train_user_id) %>%
# summarize_at(vars(-"train_user_id"), mean))
return(list("dissimilarity_rank" = dissimilarity_rank,
"param_estimations" = param_estimations))
}
The final script that causes the issues:
n_users <- max(unique(data$user_id))
train_df <- data %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= 50)
filter_users_low_amount_obs <- train_df %>%
dplyr::group_by(user_id) %>%
dplyr::summarise(n_obs = length(unique(obs_id))) %>%
dplyr::arrange(n_obs) %>%
dplyr::filter(n_obs >= 3) %>%
select(user_id)
train_df <- train_df %>%
filter(user_id %in% filter_users_low_amount_obs$user_id)
k_fold_d_rank_param_est <- k_fold_wrapper(train_df, K, D_type_rank = D_type, feature_selection)
dissimilarity_rank_1 <- k_fold_d_rank_param_est$dissimilarity_rank
param_est <- k_fold_d_rank_param_est$param_estimations
train_test_std_split_2 <- train_test_std_split(data,
train_size_2,
test_size = Inf,
feature_selection)
dissimilarity_rank_2 <- build_dissimilarity_rank(n_users,
train_test_std_split_2$train_set,
train_test_std_split_2$train_set_std,
train_test_std_split_2$test_set)
I believe that the option you are missing is the scheduling option for furrr. By default your data is split up into as many chunks as you have workers specified at the beginning of the future_map call and then each worker gets assigned one chunk to work on. Once a worker is done with it's chunk, it will look for another chunk and start working on that. If there are no more chunks left, the worker will go idle.
You can specify with the scheduling option into how many chunks your data should be split up per worker. For example .options = furrr_options(scheduling = 2) will create two chunks per worker and workers that finish early will start working on another chunk.
For more information here is a vignette on chunking
https://davisvaughan.github.io/furrr/articles/articles/chunking.html
PS: You have some nested future calls in your code, depending on your specified future::plan() this will only slow down the code

Resources