I need to perform a conceptually straightforward double left-merge followed by a simple series of matching functions (See: Straightforward Solution). However, given the DBs I have to merge are large in size I tried to unpack the merging procedure by considering a for-loop that does the trick but is inefficient to say the least (See: For-loops Solution).
Is there a solution splitting and naming at least the largest db?
Below there is a toy example.
For reference, in my data:
db_m1 ~50k lines (for ~5k unique m1)
db_m2 ~25k lines (for ~5k unique m1 and m2)
db_p ~100m lines
set.seed(0)
db_m1 <- data.frame(
y=rep(1,20),
id=sort(rep(paste0("id_",c(letters[1:4])),5)),
m1=rep(c(1,2),10),
x1=sample(LETTERS, 20, TRUE),
x2=sample(LETTERS, 20, TRUE))
set.seed(0)
db_m2 <- data.frame(y=rep(1,20),
m1=sample(c(1:5),20,TRUE),
m2=sample(c(6:10),20,TRUE))
set.seed(0)
db_p <- data.frame(m2=sample(c(6:10),100,TRUE),
y1=sample(LETTERS, 100,TRUE),
y2=sample(LETTERS, 10,TRUE))
Straightforward Solution :
final_dplyr <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
For-loops Solution:
fn_final <- function(db_m1,db_m2,db_p) {
matches_final <- vector("list",length = length(unique(db_m1$y)))
for(i in 1:length(unique(db_m1$y))){
matches <- vector("list",length = length(unique(db_m1$m1)))
for(j in 1:length(unique(db_m1$m1))){
temp_db_m1 <- db_m1 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
temp_db_m2 <- db_m2 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
m_vector <- unique(temp_db_m2$m2)
temp_db_p <- db_p %>%
dplyr::filter(m2 %in% m_vector)
final <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>% dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
matches[[j]] <- final
}
matches_all <- do.call(rbind, matches)
matches_final[[i]] <- matches_all
}
final <- do.call(rbind, matches_final) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(final)
}
final_for <- fn_final(db_m1,db_m2,db_p)
This is a possible solution, should it be optimized further?
db_m1_s <- split(db_m1, f = list(db_m1$y,db_m1$m1))
db_m2_s <- split(db_m2, f = list(db_m2$y,db_m2$m1))
db_p_s <- split(db_p, f = list(db_p$m2))
match_fn <- function(temp_db_m1,temp_db_m2,temp_db_p){
final <- temp_db_m1 %>%
dplyr::left_join(temp_db_m2) %>%
dplyr::left_join(temp_db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
return(final)
}
fn_final <- function(db_m1,db_m1_s,db_m2_s,db_p_s) {
m <- names(db_m1_s)
matches_1 <- vector("list",length = length(m))
for(i in 1:length(m)){
temp_db_m1 <- db_m1_s[[m[i]]]
temp_db_m2 <- db_m2_s[[m[i]]]
n <- as.character(sort(unique(temp_db_m2$m2)))
matches_2 <- vector("list",length = length(n))
for(j in 1:length(n)){
temp_db_p <- db_p_s[[n[j]]]
final <- match_fn(temp_db_m1,temp_db_m2,temp_db_p)
matches_2[[j]] <- final
}
matches_all <- do.call(rbind, matches_2)
matches_1[[i]] <- matches_all
}
matches_0 <- do.call(rbind, matches_1) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(matches_0)
}
final_for <- fn_final(db_m1,db_m1_s,db_m2_s,db_p_s)
below I created some fake forecast data using the tidyverse modeltime packages. I have got monthly data from 2016 and want to produce a test fc for 2020. As you can see, the data I load comes in wide format. For usage in modeltime I transform it to long data. After the modeling phase, I want to create a dataframe for the 2020 prediction values. For this purpose I need to somehow "unmelt" the data. In this process I am unfortunately losing a lot of variables. From 240 variables that I want to forecast I get only 49 in the end result. Maybe I am blind, or I do not know how to configure the modeltime functions correctly. I would really much appreciate some help. Thanks in advance!
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
## create some senseless data to produce forecasts on...
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <-
c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239)
replicate <- rep(1,60) %*% t.default(fake_values)
replicate <- as.data.frame(replicate)
df <- bind_cols(replicate, dates) %>%
rename(c(dates = ...241))
## melt it down
data <- reshape2::melt(df, id.var='dates')
## make some senseless forecast on senseless data...
split_obj <- initial_time_split(data, prop = 0.8)
model_fit_prophet <- prophet_reg() %>%
set_engine(engine = "prophet") %>%
fit(value ~ dates, data = training(split_obj))
## model table
models_tbl_prophet <- modeltime_table(model_fit_prophet)
## calibration
calibration_tbl_prophet <- models_tbl_prophet %>%
modeltime_calibrate(new_data = testing(split_obj))
## forecast
fc_prophet <- calibration_tbl_prophet %>%
modeltime_forecast(
new_data = testing(split_obj),
actual_data = data,
keep_data = TRUE
)
## "unmelt" that bastard again
fc_prophet <- fc_prophet %>% filter(str_detect(.key, "prediction"))
fc_prophet <- fc_prophet[,c(4,9,10)]
fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01")
#fc_prophet <- fc_prophet %>% subset(fc_prophet, as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" )
fc_wide_prophet <- fc_prophet %>%
pivot_wider(names_from = variable, values_from = value)
Here is my full solution. I also have provided background on what I'm doing here: https://github.com/business-science/modeltime/issues/133
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
library(timetk)
## create some senseless data to produce forecasts on...
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <-
c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239)
replicate <- rep(1,60) %*% t.default(fake_values)
replicate <- as.data.frame(replicate)
df <- bind_cols(replicate, dates) %>%
rename(c(dates = ...241))
## melt it down
data <- reshape2::melt(df, id.var='dates')
data %>% as_tibble() -> data
data %>%
filter(as.numeric(variable) %in% 1:9) %>%
group_by(variable) %>%
plot_time_series(dates, value, .facet_ncol = 3, .smooth = F)
## make some senseless forecast on senseless data...
split_obj <- initial_time_split(data, prop = 0.8)
split_obj %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(dates, value)
split_obj_2 <- time_series_split(data, assess = "1 year", cumulative = TRUE)
split_obj_2 %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(dates, value)
model_fit_prophet <- prophet_reg() %>%
set_engine(engine = "prophet") %>%
fit(value ~ dates, data = training(split_obj))
## model table
models_tbl_prophet <- modeltime_table(model_fit_prophet)
## calibration
calibration_tbl_prophet <- models_tbl_prophet %>%
modeltime_calibrate(new_data = testing(split_obj_2))
## forecast
fc_prophet <- calibration_tbl_prophet %>%
modeltime_forecast(
new_data = testing(split_obj_2),
actual_data = data,
keep_data = TRUE
)
fc_prophet %>%
filter(as.numeric(variable) %in% 1:9) %>%
group_by(variable) %>%
plot_modeltime_forecast(.facet_ncol = 3)
## "unmelt" that bastard again
# fc_prophet <- fc_prophet %>% filter(str_detect(.key, "prediction"))
# fc_prophet <- fc_prophet[,c(4,9,10)]
# fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01")
# #fc_prophet <- fc_prophet %>% subset(fc_prophet, as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" )
#
# fc_wide_prophet <- fc_prophet %>%
# pivot_wider(names_from = variable, values_from = value)
# Make a future forecast
refit_tbl_prophet <- calibration_tbl_prophet %>%
modeltime_refit(data = data)
future_fc_prophet <- refit_tbl_prophet %>%
modeltime_forecast(
new_data = data %>% group_by(variable) %>% future_frame(.length_out = "1 year"),
actual_data = data,
keep_data = TRUE
)
future_fc_prophet %>%
filter(as.numeric(variable) %in% 1:9) %>%
group_by(variable) %>%
plot_modeltime_forecast(.facet_ncol = 3)
# Reformat as wide
future_wide_tbl <- future_fc_prophet %>%
filter(.key == "prediction") %>%
select(.model_id, .model_desc, dates, variable, .value) %>%
pivot_wider(
id_cols = c(.model_id, .model_desc, dates),
names_from = variable,
values_from = .value
)
future_wide_tbl[names(df)]
My task is to compute Cosine dissimilarities.
Given a dataframe of user observations I perform a cosine dissimilarity between each pair of rows.
Long story short I am using furrr::future_map2_dfr function to spread the calculations across all cores I have.
For some reason when some cores are free while others are working hard their work doesn't keep spreading across other cores.
For example:
Here is the start point:
Now it's in the middle of the calculation:
Why cores 1, 2, 5, 6, 8, 11, 12, 15 doesn't participate and share the left jobs?
Same with other calculations.
Do I miss any settings of furrr that can change current behavior?
P.S
Now there are 5 cores that work "hard" and for some reason furrr doesn't spread their work to all 16 cores to make it faster.
Functions:
dissimilarity_wrapper <- function(n_users,
train_data,
train_data_std,
test_data,
std_thresh = 0.5) {
# NOTE:
# n_users must be set to maximum users in order to make this function
# work properly.
# Generating the options:
user_combinations <- expand.grid(i = seq_len(n_users),
j = seq_len(n_users))
plan(strategy = multicore)
expand_grid_options <- furrr::future_map2_dfr(.x = user_combinations$i,
.y = user_combinations$j,
function(x, y) {
expand.grid(test_idx = which(test_data$user_id == x),
train_idx = which(train_data$user_id == y))})
drop <- c("user_id", "row_num",
"obs_id", "scroll_id",
"time_stamp", "seq_label",
"scroll_length")
test <- test_data[expand_grid_options$test_idx, !names(test_data) %in% drop]
train <- train_data[expand_grid_options$train_idx, !names(train_data) %in% drop]
train_std <- train_data_std[expand_grid_options$train_idx, ]
# Calculate different D's:
D_manhattan_scaled <- (abs(test - train) / train_std) %>% rowSums()
D_cosinus <- 1 - (rowSums(test * train) / (sqrt(rowSums(test^2) * rowSums(train^2))))
train_std[train_std < std_thresh] <- 1
D_manhattan_scaled_adj_std <- (abs(test - train) / train_std) %>% rowSums()
D_manhattan <- (abs(test - train)) %>% rowSums()
return(expand_grid_options %>%
dplyr::mutate(
D_manhattan_scaled = D_manhattan_scaled,
D_cosinus = D_cosinus,
D_manhattan_scaled_adj_std = D_manhattan_scaled_adj_std,
D_manhattan = D_manhattan,
isSame = test_data[test_idx, ]$user_id == train_data[train_idx, ]$user_id))
}
train_test_std_split <- function(data,
train_size,
test_size,
feature_selection) {
train_set <- data %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= train_size) %>%
dplyr::ungroup()
if (length(feature_selection) > 1) {
# Manual:
# scaling_param_est <- scale_param_est_total_UG
scaling_param_est <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd))
} else if (length(feature_selection) == 1) {
scaling_param_est <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd)) %>%
dplyr::rename_at(vars("mean", "sd"),
funs(paste(feature_selection, ., sep = "_")))
}
train_set <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::mutate_at(vars(feature_selection), scale) %>%
data.table::as.data.table() %>%
dplyr::ungroup() %>%
dplyr::as_tibble() %>%
dplyr::arrange(time_stamp)
train_set_std <- train_set %>%
dplyr::left_join(train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(feature_selection, sd) %>%
dplyr::rename_at(vars(-"user_id"),
funs(paste0(feature_selection, "_sd"))), by = "user_id") %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::select(matches("_sd"))
test_set_unscaled <- data %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::filter(!(obs_id %in% train_set$obs_id)) %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= test_size) %>%
dplyr::ungroup()
# Manual:
# test_set_joined_with_scaling_params <- cbind(test_set_unscaled, scaling_param_est)
test_set_unscaled_joined_with_scaling_params <- test_set_unscaled %>%
dplyr::left_join(scaling_param_est, by = "user_id")
test_set_unscaled_joined_with_scaling_params[, feature_selection] <-
(test_set_unscaled_joined_with_scaling_params[, feature_selection] -
test_set_unscaled_joined_with_scaling_params[, paste0(feature_selection, "_mean")]) /
test_set_unscaled_joined_with_scaling_params[, paste0(feature_selection, "_sd")]
test_set <- test_set_unscaled_joined_with_scaling_params %>%
dplyr::select(user_id, obs_id, scroll_id,
time_stamp, row_num, scroll_length,
feature_selection)
# Validate:
# intersect(unique(test_set$obs_id), unique(train_set$obs_id))
# compute_std <- train_set %>%
# dplyr::group_by(user_id) %>%
# dplyr::select(-row_num) %>%
# dplyr::rename_at(vars(-user_id, -obs_id, -scroll_id,
# -time_stamp, -scroll_length),
# funs(paste(., "std", sep = "_"))) %>%
# dplyr::summarize_at(vars(matches("_std$")), funs(sd)) %>%
# dplyr::ungroup()
return(list("train_set" = train_set,
"train_set_std" = train_set_std,
"test_set" = test_set,
"test_set_unscaled" = test_set_unscaled))
}
build_dissimilarity_rank <- function(n_users,
train_set,
train_set_std,
test_set,
D_type = "D_cosinus") {
return(dissimilarity_wrapper(n_users, train_set, train_set_std, test_set) %>%
dplyr::mutate(train_user_id = train_set[train_idx, ]$user_id,
test_user_id = test_set[test_idx, ]$user_id) %>%
dplyr::select(test_idx,
train_user_id,
test_user_id,
train_idx,
D_manhattan_scaled,
D_cosinus,
D_manhattan_scaled_adj_std,
D_manhattan,
isSame) %>%
dplyr::group_by(test_idx, train_user_id) %>%
dplyr::arrange(train_user_id, !!rlang::sym(D_type)) %>%
dplyr::mutate(D_manhattan_rank = rank(D_manhattan),
D_manhattan_scaled_rank = rank(D_manhattan_scaled, ties.method = "first"),
D_cosinus_rank = rank(D_cosinus, ties.method = "first")) %>%
dplyr::ungroup())
}
build_param_est <- function(dissimilarity_rank,
K,
D_type_rank = "D_manhattan_scaled") {
return(dissimilarity_rank %>%
dplyr::filter(isSame, (!!rlang::sym(paste0(D_type_rank, "_rank"))) == K) %>%
dplyr::group_by(train_user_id) %>%
dplyr::summarise_at(vars(D_manhattan_scaled,
D_cosinus,
D_manhattan_scaled_adj_std,
D_manhattan),
funs(mean, median, sd, quantile(., probs = .9))) %>%
dplyr::rename_at(vars(matches("_quantile")),
funs(str_replace(., "_quantile", "_percentile_90"))) %>%
dplyr::rename_at(vars(matches("_sd")),
funs(str_replace(., "_sd", "_std")))
)
}
build_dissimilarity_table <- function(dissimilarity_rank,
param_est,
K,
i,
D_type_rank = "D_manhattan_scaled",
D_s = c("D_manhattan_scaled",
"D_cosinus",
"D_manhattan_scaled_adj_std",
"D_manhattan")) {
dissimilarity_table <- dissimilarity_rank %>%
dplyr::filter(isSame, (!!rlang::sym(paste0(D_type_rank, "_rank"))) == K) %>%
dplyr::left_join(param_est, by = c("train_user_id")) %>%
dplyr::ungroup()
dissimilarity_table[paste0(D_s, "_norm_standard")] <-
(dissimilarity_table[D_s] - dissimilarity_table[paste0(D_s, "_mean")]) /
dissimilarity_table[paste0(D_s, "_std")]
dissimilarity_table[paste0(D_s, "_norm_median")] <-
(dissimilarity_table[D_s] - dissimilarity_table[paste0(D_s, "_median")]) /
(dissimilarity_table[paste0(D_s, "_percentile_90")] - dissimilarity_table[paste0(D_s, "_median")])
# dplyr::mutate(experiment = i))
return(dissimilarity_table)
}
k_fold_data_prepare <- function(df, min_scroll_len = 3) {
# Given the data, split it by user id:
return(df %>%
dplyr::filter(scroll_length >= min_scroll_len) %>%
dplyr::arrange(time_stamp) %>%
dplyr::ungroup() %>%
split(.$user_id))
}
k_fold_engine <- function(df,
obs,
n_users,
K = 2,
feature_selection,
D_type = "D_cosinus") {
# Train - Test Split:
train_set <- df %>%
dplyr::arrange(time_stamp) %>%
dplyr::filter(obs_id != obs)
if (length(feature_selection) > 1) {
# Manual:
# scaling_param_est <- scale_param_est_total_UG
scaling_param_est <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection),
funs(mean, sd))
} else if (length(feature_selection) == 1) {
scaling_param_est <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd)) %>%
dplyr::rename_at(vars("mean", "sd"),
funs(paste(feature_selection, ., sep = "_")))
}
train_set <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::mutate_at(vars(feature_selection), scale) %>%
as.data.table() %>%
dplyr::ungroup() %>%
as_tibble()
# Compute std for each train variable:
train_set_std <- train_set %>%
dplyr::left_join(train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(feature_selection, sd) %>%
dplyr::rename_at(vars(-"user_id"),
funs(paste0(feature_selection, "_sd"))), by = "user_id") %>%
dplyr::select(matches("_sd"))
test_set <- df %>%
dplyr::filter(obs_id == obs)
test_set_joined_with_scaling_params <- test_set %>%
dplyr::left_join(scaling_param_est, by = "user_id") %>%
dplyr::arrange(time_stamp)
# Manual:
# test_set_joined_with_scaling_params <- cbind(test_set, scaling_param_est)
test_set_joined_with_scaling_params[, feature_selection] <-
(test_set_joined_with_scaling_params[, feature_selection] -
test_set_joined_with_scaling_params[, paste0(feature_selection, "_mean")]) /
test_set_joined_with_scaling_params[, paste0(feature_selection, "_sd")]
test_set <- test_set_joined_with_scaling_params %>%
dplyr::arrange(time_stamp) %>%
dplyr::select(user_id, obs_id, scroll_id,
time_stamp, row_num, scroll_length,
feature_selection)
# Compute std for each train variable:
# compute_std <- train_set %>%
# dplyr::group_by(user_id) %>%
# dplyr::select(-row_num) %>%
# dplyr::rename_at(vars(-user_id, -obs_id, -scroll_id,
# -time_stamp, -scroll_length),
# funs(paste(., "std", sep = "_"))) %>%
# dplyr::summarize_at(vars(matches("_std$")), funs(sd)) %>%
# dplyr::ungroup()
#
# train_set_std <- dplyr::left_join(train_set,
# compute_std,
# by = "user_id") %>%
# dplyr::ungroup() %>%
# dplyr::select(matches("_std$"))
# Compute the dissimilarities:
return(build_dissimilarity_rank(n_users,
train_set,
train_set_std,
test_set,
D_type))
}
k_fold_wrapper <- function(data_df,
K = 2,
D_type_rank = "D_cosinus",
feature_selection) {
data_seqed <- k_fold_data_prepare(data_df)
# Given the data splitted by user id, split it by observation id:
data_seqed_by_obs <- future_imap(data_seqed, ~split(., .$obs_id ))
# Get the observation ids per each splitted sub dataframe:
obs_ids <- future_imap(data_seqed_by_obs, ~as.integer(names(.)))
# Feed kfold engine with splitted data by user id and observations names:
plan(strategy = multicore)
dissimilarity_rank <- furrr::future_map_dfr(data_seqed, function(x) {
furrr::future_map_dfr(obs_ids[[as.character(x$user_id[1])]],
function(df,
obs,
n_users,
K,
feature_selection,
D_type_rank) {
k_fold_engine(df,
obs,
n_users,
K,
feature_selection,
D_type_rank) },
df = x, n_users = x$user_id[1],
K = K, feature_selection = feature_selection,
D_type = D_type_rank) } )
if(nrow(dissimilarity_rank[which(rowSums(is.na(dissimilarity_rank)) > 0), ])) {
dissimilarity_rank <- dissimilarity_rank[which(rowSums(is.na(dissimilarity_rank)) == 0), ] %>%
dplyr::mutate(row_num = row_number())
}
param_estimations <- dissimilarity_rank %>%
build_param_est(K, D_type_rank = D_type_rank)
# Summarize and return final param estimation (average):
# return(param_estimations %>%
# dplyr::group_by(train_user_id) %>%
# summarize_at(vars(-"train_user_id"), mean))
return(list("dissimilarity_rank" = dissimilarity_rank,
"param_estimations" = param_estimations))
}
The final script that causes the issues:
n_users <- max(unique(data$user_id))
train_df <- data %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= 50)
filter_users_low_amount_obs <- train_df %>%
dplyr::group_by(user_id) %>%
dplyr::summarise(n_obs = length(unique(obs_id))) %>%
dplyr::arrange(n_obs) %>%
dplyr::filter(n_obs >= 3) %>%
select(user_id)
train_df <- train_df %>%
filter(user_id %in% filter_users_low_amount_obs$user_id)
k_fold_d_rank_param_est <- k_fold_wrapper(train_df, K, D_type_rank = D_type, feature_selection)
dissimilarity_rank_1 <- k_fold_d_rank_param_est$dissimilarity_rank
param_est <- k_fold_d_rank_param_est$param_estimations
train_test_std_split_2 <- train_test_std_split(data,
train_size_2,
test_size = Inf,
feature_selection)
dissimilarity_rank_2 <- build_dissimilarity_rank(n_users,
train_test_std_split_2$train_set,
train_test_std_split_2$train_set_std,
train_test_std_split_2$test_set)
I believe that the option you are missing is the scheduling option for furrr. By default your data is split up into as many chunks as you have workers specified at the beginning of the future_map call and then each worker gets assigned one chunk to work on. Once a worker is done with it's chunk, it will look for another chunk and start working on that. If there are no more chunks left, the worker will go idle.
You can specify with the scheduling option into how many chunks your data should be split up per worker. For example .options = furrr_options(scheduling = 2) will create two chunks per worker and workers that finish early will start working on another chunk.
For more information here is a vignette on chunking
https://davisvaughan.github.io/furrr/articles/articles/chunking.html
PS: You have some nested future calls in your code, depending on your specified future::plan() this will only slow down the code