R dplyr across: Dynamically specifying arguments to functions t.test and varTest - r

Am writing some dplyr across statements. Want to create some p-values using the functions t.test and varTest. The x= columns for calculations are in df_vars and the mu= and sigma.squared= parameter values are in df_mu_sigma.
A hard-coded version of the data I need are in df_sumry. If the variable names were always the same when code is run, something like this would suffice. That's not the case, however.
The beginnings of a non-hard-coded version of what I need are in df_sumry2. That doesn't yield a correct result yet though, because values of mu= and sigma.squared= are not dynamically specified. Only the first two p-values are correct in df_sumry2. They are always wrong after that because the code always uses values for the mpg variable.
How can I consistently get the right values inserted for mu and sigma.squared?
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
set.seed(9302)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) t.test(x, mu = df_mu_sigma$mpg_mean)$p.value,
std = sd,
std_prob = function(x) varTest(x, sigma.squared = df_mu_sigma$mpg_std^2)$p.value)
)
)

I appear to have come up with a solution to my own problem. I'd be happy to see alternative solutions though as they may be better than mine.
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
library(glue)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) {
mu_name <- glue("{ensym(x)}_mean")
t.test(x, mu = df_mu_sigma[[mu_name]])$p.value
},
std = sd,
std_prob = function(x) {
sigma_name <- glue("{ensym(x)}_std")
varTest(x, sigma.squared = df_mu_sigma[[sigma_name]]^2)$p.value
}
)
)
)
all.equal(df_sumry, df_sumry2)

This is not much better than your solution, but I would use cur_column() instead of ensym() to avoid quosures handling.
Also, putting the query in a separate function makes things a bit tidier.
Finally, I would use lambda functions instead of anonymous functions for clarity.
get_mu = function(suffix){
df_mu_sigma[[paste0(cur_column(), suffix)]] #you could use glue() as well here
}
df_vars %>%
summarize(
across(
all_of(vars_num),
list(
mean = mean,
mean_prob = ~t.test(.x, mu = get_mu("_mean"))$p.value,
std = sd,
std_prob = ~varTest(.x, sigma.squared = get_mu("_std")^2)$p.value
)
)
) %>% t() #just to format the output
# [,1]
# mpg_mean 20.09062500
# mpg_mean_prob 0.01808550
# mpg_std 6.02694805
# mpg_std_prob 0.96094601
# cyl_mean 6.18750000
# cyl_mean_prob 0.10909740
# cyl_std 1.78592165
# cyl_std_prob 0.77092484
# disp_mean 230.72187500
# disp_mean_prob 0.17613878
# disp_std 123.93869383
# disp_std_prob 0.96381507
# hp_mean 146.68750000
# hp_mean_prob 0.03914858
# hp_std 68.56286849
# hp_std_prob 0.03459963

Related

create multiple cross tables with one-line code function with gtsummary

i'm having the following problem:
Context:
I'm using gtsummary to explore frequencies in a dataframe using cross variables.
Here's my desire output:
So that i have a main variable tobgp and its cross by multiple variables like agegp and algp
Attempt:
this is what i've done so far. Using the esoph data from the package The R Datasets Package (datasets).
pacman::p_load(tidyverse, gt, gtsummary)
multiple_table<-function(data, var){
t0<- data %>%
select({{var}}) %>%
gtsummary::tbl_summary(statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0))) %>%
modify_header(label ~ "") %>%
bold_labels()
#agep
t1<-data %>%
select({{var}}, agegp) %>%
gtsummary::tbl_summary(by = agegp, statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0)))
#alcgp
t2<-data %>%
select({{var}}, alcgp) %>%
gtsummary::tbl_summary(by = alcgp, statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0)))
#MERGE
tbl_merge(tbls = list(t0,t1,t2),
tab_spanner = c("**Total**", "**agegp**", "**algp**")) %>%
as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
}
esoph %>%
multiple_table(tobgp)
The problem with my code so far is that is specific for the crosses, to add more cross variables i have to modify the function i created which is not so friendly.
Request:
Create a function so that you can create the desire output with one line of code. Like this for example:
multiple_table(data, main, by)
esoph %>%
multiple_table(main=tobgp, by=c(agegp, algp)
So that if i want to use other variables to cross by i only have to change the by=c() argument.
In order to be easy to do something like:
esoph %>%
multiple_table(main=tobgp, by=c(agegp, algp, variable1, variable2)
Notes:
I've tried other functions inside gtsummary like tbl_strata which can use two variables as crosses, but doesn't suit my needs because it mixes the two cross variables like this:
This is not what i'm looking for. As you can see, Grade divides the percentage of Drug test by each Grade. This example is taken from gtsummary vignette: https://www.danieldsjoberg.com/gtsummary/reference/tbl_strata.html
I think the solution for my problem could involve some workaround with purrr, or apply, i've tried some but i'm not very good using lists and iterations.
That's it. Thanks very much for listening and i hope i've been very clear. If not, feel free to ask.
Answers 28/03/22
Since i posted my question i've recieve to different approach answers which both work perfectly. Feel free to use the one that suits you. Thanks Mike for the answer in StackOverflow and thanks Tan, June C, Tyler Grant Smith for the answer in the Slack R4DS Community. In my case i would stick with the approach 3.
Approach 1: The Mike approach
library(gtsummary)
library(dplyr)
esoph <- mutate(esoph,
ncases = ifelse(ncases > 2, "High","Low"))
multiple_table<-function(data, var, vars){
t0 <- data %>%
select( var ) %>%
gtsummary::tbl_summary(statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0))) %>%
modify_header(label ~ "") %>%
bold_labels()
tlist <- lapply(vars,function(y){
data %>%
select( var , y ) %>%
gtsummary::tbl_summary(by = y , statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0)))
})
tabspannername <- c("**Total**", paste0("**",vars,"**"))
tlist2 <- append(list(t0), tlist,1)
tbl_merge(tbls = tlist2
,tab_spanner = tabspannername
) %>%
as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
}
multiple_table(data = esoph, var = "tobgp", vars = c("agegp", "alcgp","ncases"))
Approach 2: The Tan approach
library(tidyverse)
library(gt)
library(gtsummary)
esoph
fn_subtable <- function(data, main, sub){
data %>%
dplyr::select({{main}},{{sub}}) %>%
gtsummary::tbl_summary(
by = {{sub}},
statistic = gtsummary::all_categorical()~ "{p}% ({n})",
digits = list(dplyr::everything() ~ c(2, 0)))
}
fn_table <-function(data, main_var, sub_vars){
t0 <- data %>%
dplyr::select({{main_var}}) %>%
gtsummary::tbl_summary(statistic = gtsummary::all_categorical() ~ "{p}% ({n})",
digits = list(dplyr::everything() ~ c(2, 0))) %>%
gtsummary::modify_header(label ~ "") %>%
gtsummary::bold_labels()
sub_tables <- purrr::map(sub_vars, ~fn_subtable(data = data, main = main_var, sub = .x))
#MERGE
tbls <- c(list(t0), sub_tables) %>%
gtsummary::tbl_merge(tab_spanner = c("**Total**", paste0("**",sub_vars,"**"))) %>%
gtsummary::as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
tbls
}
esoph %>% fn_table("tobgp", c("agegp", "alcgp"))
Approach 3: The June C - Tyler Grant Smith approach
library(tidyverse)
library(gt)
library(gtsummary)
fn_subtable <- function(data, main, sub){
data %>%
dplyr::select({{main}},{{sub}}) %>%
gtsummary::tbl_summary(
by = {{sub}},
statistic = gtsummary::all_categorical()~ "{p}% ({n})",
digits = list(dplyr::everything() ~ c(2, 0)))
}
fn_table3 <- function(data, main_var, sub_vars){
main_var <- rlang::enexpr(main_var)
sub_vars_expr <- rlang::enexpr(sub_vars) # 1. Capture `list(...)` call as expression
sub_vars_args <- rlang::call_args(sub_vars_expr) # 2. Pull out the arguments (they're now also exprs)
sub_vars_fn <- rlang::call_fn(sub_vars_expr) # 3. Pull out the fn call
# 4. Evaluate the fn with expr-ed arguments (this becomes `list( expr(agegp), expr(alcgp) )` )
sub_vars_reconstructed <- rlang::exec(sub_vars_fn, !!!sub_vars_args)
# --- sub_vars replaced with sub_vars_reconstructed from here onwards ---
t0 <- data %>%
dplyr::select({{main_var}}) %>%
gtsummary::tbl_summary(statistic = gtsummary::all_categorical() ~ "{p}% ({n})",
digits = list(dplyr::everything() ~ c(2, 0))) %>%
gtsummary::modify_header(label ~ "") %>%
gtsummary::bold_labels()
sub_tables <- purrr::map(sub_vars_reconstructed, ~fn_subtable(data = data, main = main_var, sub = .x))
tbls <- c(list(t0), sub_tables) %>%
gtsummary::tbl_merge(tab_spanner = c("**Total**", paste0("**",sub_vars_reconstructed,"**"))) %>%
gtsummary::as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
tbls
}
fn_table3(esoph,tobgp,list(agegp,alcgp))
Thanks very much and i hope this could be implemented as a function inside the gtsummary package because is very useful to explore frequencies with different cross variables.
you are pretty close and only needed a few modifications. the major change is adding in an lapply() to loop through the vars input to create a list of tbl_summary objects. Then I create the tab spanner names from the inputs of vars and append the t0 table to the list created by the lapply(). then you can pass tlist2 to tbl_merge() with the names created with tabspannername to dynamically label the tables.
library(gtsummary)
library(dplyr)
esoph <- mutate(esoph,
ncases = ifelse(ncases > 2, "High","Low"))
multiple_table<-function(data, var, vars){
t0 <- data %>%
select( var ) %>%
gtsummary::tbl_summary(statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0))) %>%
modify_header(label ~ "") %>%
bold_labels()
tlist <- lapply(vars,function(y){
esoph %>%
select( var , y ) %>%
gtsummary::tbl_summary(by = y , statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0)))
})
tabspannername <- c("**Total**", paste0("**",vars,"**"))
tlist2 <- append(list(t0), tlist,1)
tbl_merge(tbls = tlist2
,tab_spanner = tabspannername
) %>%
as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
}
x <- multiple_table(data = esoph, var = "tobgp", vars = c("agegp", "alcgp","ncases"))

An error keeps appearing that the sample doesn't work in R Studio cloud and I don't know why

sharp_null_thought_experiment <-
function() {
final_data %>%
mutate(
OUTCOME_Z_0 = rnorm(n(), sd = 0.5007117),
OUTCOME_Z_1 = OUTCOME_Z_0,
Z = sample(rep(c(0, 1), times = c(sum(final_data$treatment_group=="control"), sum(final_data$treatment_group=="treatment"))), size = n()),
OUTCOME = if_else(Z == 0, OUTCOME_Z_0, OUTCOME_Z_1)
) %>%
difference_in_means(OUTCOME ~ Z, data = .) %>%
tidy
}
sampling_distribution_sharp_null <- rerun(1000, sharp_null_thought_experiment()) %>%
bind_rows
sampling_distribution_sharp_null %>%
summarise(mean(estimate>=results$estimate))

mapping with an if statement based on data/class type

I have created a series of models which I would like to make predictions using new data. The code is the following:
The code below works but and is just put here to reproduce the data (but I do not have any question on this part of the code):
############## Pre-define some function ###########
logit2prob <- function(logit){
odds <- exp(logit)
prob <- odds / (1 + odds)
return(prob)
}
###################################################
data(iris)
df <- iris %>%
filter(Species != "setosa") %>%
mutate(Species = +(Species == "virginica")) %>%
sample_n(10)
##########################################
var_combos <- expand.grid(colnames(df[,1:4]), colnames(df[,1:4])) %>%
filter(!Var1 == Var2)
boundary_lists <- map2(
.x = var_combos$Var1,
.y = var_combos$Var2,
~select(df, .x, .y) %>%
summarise(
minX = min(.[[1]], na.rm = TRUE),
maxX = max(.[[1]], na.rm = TRUE),
minY = min(.[[2]], na.rm = TRUE),
maxY = max(.[[2]], na.rm = TRUE)
)
) %>%
map(.,
~tibble(
x = seq(.x$minX, .x$maxX, length.out = 200),
y = seq(.x$minY, .x$maxY, length.out = 200),
)
) %>%
map(.,
~tibble(
xx = rep(.x$x, each = 200),
yy = rep(.x$y, time = 200)
)
) %>%
map2(.,
asplit(var_combos, 1), ~ .x %>%
set_names(.y))
xgboost(
objective='binary:logistic',
eval_metric = 'auc',
data = as.matrix(df[, 1:2]),
label = as.matrix(df[, 5]), # binary variable
nrounds = 10
)
models_list <- map2(
var_combos$Var1,
var_combos$Var2,
~df %>%
select(Species, .x, .y) %>%
group_by(grp = 'grp') %>%
nest() %>%
mutate(
models = map(
data,
~{
list(
glm(Species ~ ., data = .x, family = binomial(link='logit')),
#e1071::svm(Species ~ ., data = .x, type = 'C-classification', kernel = 'linear'),
#randomForest::randomForest(formula = as.factor(Species) ~ ., data = .),
xgboost(
objective='binary:logistic',
eval_metric = 'auc',
data = as.matrix(df[, 1:2]),
label = as.matrix(df[, 5]), # binary variable
nrounds = 10
)
)
}
)
)
) %>%
map(
., ~unlist(., recursive = FALSE)
)
I now have a number of lists and models and the next step is to make predictions using the predict() function. However certain models require certain data structures. The glm model can take in a data.frame but the xgboost model requires a matrix.
If I remove the xgboost model in the models_list part of the code I can run the following:
models_predict <- models_list %>%
map(., pluck, 'models') %>%
map2(
.x = .,
.y = boundary_lists,
~predict(
object = .,
newdata = .y
)
)
Which works, however when the xgboost model is in the list the code breaks since xgboost requires a matrix.
The following two predict functions work.
predict(
object = models_list[[1]]$models[[1]],
newdata = df[, 1:2] # for the glm model
)
predict(
object = models_list[[1]]$models[[2]],
newdata = as.matrix(df[, 1:2]), # for the xgboost model
type = 'prob'
)
How can I pass to the map function, conditions. I know of the conditional map map_if() function in purrr but I am not sure how to pass the conditions to it.
If the class = xgb.Booster then go to predict_xgb else predict.
models_list[[1]]$models[[1]] %>% attributes()
models_list[[1]]$models[[2]] %>% attributes()
I would like then to put the predictions into the same data frame list once the predict function has been applied.
EDIT:
In order to get multiple trained models I think I need to change the models_list part of the code to:
models_list <- map2(
var_combos$Var1,
var_combos$Var2,
~df %>%
select(Species, .x, .y) %>%
group_by(grp = 'grp') %>%
nest() %>%
mutate(
models = map(
data,
~{
list(
glm(Species ~ ., data = .x, family = binomial(link='logit')),
#e1071::svm(Species ~ ., data = .x, type = 'C-classification', kernel = 'linear'),
#randomForest::randomForest(formula = as.factor(Species) ~ ., data = .),
xgboost(
objective='binary:logistic',
eval_metric = 'auc',
data = as.matrix(.x[, 2:3]),
label = as.matrix(.x$Species), # binary variable
nrounds = 10
)
)
}
)
)
) %>%
map(
., ~unlist(., recursive = FALSE)
)
Then running:
models_list[[1]]$models[[2]]$feature_names
models_list[[2]]$models[[2]]$feature_names
models_list[[3]]$models[[2]]$feature_names
Gives me:
> models_list[[1]]$models[[2]]$feature_names
[1] "Sepal.Width" "Sepal.Length"
> models_list[[2]]$models[[2]]$feature_names
[1] "Petal.Length" "Sepal.Length"
> models_list[[3]]$models[[2]]$feature_names
[1] "Petal.Width" "Sepal.Length"
So now each XGBoost model is trained on a different combination of the iris variables.
The attrs "class" can extract the "class" and then we do a comparison with if/else
library(xgboost)
library(purrr)
If we are passing the corresponding features_list dataset for each element of 'models_list, then usemap2` as outer most loop
out1 <- map2_dfr(models_list, boundary_lists, ~ {
mods <- pluck(.x, "models")
dat <- .y
map_dfr(mods, ~if(attr(.x, "class")[1] == "xgb.Booster") {
tibble(modelname = attr(.x, "class")[1],
prediction = predict(.x, newdata = as.matrix(dat), type = 'prob'))
} else {
tibble(modelname = attr(.x, "class")[1],
prediction = predict(.x, newdata = dat))} )
}, .id = 'grp'
)
head(out1)
# A tibble: 6 x 3
# modelname prediction grp
# <chr> <dbl> <chr>
#1 glm 1.95 1
#2 glm 1.97 1
#3 glm 1.98 1
#4 glm 1.99 1
#5 glm 2.00 1
#6 glm 2.01 1

Aggregate based on elements in the formula

I have a rate formula, which changes with each model. So, I want the aggregate function to check the number of elements in the rate formula and then run the aggregate function accordingly.
mtcars$rate_1 = mtcars$mpg - mtcars$cyl - mtcars$disp
rate_1 = "mpg - cyl - disp"
open.rate = unlist(strsplit(rate_1,"-",fixed = TRUE))
or
mtcars$rate_1 = mtcars$mpg - mtcars$cyl - mtcars$disp - mtcars$hp
rate_1 = "mpg - cyl - disp - hp"
open.rate = unlist(strsplit(rate_1,"-",fixed = TRUE))
if(length(open.rate == 3)){
data_plot = mtcars %>% group_by(carb) %>% summarise(
####### Rates #######
mpg = weighted.mean(eval(parse(text=mpg)),wt, na.rm = TRUE),
cyl = weighted.mean(eval(parse(text=cyl)),wt, na.rm = TRUE),
disp = weighted.mean(eval(parse(text=disp)),wt, na.rm = TRUE)
)
} else {
data_plot = mtcars %>% group_by(carb) %>% summarise(
####### Rates #######
mpg = weighted.mean(eval(parse(text=mpg)),wt, na.rm = TRUE),
cyl = weighted.mean(eval(parse(text=cyl)),wt, na.rm = TRUE),
disp = weighted.mean(eval(parse(text=disp)),wt, na.rm = TRUE),
hp = weighted.mean(eval(parse(text=hp)),wt, na.rm = TRUE)
)
}
The rate equation can have any number of elements. Depending on the number of elements, the aggregation needs to have the same number of elements. My aim is to write one function which takes care of that, instead of writing multiple if-else conditions.
Is there a way I can do it, without having to write the code twice for two different rates?
My solution is:
aggdata <- mtcars[, "carb"]
for(i in compratelist){
print(i)
data = mtcars %>% group_by(carb) %>% summarise(
eval(parse(text = paste0(i,"_mean = weighted.mean(eval(parse(text = ", i,")),wt, na.rm = TRUE)")))
)
colnames(data)[2] <- i
data <- as.data.frame(data)
aggdata <- left_join(aggdata, data, by = "carb")
}
aggdata <- mtcars[, "carb"]
for(i in compratelist){
print(i)
data = mtcars %>% group_by(carb) %>% summarise(
eval(parse(text = paste0(i,"_mean = weighted.mean(eval(parse(text = ", i,")),wt, na.rm = TRUE)")))
)
colnames(data)[2] <- i
data <- as.data.frame(data)
aggdata <- left_join(aggdata, data, by = "carb")
}

Append Shapley reason codes on all observations to the entire data

Here is my code to get the top 5 Shaply reason codes on mtcars dataset.
#install.packages("randomForest"); install.packages("tidyverse"); install.packages(""iml)
library(tidyverse); library(iml); library(randomForest)
set.seed(42)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs),
id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor = Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapley = Shapley$new(predictor, x.interest = mtcars1[1,])
shapleyresults <- as_tibble(shapley$results) %>% arrange(desc(phi)) %>% slice(1:5) %>% select(feature.value, phi)
How can I get the reason codes for all the observations (instead of one at a time in the 2nd last line in the above code: mtcars[1,])?
And, append/left_join the shapleyresults using id on to the entire dataset?
The dataset would be 5-times longer. Should we use purrr here to do that?
I found the solution.
#install.packages("randomForest"); install.packages("tidyverse"); install.packages("iml")
library(tidyverse); library(iml); library(randomForest)
set.seed(42)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs),
id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor <- Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapelyresults <- map_dfr(1:nrow(mtcars), ~(Shapley$new(predictor, x.interest = mtcars1[.x,]) %>%
.$results %>%
as_tibble() %>%
arrange(desc(phi)) %>%
slice(1:5) %>%
select(feature.value, phi) %>%
mutate(id = .x)))
final_data <- mtcars1 %>% left_join(shapelyresults, by = "id")

Resources