I have a rate formula, which changes with each model. So, I want the aggregate function to check the number of elements in the rate formula and then run the aggregate function accordingly.
mtcars$rate_1 = mtcars$mpg - mtcars$cyl - mtcars$disp
rate_1 = "mpg - cyl - disp"
open.rate = unlist(strsplit(rate_1,"-",fixed = TRUE))
or
mtcars$rate_1 = mtcars$mpg - mtcars$cyl - mtcars$disp - mtcars$hp
rate_1 = "mpg - cyl - disp - hp"
open.rate = unlist(strsplit(rate_1,"-",fixed = TRUE))
if(length(open.rate == 3)){
data_plot = mtcars %>% group_by(carb) %>% summarise(
####### Rates #######
mpg = weighted.mean(eval(parse(text=mpg)),wt, na.rm = TRUE),
cyl = weighted.mean(eval(parse(text=cyl)),wt, na.rm = TRUE),
disp = weighted.mean(eval(parse(text=disp)),wt, na.rm = TRUE)
)
} else {
data_plot = mtcars %>% group_by(carb) %>% summarise(
####### Rates #######
mpg = weighted.mean(eval(parse(text=mpg)),wt, na.rm = TRUE),
cyl = weighted.mean(eval(parse(text=cyl)),wt, na.rm = TRUE),
disp = weighted.mean(eval(parse(text=disp)),wt, na.rm = TRUE),
hp = weighted.mean(eval(parse(text=hp)),wt, na.rm = TRUE)
)
}
The rate equation can have any number of elements. Depending on the number of elements, the aggregation needs to have the same number of elements. My aim is to write one function which takes care of that, instead of writing multiple if-else conditions.
Is there a way I can do it, without having to write the code twice for two different rates?
My solution is:
aggdata <- mtcars[, "carb"]
for(i in compratelist){
print(i)
data = mtcars %>% group_by(carb) %>% summarise(
eval(parse(text = paste0(i,"_mean = weighted.mean(eval(parse(text = ", i,")),wt, na.rm = TRUE)")))
)
colnames(data)[2] <- i
data <- as.data.frame(data)
aggdata <- left_join(aggdata, data, by = "carb")
}
aggdata <- mtcars[, "carb"]
for(i in compratelist){
print(i)
data = mtcars %>% group_by(carb) %>% summarise(
eval(parse(text = paste0(i,"_mean = weighted.mean(eval(parse(text = ", i,")),wt, na.rm = TRUE)")))
)
colnames(data)[2] <- i
data <- as.data.frame(data)
aggdata <- left_join(aggdata, data, by = "carb")
}
Related
I would like to put the number of observations included in set of regression models at the bottom of a gtsummary table, in the same columns as the coefficient estimates. It is straightforward to put the numbers of observations in columns:
library(dplyr)
library(gtsummary)
df <- mtcars %>%
mutate(cyl_miss = if_else(
cyl == 6,
NA_real_,
cyl
))
model_1 <- lm(
data = df,
formula = mpg ~ cyl + disp
)
model_2 <- lm(
data = df,
formula = mpg ~ cyl_miss + disp
)
table_1 <- tbl_regression(model_1) %>%
add_significance_stars(
pattern = "{estimate}{stars}",
thresholds = c(0.001, 0.01, 0.05),
hide_ci = TRUE,
hide_p = TRUE,
hide_se = FALSE
) %>%
add_n()
table_2 <- tbl_regression(model_2) %>%
add_significance_stars(
pattern = "{estimate}{stars}",
thresholds = c(0.001, 0.01, 0.05),
hide_ci = TRUE,
hide_p = TRUE,
hide_se = FALSE
) %>%
add_n()
tbl_merge(
list(table_1, table_2)
)
How can I put the numbers (here 32 and 25) in the Beta columns, in a row labelled "N"?
To add the N to a new row of the table, you'll want to use the add_glance_table() function. Example below!
library(gtsummary)
packageVersion("gtsummary")
#> [1] '1.4.2'
df <-
mtcars %>%
dplyr::mutate(
cyl_miss = ifelse(cyl == 6, NA_real_, cyl)
)
model_1 <- lm(data = df, formula = mpg ~ cyl + disp)
model_2 <- lm(data = df, formula = mpg ~ cyl_miss + disp)
table_1 <-
tbl_regression(model_1) %>%
add_significance_stars() %>%
add_glance_table(include = nobs)
table_2 <-
tbl_regression(model_2) %>%
add_significance_stars() %>%
add_glance_table(include = nobs)
table_final <-
tbl_merge(list(table_1, table_2)) %>%
# ensure the glance statistics are at the bottom of table
modify_table_body(~.x %>% dplyr::arrange(row_type == "glance_statistic"))
Created on 2021-09-14 by the reprex package (v2.0.1)
Am writing some dplyr across statements. Want to create some p-values using the functions t.test and varTest. The x= columns for calculations are in df_vars and the mu= and sigma.squared= parameter values are in df_mu_sigma.
A hard-coded version of the data I need are in df_sumry. If the variable names were always the same when code is run, something like this would suffice. That's not the case, however.
The beginnings of a non-hard-coded version of what I need are in df_sumry2. That doesn't yield a correct result yet though, because values of mu= and sigma.squared= are not dynamically specified. Only the first two p-values are correct in df_sumry2. They are always wrong after that because the code always uses values for the mpg variable.
How can I consistently get the right values inserted for mu and sigma.squared?
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
set.seed(9302)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) t.test(x, mu = df_mu_sigma$mpg_mean)$p.value,
std = sd,
std_prob = function(x) varTest(x, sigma.squared = df_mu_sigma$mpg_std^2)$p.value)
)
)
I appear to have come up with a solution to my own problem. I'd be happy to see alternative solutions though as they may be better than mine.
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
library(glue)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) {
mu_name <- glue("{ensym(x)}_mean")
t.test(x, mu = df_mu_sigma[[mu_name]])$p.value
},
std = sd,
std_prob = function(x) {
sigma_name <- glue("{ensym(x)}_std")
varTest(x, sigma.squared = df_mu_sigma[[sigma_name]]^2)$p.value
}
)
)
)
all.equal(df_sumry, df_sumry2)
This is not much better than your solution, but I would use cur_column() instead of ensym() to avoid quosures handling.
Also, putting the query in a separate function makes things a bit tidier.
Finally, I would use lambda functions instead of anonymous functions for clarity.
get_mu = function(suffix){
df_mu_sigma[[paste0(cur_column(), suffix)]] #you could use glue() as well here
}
df_vars %>%
summarize(
across(
all_of(vars_num),
list(
mean = mean,
mean_prob = ~t.test(.x, mu = get_mu("_mean"))$p.value,
std = sd,
std_prob = ~varTest(.x, sigma.squared = get_mu("_std")^2)$p.value
)
)
) %>% t() #just to format the output
# [,1]
# mpg_mean 20.09062500
# mpg_mean_prob 0.01808550
# mpg_std 6.02694805
# mpg_std_prob 0.96094601
# cyl_mean 6.18750000
# cyl_mean_prob 0.10909740
# cyl_std 1.78592165
# cyl_std_prob 0.77092484
# disp_mean 230.72187500
# disp_mean_prob 0.17613878
# disp_std 123.93869383
# disp_std_prob 0.96381507
# hp_mean 146.68750000
# hp_mean_prob 0.03914858
# hp_std 68.56286849
# hp_std_prob 0.03459963
I have created a series of models which I would like to make predictions using new data. The code is the following:
The code below works but and is just put here to reproduce the data (but I do not have any question on this part of the code):
############## Pre-define some function ###########
logit2prob <- function(logit){
odds <- exp(logit)
prob <- odds / (1 + odds)
return(prob)
}
###################################################
data(iris)
df <- iris %>%
filter(Species != "setosa") %>%
mutate(Species = +(Species == "virginica")) %>%
sample_n(10)
##########################################
var_combos <- expand.grid(colnames(df[,1:4]), colnames(df[,1:4])) %>%
filter(!Var1 == Var2)
boundary_lists <- map2(
.x = var_combos$Var1,
.y = var_combos$Var2,
~select(df, .x, .y) %>%
summarise(
minX = min(.[[1]], na.rm = TRUE),
maxX = max(.[[1]], na.rm = TRUE),
minY = min(.[[2]], na.rm = TRUE),
maxY = max(.[[2]], na.rm = TRUE)
)
) %>%
map(.,
~tibble(
x = seq(.x$minX, .x$maxX, length.out = 200),
y = seq(.x$minY, .x$maxY, length.out = 200),
)
) %>%
map(.,
~tibble(
xx = rep(.x$x, each = 200),
yy = rep(.x$y, time = 200)
)
) %>%
map2(.,
asplit(var_combos, 1), ~ .x %>%
set_names(.y))
xgboost(
objective='binary:logistic',
eval_metric = 'auc',
data = as.matrix(df[, 1:2]),
label = as.matrix(df[, 5]), # binary variable
nrounds = 10
)
models_list <- map2(
var_combos$Var1,
var_combos$Var2,
~df %>%
select(Species, .x, .y) %>%
group_by(grp = 'grp') %>%
nest() %>%
mutate(
models = map(
data,
~{
list(
glm(Species ~ ., data = .x, family = binomial(link='logit')),
#e1071::svm(Species ~ ., data = .x, type = 'C-classification', kernel = 'linear'),
#randomForest::randomForest(formula = as.factor(Species) ~ ., data = .),
xgboost(
objective='binary:logistic',
eval_metric = 'auc',
data = as.matrix(df[, 1:2]),
label = as.matrix(df[, 5]), # binary variable
nrounds = 10
)
)
}
)
)
) %>%
map(
., ~unlist(., recursive = FALSE)
)
I now have a number of lists and models and the next step is to make predictions using the predict() function. However certain models require certain data structures. The glm model can take in a data.frame but the xgboost model requires a matrix.
If I remove the xgboost model in the models_list part of the code I can run the following:
models_predict <- models_list %>%
map(., pluck, 'models') %>%
map2(
.x = .,
.y = boundary_lists,
~predict(
object = .,
newdata = .y
)
)
Which works, however when the xgboost model is in the list the code breaks since xgboost requires a matrix.
The following two predict functions work.
predict(
object = models_list[[1]]$models[[1]],
newdata = df[, 1:2] # for the glm model
)
predict(
object = models_list[[1]]$models[[2]],
newdata = as.matrix(df[, 1:2]), # for the xgboost model
type = 'prob'
)
How can I pass to the map function, conditions. I know of the conditional map map_if() function in purrr but I am not sure how to pass the conditions to it.
If the class = xgb.Booster then go to predict_xgb else predict.
models_list[[1]]$models[[1]] %>% attributes()
models_list[[1]]$models[[2]] %>% attributes()
I would like then to put the predictions into the same data frame list once the predict function has been applied.
EDIT:
In order to get multiple trained models I think I need to change the models_list part of the code to:
models_list <- map2(
var_combos$Var1,
var_combos$Var2,
~df %>%
select(Species, .x, .y) %>%
group_by(grp = 'grp') %>%
nest() %>%
mutate(
models = map(
data,
~{
list(
glm(Species ~ ., data = .x, family = binomial(link='logit')),
#e1071::svm(Species ~ ., data = .x, type = 'C-classification', kernel = 'linear'),
#randomForest::randomForest(formula = as.factor(Species) ~ ., data = .),
xgboost(
objective='binary:logistic',
eval_metric = 'auc',
data = as.matrix(.x[, 2:3]),
label = as.matrix(.x$Species), # binary variable
nrounds = 10
)
)
}
)
)
) %>%
map(
., ~unlist(., recursive = FALSE)
)
Then running:
models_list[[1]]$models[[2]]$feature_names
models_list[[2]]$models[[2]]$feature_names
models_list[[3]]$models[[2]]$feature_names
Gives me:
> models_list[[1]]$models[[2]]$feature_names
[1] "Sepal.Width" "Sepal.Length"
> models_list[[2]]$models[[2]]$feature_names
[1] "Petal.Length" "Sepal.Length"
> models_list[[3]]$models[[2]]$feature_names
[1] "Petal.Width" "Sepal.Length"
So now each XGBoost model is trained on a different combination of the iris variables.
The attrs "class" can extract the "class" and then we do a comparison with if/else
library(xgboost)
library(purrr)
If we are passing the corresponding features_list dataset for each element of 'models_list, then usemap2` as outer most loop
out1 <- map2_dfr(models_list, boundary_lists, ~ {
mods <- pluck(.x, "models")
dat <- .y
map_dfr(mods, ~if(attr(.x, "class")[1] == "xgb.Booster") {
tibble(modelname = attr(.x, "class")[1],
prediction = predict(.x, newdata = as.matrix(dat), type = 'prob'))
} else {
tibble(modelname = attr(.x, "class")[1],
prediction = predict(.x, newdata = dat))} )
}, .id = 'grp'
)
head(out1)
# A tibble: 6 x 3
# modelname prediction grp
# <chr> <dbl> <chr>
#1 glm 1.95 1
#2 glm 1.97 1
#3 glm 1.98 1
#4 glm 1.99 1
#5 glm 2.00 1
#6 glm 2.01 1
Here is my code to get the top 5 Shaply reason codes on mtcars dataset.
#install.packages("randomForest"); install.packages("tidyverse"); install.packages(""iml)
library(tidyverse); library(iml); library(randomForest)
set.seed(42)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs),
id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor = Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapley = Shapley$new(predictor, x.interest = mtcars1[1,])
shapleyresults <- as_tibble(shapley$results) %>% arrange(desc(phi)) %>% slice(1:5) %>% select(feature.value, phi)
How can I get the reason codes for all the observations (instead of one at a time in the 2nd last line in the above code: mtcars[1,])?
And, append/left_join the shapleyresults using id on to the entire dataset?
The dataset would be 5-times longer. Should we use purrr here to do that?
I found the solution.
#install.packages("randomForest"); install.packages("tidyverse"); install.packages("iml")
library(tidyverse); library(iml); library(randomForest)
set.seed(42)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs),
id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor <- Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapelyresults <- map_dfr(1:nrow(mtcars), ~(Shapley$new(predictor, x.interest = mtcars1[.x,]) %>%
.$results %>%
as_tibble() %>%
arrange(desc(phi)) %>%
slice(1:5) %>%
select(feature.value, phi) %>%
mutate(id = .x)))
final_data <- mtcars1 %>% left_join(shapelyresults, by = "id")
My question is how to put in multiple actions after the if statement. For example:
vartest <- var.test(var1 ~ group, alternative='two.sided', conf.level=.95, data=data1)
ttest <- t.test(var1~group, alternative='two.sided', conf.level=.95, var.equal=FALSE, data=data1)
if (vartest$p.value>0.05) {
if (ttest$p.value<=0.05) {
cat(ttest$p.value)
ggboxplot(data1, x="group", y="var1", color="group", palette=c("#00AFBB", "#E7B800"), ylab="var1", xlab="group")
group_by(data1, group) %>% summarise(count = n(), mean = mean(var1, na.rm = TRUE), sd = sd(var1, na.rm = TRUE))
} else{
cat("text1.")
}
} else{
cat("text2.")
}
It isn't giving me my desired result, instead only the first and the last part of the expressions was printed to the console.
If I change the order of the actions for that:
vartest <- var.test(var1 ~ group, alternative='two.sided', conf.level=.95, data=data1)
ttest <- t.test(var1~group, alternative='two.sided', conf.level=.95, var.equal=FALSE, data=data1)
if (vartest$p.value>0.05) {
if (ttest$p.value<=0.05) {
cat(ttest$p.value)
group_by(data1, group) %>% summarise(count = n(), mean = mean(var1, na.rm = TRUE), sd = sd(var1, na.rm = TRUE))
ggboxplot(data1, x="group", y="var1", color="group", palette=c("#00AFBB", "#E7B800"), ylab="var1", xlab="group")
} else{
cat("text1.")
}
} else{
cat("text2.")
}
Only the first and the last part of the expressions are printed to the console.
Any ideas how can I run all actions between if and else?
Please help!
Implicit printing is disabled inside braces, and the value of the last expression is returned.
{
"hello"
cat("world\n")
"how are you"
"today?"
}
## world
## [1] "today?"
If you want to print or otherwise display something from inside braces you can use explicit print, or message or cat.
{
print("hello")
cat("world\n")
message("how are you")
"today?"
}
## [1] "hello"
## world
## how are you
## [1] "today?"
I don't have the data you're working with, but I recreated your problem as follows:
Example:
library(ggplot2)
library(dplyr)
p.value <- 0.02
if (p.value<=0.05) {
cat(p.value)
mtcars %>% group_by(gear) %>% summarise(count = n(), mean = mean(hp))
qplot(x = wt, y = mpg, data = mtcars, geom = "point")
} else{
cat("text1.")
}
The example above will only print p.value and the plot. Switching the order of the plot and the summary table will return only the p.value and summary table.
To solve this, we can assign back and explicitly print.
library(ggplot2)
library(dplyr)
p.value <- 0.02
if (p.value<=0.05) {
cat(p.value)
my_sum <- mtcars %>% group_by(gear) %>% summarise(count = n(), mean = mean(hp))
print(my_sum)
my_plot <- qplot(x = wt, y = mpg, data = mtcars, geom = "point")
print(my_plot)
} else{
cat("text1.")
}