I am using the pROC package in r to calculate and compare the AUCs of multiple tests, to see which test has the best ability to discriminate between patients and controls. However, I have a large number of tests and essentially want to run a series of pairwise comparisons of each tests AUC with every other test and then correct for multiple comparisons. This is as far as I've gotten with my code (example with simulated and replicable dataset below):
#load pROC
library(pROC)
#generate df with random numbers
set.seed(123)
df <- data.frame(disease_status = rbinom(n=100, size=1, prob=0.20),
test1 = rnorm(100, mean=15, sd=4),
test2 = rnorm(100, mean=30, sd=2),
test3 = rnorm(100, mean=50, sd=3))
#create roc object for test1, test2, test3
roc.out_test1<-roc(df$disease_status, df$test1, plot=TRUE, smooth = FALSE)
roc.out_test2<-roc(df$disease_status, df$test2, plot=TRUE, smooth = FALSE)
roc.out_test3<-roc(df$disease_status, df$test3, plot=TRUE, smooth = FALSE)
#compare the AUC of test1 and test 2
roc.test(roc.out_test1, roc.out_test2, reuse.auc=TRUE, method="delong", na.rm=TRUE)
#DeLong's test for two correlated ROC curves
#data: roc.out_test1 and roc.out_test2
#Z = 0.60071, p-value = 0.548
#alternative hypothesis: true difference in AUC is not equal to 0
#sample estimates:
#AUC of roc1 AUC of roc2
#0.5840108 0.5216802
#create a function to do above for all comparisons
vec_ROCs1 <- c("roc.out_test1,", "roc.out_test2,", "roc.out_test3,")
vec_ROCs2 <- c("roc.out_test1", "roc.out_test2", "roc.out_test3")
ROCs2_specifications <- paste0(vec_ROCs2, ",", "reuse.auc=TRUE")
test <- unlist(lapply(ROCs2_specifications, function(x) paste0(vec_ROCs1, x)))
test2 <- lapply(test, function(x) roc.test(x))
#Error in roc.test.default(x) :
# argument "predictor1" is missing, with no default
Please let me know your thoughts and suggestions on how to fix this!
Thank you.
The following should work, please check it. I didn't write all the details, but you can ask me other questions if you don't understand the code.
#load pROC
library(pROC)
#> Type 'citation("pROC")' for a citation.
#>
#> Attaching package: 'pROC'
#> The following objects are masked from 'package:stats':
#>
#> cov, smooth, var
#generate df with random numbers
set.seed(123)
df <- data.frame(disease_status = rbinom(n=100, size=1, prob=0.20),
test1 = rnorm(100, mean=15, sd=4),
test2 = rnorm(100, mean=30, sd=2),
test3 = rnorm(100, mean=50, sd=3))
#create roc object for test1, test2, test3
roc.out_test1<-roc(df$disease_status, df$test1, plot=TRUE, smooth = FALSE)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc.out_test2<-roc(df$disease_status, df$test2, plot=TRUE, smooth = FALSE)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
roc.out_test3<-roc(df$disease_status, df$test3, plot=TRUE, smooth = FALSE)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
# compare the AUC of test1 and test 2
roc.test(roc.out_test1, roc.out_test2, reuse.auc = TRUE, method = "delong", na.rm = TRUE)
#>
#> DeLong's test for two correlated ROC curves
#>
#> data: roc.out_test1 and roc.out_test2
#> Z = 0.60071, p-value = 0.548
#> alternative hypothesis: true difference in AUC is not equal to 0
#> sample estimates:
#> AUC of roc1 AUC of roc2
#> 0.5840108 0.5216802
Now we generate a list of all possible combinations of the three tests and run the roc.test function using the same parameters that you set.
all_tests <- combn(
list(
"test1" = roc.out_test1,
"test2" = roc.out_test2,
"test3" = roc.out_test3
),
FUN = function(x, ...) roc.test(x[[1]], x[[2]]),
m = 2,
simplify = FALSE,
reuse.auc = TRUE,
method = "delong",
na.rm = TRUE
)
The output is a list of choose(3, 2) = 3 elements (i.e. the number of combinations of n elements taken 2 at a time) and each element of the list is a test. For example this is the same as your previous test:
all_tests[[1]]
#>
#> DeLong's test for two correlated ROC curves
#>
#> data: x[[1]] and x[[2]]
#> Z = 0.60071, p-value = 0.548
#> alternative hypothesis: true difference in AUC is not equal to 0
#> sample estimates:
#> AUC of roc1 AUC of roc2
#> 0.5840108 0.5216802
The only problem here is that it's difficult to recognise which tests are used in the comparisons, so we can also add a list of names:
tests_names <- combn(
list("test1", "test2", "test3"),
m = 2,
FUN = paste,
simplify = TRUE,
collapse = "_"
)
all_tests <- setNames(all_tests, tests_names)
This is the result.
names(all_tests)
#> [1] "test1_test2" "test1_test3" "test2_test3"
The names of the objects flag the tests that are used in the comparison.
all_tests$test1_test2
#>
#> DeLong's test for two correlated ROC curves
#>
#> data: x[[1]] and x[[2]]
#> Z = 0.60071, p-value = 0.548
#> alternative hypothesis: true difference in AUC is not equal to 0
#> sample estimates:
#> AUC of roc1 AUC of roc2
#> 0.5840108 0.5216802
Created on 2020-03-14 by the reprex package (v0.3.0)
The roc.test() function expects a roc object as input. The list test is just character strings of all the arguments, which the function does not know what to do with. The list also includes comparisons of the tests with themselves i.e. "roc.out_test1,roc.out_test1,reuse.auc=TRUE" I assume you don't actually need to do this and that there are only 3 comparisons that you need 1v2, 1v3, 2v3. The purrr package provides map functions similar to lapply and map2 allows you to iterate of 2 lists at the same time. You need to create 2 lists of the actually roc objects and iterate over these.
#load pROC
library(pROC)
library(dplyr)
library(purrr) #For map2 function
#generate df with random numbers
set.seed(123)
df <- data.frame(disease_status = rbinom(n=100, size=1, prob=0.20),
test1 = rnorm(100, mean=15, sd=4),
test2 = rnorm(100, mean=30, sd=2),
test3 = rnorm(100, mean=50, sd=3))
#create roc object for test1, test2, test3
roc.out_test1<-roc(df$disease_status, df$test1, plot=TRUE, smooth = FALSE)
roc.out_test2<-roc(df$disease_status, df$test2, plot=TRUE, smooth = FALSE)
roc.out_test3<-roc(df$disease_status, df$test3, plot=TRUE, smooth = FALSE)
#compare the AUC of test1 and test 2
roc.test(roc.out_test1, roc.out_test2, reuse.auc=TRUE, method="delong", na.rm=TRUE)
roc_new <- function(test1, test2){
roc.test(test1, test2, reuse.auc=TRUE, method="delong", na.rm=TRUE)
}
#List of all tests
all_tests <- list(roc.out_test1,
roc.out_test2,
roc.out_test3)
#Create unique combos of tests
unique_combos <- expand.grid(1:3, 1:3) %>%
filter(Var1 < Var2) %>% #exludes duplicate comparisons,
#each col provides the index for the 2 lists to iterate over
mutate(names = paste(Var1, " V ", Var2)) #Create col to name final output list
#Create 2 lists to iterate over
#Create list 1
(test1 <- all_tests[as.numeric(unique_combos$Var1)])
#Create list 2
(test2 <- all_tests[as.numeric(unique_combos$Var2)])
#Iterate over both lists
output <- map2(test1, test2, roc_new)
names(output) <- unique_combos$names
Related
I understand how to boostrap using the "boot" package in R, through the PDF for the package and also from these two examples on Stack, Bootstrapped correlation with more than 2 variables in R and Bootstrapped p-value for a correlation coefficient on R.
However, this is for small datasets ( 2 variables or a matrix with 5 variables). I have a very large matrix (1000+ columns) and the code I use to compute the correlation between every metabolite pair (removing duplicate and correlations with the metabolite itself) is:
x <- colnames(dat)
GetCor = function(x,y) cor(dat[,x], dat[,y], method="spearman")
GetCor = Vectorize(GetCor)
out <- data.frame(t(combn(x,2)), stringsAsFactors = F) %>%
mutate(v = GetCor(X1,X2))
I'm not sure how I can then alter this for it to be the function I pass to statistic in boot so
boot_res<- boot(dat, ?, R=1000)
Or would I just need to obtain a matrix of the bootstrapped p value or estimate depending on function code (colMeans(boot_res$t)) and get rid of the upper or lower triangle?
Was curious to know the most efficient way of going about the problem..
Something like this? It follows more or less the same lines as my answer to the 2nd question you link to in your question.
Note that I have simplified the correlation code, cor accepts a data.frame or a matrix, so pass a two column one and keep one of the off diagonal correlation matrix elements.
library(boot)
bootPairwiseCor <- function(data, i) {
d <- data[i,]
combn(d, 2, \(x) cor(x, method="spearman")[1,2])
}
dat <- iris[-5]
nms <- combn(colnames(dat), 2, paste, collapse = "_")
R <- 100L
b <- boot(dat, bootPairwiseCor, R)
b
#>
#> ORDINARY NONPARAMETRIC BOOTSTRAP
#>
#>
#> Call:
#> boot(data = dat, statistic = bootPairwiseCor, R = R)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -0.1667777 0.0037142908 0.070552718
#> t2* 0.8818981 -0.0002851683 0.017783297
#> t3* 0.8342888 0.0006306610 0.021509280
#> t4* -0.3096351 0.0047809612 0.075976067
#> t5* -0.2890317 0.0045689001 0.069929108
#> t6* 0.9376668 -0.0014838117 0.009632318
data.frame(variables = nms, correlations = colMeans(b$t))
#> variables correlations
#> 1 Sepal.Length_Sepal.Width -0.1630634
#> 2 Sepal.Length_Petal.Length 0.8816130
#> 3 Sepal.Length_Petal.Width 0.8349194
#> 4 Sepal.Width_Petal.Length -0.3048541
#> 5 Sepal.Width_Petal.Width -0.2844628
#> 6 Petal.Length_Petal.Width 0.9361830
Created on 2023-01-28 with reprex v2.0.2
You may want to use cor.test to get theoretical t-values. We will use them for comparison with the B bootstrap t-values. (Recall: The p-value is the probability of obtaining test results at least as extreme as the result actually observed, under the assumption that the null hypothesis is correct.)
Here is a similar function to yours, but applying cor.test and extracting statistics.
corr_cmb <- \(X, boot=FALSE) {
stts <- c('estimate', 'statistic', 'p.value')
cmbn <- combn(colnames(X), 2, simplify=FALSE)
a <- lapply(cmbn, \(x) as.data.frame(cor.test(X[, x[1]], X[, x[2]])[stts])) |>
do.call(what=rbind) |>
`rownames<-`(sapply(cmbn, paste, collapse=':'))
if (boot) {
a <- a[, 'statistic']
}
a
}
We run it one times on the data to get a theoretical solution.
rhat <- corr_cmb(dat)
head(rhat, 3)
# estimate statistic p.value
# V1:V2 0.06780426 2.1469547 0.03203729
# V1:V3 0.03471587 1.0973752 0.27274212
# V1:V4 0.05301563 1.6771828 0.09381987
Bootstrap
We can assume from the start that the bootstrap with 1000 columns will run for a while (choose(1000, 2) returns 499500 combinations). That's why we think about a multithreaded solution right away.
To bootstrap we simple repeatedly apply corr_cmb repeatedly on a sample of the data with replications.
We will measure the time to estimate the time needed for 1000 variables.
## setup clusters
library(parallel)
CL <- makeCluster(detectCores() - 1)
clusterExport(CL, c('corr_cmb', 'dat'))
t0 <- Sys.time() ## timestamp before run
B <- 1099L
clusterSetRNGStream(CL, 42)
boot_res <- parSapply(CL, 1:B, \(i) corr_cmb(dat[sample.int(nrow(dat), replace=TRUE), ], boot=TRUE))
t1 <- Sys.time() ## timestamp after run
stopCluster(CL)
After the bootstrap, we calculate the ratios of how many times the absolute bootstrap test statistics exceeded the theoretical ones (Ref.),
boot_p <- rowMeans(abs(boot_res - rowMeans(boot_res)) > abs(rhat$statistic))
and cbind the bootstrap p-values to the theoretical result.
cbind(rhat, boot_p)
# estimate statistic p.value boot_p
# V1:V2 0.06780426 2.1469547 0.03203729 0.03003003
# V1:V3 0.03471587 1.0973752 0.27274212 0.28028028
# V1:V4 0.05301563 1.6771828 0.09381987 0.08208208
# V1:V5 -0.01018682 -0.3218300 0.74764890 0.73473473
# V2:V3 0.03730133 1.1792122 0.23859474 0.23323323
# V2:V4 0.07203911 2.2817257 0.02271539 0.01201201
# V2:V5 0.03098230 0.9792363 0.32770055 0.30530531
# V3:V4 0.02364486 0.7471768 0.45513283 0.47547548
# V3:V5 -0.02864165 -0.9051937 0.36558126 0.38938939
# V4:V5 0.03415689 1.0796851 0.28054328 0.29329329
Note that the data used is fairly normally distributed. If the data is not normally distributed, the bootstrap p-values will be more different.
To conclude, an estimate of the time needed for your 1000 variables.
d <- as.numeric(difftime(t1, t0, units='mins'))
n_est <- 1000
t_est <- d/(choose(m, 2))*choose(n_est, 2)
cat(sprintf('est. runtime for %s variables: %s mins\n', n_est, round(t_est, 1)))
# est. runtime for 1000 variables: 1485.8 mins
(Perhaps for sake of completeness, a single-threaded version for smaller problems:)
## singlethreaded version
# set.seed(42)
# B <- 1099L
# boot_res <- replicate(B, corr_cmb(dat[sample.int(nrow(dat), replace=TRUE), ], boot=TRUE))
Data:
library(MASS)
n <- 1e3; m <- 5
Sigma <- matrix(.5, m, m)
diag(Sigma) <- 1
set.seed(42)
M <- mvrnorm(n, runif(m), Sigma)
M <- M + rnorm(length(M), sd=6)
dat <- as.data.frame(M)
I am trying to manually pool results from quantile regression models run on multiply imputed data in R using mice. I make use of a bootstrapping procedure to get 95% CIs and P values of the model terms, in which model parameters and their standard errors are obtained after sampling a certain number of rows that is equal to the unique number of participants in my data set. This procedure is repeated 500 times for each of the m imputed data sets. Then, as a last step, I pool the estimated coefficients and their standard errors of the resulting 500 * m regression models according to Rubin's rules (1987) (see e.g. https://bookdown.org/mwheymans/bookmi/rubins-rules.html). To speed things up, I use foreach to split up the analyses over multiple processor cores and for to loop over the m imputed data sets.
However, there seems to be a flaw in the part wherein the results are pooled. When I look at the pooled results, I observe that the P values are not in accordance with the 95% CIs (e.g. P < 0.05 when 0 is included in the 95% CI).
To illustrate this issue, I have made a reproducible example, using these publicly available data: https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv
Because there are no missing data in this data set, I introduce them myself and impute the data (m = 10 multiply imputed data sets with 20 iterations). I use set.seed for reproducibility.
Note that I use lm instead of quantreg::rq in this example.
# load data
projdir <- "my_directory"
d <- read.csv(file = file.path(projdir, 'heart_failure_clinical_records_dataset.csv'))
#### introduce missing values
set.seed(1)
# age
age_miss_tag <- rbinom(nrow(d), 1, 0.3)
d$age[age_miss_tag == 1] <- NA # MCAR
# serum creatinine
creat_miss_tag <- rbinom(nrow(d), 1, 0.3)
d$serum_creatinine[creat_miss_tag == 1 & d$anaemia == 0] <- NA # MAR
# CK
CK_miss_tag <- rbinom(nrow(d), 1, 0.3)
d$creatinine_phosphokinase[CK_miss_tag & d$platelets > median(d$platelets)] <- NA # MAR
# platelets
platelets_miss_tag <- rbinom(nrow(d), 1, 0.3)
d$platelets[platelets_miss_tag == 1] <- NA # MCAR
library(mice); library(mitml); library(miceadds); library(splines); library(foreach); library(doParallel)
# impute data
imp <- mice(d, maxit = 20, m = 10, seed = 2)
# log creatinine
implong <- complete(imp, 'long', include = FALSE)
implong$log_creat <- log(implong$serum_creatinine)
imp <- miceadds::datlist2mids(split(implong, implong$.imp))
# compute values for Boundary.knots
temp <- complete(imp, 'long', include = FALSE)
B_knots <- rowMeans(sapply(split(temp, temp$.imp), function(x) {
quantile(x$age, c(0.1, 0.9))
}))
# Convert mids object into a datlist
longlist <- miceadds::mids2datlist(imp)
# fit model based on origial data and use the terms in the below foreach loop
# in order to fix the position of the inner knots
fit_orig <- lm(log_creat ~
# Main effects
ns(age, df = 2, B = c(B_knots[1], B_knots[2])) * sex,
data = longlist[[1]])
To further speed things up, I use OLS instead of quantile regression here and parallelize the process.
# make cluster used in foreach
cores_2_use <- detectCores() - 1
cl <- makeCluster(cores_2_use)
clusterSetRNGStream(cl, iseed = 9956)
registerDoParallel(cl)
# No. of bootstrap samples to be taken
n_iter <- 500
boot.1 <- c()
for(k in seq_along(longlist)){
boot.1[[k]] <- foreach(i = seq_len(n_iter),
.combine = rbind,
.packages = c('mice', 'mitml', 'splines')) %dopar% {
# store data from which rows can be samples
longlist0 <- longlist[[k]]
# set seed for reproducibility
set.seed(i)
# sample rows
boot_dat <- longlist0[sample(1:nrow(longlist0), replace = TRUE), ]
# linear regression model based on sampled rows
fit1 <- lm(terms(fit_orig), data = boot_dat)
# save coefficients
fit1$coef
}
}
stopCluster(cl)
As a last step, I pool the results according to Rubin's rules.
n_cols <- dim(boot.1[[1]])[2]
list <- c()
for(i in seq_len(n_cols)) {
# extract coefficients
parameter <- lapply(boot.1, function(x){
x[,i]
})
m <- length(parameter)
for(k in seq_len(m)) {
names(parameter[[k]]) <- NULL
}
Q <- sapply(parameter, mean)
U <- sapply(parameter, var) # (standard error of estimate)^2
#### Pooling
# Pooled univariate estimate
qbar <- mean(Q)
# Mean of the variances (i.e. the pooled within-imputation variance)
ubar <- mean(U)
# Between-imputation variance
btw_var <- var(Q)
# Total variance of the pooled estimated
tot_var <- ubar + btw_var + (btw_var / m)
# Relative increase in variance due to non-response
r_var <- (btw_var + (btw_var / m)) / ubar
# Fraction of missing information
lambda <- (btw_var + (btw_var / m)) / tot_var
# degrees of freedom for the t-distribution according to Rubin (1987)
df_old <- (m - 1) / lambda^2
# sample size in the imputed data sets
n_sample <- nrow(longlist[[1]])
# observed degrees of freedom
df_observed <- (((n_sample - n_cols) + 1) / ((n_sample - n_cols) + 3)) *
(n_sample - n_cols) * (1 - lambda)
# adjusted degrees of freedom according to Barnard & Rubin (1999)
df_adjusted <- (df_old * df_observed) / (df_old + df_observed)
# 95% confidence interval of qbar
lwr <- qbar - qt(0.975, df_adjusted) * sqrt(tot_var)
upr <- qbar + qt(0.975, df_adjusted) * sqrt(tot_var)
# F statistic
q <- ((0 - qbar)^2 / tot_var)^2
# Significance level associated with the null value Q[0]
p_value <- pf(q, df1 = 1, df2 = df_adjusted, lower.tail = FALSE)
list[[i]] <- cbind(qbar, lwr, upr, p_value)
}
names(list) <- colnames(boot.1[[1]])
list
Obviously, the P value shown below is not in accordance with the 95% CI (as 0 is included in the CI, so the P value should be ≥0.05).
> list
$`(Intercept)`
qbar lwr upr p_value
[1,] 0.06984595 -0.02210231 0.1617942 0.008828337
EDIT (29 Dec 2021)
As #Gerko Vink notes in his answer, multiple imputation and bootstrapping both induce variance. The variance induced by imputation is taken care of by Rubin's rules, the bootstrap variance is not. Unfortunately, mice::pool will not work with the output returned by quantreg::rq.
I am aware of constructing bootstrap CIs based on a naive percentile-based approach as shown in this post, but I am inclined to think this is not the correct approach to proceed with.
Does anyone know how to appropriately take care of the extra variance induced by bootstrapping when using rq?
EDIT (30 Dec 2021)
Inspired by this recent post, I decided not to hit the road of bootstrapping anymore, but instead manually extract the point estimates and variances from each of the imputed data sets and pool them using Rubin's rules. I have posted this approach as answer below. Any input on how to appropriately take care of the extra variance induced by bootstrapping when using rq is still very welcome though!
Bootstrapping and multiple imputation both induce variance. The imputation variance is taken care of by Rubin's rules for parameters with normal sampling distributions. The bootstrap variance is not.
Two remarks:
First, there is a small error in your code. You're calculating the bootstrap variance about Q in U <- sapply(parameter, var). No need for U <- U/n_iter. U is already the variance and sapply(parameter, sd) would yield the bootstrapped standard error.
Second, you're using bootstrap parameters to calculate a parametric interval and p-value. That seems needlessly complicated and, as you can see, potentially problematic. Why not calculate the bootstrap CI?
See also this link for some inspiration with respect to different means of calculating the CIs and their respective validity.
A small sim that demonstrates that you cannot expect both to be identical for a finite set of bootstrap replications.
library(purrr)
library(magrittr)
#fix seed
set.seed(123)
#some data
n = 1000
d <- rnorm(n, 0, 1)
# ci function
fun <- function(x){
se <- var(x)/length(x)
lwr <- mean(x) - 1.96 * se
upr <- mean(x) + 1.96 * se
ci <- c(lwr, upr)
return(ci)
}
# bootstrap
boot <- replicate(500,
d[sample(1:1000, 1000, replace = TRUE)],
simplify = FALSE)
# bootstrapped ci's based on parameters
boot.param.ci <- boot %>%
map(~.x %>% fun) %>%
do.call("rbind", args = .)
# bootstrap CI
boot.ci <- boot %>%
map(~.x %>% mean) %>%
unlist %>%
quantile(c(.025, .975))
# Overview
data.frame(param = fun(d),
boot.param = boot.param.ci %>% colMeans,
boot.ci = boot.ci)
#> param boot.param boot.ci
#> 2.5% 0.01420029 0.01517527 -0.05035913
#> 97.5% 0.01805545 0.01904181 0.07245449
Created on 2021-12-22 by the reprex package (v2.0.1)
The following reprex also demonstrates that the bootstrap applied to the imputed data yields different variance estimates under the same pooling rules.
library(purrr)
library(magrittr)
library(mice)
#fix seed
set.seed(123)
imp <- mice(boys,
m = 10,
printFlag = FALSE)
imp %>%
complete("all") %>%
map(~.x %$%
lm(age ~ hgt + hc)) %>%
pool %>%
summary(conf.int = TRUE)
#> term estimate std.error statistic df p.value 2.5 %
#> 1 (Intercept) -1.9601179 0.809167659 -2.422388 682.5182 0.01567825 -3.5488747
#> 2 hgt 0.1690468 0.002784939 60.700342 572.1861 0.00000000 0.1635768
#> 3 hc -0.2138941 0.021843724 -9.792018 639.0432 0.00000000 -0.2567883
#> 97.5 %
#> 1 -0.3713610
#> 2 0.1745167
#> 3 -0.1710000
imp %>%
complete("all") %>%
map(~.x %>%
.[sample(1:748, 748, replace = TRUE), ] %$%
lm(age ~ hgt + hc)) %>%
pool %>%
summary(conf.int = TRUE)
#> term estimate std.error statistic df p.value 2.5 %
#> 1 (Intercept) -1.9810146 1.253312293 -1.580623 22.57546 1.278746e-01 -4.5763892
#> 2 hgt 0.1689181 0.004124538 40.954423 24.47123 0.000000e+00 0.1604141
#> 3 hc -0.2133606 0.033793045 -6.313743 22.29686 2.217445e-06 -0.2833890
#> 97.5 %
#> 1 0.6143599
#> 2 0.1774221
#> 3 -0.1433322
Created on 2021-12-22 by the reprex package (v2.0.1)
For quantile regression, mice::pool will not work with the output returned by quantreg::rq, because (according to this post) there is no agreed upon method to calculate standard errors, which are required to pool results under multiple imputation.
An ad hoc solution would be to manually extract the point estimates and variances from each of the imputed data sets and pool them using Rubin's rules.
First, a reprex using lm to verify whether results from the manual approach and mice::pool are identical.
library(mice)
imp <- mice(nhanes, print = FALSE, seed = 123)
# fit linear model
fit <- with(imp, lm(bmi ~ chl + hyp))
# manually pool univariate estimates using Rubin's rules
pool_manual <- \(model_object) {
m <- length(model_object$analyses)
Q <- sapply(model_object$analyses, \(x) summary(x)$coefficients[, 'Estimate'])
U <- sapply(model_object$analyses, \(x) (summary(x)$coefficients[, 'Std. Error'])^2)
qbar <- rowMeans(Q)
ubar <- rowMeans(U)
btw_var <- apply(Q, 1, var)
tot_var <- ubar + btw_var + (btw_var / m)
lambda <- (btw_var + (btw_var / m)) / tot_var
df_old <- (m - 1) / lambda^2
n_sample <- length(residuals(model_object$analyses[[1]]))
n_cols <- dim(Q)[1]
df_com <- n_sample - n_cols
df_observed <- ((df_com + 1) / (df_com + 3)) * df_com * (1 - lambda)
df_adjusted <- (df_old * df_observed) / (df_old + df_observed)
lwr <- qbar - qt(0.975, df_adjusted) * sqrt(tot_var)
upr <- qbar + qt(0.975, df_adjusted) * sqrt(tot_var)
q <- (0 - qbar)^2 / tot_var
p_value <- pf(q, df1 = 1, df2 = df_adjusted, lower.tail = FALSE)
df <- data.frame(noquote(rownames(Q)), qbar, lwr, upr, p_value)
rownames(df) <- NULL
names(df) <- c('term', 'estimate', '2.5 %', '97.5 %', 'p.value')
return(df)
}
Verify.
> pool_manual(fit)
term estimate 2.5 % 97.5 % p.value
1 (Intercept) 21.78583831 8.99373786 34.57793875 0.004228746
2 chl 0.03303449 -0.02812005 0.09418903 0.254696358
3 hyp -1.07291395 -5.57406829 3.42824039 0.624035769
> extract <- c('term', 'estimate', '2.5 %', '97.5 %', 'p.value')
> summary(pool(fit), conf.int = TRUE)[, extract]
term estimate 2.5 % 97.5 % p.value
1 (Intercept) 21.78583831 8.99373786 34.57793875 0.004228746
2 chl 0.03303449 -0.02812005 0.09418903 0.254696358
3 hyp -1.07291395 -5.57406829 3.42824039 0.624035769
Quantile regression
Now, let's pool results from rq for the expected median of the outcome.
library(quantreg)
# fit quantile regression model
fit <- with(imp, rq(bmi ~ chl + hyp, tau = 0.5))
To be able to pool results from rq, only the summary method used to extract point estimates and variances from each of the imputed data sets needs to be adjusted in pool_manual.
Q <- sapply(model_object$analyses, \(x) summary.rq(x, covariance = TRUE)$coefficients[, 'Value'])
U <- sapply(model_object$analyses, \(x) (summary.rq(x, covariance = TRUE)$coefficients[, 'Std. Error'])^2)
Result
> pool_manual(fit)
term estimate 2.5 % 97.5 % p.value
1 (Intercept) 22.23452856 0.8551626 43.6138945 0.04461337
2 chl 0.03487894 -0.0857199 0.1554778 0.47022312
3 hyp -1.43636147 -6.0666990 3.1939761 0.52455041
> summary(pool(fit), conf.int = TRUE)[, extract]
Error in rq.fit.br(x, y, tau = tau, ci = TRUE, ...) :
unused arguments (effects = "fixed", parametric = TRUE, exponentiate = FALSE)
Having a binary Classification problem:
how would be possible to get the Shap Contribution for variables for a Ranger model?
Sample data:
library(ranger)
library(tidyverse)
# Binary Dataset
df <- iris
df$Target <- if_else(df$Species == "setosa",1,0)
df$Species <- NULL
# Train Ranger Model
model <- ranger(
x = df %>% select(-Target),
y = df %>% pull(Target))
I have tried with several libraries(DALEX, shapr, fastshap, shapper) but I didnt get any solution.
I wish getting some result like SHAPforxgboost for xgboost like:
the output of shap.values which is the shap contribution of variables
the shap.plot.summary
Good Morning!,
According to what I have found, you can use ranger() with fastshap() as following:
library(fastshap)
library(ranger)
library(tidyverse)
data(iris)
# Binary Dataset
df <- iris
df$Target <- if_else(df$Species == "setosa",1,0)
df$Species <- NULL
x <- df %>% select(-Target)
# Train Ranger Model
model <- ranger(
x = df %>% select(-Target),
y = df %>% pull(Target))
# Prediction wrapper
pfun <- function(object, newdata) {
predict(object, data = newdata)$predictions
}
# Compute fast (approximate) Shapley values using 10 Monte Carlo repetitions
system.time({ # estimate run time
set.seed(5038)
shap <- fastshap::explain(model, X = x, pred_wrapper = pfun, nsim = 10)
})
# Load required packages
library(ggplot2)
theme_set(theme_bw())
# Aggregate Shapley values
shap_imp <- data.frame(
Variable = names(shap),
Importance = apply(shap, MARGIN = 2, FUN = function(x) sum(abs(x)))
)
Then for example, for variable importance, you can do:
# Plot Shap-based variable importance
ggplot(shap_imp, aes(reorder(Variable, Importance), Importance)) +
geom_col() +
coord_flip() +
xlab("") +
ylab("mean(|Shapley value|)")
Also, if you want individual predictions, the following is possible:
# Plot individual explanations
expl <- fastshap::explain(model, X = x ,pred_wrapper = pfun, nsim = 10, newdata = x[1L, ])
autoplot(expl, type = "contribution")
All this information has been found in here, and there is more to it: https://bgreenwell.github.io/fastshap/articles/fastshap.html
Check the link and solve your doubts ! :)
I launched two R packages to perform such tasks: One is "kernelshap" (crunching), the other one is "shapviz" (plotting).
library(randomForest)
library(kernelshap)
Ilibrary(shapviz)
set.seed(1)
fit <- randomForest(Sepal.Length ~ ., data = iris,)
# bg_X is usually a small (50-200 rows) subset of the data
# Step 1: Calculate Kernel SHAP values
s <- kernelshap(fit, iris[-1], bg_X = iris)
# Step 2: Turn them into a shapviz object
sv <- shapviz(s)
# Step 3: Gain insights...
sv_importance(sv, show_numbers = TRUE)
sv_dependence(sv, v = "Petal.Length", color_var = "auto")
I was wondering why lm() says 5 coefs not defined because of singularities and then gives all NA in the summary output for 5 coefficients.
Note that all my predictors are categorical.
Is there anything wrong with my data on these 5 coefficients or code? How can I possibly fix this?
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T) # Data
nms <- c("Age","genre","Length","cf.training","error.type","cf.scope","cf.type","cf.revision")
d[nms] <- lapply(d[nms], as.factor) # make factor
vv <- lm(dint~Age+genre+Length+cf.training+error.type+cf.scope+cf.type+cf.revision, data = d)
summary(vv)
First 6 lines of output:
Coefficients: (5 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.17835 0.63573 0.281 0.779330
Age1 -0.04576 0.86803 -0.053 0.958010
Age2 0.46431 0.87686 0.530 0.596990
Age99 -1.64099 1.04830 -1.565 0.118949
genre2 1.57015 0.55699 2.819 0.005263 **
genre4 NA NA NA NA ## For example here is all `NA`s? there are 4 more !
As others noted, a problem is that you seem to have multicollinearity. Another is that there are missing values in your dataset. The missing values should probably just be removed. As for correlated variables, you should inspect your data to identify this collinearity, and remove it. Deciding which variables to remove and which to retain is a very domain-specific topic. However, you could if you wish decide to use regularisation and fit a model while retaining all variables. This also allows you to fit a model when n (number of samples) is less than p (number of predictors).
I've shown code below that demonstrates how to examine the correlation structure within your data, and to identify which variables are most correlated (thanks to this answer. I've included an example of fitting such a model, using L2 regularisation (commonly known as ridge regression).
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T) # Data
nms <- c("Age","genre","Length","cf.training","error.type","cf.scope","cf.type","cf.revision")
d[nms] <- lapply(d[nms], as.factor) # make factor
vv <- lm(dint~Age+genre+Length+cf.training+error.type+cf.scope+cf.type+cf.revision, data = d)
df <- d
df[] <- lapply(df, as.numeric)
cor_mat <- cor(as.matrix(df), use = "complete.obs")
library("gplots")
heatmap.2(cor_mat, trace = "none")
## https://stackoverflow.com/questions/22282531/how-to-compute-correlations-between-all-columns-in-r-and-detect-highly-correlate
library("tibble")
library("dplyr")
library("tidyr")
d2 <- df %>%
as.matrix() %>%
cor(use = "complete.obs") %>%
## Set diag (a vs a) to NA, then remove
(function(x) {
diag(x) <- NA
x
}) %>%
as.data.frame %>%
rownames_to_column(var = 'var1') %>%
gather(var2, value, -var1) %>%
filter(!is.na(value)) %>%
## Sort by decreasing absolute correlation
arrange(-abs(value))
## 2 pairs of variables are almost exactly correlated!
head(d2)
#> var1 var2 value
#> 1 id study.name 0.9999430
#> 2 study.name id 0.9999430
#> 3 Location timed 0.9994082
#> 4 timed Location 0.9994082
#> 5 Age ed.level 0.7425026
#> 6 ed.level Age 0.7425026
## Remove some variables here, or maybe try regularized regression (see below)
library("glmnet")
## glmnet requires matrix input
X <- d[, c("Age", "genre", "Length", "cf.training", "error.type", "cf.scope", "cf.type", "cf.revision")]
X[] <- lapply(X, as.numeric)
X <- as.matrix(X)
ind_na <- apply(X, 1, function(row) any(is.na(row)))
X <- X[!ind_na, ]
y <- d[!ind_na, "dint"]
glmnet <- glmnet(
x = X,
y = y,
## alpha = 0 is ridge regression
alpha = 0)
plot(glmnet)
Created on 2019-11-08 by the reprex package (v0.3.0)
Under such situation you can use "olsrr" package in R for stepwise regression analysis. I am providing you a sample code to do stepwise regression analysis in R
library("olsrr")
#Load the data
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T)
# stepwise regression
vv <- lm(dint ~ Age + genre + Length + cf.training + error.type + cf.scope + cf.type + cf.revision, data = d)
summary(vv)
k <- ols_step_both_p(vv, pent = 0.05, prem = 0.1)
# stepwise regression plot
plot(k)
# final model
k$model
It will provide you exactly the same output as that of SPSS.
I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))