dplyr with stats test - r

I have the follow data setup
library(dplyr)
library(broom)
pop.mean = 0.10
df = data.frame(
trial = as.integer(runif(1000, min = 5, max = 20)),
success = as.integer(runif(1000, min = 0, max = 20)),
my.group = factor(rep(c("a","b","c","d"), each = 250))
)
I want to group on my.group and apply binom.test
bi.test <- df %>% group_by(my.group) %>%
do(test = binom.test(sum(success),
sum(trial),
pop.mean,
alternative = c("two.sided"),
conf.level = 0.95))
Getting error message, cannot find success what am I doing wrong here?

We need to extract the columns using $ within do
res <- df %>%
group_by(my.group) %>%
do(test = binom.test(sum(.$success),
sum(.$trial),
pop.mean,
alternative = c("two.sided"),
conf.level = 0.95))
If we are using the broom functions, then
res1 <- df %>%
group_by(my.group) %>%
do(test = tidy(binom.test(sum(.$success),
sum(.$trial),
pop.mean,
alternative = c("two.sided"),
conf.level = 0.95)))
res1$test %>%
bind_rows %>%
bind_cols(res1[1], .)
# A tibble: 4 x 9
# my.group estimate statistic p.value parameter conf.low conf.high method alternative
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fctr> <fctr>
#1 a 0.7908251 2310 0 2921 0.7756166 0.8054487 Exact binomial test two.sided
#2 b 0.7525138 2320 0 3083 0.7368831 0.7676640 Exact binomial test two.sided
#3 c 0.8446337 2479 0 2935 0.8310152 0.8575612 Exact binomial test two.sided
#4 d 0.7901683 2395 0 3031 0.7752305 0.8045438 Exact binomial test two.sided
NOTE: The dataset was created with a seed of 24 i.e. set.seed(24)

Thanks #akrun
I came up with a solution with tidyr::nest and purr::map after reading your answer.
res <- df %>%
group_by(my.group) %>%
tidyr::nest() %>%
mutate(bi.test =
purrr::map(data, function(df) broom::tidy(
binom.test(sum(df$success),
sum(df$trial),
pop.mean,
alternative = c("two.sided"),
conf.level = 0.95)))) %>%
select(my.group, bi.test) %>%
tidyr::unnest()

Related

How to use %>% and calculate multiple metrics in R?

I have a tibble and I am trying to calculate multiple metrics.
library(tidymodels)
price = 1:50
prediction = price * 0.9
My_tibble = tibble(price=price, prediction=prediction)
# The following code can calculate the rmse
My_tibble %>%
rmse(truth = price, estimate = prediction)
# Is it possible to calculate `rmse` and `rsq` at the same time?
# The following code reports an error: object '.pred' not found
My_tibble %>%
rmse(truth = price, estimate = prediction ) %>%
rsq(truth = price, estimate = prediction )
To extend the question a little bit, is it possible to calculate rmse and cor at the same time?
My_tibble %>%
rmse(truth = price, estimate = prediction)
# An error occurs: the condition has length > 1 and only the first element will be used
My_tibble %>%
cor(x= price, y= prediction, method = "kendall")
Thanks to jpsmith, is it possible to bundle rmse and cor into a single summarise call?
# An error occurs: no applicable method for 'rmse' applied to an object of class "c('integer', 'numeric')"
My_tibble %>%
summarize(
COR = cor(x = price, y = prediction),
RMSE = rmse(truth = price, estimate = prediction))
I've done this before by specifying desired metrics in metric_set and then passing it through:
mets <- metric_set(rmse, rsq)
My_tibble %>%
mets(price, prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 2.93
# 2 rsq standard 1
Which gives the same as:
My_tibble %>%
rmse(truth = price, estimate = prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 2.93
My_tibble %>%
rsq(truth = price, estimate = prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rsq standard 1
For cor, you need to wrap it in summarize:
My_tibble %>%
summarize(cor = cor(x = price, y = prediction))
# cor
# <dbl>
# 1 1
Not sure how to combine both the functions defined in mets and cor elegantly, but defining your own function can do it:
met_fun <- function(df){
mets <- metric_set(rmse, rsq)
a <- df %>%
mets(price, prediction) %>%
tidyr::pivot_wider(values_from = .estimate, names_from = .metric) %>%
select(-.estimator)
b <- df %>%
summarize(cor = cor(x = price, y = prediction))
cbind(a, b)
}
met_fun(My_tibble)
# rmse rsq cor
# 1 2.930017 1 1
Good luck!

Prediction Intervals from Quantile Regression Forests have higher coverage than expected?

Question:
What factors may cause the prediction interval to have wider coverage than would be expected? Particularly with regard to quantile regression forests with the ranger package?
Specific Context + REPREX:
I am using quantile regression forests through parsnip and the tidymodels suite of packages with ranger to generate prediction intervals. I was reviewing an example using the ames housing data and was surprised to see in the example below that my 90% prediction intervals had an empirical coverage of ~97% when evaluated on a hold-out dataset (coverage on the training data was even higher).
This was even more surprising given that my model performance is substantially worse on the hold-out set than on the training set hence I would have guessed the coverage would have been less than expected, not greater than expected?
Load libraries, data, set-up split:
```{r}
library(tidyverse)
library(tidymodels)
library(AmesHousing)
ames <- make_ames() %>%
mutate(Years_Old = Year_Sold - Year_Built,
Years_Old = ifelse(Years_Old < 0, 0, Years_Old))
set.seed(4595)
data_split <- initial_split(ames, strata = "Sale_Price", p = 0.75)
ames_train <- training(data_split)
ames_test <- testing(data_split)
```
Specify model workflow:
```{r}
rf_recipe <-
recipe(
Sale_Price ~ Lot_Area + Neighborhood + Years_Old + Gr_Liv_Area + Overall_Qual + Total_Bsmt_SF + Garage_Area,
data = ames_train
) %>%
step_log(Sale_Price, base = 10) %>%
step_other(Neighborhood, Overall_Qual, threshold = 50) %>%
step_novel(Neighborhood, Overall_Qual) %>%
step_dummy(Neighborhood, Overall_Qual)
rf_mod <- rand_forest() %>%
set_engine("ranger", importance = "impurity", seed = 63233, quantreg = TRUE) %>%
set_mode("regression")
set.seed(63233)
rf_wf <- workflows::workflow() %>%
add_model(rf_mod) %>%
add_recipe(rf_recipe) %>%
fit(ames_train)
```
Make predictions on training and hold-out datasets:
```{r}
rf_preds_train <- predict(
rf_wf$fit$fit$fit,
workflows::pull_workflow_prepped_recipe(rf_wf) %>% bake(ames_train),
type = "quantiles",
quantiles = c(0.05, 0.50, 0.95)
) %>%
with(predictions) %>%
as_tibble() %>%
set_names(paste0(".pred", c("_lower", "", "_upper"))) %>%
mutate(across(contains(".pred"), ~10^.x)) %>%
bind_cols(ames_train)
rf_preds_test <- predict(
rf_wf$fit$fit$fit,
workflows::pull_workflow_prepped_recipe(rf_wf) %>% bake(ames_test),
type = "quantiles",
quantiles = c(0.05, 0.50, 0.95)
) %>%
with(predictions) %>%
as_tibble() %>%
set_names(paste0(".pred", c("_lower", "", "_upper"))) %>%
mutate(across(contains(".pred"), ~10^.x)) %>%
bind_cols(ames_test)
```
Show that coverage rate is far higher for both the training and hold-out data than the 90% expected (empirically seems to be ~98% and ~97% respectively):
```{r}
rf_preds_train %>%
mutate(covered = ifelse(Sale_Price >= .pred_lower & Sale_Price <= .pred_upper, 1, 0)) %>%
summarise(n = n(),
n_covered = sum(
covered
),
covered_prop = n_covered / n,
stderror = sd(covered) / sqrt(n)) %>%
mutate(min_coverage = covered_prop - 2 * stderror,
max_coverage = covered_prop + 2 * stderror)
# # A tibble: 1 x 6
# n n_covered covered_prop stderror min_coverage max_coverage
# <int> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2199 2159 0.982 0.00285 0.976 0.988
rf_preds_test %>%
mutate(covered = ifelse(Sale_Price >= .pred_lower & Sale_Price <= .pred_upper, 1, 0)) %>%
summarise(n = n(),
n_covered = sum(
covered
),
covered_prop = n_covered / n,
stderror = sd(covered) / sqrt(n)) %>%
mutate(min_coverage = covered_prop - 2 * stderror,
max_coverage = covered_prop + 2 * stderror)
# # A tibble: 1 x 6
# n n_covered covered_prop stderror min_coverage max_coverage
# <int> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 731 706 0.966 0.00673 0.952 0.979
```
Guesses:
Something about the ranger package or quantile regression forests is overly extreme in the way it estimates quantiles, or I am overfitting in the 'extreme' direction somehow -- leading to my highly conservative prediction intervals
This is a quirk specific to this dataset / model
I am missing something or setting-up something incorrectly

Conduct Multiple T-Tests in R, Condensed

I wish to conduct multiple t-tests in R, without having to go through a copy-paste of each test. Each test will whether differences exist in the "Type" (whether "Left" or "Right") when looking at the "Level_#". Currently, I might have:
t.test(Level_1 ~ Type, alternative="two.sided", conf.level=0.99)
t.test(Level_2 ~ Type, alternative="two.sided", conf.level=0.99)
Type Level_1 Level_2 Level_3
Left 17 50 98
Right 18 65 65
Left 23 7 19
Left 65 7 100
Right 9 13 17
The issue is that I have hundreds of "Level_#" and would like to know how to automate this process and output a data frame of the results. My thought is to somehow incorporate an apply function.
You can do it with using the tidyverse approach, and using the purrr and broom packages.
require(tidyverse)
require(broom)
df %>%
gather(var, level, -type) %>%
nest(-var) %>%
mutate(model = purrr::map(data, function(x) {
t.test(level ~ type, alternative="two.sided", conf.level=0.99,
data = x)}),
value = purrr::map(model, tidy),
conf.low = purrr::map(value, "conf.low"),
conf.high = purrr::map(value, "conf.high"),
pvalue = purrr::map(value, "p.value")) %>%
select(-data, -model, -value)
Output:
var conf.low conf.high pvalue
1 level1 -3.025393 4.070641 0.6941518
2 level2 -3.597754 3.356125 0.9260015
3 level3 -3.955293 3.673493 0.9210724
Sample data:
set.seed(123)
df <- data.frame(type = rep(c("left", "right"), 25),
level1 = rnorm(50, mean = 85, sd = 5),
level2 = rnorm(50, mean = 75, sd = 5),
level3 = rnorm(50, mean = 65, sd = 5))

Unnest fitted glm models

I have a tibble with nested glm models. I nest over a variable (region) and run a function region_model that fits the model.
# toy data
test_data = data.frame(region = sample(letters[1:3], 1000, replace = TRUE),
x = sample(0:1, 1000, replace = TRUE),
y = sample(1:100, 1000, replace = TRUE),
z = sample(0:1, 1000, replace = TRUE)) %>% arrange(region)
# nest
by_region = test_data %>%
group_by(region) %>%
nest()
# glm function
region_model <- function(df) {
glm(x ~ y + z, data = df, family = "binomial")
}
# run the model
by_region = by_region %>% mutate(mod_rat = data %>% map(region_model))
The resulting tibble looks like this:
> by_region
# A tibble: 3 x 3
region data mod_rat
<fctr> <list> <list>
1 a <tibble [352 x 3]> <S3: glm>
2 b <tibble [329 x 3]> <S3: glm>
3 c <tibble [319 x 3]> <S3: glm>
My purpose is to unnest the models to calculate marginal effects. I have tried it and I have got this error:
> unnest(by_region, mod_rat)
Error: Each column must either be a list of vectors or a list of data frames [mod_rat]
I wonder whether it possible to use unnest on this type of objects (<S3: glm>) and in case not, whether there is an alternative to get these estimates.
As it happens, the margins package has had some recent updates which will help you do this in a tidy fashion. In particular a margins_summary() function has been added that can be mapped onto nested model objects.
This issue on GitHub has the details.
Here is some code that works with your example
Using data from above
library(tidyverse)
library(magrittr)
library(margins)
# toy data
test_data <- data.frame(region = sample(letters[1:3], 1000, replace = TRUE),
x = sample(0:1, 1000, replace = TRUE),
y = sample(1:100, 1000, replace = TRUE),
z = sample(0:1, 1000, replace = TRUE)) %>%
arrange(region)
# nest
by_region <-
test_data %>%
group_by(region) %>%
nest()
# glm function
region_model <- function(df) {
glm(x ~ y + z, data = df, family = "binomial")
}
# run the model
by_region %<>%
mutate(mod_rat = map(data, region_model))
Using the margins_summary() function via purrr:map2() to compute marginal effects (I have included both methods for calculating the marginal effects with logistic regression as described in the package vignette)
by_region %<>%
mutate(marginals = map2(mod_rat, data, ~margins_summary(.x, data = .y)),
marginals_link = map2(mod_rat, data, ~margins_summary(.x, data = .y, type = "link")))
We can now unnest either of the created list columns with the marginal effect data
by_region %>%
unnest(marginals) -> region_marginals
region_marginals
# A tibble: 6 x 8
region factor AME SE z p
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 a y -9.38e-4 9.71e-4 -0.966 0.334
2 a z 3.59e-2 5.55e-2 0.647 0.517
3 b y 1.14e-3 9.19e-4 1.24 0.215
4 b z -2.93e-2 5.38e-2 -0.545 0.586
5 c y 4.67e-4 9.77e-4 0.478 0.633
6 c z -3.32e-2 5.49e-2 -0.604 0.546
# ... with 2 more variables: lower <dbl>,
# upper <dbl>
And plot nicely
region_marginals %>%
ggplot(aes(reorder(factor, AME), AME, ymin = lower, ymax = upper)) +
geom_hline(yintercept = 0, colour = "#AAAAAA") +
geom_pointrange() +
facet_wrap(~region) +
coord_flip()

Create t.test table with dplyr?

Suppose I have data that looks like this:
set.seed(031915)
myDF <- data.frame(
Name= rep(c("A", "B"), times = c(10,10)),
Group = rep(c("treatment", "control", "treatment", "control"), times = c(5,5,5,5)),
X = c(rnorm(n=5,mean = .05, sd = .001), rnorm(n=5,mean = .02, sd = .001),
rnorm(n=5,mean = .08, sd = .02), rnorm(n=5,mean = .03, sd = .02))
)
I want to create a t.test table with a row for "A" and one for "B"
I can write my own function that does that:
ttestbyName <- function(Name) {
b <- t.test(myDF$X[myDF$Group == "treatment" & myDF$Name==Name],
myDF$X[myDF$Group == "control" & myDF$Name==Name],
conf.level = 0.90)
dataNameX <- data.frame(Name = Name,
treatment = round(b$estimate[[1]], digits = 4),
control = round(b$estimate[[2]], digits = 4),
CI = paste('(',round(b$conf.int[[1]],
digits = 4),', ',
round(b$conf.int[[2]],
digits = 4), ')',
sep=""),
pvalue = round(b$p.value, digits = 4),
ntreatment = nrow(myDF[myDF$Group == "treatment" & myDF$Name==Name,]),
ncontrol = nrow(myDF[myDF$Group == "control" & myDF$Name==Name,]))
}
library(parallel)
Test_by_Name <- mclapply(unique(myDF$Name), ttestbyName)
Test_by_Name <- do.call("rbind", Test_by_Name)
and the output looks like this:
Name treatment control CI pvalue ntreatment ncontrol
1 A 0.0500 0.0195 (0.0296, 0.0314) 0.0000 5 5
2 B 0.0654 0.0212 (0.0174, 0.071) 0.0161 5 5
I'm wondering if there is a cleaner way of doing this with dplyr. I thought about using groupby, but I'm a little lost.
Thanks!
Not much cleaner, but here's an improvement:
library(dplyr)
ttestbyName <- function(myName) {
bt <- filter(myDF, Group=="treatment", Name==myName)
bc <- filter(myDF, Group=="control", Name==myName)
b <- t.test(bt$X, bc$X, conf.level=0.90)
dataNameX <- data.frame(Name = myName,
treatment = round(b$estimate[[1]], digits = 4),
control = round(b$estimate[[2]], digits = 4),
CI = paste('(',round(b$conf.int[[1]],
digits = 4),', ',
round(b$conf.int[[2]],
digits = 4), ')',
sep=""),
pvalue = round(b$p.value, digits = 4),
ntreatment = nrow(bt), # changes only in
ncontrol = nrow(bc)) # these 2 nrow() args
}
You should really replace the do.call function with rbindlist from data.table:
library(data.table)
Test_by_Name <- lapply(unique(myDF$Name), ttestbyName)
Test_by_Name <- rbindlist(Test_by_Name)
or, even better, use the %>% pipes:
Test_by_Name <- myDF$Name %>%
unique %>%
lapply(., ttestbyName) %>%
rbindlist
> Test_by_Name
Name treatment control CI pvalue ntreatment ncontrol
1: A 0.0500 0.0195 (0.0296, 0.0314) 0.0000 5 5
2: B 0.0654 0.0212 (0.0174, 0.071) 0.0161 5 5
An old question, but the broom package has since been made available for this exact purpose (as well as other statistical tests):
library(broom)
library(dplyr)
myDF %>% group_by(Name) %>%
do(tidy(t.test(X~Group, data = .)))
Source: local data frame [2 x 9]
Groups: Name [2]
Name estimate estimate1 estimate2 statistic p.value
(fctr) (dbl) (dbl) (dbl) (dbl) (dbl)
1 A -0.03050475 0.01950384 0.05000860 -63.838440 1.195226e-09
2 B -0.04423181 0.02117864 0.06541046 -3.104927 1.613625e-02
Variables not shown: parameter (dbl), conf.low (dbl), conf.high (dbl)
library(tidyr)
library(dplyr)
myDF %>% group_by(Group) %>% mutate(rowname=1:n())%>%
spread(Group, X) %>%
group_by(Name) %>%
do(b = t.test(.$control, .$treatment)) %>%
mutate(
treatment = round(b[['estimate']][[2]], digits = 4),
control = round(b[['estimate']][[1]], digits = 4),
CI = paste0("(", paste(b[['conf.int']], collapse=", "), ")"),
pvalue = b[['p.value']]
)
# Name treatment control CI pvalue
#1 A 0.0500 0.0195 (-0.031677109707283, -0.0293323994902097) 1.195226e-09
#2 B 0.0654 0.0212 (-0.0775829100729602, -0.010880719830447) 1.613625e-02
You can add ncontrol, ntreatment manually.
You can do it with a custom t.test function and do:
my.t.test <- function(data, formula, ...)
{
tt <- t.test(formula=formula, data=data, ...)
ests <- tt$estimate
names(ests) <- sub("mean in group ()", "\\1",names(ests))
counts <- xtabs(formula[c(1,3)],data)
names(counts) <- paste0("n",names(counts))
cbind(
as.list(ests),
data.frame(
CI = paste0("(", paste(tt$conf.int, collapse=", "), ")"),
pvalue = tt$p.value,
stringsAsFactors=FALSE
),
as.list(counts)
)
}
myDF %>% group_by(Name) %>% do(my.t.test(.,X~Group))
Source: local data frame [2 x 7]
Groups: Name
Name control treatment CI pvalue ncontrol ntreatment
1 A 0.01950384 0.05000860 (-0.031677109707283, -0.0293323994902097) 1.195226e-09 5 5
2 B 0.02117864 0.06541046 (-0.0775829100729602, -0.010880719830447) 1.613625e-02 5 5

Resources