Obtain CLD from TukeyHSD test - r

I have been struggling for a while to make my own CLDs from the output of a TukeyHSD test.
First I've done a two-way ANOVA:
aov2_arbuscular <- aov(arbuscular_count ~ block + pesticide*fertilizer, data = main_trial)
And did a TukeyHSD test as a post hoc test:
tk_arbuscular <- TukeyHSD(aov2_arbuscular)
Because I could not generate the CLD with the TukeyHSD output I used the emmeans() and cld() function.
tk_arbuscular_model <- emmeans(aov2_arbuscular,
pairwise ~ pesticide*fertilizer,
adjust = "tukey")
tk_arbuscular_model_cld <- cld(tk_arbuscular_model$emmeans,
alpha = .05,
Letters = letters)
I thought that both the TukeyHSD and emmeans with adjust = "tukey" result in the same output. Which they do for the most results, with unfortunately a few exceptions.
I have already written my result part and do not want to adjust all the p-values again. Therefore, can someone help me to generate the CLDs with the TukeyHSD output, so I can integrate them in a ggplot?

You did not provide your data, so I am creating my own reprex that can also be analyzed as a two-factorial block design, i.e. with a model similar to yours.
As you can see, I could not reproduce your problem - all p-values are basically identical. I noticed that you added the adjust = "Tukey" to the emmeans() statement, but I usually add it to the cld() statement instead - however that shouldn't be the problem.
library(tidyverse)
library(emmeans)
library(multcomp)
library(multcompView)
dataURL <- "https://raw.githubusercontent.com/SchmidtPaul/DSFAIR/master/data/Gomez%26Gomez1984.csv"
dat <- read_csv(dataURL) %>%
filter(G %in% c("A", "B") & N %in% c("N1", "N2")) %>%
mutate_at(vars(rep:N), as.factor)
aov <- aov(yield ~ G + N + G:N + rep, data = dat)
# get contrasts via 3 options ---------------------------------------------
option1 <- stats::TukeyHSD(aov) %>%
pluck("G:N")
option2 <- emmeans::emmeans(aov, ~ G:N) %>%
emmeans::pairs(adjust = "Tukey")
option3 <- emmeans::emmeans(aov, ~ G:N) %>%
multcomp::cld(adjust = "Tukey", details = TRUE)
# uniform format ----------------------------------------------------------
option1 <- option1 %>%
as_tibble(rownames = "contrast") %>%
transmute(contrast = contrast,
estimate = diff,
p.value = `p adj`)
option2 <- option2 %>%
as_tibble() %>%
dplyr::select(contrast, estimate, p.value)
option3 <- option3 %>%
pluck("comparisons") %>%
as_tibble() %>%
dplyr::select(contrast, estimate, p.value)
# compare -----------------------------------------------------------------
option1
#> # A tibble: 6 x 3
#> contrast estimate p.value
#> <chr> <dbl> <dbl>
#> 1 B:N1-A:N1 53.3 0.999
#> 2 A:N2-A:N1 1419. 0.0860
#> 3 B:N2-A:N1 1729. 0.0401
#> 4 A:N2-B:N1 1366 0.0984
#> 5 B:N2-B:N1 1676 0.0455
#> 6 B:N2-A:N2 310 0.910
option2
#> # A tibble: 6 x 3
#> contrast estimate p.value
#> <chr> <dbl> <dbl>
#> 1 A N1 - B N1 -53.3 0.999
#> 2 A N1 - A N2 -1419. 0.0860
#> 3 A N1 - B N2 -1729. 0.0401
#> 4 B N1 - A N2 -1366 0.0984
#> 5 B N1 - B N2 -1676 0.0455
#> 6 A N2 - B N2 -310. 0.910
option3
#> # A tibble: 6 x 3
#> contrast estimate p.value
#> <chr> <dbl> <dbl>
#> 1 B N1 - A N1 53.3 0.999
#> 2 A N2 - A N1 1419. 0.0860
#> 3 A N2 - B N1 1366 0.0984
#> 4 B N2 - A N1 1729. 0.0401
#> 5 B N2 - B N1 1676 0.0455
#> 6 B N2 - A N2 310. 0.910
tibble(
o1_p = option1$p.value,
o2_p = option2$p.value,
o3_p = option3$p.value
) %>% cor()
#> o1_p o2_p o3_p
#> o1_p 1.0000000 1.0000000 0.9967731
#> o2_p 1.0000000 1.0000000 0.9967731
#> o3_p 0.9967731 0.9967731 1.0000000
Created on 2021-12-02 by the reprex package (v2.0.1)
Does this help? If not, can you try to create a reproducible example with your data so that we have all the information?
Also, here is a chapter I wrote on using and interpreting the compact letter display.

Related

Using a function in R with multiple outcomes to create multiple columns in mutate

I am using a self declared function that runs a regression analysis. I want to run this for thousands of companies for multiple years, thus speed is essential. My function creates three outputs (a coefficient, the p value and r-squared). The function runs fine individually, however when I use mutate() to let it run through the whole dataset, it only gives the same values for all rows. The weirdest thing is that I can't reproduce those particular values by running the function individually. I made an reproducible example below. I have used lapply successfully before with this data, but I would like to keep it in mutate and above all I would like to know what's exactly happening here.
So my question is: how can I make this function work for each individual row for the companies dataset using mutate?
library(tidyverse)
companies <- data.frame(comp_id = 1:5)
individuals <- data.frame(id = 1:100,
comp_id = sample(1:5, 100, replace = T),
age = sample(18:67, 100, replace = T),
wage = sample(1700:10000, 100, replace = T))
regger <- function(x){
df <- individuals %>% filter(comp_id == x)
formula <- wage ~ age
regression <- lm(formula, df)
res <- list(coeff = summary(regression)$coefficient[2,1],
p = summary(regression)$coefficients[2,4],
r2 = summary(regression)$r.squared)
return(res)
}
companies %>%
mutate(data = list(regger(comp_id))) %>%
unnest_wider(data)
output:
# A tibble: 5 x 4
comp_id coeff p r2
<int> <dbl> <dbl> <dbl>
1 1 -4.92 0.916 0.000666
2 2 -4.92 0.916 0.000666
3 3 -4.92 0.916 0.000666
4 4 -4.92 0.916 0.000666
5 5 -4.92 0.916 0.000666
Use map from the purrr package if a function is not vectorized:
library(tidyverse)
set.seed(1337)
companies <- data.frame(comp_id = 1:5)
individuals <- data.frame(
id = 1:100,
comp_id = sample(1:5, 100, replace = T),
age = sample(18:67, 100, replace = T),
wage = sample(1700:10000, 100, replace = T)
)
regger <- function(x) {
df <- individuals %>% filter(comp_id == x)
formula <- wage ~ age
regression <- lm(formula, df)
res <- list(
coeff = summary(regression)$coefficient[2, 1],
p = summary(regression)$coefficients[2, 4],
r2 = summary(regression)$r.squared
)
return(res)
}
companies %>%
mutate(data = comp_id %>% map(regger)) %>%
unnest_wider(data)
#> # A tibble: 5 x 4
#> comp_id coeff p r2
#> <int> <dbl> <dbl> <dbl>
#> 1 1 67.1 0.108 0.218
#> 2 2 23.7 0.466 0.0337
#> 3 3 31.2 0.292 0.0462
#> 4 4 18.4 0.582 0.0134
#> 5 5 0.407 0.994 0.00000371
Created on 2021-09-09 by the reprex package (v2.0.1)
I'm not sure what the output should look like, but could it be that you need to work on a row-by-row basis?
companies %>%
rowwise() %>%
mutate(data = list(regger(comp_id))) %>%
unnest_wider(data)
comp_id coeff p r2
<int> <dbl> <dbl> <dbl>
1 1 21.6 0.470 0.0264
2 2 13.5 0.782 0.00390
3 3 0.593 0.984 0.0000175
4 4 -9.33 0.824 0.00394
5 5 64.9 0.145 0.156

conditionally mutating column values using `dplyr`

I am using WRS2 to carry out robust pairwise comparisons. But one problem is that it removes the group level names from the output dataframes and saves it in a different object.
# setup
set.seed(123)
library(WRS2)
library(tidyverse)
# robust pairwise comparisons
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
# comparisons
x$comp
#> Group Group psihat ci.lower ci.upper p.value
#> [1,] 1 2 -1.0 -3.440879 1.44087853 0.25984505
#> [2,] 1 3 -2.8 -5.536161 -0.06383861 0.04914871
#> [3,] 2 3 -1.8 -4.536161 0.93616139 0.17288911
# vector with group level names
x$fnames
#> [1] "placebo" "low" "high"
I can convert it to a tibble:
# converting to tibble
suppressMessages(as_tibble(x$comp, .name_repair = "unique")) %>%
dplyr::rename(group1 = Group...1, group2 = Group...2)
#> # A tibble: 3 x 6
#> group1 group2 psihat ci.lower ci.upper p.value
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 -1 -3.44 1.44 0.260
#> 2 1 3 -2.8 -5.54 -0.0638 0.0491
#> 3 2 3 -1.8 -4.54 0.936 0.173
I would then like to replace the group column numeric values with actual names included in fnames (so map fnames[1] -> 1, fnames[2] -> 2, and so on).
So the final dataframe should look something like the following-
#> # A tibble: 3 x 6
#> group1 group2 psihat ci.lower ci.upper p.value
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 placebo low -1 -3.44 1.44 0.260
#> 2 placebo high -2.8 -5.54 -0.0638 0.0491
#> 3 low high -1.8 -4.54 0.936 0.173
In this case, it was easy to just copy-paste the three values, but I want to have a generalizable approach where no matter the number of levels, it works. How can I do this using dplyr?
Using a named vector to match with tidyverse. This matches by value and not by the sequence of index i.e. if the value in 'Group' columns are not in a sequence or character, this would still work
library(dplyr)
as_tibble(x$comp, .name_repair = 'unique') %>%
mutate(across(starts_with("Group"),
~ setNames(x$fnames, seq_along(x$fnames))[as.character(.)]))
Does this fullfil your needs :
names <- c("A","B","C")
df = data.frame(group=c(1,2,3))
library(dplyr)
df %>% mutate(group = names[group])
group
1 A
2 B
3 C
Here's an approach using the recode function, with the recoding vector built programmatically from the data:
# Setup
set.seed(123)
library(WRS2)
library(tidyverse)
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
# Create recoding vector
recode.vec = x$fnames %>% set_names(1:length(x$fnames))
# Recode columns
x.comp = x$comp %>%
as_tibble(.name_repair=make.unique) %>%
mutate(across(starts_with("Group"), ~recode(., !!!recode.vec)))
Output:
x.comp
#> # A tibble: 3 x 6
#> Group Group.1 psihat ci.lower ci.upper p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 placebo low -1 -3.44 1.44 0.260
#> 2 placebo high -2.8 -5.54 -0.0638 0.0491
#> 3 low high -1.8 -4.54 0.936 0.173
Try this tidyverse approach formating data to long after extracting the objects as tibbles. You can use left_join() to get your groups as you want. Here the code to get something close to what you want:
# setup
set.seed(123)
library(WRS2)
library(tidyverse)
# robust pairwise comparisons
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
#Transform to tibble
df1 <- suppressMessages(as_tibble(x$comp, .name_repair = "unique")) %>%
dplyr::rename(group1 = Group...1, group2 = Group...2)
#Extract labels
df2 <- tibble(treat=x$fnames) %>% mutate(value=1:n())
#Format to long df1
df1 <- df1 %>%
mutate(id=1:n()) %>%
pivot_longer(cols = c(group1,group2)) %>%
rename(group=name) %>% left_join(df2) %>% select(-value) %>%
pivot_wider(names_from = group,values_from=treat) %>% select(-id)
Output:
# A tibble: 3 x 6
psihat ci.lower ci.upper p.value group1 group2
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 -1 -3.44 1.44 0.260 placebo low
2 -2.8 -5.54 -0.0638 0.0491 placebo high
3 -1.8 -4.54 0.936 0.173 low high

Using values in one dataframe as arguments for model estimated on another dataframe

I wish to estimate models in one dataframe, but the formula for each model has some "moving parts" which come from another dataframe. For example, say I wish to estimate the following model (I can't post picture and found no way to type latex equations):
mpg = a + b*log(w_1 * drat + w_2 * hp)
where w_1 and w_2 are weights, which for example are either 0.5 or 1. I use expand.grid() to create a dataframe of weights, then mutate() a formula using paste() or paste0() with the variable names and the value of the weights, and then pass it to the lm() function.
However, the model estimated is just using the formula found in the first row of the weights dataframe. This is solved if I use group_by() before estimating the models.
The question is - why? why doesn't the first code work? what does group_by() achieve here that makes it possible?
library(tidyverse)
cars <- mtcars
w <- seq(from=0.5, to=1, by=0.5)
weights <- as_tibble(expand.grid(w1=w,w2=w))
#Doesn't work - the lm model is fit using the formula from the first row only
weights %>%
mutate(formula_weights = paste0("mpg~log(",w1,"*drat+",w2,"*hp)")) %>%
mutate(r2 = summary(lm(data=cars, formula = formula_weights))$r.squared)
#Does work - model is fit using the w1 and w2 values from each row (formula_weights)
weights %>%
mutate(formula_weights = paste0("mpg~log(",w1,"*drat+",w2,"*hp)")) %>%
group_by(formula_weights) %>%
mutate(r2 = summary(lm(data=cars, formula = formula_weights))$r.squared)
The output without group_by():
# A tibble: 4 x 4
w1 w2 formula_weights r2
<dbl> <dbl> <chr> <dbl>
1 0.5 0.5 mpg~log(0.5*drat+0.5*hp) 0.715
2 1 0.5 mpg~log(1*drat+0.5*hp) 0.715
3 0.5 1 mpg~log(0.5*drat+1*hp) 0.715
4 1 1 mpg~log(1*drat+1*hp) 0.715
The output with group_by():
# A tibble: 4 x 4
# Groups: formula_weights [4]
w1 w2 formula_weights r2
<dbl> <dbl> <chr> <dbl>
1 0.5 0.5 mpg~log(0.5*drat+0.5*hp) 0.715
2 1 0.5 mpg~log(1*drat+0.5*hp) 0.709
3 0.5 1 mpg~log(0.5*drat+1*hp) 0.718
4 1 1 mpg~log(1*drat+1*hp) 0.715
We can add rowwise
library(dplyr)
weights %>%
mutate(formula_weights = paste0("mpg~log(",w1,"*drat+",w2,"*hp)")) %>%
rowwise() %>%
mutate(r2 = summary(lm(data=cars, formula = formula_weights))$r.squared)
#Source: local data frame [4 x 4]
#Groups: <by row>
# A tibble: 4 x 4
# w1 w2 formula_weights r2
# <dbl> <dbl> <chr> <dbl>
#1 0.5 0.5 mpg~log(0.5*drat+0.5*hp) 0.715
#2 1 0.5 mpg~log(1*drat+0.5*hp) 0.709
#3 0.5 1 mpg~log(0.5*drat+1*hp) 0.718
#4 1 1 mpg~log(1*drat+1*hp) 0.715
Or use map
library(purrr)
weights %>%
mutate(r2 = map_dbl(paste0("mpg~log(",w1,"*drat+",w2,"*hp)"), ~
summary(lm(data = cars, formula = .x))$r.squared))
# A tibble: 4 x 3
# w1 w2 r2
# <dbl> <dbl> <dbl>
#1 0.5 0.5 0.715
#2 1 0.5 0.709
#3 0.5 1 0.718
#4 1 1 0.715
use sapply inside your mutate. summary/lm are not vectorized
weights %>%
mutate(formula_weights = paste0("mpg~log(",w1,"*drat+",w2,"*hp)")) %>%
mutate(r2 = sapply(formula_weights,
function(fw) summary(lm(data=cars, formula =))$r.squared))

R summarise with multiple evalution metric functions that use actual and predicted from a data frame

I want to calculate multiple model evaluation metrics by groups for a data set. Each metric requires the input of actual (observed) and predicted values. These are columns in my data frame. My groups are represented by the variables iTime and an_id.
I can do the necessary calculations with summarise and much redundant typing, but there must be a purrr way to do this. I am trying to master purrr. I have tried invoke_map and pmap but could not figure out how to refer to the columns "actual" and "predicted" in my data frame.
A short example - there are more metrics needed:
library(Metrics)
df <- data.frame(an_id = c('G','J','J', 'J', 'G','G','J','G'),
iTime = c(1,1,2,2,1,2,1,2),
actual = c(1.28, 2.72,.664,.927,.711,1.16,.727,.834),
predicted = c(1.14,1.61,.475,.737,.715,1.15,.725,.90))
dataMetrics <- df %>%
group_by(an_id, iTime) %>%
summarise(vmae = mae(actual, predicted),
rae = rae(actual, predicted),
vrmse = rmse(actual, predicted))
> dataMetrics
A tibble: 4 x 5
an_id iTime vmae rae vrmse
<chr> <dbl> <dbl> <dbl> <dbl>
1 G 1 0.072 0.253 0.0990
2 G 2 0.038 0.233 0.0472
3 J 1 0.556 0.558 0.785
4 J 2 0.190 1.44 0.190
I don't know where mae, mase and rmse come from, which regrettably makes your example not reproducible. It's important to always explicitly state which packages you're using.
invoke_map is the way to map multiple functions to the same data. We can then combine that with nesting data and mapping invoke_map over the nested data.
I'll demonstrate with the sample data you give and by defining two functions f1 and f2:
f1 <- function(x, y) sum(abs(x - y))
f2 <- function(x, y) sum((x - y)^2)
library(tidyverse)
df %>%
group_by(an_id, iTime) %>%
nest() %>%
mutate(tmp = map(data, ~invoke_map_dfc(
list(f1 = f1, f2 = f2),
x = .x$actual, y = .x$predicted))) %>%
select(-data) %>%
unnest()
## A tibble: 4 x 4
# an_id iTime f1 f2
# <fct> <int> <dbl> <dbl>
#1 G 1 0.144 0.0196
#2 J 1 1.11 1.23
#3 J 2 0.381 0.0718
#4 G 2 0.01 0.0001
Explanation: We group observations by an_id and iTime, then nest the remaining data and use invoke_map_dfc inside map to apply f1 and f2 to data and store the result in columns of a nested tibble. The last step is removing the data column and un-nesting the summary stats.
Update
To reproduce your expected output
library(Metrics)
df %>%
group_by(an_id, iTime) %>%
nest() %>%
mutate(tmp = map(data, ~invoke_map_dfc(
list(vmae = mae, rae = rae, vrmse = rmse),
actual = .x$actual, predicted = .x$predicted))) %>%
select(-data) %>%
unnest()
## A tibble: 4 x 5
# an_id iTime vmae rae vrmse
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 G 1 0.072 0.253 0.0990
#2 J 1 0.556 0.558 0.785
#3 J 2 0.190 1.44 0.190
#4 G 2 0.038 0.233 0.0472
Sample data
df <- read.table(text =
"an_id iTime actual predicted
G 1 1.28 1.14
J 1 2.72 1.61
J 2 0.664 0.475
J 2 0.927 0.737
G 1 0.711 0.715
G 2 1.16 1.15
J 2 0.727 0.725", header = T)

Tidyverse syntax for calculating precision and recall

I am trying to calculate AUC, Precision, Recall, Accuracy for every group in my data frame (i have a single data frame that has predicted data from three different models concatenated).
What is the tidyverse syntax to do it? I want to use the yardstick package by Max Kuhn to calculate these metrics.
Here is a sample df and here is where I got so far:
> library(tidyverse)
> library(yardstick)
>
> sample_df <- data_frame(
+ group_type = rep(c('a', 'b', 'c'), each = 5), # repeats each element 5 times
+ true_label = as.factor(rbinom(15, 1, 0.3)), # generates 1 with 30% prob
+ pred_prob = runif(15, 0, 1) # generates 15 decimals between 0 and 1 from uniform dist
+ ) %>%
+ mutate(pred_label = as.factor(if_else(pred_prob > 0.5, 1, 0)))
>
> sample_df
# A tibble: 15 x 4
group_type true_label pred_prob pred_label
<chr> <fct> <dbl> <fct>
1 a 1 0.327 0
2 a 1 0.286 0
3 a 0 0.0662 0
4 a 0 0.993 1
5 a 0 0.835 1
6 b 0 0.975 1
7 b 0 0.436 0
8 b 0 0.585 1
9 b 0 0.478 0
10 b 1 0.541 1
11 c 1 0.247 0
12 c 0 0.608 1
13 c 0 0.215 0
14 c 0 0.937 1
15 c 0 0.819 1
>
Metrics:
> # metrics for the full data
> precision(sample_df, truth = true_label, estimate = pred_label)
[1] 0.5714286
> recall(sample_df, truth = true_label, estimate = pred_label)
[1] 0.3636364
> accuracy(sample_df, truth = true_label, estimate = pred_label)
[1] 0.3333333
> roc_auc(sample_df, truth = true_label, pred_prob)
[1] 0.7727273
>
Now how do i get these metrics for each group in my dataset??
sample_df %>%
group_by(group_type) %>%
summarize(???)
An example using unnest:
sample_df %>%
group_by(group_type) %>%
do(auc = roc_auc(., true_label, pred_prob),
acc = accuracy(., true_label, pred_label),
recall = recall(., true_label, pred_label),
precision = precision(., true_label, pred_label)) %>% unnest
HOWEVER,
I would actually suggest to not use yardstick because it doesn't play nice with dplyr summarize. Actually, it just uses the ROCR package under the hood. I would just make your own functions that take in two variables.
yardstick is flawed because it requires a data.frame as it's first input, it is trying to be too clever. Under the dplyr framework, that isn't necessary because of summarize and mutate as functions already see the variables inside a data.frame without an explicit data parameter.
As others have noted, the functions in yardstick don't really play nice with grouped data frames (at least as of yet). A workaround could be to work with nested data.
In order to reduce replication, it's probably also a good idea to write a simple wrapper function that calculates all of the summary metrics you want in one call. Here's an example of how you could go about doing just that:
reprex::reprex_info()
#> Created by the reprex package v0.1.1.9000 on 2018-02-09
Set up first:
library(tidyverse)
library(yardstick)
set.seed(1)
# Given sample data
sample_df <- data_frame(
group_type = rep(c('a', 'b', 'c'), each = 5), # repeats each element 5 times
true_label = as.factor(rbinom(15, 1, 0.3)), # generates 1 with 30% prob
pred_prob = runif(15, 0, 1) # generates 15 decimals between 0 and 1 from uniform dist
) %>%
mutate(pred_label = as.factor(if_else(pred_prob > 0.5, 1, 0)))
#> Warning: package 'bindrcpp' was built under R version 3.3.3
Here are the wrappers:
# Wrapper to calculate several metrics from same data
performance_metrics <- function(data, truth, estimate, prob) {
metrics <- lst(precision, recall, accuracy) # these all share arguments
values <- invoke_map_df(metrics, list(list(data)), truth, estimate)
roc <- roc_auc(sample_df, truth, prob) # bit different here
bind_cols(values, roc_auc = roc)
}
# Wrap the wrapper with default arguments
metrics <- partial(performance_metrics,
truth = "true_label",
estimate = "pred_label",
prob = "pred_prob")
And apply to groups by nesting the data:
sample_df %>%
nest(-group_type) %>%
mutate(metrics = map(data, metrics)) %>%
unnest(metrics)
#> # A tibble: 3 x 6
#> group_type data precision recall accuracy roc_auc
#> <chr> <list> <dbl> <dbl> <dbl> <dbl>
#> 1 a <tibble [5 x 3]> 0.5000000 0.2500000 0.2 0.5909091
#> 2 b <tibble [5 x 3]> 0.6666667 0.6666667 0.6 0.5909091
#> 3 c <tibble [5 x 3]> 0.7500000 0.7500000 0.6 0.5909091
I managed to do it by spiting a data frame to a list and mapping the function to each list element:
library(tidyverse)
library(yardstick)
sample_df %>%
split(.$group_type) %>%
map_dfr(precision, true_label, pred_label)
#output
## A tibble: 1 x 3
a b c
<dbl> <dbl> <dbl>
1 0.500 0.667 1.00
it seems group_by is not supported by yardstick functions yet
This also works:
sample_df %>%
split(.$group_type) %>%
map_dfr(function(x){
prec = precision(x, true_label, pred_label)
rec = recall(x, true_label, pred_label)
return(data.frame(prec, rec))
})
I used the example in http://r4ds.had.co.nz/many-models.html
It uses nest but also uses precision as you requested.
library(tidyverse)
library(yardstick)
sample_df <- data_frame(group_type = rep(c('a', 'b', 'c'), each = 5), # repeats each element 5 times
true_label = as.factor(rbinom(15, 1, 0.3)), # generates 1 with 30% prob
pred_prob = runif(15, 0, 1) # generates 15 decimals between 0 and 1 from uniform dist
) %>%
mutate(pred_label = as.factor(if_else(pred_prob > 0.5, 1, 0)))
by_group_type <- sample_df %>% group_by(group_type) %>% nest()
stick_m_1 <- function(df){
precision(df,truth = true_label, estimate = pred_label)
}
models <- map(by_group_type$data,stick_m_1)
models

Resources