r combine results from multiple lme4 objects - r

I am running a mixed effects model on my dataset ,
library(lme4)
data(cake)
each dataset is a subset of a larger datsaet
subset(cake, recipe=="A")
subset(cake, recipe=="B")
subset(cake, recipe=="C")
I am using dlply to run my mixed effects model on each subset
MxM1 <- plyr::dlply(cake,
"recipe",
function(x)
lmer(angle ~ 1+ (1|replicate)+ temperature,
data=x))
This gives me a list of summaries based on each subset of data.
I know how to display the summaries one at a time using gt_summary package
lm_cake$A %>%
tbl_regression() %>%
modify_column_hide(columns = ci) %>%
modify_column_unhide(columns = std.error)
lm_cake$B %>%
tbl_regression() %>%
modify_column_hide(columns = ci) %>%
modify_column_unhide(columns = std.error)
lm_cake$B %>%
tbl_regression() %>%
modify_column_hide(columns = ci) %>%
modify_column_unhide(columns = std.error)
I am not sure how to combine the results from all 3 objects (lm_cake$A, lm_cake$B, lm_cake$C) to display them as one summary table.
Model: A Model: B Model: C
Temperature Beta SE Beta SE Beta SE
Temperature. L
Temperature. Q
Temperature. C
Temperature^4
Temperature^5
Any suggestions or help is much apricated. Thanks.

You can also merge two or more gtsummary tables using the gtsummary::tbl_merge() function. Example below!
library(gtsummary)
#> #StandWithUkraine
library(lme4)
#> Loading required package: Matrix
data(cake)
MxM1 <-
plyr::dlply(
cake,
"recipe",
function(x) {
lmer(angle ~ 1+ (1|replicate)+ temperature, data=x) %>%
tbl_regression() %>%
modify_column_hide(columns = ci) %>%
modify_column_unhide(columns = std.error)
}
)
# Merge all model summaries together with `tbl_merge()`
tbl <-
MxM1 %>%
tbl_merge(
tab_spanner = c("**A**", "**B**", "**C**")
)
Created on 2022-12-17 with reprex v2.0.2

Update:
While the answer by #Daniel D. Sjoberg is perfect and the desired one. Here is the answer to OP's question in the comments:
"How can i convert the final results from long format to wide, by each recipe?"
After filtering temperature we could use pivot_wider and some tweaking thereafter:
Note we have to use broom.mixed package for our lmer
library(lme4)
library(tidyverse)
#library(broom)
library(broom.mixed)
cake %>%
mutate(recipe = as_factor(recipe)) %>%
group_by(recipe) %>%
group_split() %>%
map_dfr(.f = function(df){
lmer(angle ~ 1 + (1|replicate) + temperature,
data=df) %>%
tidy() %>%
add_column(recipe = unique(df$recipe), .before = 1)
}) %>%
filter(str_detect(term, "temperature")) %>%
select(recipe, term, Beta=estimate, SE = std.error) %>%
pivot_wider(names_from = recipe,
values_from = c(Beta, SE)) %>%
rename_with(~ str_replace(., "(.*)_(.*)", "\\2_\\1"), -1) %>%
select(term, sort(colnames(.)))
term A_Beta A_SE B_Beta B_SE C_Beta C_SE
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 temperature.L 6.43 1.22 6.88 1.16 6.52 1.12
2 temperature.Q -0.713 1.22 -0.946 1.16 0.502 1.12
3 temperature.C -2.33 1.22 0.368 1.16 0.313 1.12
4 temperature^4 -3.35 1.22 -0.328 1.16 -0.214 1.12
5 temperature^5 -0.151 1.22 -0.815 1.16 -1.78 1.12
First answer:
You need something like this?:
library(lme4)
data(cake)
library(dplyr)
library(broom)
library(broom.mixed)
cake %>%
mutate(recipe = as_factor(recipe)) %>%
group_by(recipe) %>%
group_split() %>%
map_dfr(.f = function(df){
lmer(angle ~ 1 + (1|replicate) + temperature,
data=df) %>%
tidy() %>%
add_column(recipe = unique(df$recipe), .before = 1)
})
A tibble: 24 × 7
recipe effect group term estimate std.error statistic
<fct> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 A fixed NA (Intercept) 33.1 1.42 23.3
2 A fixed NA temperature.L 6.43 1.22 5.26
3 A fixed NA temperature.Q -0.713 1.22 -0.583
4 A fixed NA temperature.C -2.33 1.22 -1.90
5 A fixed NA temperature^4 -3.35 1.22 -2.74
6 A fixed NA temperature^5 -0.151 1.22 -0.124
7 A ran_pars replicate sd__(Intercept) 5.16 NA NA
8 A ran_pars Residual sd__Observation 4.73 NA NA
9 B fixed NA (Intercept) 31.6 1.81 17.5
10 B fixed NA temperature.L 6.88 1.16 5.93
# … with 14 more rows
# ℹ Use `print(n = ...)` to see more rows

Related

How to create a tidy table of t-tests?

I'm trying to figure out if there is a straightforward way to create a table of paired t-tests using tidyverse packages. There are already Q&As addressing this topic (e.g., here), but the existing answers all seem pretty convoluted.
Here's a reproducible example showing what I'm trying to accomplish -- a column of variable names, columns with the means for both items in the pair for each variable, and a column of p-values:
library(dplyr)
library(infer)
library(tidyr)
df <- mtcars %>%
mutate(engine = if_else(vs == 0, "V-shaped", "straight"))
v_shaped <- df %>%
filter(engine == "V-shaped") %>%
summarise(across(c(mpg, disp), mean)) %>%
pivot_longer(cols = everything()) %>%
rename(V_shaped = value)
straight <- df %>%
filter(engine == "straight") %>%
summarise(across(c(mpg, disp), mean)) %>%
pivot_longer(cols = everything()) %>%
rename(straight = value)
mpg <- df %>%
t_test(formula = mpg ~ engine, alternative = "two-sided") %>%
select(p_value) %>%
mutate(name = "mpg")
disp <- df %>%
t_test(formula = disp ~ engine, alternative = "two-sided") %>%
select(p_value) %>%
mutate(name = "disp")
p_values <- bind_rows(mpg, disp)
table <- v_shaped %>%
full_join(straight, by = "name") %>%
full_join(p_values, by = "name")
table
#> # A tibble: 2 × 4
#> name V_shaped straight p_value
#> <chr> <dbl> <dbl> <dbl>
#> 1 mpg 16.6 24.6 0.000110
#> 2 disp 307. 132. 0.00000248
Obviously, this is not a good way to address this problem even for two variables, and it certainly does not scale well. But it does illustrate the intended outcome. Is there a way to do this in one pipeline? My actual use case involves many more variables, so -- ideally -- I'd be able to feed a vector of variable names into the pipe.
Here's one way of doing it in a single pipe -
library(tidyverse)
library(infer)
df %>%
#select the columns you are interested in
select(mpg, disp, engine) %>%
#get them in long format
pivot_longer(cols = -engine) %>%
#Divide the data in a list of dataframes
split(.$name) %>%
#For each dataframe
map_df(~{
#Get the mean value for each engine
.x %>%
group_by(engine) %>%
summarise(value = mean(value)) %>%
#get the data in wide format
pivot_wider(names_from = engine) %>%
#Combine it with t.test result
bind_cols(t_test(.x, formula = value ~ engine, alternative = "two-sided"))
}, .id = "name")
# name `V-shaped` straight statistic t_df p_value alternative lower_ci upper_ci
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
#1 disp 307. 132. -5.94 27.0 0.00000248 two.sided -235. -114.
#2 mpg 16.6 24.6 4.67 22.7 0.000110 two.sided 4.42 11.5
Obviously, you can keep only the columns (using select) that is relevant to you.
This can be achieved with a call to summarise() and a flattening of the results. If we use stats::t.test() the group means are returned as part of the test and don't need to be calculated separately. broom::tidy() ensures that each result is returned in a 1-row tibble.
library(dplyr)
library(purrr)
library(broom)
mtcars %>%
summarise(across(-vs, \(x)
list(tidy(
t.test(x ~ vs, data = ., alternative = "two.sided")
)))) %>%
flatten_dfr(.id = "names") %>%
rename("V-shaped" = estimate1, straight = estimate2)
# A tibble: 10 × 11
names estimate `V-shaped` straight statistic p.value parameter conf.low conf.high method alternative
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 mpg -7.94 16.6 24.6 -4.67 0.000110 22.7 -11.5 -4.42 Welch Two Sample t-test two.sided
2 cyl 2.87 7.44 4.57 7.79 0.0000000112 29.9 2.12 3.63 Welch Two Sample t-test two.sided
3 disp 175. 307. 132. 5.94 0.00000248 27.0 114. 235. Welch Two Sample t-test two.sided
4 hp 98.4 190. 91.4 6.29 0.00000182 23.6 66.1 131. Welch Two Sample t-test two.sided
5 drat -0.467 3.39 3.86 -2.66 0.0129 27.1 -0.827 -0.107 Welch Two Sample t-test two.sided
6 wt 1.08 3.69 2.61 3.76 0.000728 30.0 0.493 1.66 Welch Two Sample t-test two.sided
7 qsec -2.64 16.7 19.3 -5.94 0.00000352 24.6 -3.56 -1.72 Welch Two Sample t-test two.sided
8 am -0.167 0.333 0.5 -0.927 0.362 27.1 -0.535 0.202 Welch Two Sample t-test two.sided
9 gear -0.302 3.56 3.86 -1.22 0.232 28.8 -0.807 0.204 Welch Two Sample t-test two.sided
10 carb 1.83 3.61 1.79 3.98 0.000413 29.6 0.888 2.76 Welch Two Sample t-test two.sided

Tidymodels Error: Can't rename variables in this context

I recently picked up Tidymodels after having used R for a few months in my school.
I was trying to make my first model using the Titanic Dataset on Kaggle, but ran into some issues when fitting the model. Could someone help me?
titanic_rec <- recipe(Survived ~ Sex + Age + Pclass + Embarked + Family_Size + Name, data = titanic_train) %>%
step_impute_knn(all_predictors(), k = 3) %>%
step_dummy(Sex, Pclass, Embarked, Family_Size, Name) %>%
step_interact(~ Sex:Age + Sex:Pclass + Pclass:Age)
log_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
fitted_log_model <- workflow() %>%
add_model(log_model) %>%
add_recipe(titanic_rec) %>%
fit(data = titanic_train) %>%
pull_workflow_fit() %>%
tidy()
Every feature has a factor data type except Age and Survived which are doubles. The error seems to come about when I include the fit(data = ...) onwards.
Error: Can't rename variables in this context. Run `rlang::last_error()` to see where the error occurred.
24.
stop(fallback)
23.
signal_abort(cnd)
22.
abort("Can't rename variables in this context.")
21.
eval_select_recipes(to_impute, training, info)
20.
impute_var_lists(to_impute = x$terms, impute_using = x$impute_with, training = training, info = info)
19.
prep.step_impute_knn(x$steps[[i]], training = training, info = x$term_info)
18.
prep(x$steps[[i]], training = training, info = x$term_info)
17.
prep.recipe(blueprint$recipe, training = data, fresh = blueprint$fresh)
16.
recipes::prep(blueprint$recipe, training = data, fresh = blueprint$fresh)
15.
blueprint$mold$process(blueprint = blueprint, data = data)
14.
run_mold.recipe_blueprint(blueprint, data)
13.
run_mold(blueprint, data)
12.
mold.recipe(recipe, data, blueprint = blueprint)
11.
hardhat::mold(recipe, data, blueprint = blueprint)
10.
fit.action_recipe(action, workflow = workflow, data = data)
9.
fit(action, workflow = workflow, data = data)
8.
.fit_pre(workflow, data)
7.
fit.workflow(., data = titanic_train)
6.
fit(., data = titanic_train)
5.
is_workflow(x)
4.
validate_is_workflow(x)
3.
pull_workflow_fit(.)
2.
tidy(.)
1.
workflow() %>% add_model(log_model) %>% add_recipe(titanic_rec) %>% fit(data = titanic_train) %>% pull_workflow_fit() %>% tidy()
The posted error comes from step_impute_knn() where the number of neighbors should be specified by with neighbors. Secondly, I would advise against using name as a predictor since it creates a separate dummy variable for each name which would mess with the fit.
The final error comes in step_interact(). You can't use step_interact(~ Sex:Age) after step_dummy(Sex) becuase there won't be any columns named Sex after step_dummy() is done. Instead it will have Sex_male (since female is part of the intercept). A way to catch all the created dummy variables is to use starts_with() inside step_interact().
library(tidymodels)
titanic_train <- readr::read_csv("your/path/to/data/train.csv")
titanic_train <- titanic_train %>%
mutate(Survived = factor(Survived),
Pclass = factor(Pclass),
Family_Size = SibSp + Parch + 1)
titanic_rec <- recipe(Survived ~ Sex + Age + Pclass + Embarked + Family_Size,
data = titanic_train) %>%
step_impute_knn(all_predictors(), neighbors = 3) %>%
step_dummy(Sex, Pclass, Embarked) %>%
step_interact(~ starts_with("Sex_"):Age +
starts_with("Sex_"):starts_with("Pclass_") +
starts_with("Pclass_"):Age)
log_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
fitted_log_model <- workflow() %>%
add_model(log_model) %>%
add_recipe(titanic_rec) %>%
fit(data = titanic_train) %>%
pull_workflow_fit() %>%
tidy()
fitted_log_model
#> # A tibble: 13 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 3.85 0.921 4.18 0.0000289
#> 2 Age 0.0117 0.0226 0.516 0.606
#> 3 Family_Size -0.226 0.0671 -3.36 0.000769
#> 4 Sex_male -2.22 0.886 -2.50 0.0124
#> 5 Pclass_X2 1.53 1.16 1.31 0.189
#> 6 Pclass_X3 -2.42 0.884 -2.74 0.00615
#> 7 Embarked_Q -0.0461 0.368 -0.125 0.900
#> 8 Embarked_S -0.548 0.243 -2.26 0.0241
#> 9 Sex_male_x_Age -0.0488 0.0199 -2.46 0.0140
#> 10 Sex_male_x_Pclass_X2 -1.28 0.879 -1.46 0.144
#> 11 Sex_male_x_Pclass_X3 1.48 0.699 2.11 0.0347
#> 12 Age_x_Pclass_X2 -0.0708 0.0263 -2.69 0.00714
#> 13 Age_x_Pclass_X3 -0.0341 0.0209 -1.63 0.103
Created on 2021-07-01 by the reprex package (v2.0.0)

retreiving tidy results from regression by group with broom

The answer to this question clearly explains how to retrieve tidy regression results by group when running a regression through a dplyr pipe, but the solution is no longer reproducible.
How can one use dplyr and broom in combination to run a regression by group and retrieve tidy results using R 4.02, dplyr 1.0.0, and broom 0.7.0?
Specifically, the example answer from the question linked above,
library(dplyr)
library(broom)
df.h = data.frame(
hour = factor(rep(1:24, each = 21)),
price = runif(504, min = -10, max = 125),
wind = runif(504, min = 0, max = 2500),
temp = runif(504, min = - 10, max = 25)
)
dfHour = df.h %>% group_by(hour) %>%
do(fitHour = lm(price ~ wind + temp, data = .))
# get the coefficients by group in a tidy data_frame
dfHourCoef = tidy(dfHour, fitHour)
returns the following error (and three warnings) when I run it on my system:
Error in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm) :
Calling var(x) on a factor x is defunct.
Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
In addition: Warning messages:
1: Data frame tidiers are deprecated and will be removed in an upcoming release of broom.
2: In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
3: In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
If I reformat df.h$hour as a character rather than factor,
df.h <- df.h %>%
mutate(
hour = as.character(hour)
)
re-run the regression by group, and again attempt to retrieve the results using broom::tidy,
dfHour = df.h %>% group_by(hour) %>%
do(fitHour = lm(price ~ wind + temp, data = .))
# get the coefficients by group in a tidy data_frame
dfHourCoef = tidy(dfHour, fitHour)
I get this error:
Error in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm) :
is.atomic(x) is not TRUE
I assume that the problem has to do with the fact that the group-level regression results are stored as lists in dfHour$fitHour, but I am unsure how to correct the error and once again tidily and quickly compile the regression results, as used to work in the originally posted code/answer.
****** Updated with more succinct code pulled from the dplyr 1.0.0 release notes ******
Thank you. I was struggling with a similar question with the update to dplyr 1.0.0 related to using the examples in the provided link. This was both a helpful question and answer.
One note as an FYI, do() has been superseded as of dplyr 1.0.0, so may consider using the updated language (now very efficient with my update):
dfHour = df.h %>%
# replace group_by() with nest_by()
# to convert your model data to a vector of lists
nest_by(hour) %>%
# change do() to mutate(), then add list() before your model
# make sure to change data = . to data = data
mutate(fitHour = list(lm(price ~ wind + temp, data = data))) %>%
summarise(tidy(mod))
Done!
This gives a very efficient df with select output stats. The last line replaces the following code (from my original response), which does the same thing, but less easily:
ungroup() %>%
# then leverage the feedback from #akrun
transmute(hour, HourCoef = map(fitHour, tidy)) %>%
unnest(HourCoef)
dfHour
Which gives the outupt:
# A tibble: 72 x 6
hour term estimate std.error statistic p.value
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 (Intercept) 68.6 21.0 3.27 0.00428
2 1 wind 0.000558 0.0124 0.0450 0.965
3 1 temp -0.866 0.907 -0.954 0.353
4 2 (Intercept) 31.9 17.4 1.83 0.0832
5 2 wind 0.00950 0.0113 0.838 0.413
6 2 temp 1.69 0.802 2.11 0.0490
7 3 (Intercept) 85.5 22.3 3.83 0.00122
8 3 wind -0.0210 0.0165 -1.27 0.220
9 3 temp 0.276 1.14 0.243 0.811
10 4 (Intercept) 73.3 15.1 4.86 0.000126
# ... with 62 more rows
Thanks for the patience, I am working through this myself!
Issue would be that there is a grouping attribute rowwise after the do call and the column 'fitHour' is a list. We can ungroup, loop over the list with map and tidy it to a list column
library(dplyr)
library(purrr)
library(broom)
df.h %>%
group_by(hour) %>%
do(fitHour = lm(price ~ wind + temp, data = .)) %>%
ungroup %>%
mutate(HourCoef = map(fitHour, tidy))
Or use unnest after the mtuate
df.h %>%
group_by(hour) %>%
do(fitHour = lm(price ~ wind + temp, data = .)) %>%
ungroup %>%
transmute(hour, HourCoef = map(fitHour, tidy)) %>%
unnest(HourCoef)
# A tibble: 72 x 6
# hour term estimate std.error statistic p.value
# <fct> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 (Intercept) 89.8 20.2 4.45 0.000308
# 2 1 wind 0.00493 0.0151 0.326 0.748
# 3 1 temp -1.84 1.08 -1.71 0.105
# 4 2 (Intercept) 75.6 23.7 3.20 0.00500
# 5 2 wind -0.00910 0.0146 -0.622 0.542
# 6 2 temp 0.192 0.853 0.225 0.824
# 7 3 (Intercept) 44.0 23.9 1.84 0.0822
# 8 3 wind -0.00158 0.0166 -0.0953 0.925
# 9 3 temp 0.622 1.19 0.520 0.609
#10 4 (Intercept) 57.8 18.9 3.06 0.00676
# … with 62 more rows
If we wanted a single dataset, pull the 'fitHour', loop over the list with map, condense it to a single dataset by row binding (suffix _dfr)
df.h %>%
group_by(hour) %>%
do(fitHour = lm(price ~ wind + temp, data = .)) %>%
ungroup %>%
pull(fitHour) %>%
map_dfr(tidy, .id = 'grp')
NOTE: The OP's error message was able to be replicated with R 4.02, dplyr 1.0.0 and broom 0.7.0
tidy(dfHour,fitHour)
Error in var(if (is.vector(x) || is.factor(x)) x else as.double(x),
na.rm = na.rm) :
Calling var(x) on a factor x is defunct.
Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
In addition: Warning messages:
1: Data frame tidiers are deprecated and will be removed in an upcoming release of broom.
2: In mean.default(X[[i]], ...) :
Your code actually works. Maybe package version or re starting a new R session could help:
library(dplyr)
library(broom)
df.h = data.frame(
hour = factor(rep(1:24, each = 21)),
price = runif(504, min = -10, max = 125),
wind = runif(504, min = 0, max = 2500),
temp = runif(504, min = - 10, max = 25)
)
dfHour = df.h %>% group_by(hour) %>%
do(fitHour = lm(price ~ wind + temp, data = .))
tidy(dfHour,fitHour)
# A tibble: 72 x 6
# Groups: hour [24]
hour term estimate std.error statistic p.value
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 (Intercept) 66.4 14.8 4.48 0.000288
2 1 wind 0.000474 0.00984 0.0482 0.962
3 1 temp 0.0691 0.945 0.0731 0.943
4 2 (Intercept) 66.5 20.4 3.26 0.00432
5 2 wind -0.00540 0.0127 -0.426 0.675
6 2 temp -0.306 0.944 -0.324 0.750
7 3 (Intercept) 86.5 17.3 5.00 0.0000936
8 3 wind -0.0119 0.00960 -1.24 0.232
9 3 temp -1.18 0.928 -1.27 0.221
10 4 (Intercept) 59.8 17.5 3.42 0.00304
# ... with 62 more rows

How to pipe the tidy-ed lm model CI's into ggplot2?

I have the following code that is computing for every year bewteen 1961:2018 the effects of both predictor variables: base on balls per game (BB) and home runs per game (HR) on the response variable runs per game (R):
rm(list = ls())
library(dbplyr)
library(tidyverse)
library(broom)
library(Lahman)
fit <- Teams %>%
filter(yearID %in% 1961:2018) %>%
mutate(BB = BB / G,
HR = HR / G,
R = R / G) %>%
group_by(yearID) %>%
do(tidy(lm(R ~ BB + HR, data = .), conf.int = TRUE)) %>% filter(term=="BB")
fit
> fit
# A tibble: 58 x 8
# Groups: yearID [58]
yearID term estimate std.error statistic p.value conf.low conf.high
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1961 BB 0.0845 0.168 0.502 0.623 -0.274 0.443
2 1962 BB 0.142 0.273 0.520 0.610 -0.434 0.718
3 1963 BB 0.339 0.242 1.40 0.178 -0.171 0.849
4 1964 BB -0.105 0.302 -0.349 0.731 -0.742 0.532
5 1965 BB 0.235 0.253 0.928 0.366 -0.299 0.768
6 1966 BB 0.104 0.216 0.482 0.636 -0.351 0.559
7 1967 BB 0.0660 0.223 0.296 0.771 -0.405 0.537
8 1968 BB -0.199 0.203 -0.983 0.340 -0.627 0.229
9 1969 BB 0.153 0.163 0.942 0.357 -0.185 0.492
10 1970 BB 0.239 0.157 1.52 0.143 -0.0874 0.566
# ... with 48 more rows
I now would like to output this "fit" which is actually a tibble (or modernized data frame) into ggplot to show the estimates per year as points but also the regression line along with the CI's computed by the lm model and not simply recomputing it with geom_smooth(method = "lm").
I have tried the following without success. I know that the augment from broom should operate on the lm model output directly and therefore the following code is wrong but it illustrates what I'm trying to achieve:
augment(fit) %>%
ggplot() +
geom_point(aes(yearID, estimate)) +
geom_line(aes(yearID, .fitted), col = "blue")
How can I do that without "cheating" (double computing the lm once and then on the ggplot as well) and doing:
fit %>% ggplot(aes(yearID,estimate)) + geom_point() + geom_smooth(method = "lm")
I took a similar route to Patrick, using map() and nest():
library(tidyverse)
library(broom)
library(Lahman)
library(magrittr)
fit <- Teams %>%
filter(yearID %in% 1961:2018) %>%
mutate(
BB = BB / G,
HR = HR / G,
R = R / G
) %>%
nest(data = -yearID) %>%
mutate(
model = map(data, ~ lm(R ~ BB + HR, .x)), # apply model to all nested groups
m_tidy = map(model, tidy), # tidy up
est = map_dbl(m_tidy, ~ .x %>% # pull BB estimate from each group
filter(term == "BB") %>%
pull(estimate)),
)
Now at this point you could just %$% right into this next portion but I've kept them separate here so talk about mimicking the confidence interval properly. The geom_smooth() confidence interval is based on the t-distribution and not the normal distribution. Thus, we'll have to do a bit of extra work to get out intervals to work:
fit %$%
lm(est ~ yearID) %>%
augment() %>%
mutate(m.se.fit = .se.fit * qt(1 - (1-0.95)/2, nrow(fit))) %>% # 95% conf int calc
ggplot(aes(yearID, est)) +
geom_point() +
geom_line(aes(y = .fitted), col = "blue") +
geom_ribbon(aes(ymin = .fitted - m.se.fit, ymax = .fitted + m.se.fit), alpha = .2)
This plot essentially mirrors the desired plot:
fit %>% ggplot(aes(yearID, est)) +
geom_point() +
geom_smooth(method = "lm")
Created on 2019-10-23 by the reprex package (v0.3.0)
You can try map functions from the purrr package, which is included in tidyverse. A possible code for your described problem is listed below. Should also be possible with lapply if you are not that familar with the purrr package.
library(tidyverse)
library(broom)
library(Lahman)
fit <- Teams %>%
filter(yearID %in% 1961:2018) %>%
mutate(BB = BB / G,
HR = HR / G,
R = R / G) %>%
group_by(yearID) %>%
# consolidate your data
nest() %>%
# creates new nested column with your regression data
mutate(model = map(data, function(df)
tidy(lm(R ~ BB + HR, data = df), conf.int = TRUE) %>%
filter(term=="BB")
),
# extract the column estimate
model_est = map_dbl(model, function(df)
df %>% pull(estimate)
),
# extract the column conf.low
model_conf.low = map_dbl(model, function(df)
df %>% pull(conf.low)
),
# extract the column conf.high
model_conf.high = map_dbl(model, function(df)
df %>% pull(conf.high)
)
)
fit %>% ggplot(aes(yearID,model_est)) + geom_point() +
geom_line(aes(yearID, model_conf.low)) +
geom_line(aes(yearID, model_conf.high))

Using dplyr and broom to compute kmeans on a training and test set

I am using dplyr and broom to compute kmeans for my data. My data contains a test and training set of X and Y coordinates and are grouped by a some parameter value (lambda in this case):
mds.test = data.frame()
for(l in seq(0.1, 0.9, by=0.2)) {
new.dist <- run.distance.model(x, y, lambda=l)
mds <- preform.mds(new.dist, ndim=2)
mds.test <- rbind(mds.test, cbind(mds$space, design[,c(1,3,4,5)], lambda=rep(l, nrow(mds$space)), data="test"))
}
> head(mds.test)
Comp1 Comp2 Transcripts Genes Timepoint Run lambda data
7A_0_AAGCCTAGCGAC -0.06690476 -0.25519106 68125 9324 Day 0 7A 0.1 test
7A_0_AAATGACTGGCC -0.15292848 0.04310200 28443 6746 Day 0 7A 0.1 test
7A_0_CATCTCGTTCTA -0.12529445 0.13022908 27360 6318 Day 0 7A 0.1 test
7A_0_ACCGGCACATTC -0.33015913 0.14647857 23038 5709 Day 0 7A 0.1 test
7A_0_TATGTCGGAATG -0.25826098 0.05424976 22414 5878 Day 0 7A 0.1 test
7A_0_GAAAAAGGTGAT -0.24349387 0.08071162 21907 6766 Day 0 7A 0.1 test
I've head the test dataset above but I also have one named mds.train which contains my training data coordinates. My ultimate goal here is to run k-means for both sets grouped by lambda, then compute the within.ss, between.ss and total.ss for the test data on the training centers. Thanks to a great resource on broom, I am able to run kmeans for each lambda for the test set by simply doing the following:
test.kclusts = mds.test %>%
group_by(lambda) %>%
do(kclust=kmeans(cbind(.$Comp1, .$Comp2), centers=length(unique(design$Timepoint))))
Then I can compute the centers of this data for each cluster within each lambda:
test.clusters = test.kclusts %>%
group_by(lambda) %>%
do(tidy(.$kclust[[1]]))
This is where I am stuck. How do I compute the feature assignments as similarly shown on the reference page (e.g. kclusts %>% group_by(k) %>% do(augment(.$kclust[[1]], points.matrix))), where my points.matrix is mds.test which is a data.frame with length(unique(mds.test$lambda)) times as many rows as should be? And is there a way to somehow use centers from the training set to compute glance() statistics based off the test assignments?
Any help would be greatly appreciated! Thank you!
EDIT: Updating progress. I have figured out how to aggregate the test/training assignments but am still having issues trying to compute kmeans statistics from both sets (training assignments on test center and test assignments on training centers). Updated code is below:
test.kclusts = mds.test %>% group_by(lambda) %>% do(kclust=kmeans(cbind(.$Comp1, .$Comp2), centers=length(unique(design$Timepoint))))
test.clusters = test.kclusts %>% group_by(lambda) %>% do(tidy(.$kclust[[1]]))
test.clusterings = test.kclusts %>% group_by(lambda) %>% do(glance(.$kclust[[1]]))
test.assignments = left_join(test.kclusts, mds.test) %>% group_by(lambda) %>% do(augment(.$kclust[[1]], cbind(.$Comp1, .$Comp2)))
train.kclusts = mds.train %>% group_by(lambda) %>% do(kclust=kmeans(cbind(.$Comp1, .$Comp2), centers=length(unique(design$Timepoint))))
train.clusters = train.kclusts %>% group_by(lambda) %>% do(tidy(.$kclust[[1]]))
train.clusterings = train.kclusts %>% group_by(lambda) %>% do(glance(.$kclust[[1]]))
train.assignments = left_join(train.kclusts, mds.train) %>% group_by(lambda) %>% do(augment(.$kclust[[1]], cbind(.$Comp1, .$Comp2)))
test.assignments$data = "test"
train.assignments$data = "train"
merge.assignments = rbind(test.assignments, train.assignments)
merge.assignments %>% filter(., data=='test') %>% group_by(lambda) ... ?
Ive attached a plot below which illustrates my progress to this point. Just to reiterate, I would like to compute kmeans statistics (within sum of square, total sum of squares, and between sum of squares) for the training data centers on test assignments/coordinates (the plots which the centers look off):
One approach would be to...
extract the table specifying the centroids of your clusters (built on the training set) via broom.
calculate the distance of each point in the test set from each of the cluster centroids built using the training set. Could do this via fuzzyjoin package.
the cluster centroid that a test point has the shortest Euclidian distance from represents its assigned cluster.
From there you can calculate any metrics of interest.
See below using a simpler dataset pulled from clustering example from tidymodels.
library(tidyverse)
library(rsample)
library(broom)
library(fuzzyjoin)
# data and train / test set-up
set.seed(27)
centers <- tibble(
cluster = factor(1:3),
num_points = c(100, 150, 50), # number points in each cluster
x1 = c(5, 0, -3), # x1 coordinate of cluster center
x2 = c(-1, 1, -2) # x2 coordinate of cluster center
)
labelled_points <-
centers %>%
mutate(
x1 = map2(num_points, x1, rnorm),
x2 = map2(num_points, x2, rnorm)
) %>%
select(-num_points) %>%
unnest(cols = c(x1, x2))
points <-
labelled_points %>%
select(-cluster)
set.seed(1234)
split <- rsample::initial_split(points)
train <- rsample::training(split)
test <- rsample::testing(split)
# Fit kmeans on train then assign clusters to test
kclust <- kmeans(train, centers = 3)
clust_centers <- kclust %>%
tidy() %>%
select(-c(size, withinss))
test_clusts <- fuzzyjoin::distance_join(mutate(test, index = row_number()),
clust_centers,
max_dist = Inf,
method = "euclidean",
distance_col = "dist") %>%
group_by(index) %>%
filter(dist == min(dist)) %>%
ungroup()
#> Joining by: c("x1", "x2")
# resulting table
test_clusts
#> # A tibble: 75 x 7
#> x1.x x2.x index x1.y x2.y cluster dist
#> <dbl> <dbl> <int> <dbl> <dbl> <fct> <dbl>
#> 1 4.24 -0.946 1 5.07 -1.10 3 0.847
#> 2 3.54 0.287 2 5.07 -1.10 3 2.06
#> 3 3.71 -1.67 3 5.07 -1.10 3 1.47
#> 4 5.03 -0.788 4 5.07 -1.10 3 0.317
#> 5 6.57 -2.49 5 5.07 -1.10 3 2.04
#> 6 4.97 0.233 6 5.07 -1.10 3 1.34
#> 7 4.43 -1.89 7 5.07 -1.10 3 1.01
#> 8 5.34 -0.0705 8 5.07 -1.10 3 1.07
#> 9 4.60 0.196 9 5.07 -1.10 3 1.38
#> 10 5.68 -1.55 10 5.07 -1.10 3 0.758
#> # ... with 65 more rows
# calc within clusts SS on test
test_clusts %>%
group_by(cluster) %>%
summarise(size = n(),
withinss = sum(dist^2),
withinss_avg = withinss / size)
#> # A tibble: 3 x 4
#> cluster size withinss withinss_avg
#> <fct> <int> <dbl> <dbl>
#> 1 1 11 32.7 2.97
#> 2 2 35 78.9 2.26
#> 3 3 29 62.0 2.14
# compare to on train
tidy(kclust) %>%
mutate(withinss_avg = withinss / size)
#> # A tibble: 3 x 6
#> x1 x2 size withinss cluster withinss_avg
#> <dbl> <dbl> <int> <dbl> <fct> <dbl>
#> 1 -3.22 -1.91 40 76.8 1 1.92
#> 2 0.0993 1.06 113 220. 2 1.95
#> 3 5.07 -1.10 72 182. 3 2.53
# plot of test and train points
test_clusts %>%
select(x1 = x1.x, x2 = x2.x, cluster) %>%
mutate(type = "test") %>%
bind_rows(
augment(kclust, train) %>%
mutate(type = "train") %>%
rename(cluster = .cluster)
) %>%
ggplot(aes(x = x1,
y = x2,
color = as.factor(cluster)))+
geom_point()+
facet_wrap(~fct_rev(as.factor(type)))+
coord_fixed()+
labs(title = "Cluster Assignment on Training and Holdout Datasets",
color = "Cluster")+
theme_bw()
Created on 2021-08-19 by the reprex package (v2.0.0)
(See comment on OP for link to conversations on making this easier within tidymodels.)

Resources