Passing a function and arguments to a function and purrr - r

Let's say I have the following function:
new_func <- function(.data, .x, .fns, ...){
# Arguments
value_var_expr <- rlang::enquo(.x)
func <- .fns
func_chr <- deparse(substitute(.fns))
passed_args <- list(...)
# New Param Args ----
# I do this because na.rm = TRUE when passed to say quantile gets
# converted to 1 or 100%
if ("na.rm" %in% names(passed_args)) {
tmp_args <- passed_args[!names(passed_args) == "na.rm"]
}
if (!exists("tmp_args")) {
args <- passed_args
} else {
args <- tmp_args
}
ret <- purrr::map(
.x = dplyr::as_tibble(.data),
.f = ~ func(.x, unlist(args)) %>%
purrr::imap(.f = ~ cbind(.x, name = .y)) %>%
purrr::map_df(dplyr::as_tibble)
) %>%
purrr::imap(.f = ~ cbind(.x, sim_number = .y)) %>%
purrr::map_df(dplyr::as_tibble) %>%
dplyr::select(sim_number, name, .x) %>%
dplyr::mutate(.x = as.numeric(.x)) %>%
dplyr::mutate(sim_number = factor(sim_number)) %>%
dplyr::rename(value = .x)
cn <- c("sim_number", "name", func_chr)
names(ret) <- cn
return(ret)
}
Now try using IQR with no additional arguments passed
> new_func(mtcars, mpg, IQR)
Error in if (na.rm) x <- x[!is.na(x)] else if (anyNA(x)) stop("missing values and NaN's not allowed if 'na.rm' is FALSE") :
argument is of length zero
Passing only na.rm = TRUE
> new_func(mtcars, mpg, IQR, na.rm = TRUE)
Error in if (na.rm) x <- x[!is.na(x)] else if (anyNA(x)) stop("missing values and NaN's not allowed if 'na.rm' is FALSE") :
argument is of length zero
Passing type = 7
> new_func(mtcars, mpg, IQR, type = 7)
# A tibble: 11 × 3
sim_number name IQR
<fct> <dbl> <dbl>
1 mpg 1 7.38
2 cyl 1 4
3 disp 1 205.
4 hp 1 83.5
5 drat 1 0.84
6 wt 1 1.03
7 qsec 1 2.01
8 vs 1 1
9 am 1 1
10 gear 1 1
11 carb 1 2
Now I cannot pass simply ... like I could if I were doing say ret <- sapply(.data, .x, ...)
How can I correct this? I did try doing something like dots <- rlang::enquos(...) and then doing func(.x, !!!dots) which also fails.

I'm not sure if I'm understanding this completely, but I think the issue is that you're passing an empty list as an argument.
This might be a solution:
library(tidyverse)
new_func <- function(.data, .x, .fns, ...){
# Arguments
value_var_expr <- rlang::enquo(.x)
func <- .fns
func_chr <- deparse(substitute(.fns))
passed_args <- list(...)
# New Param Args ----
# I do this because na.rm = TRUE when passed to say quantile gets
# converted to 1 or 100%
if ("na.rm" %in% names(passed_args)) {
args <- passed_args[!names(passed_args) == "na.rm"]
} else {
args <- passed_args
}
if (length(args) < 0) {
args <- NULL
}
ret <- purrr::map(
.x = dplyr::as_tibble(.data),
.f = ~ func(.x, unlist(args)) %>%
purrr::imap(.f = ~ cbind(.x, name = .y)) %>%
purrr::map_df(dplyr::as_tibble)
) %>%
purrr::imap(.f = ~ cbind(.x, sim_number = .y)) %>%
purrr::map_df(dplyr::as_tibble) %>%
dplyr::select(sim_number, name, .x) %>%
dplyr::mutate(.x = as.numeric(.x)) %>%
dplyr::mutate(sim_number = factor(sim_number)) %>%
dplyr::rename(value = .x)
cn <- c("sim_number", "name", func_chr)
names(ret) <- cn
return(ret)
}
new_func(mtcars, mpg, IQR, type = 7, na.rm = TRUE)
#> # A tibble: 11 × 3
#> sim_number name IQR
#> <fct> <dbl> <dbl>
#> 1 mpg 1 7.38
#> 2 cyl 1 4
#> 3 disp 1 205.
#> 4 hp 1 83.5
#> 5 drat 1 0.84
#> 6 wt 1 1.03
#> 7 qsec 1 2.01
#> 8 vs 1 1
#> 9 am 1 1
#> 10 gear 1 1
#> 11 carb 1 2

We may need to take care of the cases where args returns length 0
new_func <- function(.data, .x, .fns, ...){
# Arguments
value_var_expr <- rlang::enquo(.x)
func <- .fns
func_chr <- deparse(substitute(.fns))
passed_args <- list(...)
if(length(passed_args) > 0) {
# New Param Args ----
# I do this because na.rm = TRUE when passed to say quantile gets
# converted to 1 or 100%
if ("na.rm" %in% names(passed_args)) {
tmp_args <- passed_args[!names(passed_args) == "na.rm"]
}
if (!exists("tmp_args")) {
args <- passed_args
} else {
args <- tmp_args
}
} else {
args <- NULL
}
if(length(args) == 0) args <- NULL
ret <- purrr::map(
.x = dplyr::as_tibble(.data),
.f = ~ if(is.null(args)) func(.x) else func(.x, unlist(args)) %>%
purrr::imap(.f = ~ cbind(.x, name = .y)) %>%
purrr::map_df(dplyr::as_tibble)
) %>%
purrr::imap(.f = ~ cbind(.x, sim_number = .y)) %>%
purrr::map_dfr(dplyr::as_tibble, .id = 'name') %>%
dplyr::select(sim_number, name, `.x`) %>%
dplyr::mutate(.x = as.numeric(.x)) %>%
dplyr::mutate(sim_number = factor(sim_number)) %>%
dplyr::rename(value = .x)
cn <- c("sim_number", "name", func_chr)
names(ret) <- cn
return(ret)
}
-testing
> new_func(mtcars, mpg, IQR, na.rm = TRUE)
# A tibble: 11 × 3
sim_number name IQR
<fct> <chr> <dbl>
1 mpg mpg 7.38
2 cyl cyl 4
3 disp disp 205.
4 hp hp 83.5
5 drat drat 0.84
6 wt wt 1.03
7 qsec qsec 2.01
8 vs vs 1
9 am am 1
10 gear gear 1
11 carb carb 2
> new_func(mtcars, mpg, IQR)
# A tibble: 11 × 3
sim_number name IQR
<fct> <chr> <dbl>
1 mpg mpg 7.38
2 cyl cyl 4
3 disp disp 205.
4 hp hp 83.5
5 drat drat 0.84
6 wt wt 1.03
7 qsec qsec 2.01
8 vs vs 1
9 am am 1
10 gear gear 1
11 carb carb 2
>
> new_func(mtcars, mpg, IQR, type = 7)
# A tibble: 11 × 3
sim_number name IQR
<fct> <chr> <dbl>
1 mpg mpg 7.38
2 cyl cyl 4
3 disp disp 205.
4 hp hp 83.5
5 drat drat 0.84
6 wt wt 1.03
7 qsec qsec 2.01
8 vs vs 1
9 am am 1
10 gear gear 1
11 carb carb 2

Related

How to calculate the distance of each pair of one column in r

I tried to calculate the lp norm of all pairs in one column. The answer just not right and I don't know why.
Here is my sample code.
a <- c(23,41,32,58,26,77,45,67,23,78,22,9,20)
lp_norm = function(x, y, p){
return(sum((abs(x-y))^p)^(1/p))
}
i = 1
while (i <= 13) {
for(j in i:12){
lp1 <- lp_norm(a[i],a[j+1],p=1)
}
i=i+1
print(lp1)
}
}
And I have a dataframe with 10 column need to do the same thing. How can I apply this to all column?
Here is one way to calculate this for different combinations of columns in a dataframe.
library(tidyverse)
lp_norm <- function(data, x, y, p){
data |>
select(v1:= !!sym(x), v2:= !!sym(y))|>
summarise(lp_norm = sum((abs(v1-v2))^p)^(1/p)) |>
pull(lp_norm)
}
calc_lp_norm <- function(data, vars, p){
combn(vars, 2) |>
t() |>
`colnames<-`(c("var1", "var2")) |>
as_tibble() |>
mutate(lp_norm = map2_dbl(var1, var2, ~lp_norm(x = .x, y = .y, data = data, p = p)))
}
#few columns
calc_lp_norm(mtcars, c("mpg", "cyl", "hp", "wt"), p = 1)
#> # A tibble: 6 x 3
#> var1 var2 lp_norm
#> <chr> <chr> <dbl>
#> 1 mpg cyl 445.
#> 2 mpg hp 4051.
#> 3 mpg wt 540.
#> 4 cyl hp 4496
#> 5 cyl wt 95.0
#> 6 hp wt 4591.
#all columns
calc_lp_norm(mtcars, colnames(mtcars), p = 1)
#> # A tibble: 55 x 3
#> var1 var2 lp_norm
#> <chr> <chr> <dbl>
#> 1 mpg cyl 445.
#> 2 mpg disp 6740.
#> 3 mpg hp 4051.
#> 4 mpg drat 528.
#> 5 mpg wt 540.
#> 6 mpg qsec 136.
#> 7 mpg vs 629.
#> 8 mpg am 630.
#> 9 mpg gear 525.
#> 10 mpg carb 553.
#> # ... with 45 more rows
We could either use combn (only returns pairwise combinations) in base R. Loop over the columns of data.frame 'dat', apply pair combinations of elements (assuming all are unique or else do combn(unique(u), 2) and apply the lp_norm function
lapply(dat, \(u) combn(u, 2, FUN = \(x) lp_norm(x[1], x[2], p = 1)))
Or if we need the output as a matrix (include pairwise combinations of mirror types as well i.e. 1 vs 2 and 2 vs 1 and 1 vs 1)
lapply(dat, \(u) outer(u, u, FUN = Vectorize(\(x, y) lp_norm(x, y, p = 1))))
But, as this is a distance function, using outer will be calculating the same distance twice + distance between the same element

How to efficiently apply multiple functions simultaneously to the same dataframe and save the results as a list of dataframes?

I want to apply several different functions simultaneously to one dataframe, then put the results into a list of dataframes. So, for example, I could arrange by one column, then save the output as a new dataframe. Or I could filter some data, then save as another new dataframe (and so on). I feel like there must be an easy way to do this with purrr or apply, but am unsure how to proceed. So, I'm wondering if there is a way to give a list of functions, then return a list of dataframes. Here are some example functions that I apply to mtcars:
library(tidyverse)
filter_df <- function(x, word) {
x %>%
tibble::rownames_to_column("ID") %>%
filter(str_detect(ID, word))
}
a <- filter_df(mtcars, "Merc")
mean_n_df <- function(x, grp, mean2) {
x %>%
group_by({{grp}}) %>%
summarise(mean = mean({{mean2}}), n = n())
}
b <- mean_n_df(mtcars, grp = cyl, mean2 = wt)
rating <- function(x, a, b, c) {
x %>%
rowwise %>%
mutate(rating = ({{a}}*2) + ({{b}}-5) * abs({{c}} - 30))
}
c <- rating(mtcars, a = cyl, b = drat, c = qsec)
pct <- function(data, var, round = 4){
var_expr <- rlang::enquo(var)
colnm_expr <- paste(rlang::get_expr(var_expr), "pct", sep = "_")
data %>%
mutate(!! colnm_expr := !!var_expr/sum(!!var_expr) %>%
round(round))
}
d <- pct(mtcars, mpg)
I know that I could run the code above, then just bind each dataframe into a list.
df_list <- list(mtcars, a, b, c, d)
str(df_list, 1)[[1]]
List of 5
$ :'data.frame': 32 obs. of 11 variables:
$ :'data.frame': 7 obs. of 12 variables:
$ : tibble [3 × 3] (S3: tbl_df/tbl/data.frame)
$ : rowwise_df [32 × 12] (S3: rowwise_df/tbl_df/tbl/data.frame)
..- attr(*, "groups")= tibble [32 × 1] (S3: tbl_df/tbl/data.frame)
$ :'data.frame': 32 obs. of 12 variables:
This seems a bit bespoke (since each function requires different parameters), but I'd use Map (or purrr::map2 or purrr::pmap), passing a function and the args for it:
filter_df <- function(x, word) {
x %>%
tibble::rownames_to_column("ID") %>%
filter(str_detect(ID, word))
}
mean_n_df <- function(x, grp, mean2) {
x %>%
group_by({{grp}}) %>%
summarise(mean = mean({{mean2}}), n = n())
}
rating <- function(x, a, b, c) {
x %>%
rowwise %>%
mutate(rating = ({{a}}*2) + ({{b}}-5) * abs({{c}} - 30))
}
pct <- function(data, var, round = 4){
var_expr <- rlang::enquo(var)
colnm_expr <- paste(rlang::get_expr(var_expr), "pct", sep = "_")
data %>%
mutate(!! colnm_expr := !!var_expr/sum(!!var_expr) %>%
round(round))
}
The call:
out <- Map(
function(fun, args) do.call(fun, c(list(mtcars), args)),
list(filter_df, mean_n_df, rating, pct),
list(list("Merc"), list(grp = quo(cyl), mean2 = quo(wt)),
list(a = quo(cyl), b = quo(drat), c = quo(qsec)),
list(quo(mpg)))
)
lapply(out, head, 3)
# [[1]]
# ID mpg cyl disp hp drat wt qsec vs am gear carb
# 1 Merc 240D 24.4 4 146.7 62 3.69 3.19 20.0 1 0 4 2
# 2 Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
# 3 Merc 280 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
# [[2]]
# # A tibble: 3 x 3
# cyl mean n
# <dbl> <dbl> <int>
# 1 4 2.29 11
# 2 6 3.12 7
# 3 8 4.00 14
# [[3]]
# # A tibble: 3 x 12
# # Rowwise:
# mpg cyl disp hp drat wt qsec vs am gear carb rating
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 -2.89
# 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 -2.28
# 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 -5.10
# [[4]]
# mpg cyl disp hp drat wt qsec vs am gear carb mpg_pct
# Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 0.03266449
# Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 0.03266449
# Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 0.03546430
A few things:
Because you demonstrated using the unevaluated symbols (grp=cyl), we have to quote them first, otherwise they would be evaluated before reaching the functions.
You can general this out to arbitrary data by not hard-coding it in the Map anon-func, with:
out <- Map(
function(x, fun, args) do.call(fun, c(list(x), args)),
list(mtcars),
list(filter_df, mean_n_df, rating, pct),
list(list("Merc"), list(grp = quo(cyl), mean2 = quo(wt)),
list(a = quo(cyl), b = quo(drat), c = quo(qsec)),
list(quo(mpg)))
)
where the list(.) around mtcars is intentional: it appears as length-1 to Map, so it is recycled for the other args (length 4 each). Without list, Map would fail because the first function would see the first column (as a vector), second function second column (and/or warning with longer argument not a multiple of length of shorter ... I really wish mis-aligned recycling in R would fail harder than that).
This generalization allows this sequence of functions to be applied each to multiple datasets:
out2 <- lapply(list(mtcars[1:10,], mtcars[11:32,]), function(XYZ) {
Map(
function(x, fun, args) do.call(fun, c(list(x), args)),
list(XYZ),
list(filter_df, mean_n_df, rating, pct),
list(list("Merc"), list(grp = quo(cyl), mean2 = quo(wt)),
list(a = quo(cyl), b = quo(drat), c = quo(qsec)),
list(quo(mpg)))
)
})
Not sure if you're intending the inception of applying a list of functions to a list of datasets ...
Using invoke with map2 from purrr
library(purrr)
df_list2 <- c(list(mtcars), map2(list(filter_df, mean_n_df, rating, pct),
list("Merc", expression(grp = cyl, mean2 = wt),
expression(a = cyl, b= drat, c = qsec), quote(mpg)),
~ invoke(.x, c(list(mtcars), as.list(.y)))))
-checking
all.equal(df_list2, df_list, check.attributes = FALSE)
[1] TRUE

How to combine multiple variable descriptive stats in one table in R?

I have this code
data_2012 %>%
group_by(job2) %>%
filter(!is.na(job2)) %>%
summarise(mean = mean(persinc2, na.rm = T),
sd = sd(persinc2, na.rm = T))
Which gives me a little table for that specific variable which is perfect, however i have multiple variables that i want the mean and SD for but it all to be in the one table, how do i do that?
I am very new to R.
You can use across and have to choose your columns using the tidy_select format:
data_2012 %>%
group_by(job2) %>%
filter(!is.na(job2)) %>%
summarise(across(your_columns, list(mean = ~ mean(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE))))
With a toy dataset
iris %>%
group_by(Species) %>%
summarise(across(everything(), list(mean = ~ mean(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE))))
# A tibble: 3 x 9
Species Sepal.Length_mean Sepal.Length_sd Sepal.Width_mean Sepal.Width_sd
<fct> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.01 0.352 3.43 0.379
2 versicolor 5.94 0.516 2.77 0.314
3 virginica 6.59 0.636 2.97 0.322
# ... with 4 more variables: Petal.Length_mean <dbl>, Petal.Length_sd <dbl>,
# Petal.Width_mean <dbl>, Petal.Width_sd <dbl>
With base R, we may use split() to split the data by some factor variable. This returns a list of a number of elements that is equal to the number of levels of that factor variable. We can then obtain the mean and sd (or any other statistic you like) per column per level using members of the *apply() family as follows:
# toy data
df <- mtcars[, 1:5]
# splitting by a factor variable
lapply(split(df, df$cyl), function(x) {
sapply(x, function(i) data.frame(Mean=mean(i), SD=sd(i)))
})
Output
$`4`
mpg cyl disp hp drat
Mean 26.66364 4 105.1364 82.63636 4.070909
SD 4.509828 0 26.87159 20.93453 0.3654711
$`6`
mpg cyl disp hp drat
Mean 19.74286 6 183.3143 122.2857 3.585714
SD 1.453567 0 41.56246 24.26049 0.4760552
$`8`
mpg cyl disp hp drat
Mean 15.1 8 353.1 209.2143 3.229286
SD 2.560048 0 67.77132 50.97689 0.3723618

R Loop Regressions

data=mtcars
data$group = rep(seq(from=1, to=4, by=1), 8)
model1 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 1), family = "binomial")
model2 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 2), family = "binomial")
model3 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 3), family = "binomial")
model4 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 4), family = "binomial")
model5 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 1), family = "binomial")
model6 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 2), family = "binomial")
model7 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 3), family = "binomial")
model8 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 4), family = "binomial")
Say you want to estimate a bunch of stratified models that are identical in every way except the stratified group (models 1-4) and also that you want to repeat this series of models for different outcomes (models 5-8).
That is what I have for the code above. However, is there a more efficient way to run this in terms of it not taking up as many lines of code? For example to specify the covariates, outcomes, and groups, and then loop over them?
You can for instance use data.table to run the model fitting by group, e.g.:
library(data.table)
dt = as.data.table(data)
models = dt[, .(fit_vs = list(glm(vs ~ mpg + cyl + disp + hp, family = "binomial")),
fit_am = list(glm(am ~ mpg + cyl + disp + hp, family = "binomial"))),
by = .(group)]
The result is then:
print(models)
# group fit_vs fit_am
# 1: 2 <glm> <glm>
# 2: 1 <glm> <glm>
# 3: 3 <glm> <glm>
# 4: 4 <glm> <glm>
You can access the fit for vs and group 3 using:
models[group == "3", fit_vs]
# [[1]]
#
# Call: glm(formula = vs ~ mpg + cyl + disp + hp, family = "binomial")
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 180.970664 -0.384760 -24.366394 -0.008435 -0.010799
#
# Degrees of Freedom: 9 Total (i.e. Null); 5 Residual
# Null Deviance: 13.46
# Residual Deviance: 3.967e-10 AIC: 10
First of all, seq(from=1, to=4, length=T) returns 1, so your code only creates 1 group. I thus modified your code as follows.
data=mtcars
data$group = rep(1:4, each = 8)
We can use the functions to apply glm to each combination as follows.
library(tidyverse)
data2 <- data %>%
gather(Y, Value, vs, am) %>%
group_split(Y, group) %>%
set_names(nm = map_chr(., ~str_c(unique(.x$Y), unique(.x$group), sep = "-"))) %>%
map(~glm(Value ~ mpg + cyl + disp + hp, data = .x, family = "binomial"))
We can access the result by names
data2[["am-1"]]
# Call: glm(formula = Value ~ mpg + cyl + disp + hp, family = "binomial",
# data = .x)
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 4.9180 -0.5335 17.2521 -0.7975 0.5192
#
# Degrees of Freedom: 7 Total (i.e. Null); 3 Residual
# Null Deviance: 10.59
# Residual Deviance: 2.266e-10 AIC: 10
data3 <- data %>%
gather(Y, Value, vs, am) %>%
group_by(Y, group) %>%
nest() %>%
mutate(Model = map(data, ~glm(Value ~ mpg + cyl + disp + hp, data = .x, family = "binomial")))
data3
# # A tibble: 8 x 4
# # Groups: group, Y [8]
# group Y data Model
# <int> <chr> <list<df[,10]>> <list>
# 1 1 vs [8 x 10] <glm>
# 2 2 vs [8 x 10] <glm>
# 3 3 vs [8 x 10] <glm>
# 4 4 vs [8 x 10] <glm>
# 5 1 am [8 x 10] <glm>
# 6 2 am [8 x 10] <glm>
# 7 3 am [8 x 10] <glm>
# 8 4 am [8 x 10] <glm>
data3 %>%
filter(group == 1, Y == "am") %>%
pull(Model)
# [[1]]
#
# Call: glm(formula = Value ~ mpg + cyl + disp + hp, family = "binomial",
# data = .x)
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 4.9180 -0.5335 17.2521 -0.7975 0.5192
#
# Degrees of Freedom: 7 Total (i.e. Null); 3 Residual
# Null Deviance: 10.59
# Residual Deviance: 2.266e-10 AIC: 10
You can extract the information with mutate and map, like below.
data4 <- data3 %>% mutate(Coef = map(Model, coef))
data4 %>%
filter(group == 1, Y == "am") %>%
pull(Coef)
# [[1]]
# (Intercept) mpg cyl disp hp
# 4.9179574 -0.5334823 17.2520829 -0.7974839 0.5191961
Or use the functions from the broom package.
library(broom)
data5 <- data3 %>%
mutate(Info = map(Model, tidy)) %>%
select(-Model, -data) %>%
unnest(cols = "Info")
data5
# # A tibble: 40 x 7
# # Groups: group, Y [8]
# group Y term estimate std.error statistic p.value
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 vs (Intercept) 397. 4682905. 0.0000849 1.000
# 2 1 vs mpg -8.95 176775. -0.0000507 1.000
# 3 1 vs cyl -41.9 141996. -0.000295 1.000
# 4 1 vs disp 0.525 1510. 0.000348 1.000
# 5 1 vs hp -0.610 8647. -0.0000705 1.000
# 6 2 vs (Intercept) 126. 2034044. 0.0000619 1.000
# 7 2 vs mpg -0.965 69501. -0.0000139 1.000
# 8 2 vs cyl 25.6 398854. 0.0000642 1.000
# 9 2 vs disp 0.266 3917. 0.0000680 1.000
# 10 2 vs hp -2.29 19162. -0.000120 1.000
# # ... with 30 more rows

Loosing group_by information when using dplyr::do for the second time

I am running multiple models on multiple sections of my data set, similar to (but with many more models)
library(tidyverse)
d1 <- mtcars %>%
group_by(cyl) %>%
do(mod_linear = lm(mpg ~ disp + hp, data = ., x = TRUE))
d1
# Source: local data frame [3 x 3]
# Groups: <by row>
#
# # A tibble: 3 x 3
# cyl mod_linear
# * <dbl> <list>
# 1 4. <S3: lm>
# 2 6. <S3: lm>
# 3 8. <S3: lm>
I then tidy this tibble and save my parameter estimates using tidy() in the broom package.
I also want to calculate the standard deviation of the predictors (stored in models above as I set x = TRUE) to create and then compare re-scaled parameters. I can do the former of these using
d1 %>%
# group_by(cyl) %>%
do(term = colnames(.$mod$x),
pred_sd = apply(X = .$mod$x, MARGIN = 2, FUN = sd)) %>%
unnest()
# # A tibble: 9 x 2
# term pred_sd
# <chr> <dbl>
# 1 (Intercept) 0.00000
# 2 disp 26.87159
# 3 hp 20.93453
# 4 (Intercept) 0.00000
# 5 disp 41.56246
# 6 hp 24.26049
# 7 (Intercept) 0.00000
# 8 disp 67.77132
# 9 hp 50.97689
However, the result is not a grouped tibble so I end up loosing the cyl column to tell me which terms belong to which model. How can avoid this loss? - Adding in group_by again seems to throw an error.
n.b. I want avoid using purrr for at least for the first part (fitting the models) as I run different types of models and then need to reshape the results (d1), and I like the progress bar with do.
n.b. I want to work with the $x component of the models rather than the raw data as they have the data on correct scale (I am experimenting with different transformations of the predictors)
We can do this by nesting initially and then do the unnest
mtcars %>%
group_by(cyl) %>%
nest(-cyl) %>%
mutate(mod_linear = map(data, ~ lm(mpg ~ disp + hp, data = .x, x = TRUE)),
term = map(mod_linear, ~ names(coef(.x))),
pred = map(mod_linear, ~ .x$x %>%
as_tibble %>%
summarise_all(sd) %>%
unlist )) %>%
select(-data, -mod_linear) %>%
unnest
# A tibble: 9 x 3
# cyl term pred
# <dbl> <chr> <dbl>
#1 6.00 (Intercept) 0
#2 6.00 disp 41.6
#3 6.00 hp 24.3
#4 4.00 (Intercept) 0
#5 4.00 disp 26.9
#6 4.00 hp 20.9
#7 8.00 (Intercept) 0
#8 8.00 disp 67.8
#9 8.00 hp 51.0
Or instead of calling the map multiple times, this can be further made compact with
mtcars %>%
group_by(cyl) %>%
nest(-cyl) %>%
mutate(mod_contents = map(data, ~ {
mod <- lm(mpg ~ disp + hp, data = .x, x = TRUE)
term <- names(coef(mod))
pred <- mod$x %>%
as_tibble %>%
summarise_all(sd) %>%
unlist
tibble(term, pred)
}
)) %>%
select(-data) %>%
unnest
# A tibble: 9 x 3
# cyl term pred
# <dbl> <chr> <dbl>
#1 6.00 (Intercept) 0
#2 6.00 disp 41.6
#3 6.00 hp 24.3
#4 4.00 (Intercept) 0
#5 4.00 disp 26.9
#6 4.00 hp 20.9
#7 8.00 (Intercept) 0
#8 8.00 disp 67.8
#9 8.00 hp 51.0
If we start from 'd1' (based on the OP's code)
d1 %>%
ungroup %>%
mutate(mod_contents = map(mod_linear, ~ {
pred <- .x$x %>%
as_tibble %>%
summarise_all(sd) %>%
unlist
term <- .x %>%
coef %>%
names
tibble(term, pred)
})) %>%
select(-mod_linear) %>%
unnest

Resources