I am trying to update my function using the new version of dplyr.
First, I had this function (old version):
slope.k <- function(data, Treatment, Replicate, Day, Ln.AFDMrem){
fitted_models <- data %>% group_by(Treatment, Replicate) %>%
do(model = lm(Ln.AFDMrem ~ Day, data = .))
broom::tidy(fitted_models,model) %>% print(n = Inf)
}
However, the do() function was superseded. Now, I am trying to update with this (new) version:
slope.k <- function(data, Treatment, Replicate, Day, Ln.AFDMrem){
mod_t <- data %>% nest_by(Treatment, Replicate) %>%
mutate(model = list(lm(Ln.AFDMrem ~ Day, data = data))) %>%
summarise(tidy_out = list(tidy(model)))
unnest(select(mod_t, Treatment, tidy_out)) %>% print(n = Inf)
}
However, it doesn't work properly, because I have the following warnings:
Warning messages:
1: `cols` is now required when using unnest().
Please use `cols = c(tidy_out)`
2: `...` is not empty.
We detected these problematic arguments:
* `needs_dots`
These dots only exist to allow future extensions and should be empty.
Did you misspecify an argument?
Thanks in advance!!!
The issue would be the use of select with unnest. It can be reproduced by changing the select to c
libary(dplyr)
library(broom)
library(tidyr)
mtcars %>%
nest_by(carb, gear) %>%
mutate(model = list(lm(mpg ~ disp + drat, data = data))) %>%
summarise(tidy_out = list(tidy(model)), .groups = 'drop') %>%
unnest(c(tidy_out))
-output
# A tibble: 33 x 7
# carb gear term estimate std.error statistic p.value
# <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 3 (Intercept) -8.50 NaN NaN NaN
# 2 1 3 disp 0.0312 NaN NaN NaN
# 3 1 3 drat 7.10 NaN NaN NaN
# 4 1 4 (Intercept) -70.5 302. -0.234 0.854
# 5 1 4 disp -0.0445 0.587 -0.0757 0.952
# 6 1 4 drat 25.5 62.4 0.408 0.753
# 7 2 3 (Intercept) -3.72 8.57 -0.434 0.739
# 8 2 3 disp 0.0437 0.0123 3.54 0.175
# 9 2 3 drat 1.90 2.88 0.661 0.628
#10 2 4 (Intercept) -10.0 226. -0.0443 0.972
# … with 23 more rows
Also, after the mutate, step, we can directly use the unnest on the 'tidy_out' column
If we use as a function, assuming that unquoted arguments are passed as column names
slope.k <- function(data, Treatment, Replicate, Day, Ln.AFDMrem){
ln_col <- rlang::as_string(ensym(Ln.AFDMrem))
day_col <- rlang::as_string(ensym(Day))
data %>%
nest_by({{Treatment}}, {{Replicate}}) %>%
mutate(model = list(lm(reformulate(day_col, ln_col), data = data))) %>%
summarise(tidy_out = list(tidy(model)), .groups = 'drop') %>%
unnest(tidy_out)
}
slope.k(mtcars, carb, gear, disp, mpg)
# A tibble: 22 x 7
carb gear term estimate std.error statistic p.value
<dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 3 (Intercept) 22.0 5.35 4.12 0.152
2 1 3 disp -0.00841 0.0255 -0.329 0.797
3 1 4 (Intercept) 52.6 8.32 6.32 0.0242
4 1 4 disp -0.279 0.0975 -2.86 0.104
5 2 3 (Intercept) 1.25 3.49 0.357 0.755
6 2 3 disp 0.0460 0.0100 4.59 0.0443
7 2 4 (Intercept) 36.6 6.57 5.57 0.0308
8 2 4 disp -0.0978 0.0529 -1.85 0.206
9 2 5 (Intercept) 47.0 NaN NaN NaN
10 2 5 disp -0.175 NaN NaN NaN
# … with 12 more rows
Related
Suppose I want to summarise a data frame after grouping with differing functions. How can I do that?
mtcars %>% group_by(cyl) %>% summarise(size = n())
# A tibble: 3 x 2
cyl size
<dbl> <int>
1 4 11
2 6 7
3 8 14
But if I try:
mtcars %>% group_by(cyl) %>% summarise(size = n()) %>% summarise_at(vars(c(mpg, am:carb)), mean)
Error in is_string(y) : object 'carb' not found
How can I get first the size of each group with n() and then the mean of the other chosen features?
Here is one way using a dplyr::inner_join() on the two summarize operations by the grouping variable:
mtcars %>%
group_by(cyl) %>%
summarise(size = n()) %>%
inner_join(
mtcars %>%
group_by(cyl) %>%
summarise_at(vars(c(mpg, am:carb)), mean),
by='cyl' )
Output is:
# A tibble: 3 x 6
cyl size mpg am gear carb
<dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 4 11 26.7 0.727 4.09 1.55
2 6 7 19.7 0.429 3.86 3.43
3 8 14 15.1 0.143 3.29 3.5
Since summarise removes the column which are not grouped or summarised, an alternative in this case would be to first add a new column with mutate (so that all other columns remain as it is) to count number of rows in each group and include that column in summarise_at calculation.
library(dplyr)
mtcars %>%
group_by(cyl) %>%
mutate(n = n()) %>%
summarise_at(vars(mpg, am:carb, n), mean)
# A tibble: 3 x 6
# cyl mpg am gear carb n
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 26.7 0.727 4.09 1.55 11
#2 6 19.7 0.429 3.86 3.43 7
#3 8 15.1 0.143 3.29 3.5 14
We can use data.table methods
library(data.table)
as.data.table(mtcars)[, n := .N, cyl][, lapply(.SD, mean), cyl,
.SDcols = c("mpg", "am", "gear", "carb", "n")]
#. yl mpg am gear carb n
#1: 6 19.74286 0.4285714 3.857143 3.428571 7
#2: 4 26.66364 0.7272727 4.090909 1.545455 11
#3: 8 15.10000 0.1428571 3.285714 3.500000 14
Or with tidyverse
library(tidyverse)
mtcars %>%
add_count(cyl) %>%
group_by(cyl) %>%
summarise_at(vars(mpg, am:carb, n), mean)
# A tibble: 3 x 6
# cyl mpg am gear carb n
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 26.7 0.727 4.09 1.55 11
#2 6 19.7 0.429 3.86 3.43 7
#3 8 15.1 0.143 3.29 3.5 14
Or using base R
nm1 <- c("mpg", "am", "gear", "carb", "cyl")
transform(aggregate(.~ cyl, mtcars[nm1], mean), n = as.vector(table(mtcars$cyl)))
# cyl mpg am gear carb n
#1 4 26.66364 0.7272727 4.090909 1.545455 11
#2 6 19.74286 0.4285714 3.857143 3.428571 7
#3 8 15.10000 0.1428571 3.285714 3.500000 14
Using mtcars data, I am testing map() to build some lm() models:
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x)))
#> # A tibble: 3 x 3
#> cyl data fit
#> <dbl> <list> <list>
#> 1 6 <tibble [7 x 10]> <S3: lm>
#> 2 4 <tibble [11 x 10]> <S3: lm>
#> 3 8 <tibble [14 x 10]> <S3: lm>
The output shows that I have a new column, fit.
Now I wish to see the summary of each lm
When I try:
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x))) %>%
map(fit,summary)
#> Error in as_mapper(.f, ...): object 'fit' not found
It gives the error:
Error in as_mapper(.f, ...) : object 'fit' not found
If I wish to calculate R2 or aic then I can using the following code without any problem:
library(tidyverse)
library(modelr)
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x))) %>%
mutate(r2 = map_dbl(fit, ~rsquare(., data = mtcars)),
aic = map_dbl(fit, ~AIC(.))) %>%
arrange(aic)
#> # A tibble: 3 x 5
#> cyl data fit r2 aic
#> <dbl> <list> <list> <dbl> <dbl>
#> 1 6 <tibble [7 x 10]> <S3: lm> -8.96 -Inf
#> 2 4 <tibble [11 x 10]> <S3: lm> -26.4 56.4
#> 3 8 <tibble [14 x 10]> <S3: lm> -1.000 67.3
Created on 2019-06-18 by the reprex package (v0.3.0)
What am I missing?
As IceCreamToucan's comment laid out, purrr::map does not look into the data which has been made within your piping.
If you use it with dplyr::mutate then it has access to fit which you have created in the previous piping.
Another option would be explicitly referring to fit which you can see below, as my second suggestion.
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x))) %>%
mutate(fit_sum = map(fit,summary))
#> # A tibble: 3 x 4
#> cyl data fit fit_sum
#> <dbl> <list> <list> <list>
#> 1 6 <tibble [7 x 10]> <lm> <smmry.lm>
#> 2 4 <tibble [11 x 10]> <lm> <smmry.lm>
#> 3 8 <tibble [14 x 10]> <lm> <smmry.lm>
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x))) %>%
{map(.$fit, summary)} #or using pull: `pull(fit) %>% map(summary)`
#> [[1]]
#>
#> Call:
#> lm(formula = mpg ~ ., data = .x)
#>
#> Residuals:
#> ALL 7 residuals are 0: no residual degrees of freedom!
#>
#> Coefficients: (3 not defined because of singularities)
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 32.78649 NA NA NA
#> disp 0.07456 NA NA NA
#> hp -0.04252 NA NA NA
#> drat 1.52367 NA NA NA
#> wt 5.12418 NA NA NA
#> qsec -2.33333 NA NA NA
#> vs -1.75289 NA NA NA
#> am NA NA NA NA
#> gear NA NA NA NA
#> carb NA NA NA NA
#>
#> Residual standard error: NaN on 0 degrees of freedom
#> Multiple R-squared: 1, Adjusted R-squared: NaN
#> F-statistic: NaN on 6 and 0 DF, p-value: NA
####truncated the results for the sake of space####
Created on 2019-06-17 by the reprex package (v0.3.0)
From the latest release of dplyr, tidyverse seems to be encouraging using group_modify functions instead of using purrr + nested dataframes.
In that workflow, here is how you can get both model summaries and estimates in the same dataframe via broom package:
# setup
set.seed(123)
library(tidyverse)
options(tibble.width = Inf)
# joining dataframes with regression estimates and model summaries
dplyr::full_join(
# to get a tidy dataframe of regression estimates
x = mtcars %>%
group_by(cyl) %>%
group_modify(.f = ~ broom::tidy(lm(mpg ~ ., data = .x), conf.int = TRUE)),
# to get a tidy dataframe of model summaries
y = mtcars %>%
group_by(cyl) %>%
group_modify(.f = ~ broom::glance(lm(mpg ~ ., data = .x))),
by = "cyl"
) %>%
dplyr::ungroup(x = .)
#> Warning in qt(a, object$df.residual): NaNs produced
#> # A tibble: 25 x 20
#> cyl term estimate std.error statistic.x p.value.x conf.low
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 (Intercept) 60.9 180. 0.338 0.793 -2229.
#> 2 4 disp -0.345 0.469 -0.735 0.596 -6.31
#> 3 4 hp -0.0332 0.364 -0.0915 0.942 -4.65
#> 4 4 drat -4.19 46.4 -0.0903 0.943 -594.
#> 5 4 wt 4.48 29.7 0.151 0.905 -373.
#> 6 4 qsec -0.106 7.82 -0.0136 0.991 -99.4
#> 7 4 vs -3.64 34.0 -0.107 0.932 -435.
#> 8 4 am -6.33 45.2 -0.140 0.912 -581.
#> 9 4 gear 4.07 29.1 0.140 0.912 -366.
#> 10 4 carb 3.22 28.2 0.114 0.928 -355.
#> conf.high r.squared adj.r.squared sigma statistic.y p.value.y df
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2351. 0.928 0.276 3.84 1.42 0.576 9
#> 2 5.62 0.928 0.276 3.84 1.42 0.576 9
#> 3 4.59 0.928 0.276 3.84 1.42 0.576 9
#> 4 586. 0.928 0.276 3.84 1.42 0.576 9
#> 5 382. 0.928 0.276 3.84 1.42 0.576 9
#> 6 99.2 0.928 0.276 3.84 1.42 0.576 9
#> 7 428. 0.928 0.276 3.84 1.42 0.576 9
#> 8 568. 0.928 0.276 3.84 1.42 0.576 9
#> 9 374. 0.928 0.276 3.84 1.42 0.576 9
#> 10 362. 0.928 0.276 3.84 1.42 0.576 9
#> logLik AIC BIC deviance df.residual nobs
#> <dbl> <dbl> <dbl> <dbl> <int> <int>
#> 1 -17.2 56.4 60.8 14.7 1 11
#> 2 -17.2 56.4 60.8 14.7 1 11
#> 3 -17.2 56.4 60.8 14.7 1 11
#> 4 -17.2 56.4 60.8 14.7 1 11
#> 5 -17.2 56.4 60.8 14.7 1 11
#> 6 -17.2 56.4 60.8 14.7 1 11
#> 7 -17.2 56.4 60.8 14.7 1 11
#> 8 -17.2 56.4 60.8 14.7 1 11
#> 9 -17.2 56.4 60.8 14.7 1 11
#> 10 -17.2 56.4 60.8 14.7 1 11
#> # ... with 15 more rows
Created on 2019-06-17 by the reprex package (v0.3.0)
I have a recollection that purrr::pmap_* can treat a data.frame as a list but the syntax eludes me.
Imagine we wanted to fit a separate lm object for each value of mtcars$vs and mtcars$am
library(tidyverse)
library(broom)
d1 <- mtcars %>%
group_by(
vs, am
) %>%
nest %>%
mutate(
coef = data %>%
map(
~lm(mpg ~ wt, data =.) %>%
tidy
)
)
If I wanted to extract the coefficient estimates as an un-nested data.frame, and append the values of am and vs, I might try
d1[, -3] %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
But this results in an error. More explicitly declaring these variables as separate lists has the desired effect
list(
d1$vs,
d1$am,
d1$coef
) %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
Is there a succinct way for pmap_* to treat a data.frame as a list?
We can use the standard option to extract the components (..1, ..2, etc)
d1[, -3] %>%
pmap_dfr(~ ..3 %>%
mutate(vs = ..1, am = ..2))
# A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
#2 wt -7.91 1.14 -6.93 0.00227 0 1
#3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
#4 wt -7.77 3.36 -2.31 0.0689 1 1
#5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
#6 wt -3.38 2.80 -1.21 0.281 1 0
#7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
#8 wt -2.44 0.842 -2.90 0.0159 0 0
This is because the second list has no names attribute. If you unname d1 it works. The fact that you used the list function in the second example doesn't make a difference (except that it removed the names), because both objects are lists (data frames are lists).
d1[, -3] %>%
unname %>%
pmap_dfr(
function(i, j, k)
k %>%
mutate(
vs = i,
am = j
)
)
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0
You can also name the arguments in your first code block's function to match (or use ..1 etc) for the same result
d1[, -3] %>%
pmap_dfr(
function(vs, am, coef)
coef %>%
mutate(
vs = vs,
am = am
)
)
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0
You could also use wap from the experimental rap package
library(rap)
d1[, -3] %>%
wap( ~ coef %>%
mutate(
vs = vs,
am = am)) %>%
bind_rows
# # A tibble: 8 x 7
# term estimate std.error statistic p.value vs am
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 42.4 3.30 12.8 0.000213 0 1
# 2 wt -7.91 1.14 -6.93 0.00227 0 1
# 3 (Intercept) 44.1 6.96 6.34 0.00144 1 1
# 4 wt -7.77 3.36 -2.31 0.0689 1 1
# 5 (Intercept) 31.5 8.98 3.51 0.0171 1 0
# 6 wt -3.38 2.80 -1.21 0.281 1 0
# 7 (Intercept) 25.1 3.51 7.14 0.0000315 0 0
# 8 wt -2.44 0.842 -2.90 0.0159 0 0
I have dataset that looks like this:
Category Weekly_Date a b
<chr> <date> <dbl> <dbl>
1 aa 2018-07-01 36.6 1.4
2 aa 2018-07-02 5.30 0
3 bb 2018-07-01 4.62 1.2
4 bb 2018-07-02 3.71 1.5
5 cc 2018-07-01 3.41 12
... ... ... ... ...
I fitted linear regression for each group separately:
fit_linreg <- train %>%
group_by(Category) %>%
do(model = lm(Target ~ Unit_price + Unit_discount, data = .))
Now I have different models for each category:
aa model1
bb model2
cc model3
So, I need to apply each model to the appropriate category. How to achieve that? (dplyr is preferable)
if you nest the data of your test data, join it with the models, then you can use map2 to make predictions on the test data with the trained models. See below example with mtcars.
library(tidyverse)
x <- mtcars %>%
group_by(gear) %>%
do(model = lm(mpg ~ hp + wt, data = .))
x
Source: local data frame [3 x 2]
Groups: <by row>
# A tibble: 3 x 2
gear model
* <dbl> <list>
1 3 <S3: lm>
2 4 <S3: lm>
3 5 <S3: lm>
mtcars %>%
group_by(gear) %>%
nest %>%
inner_join(x) %>%
mutate(preds = map2(model, data, predict)) %>%
unnest(preds)
Joining, by = "gear"
# A tibble: 32 x 2
gear preds
<dbl> <dbl>
1 4 22.0
2 4 21.2
3 4 25.1
4 4 26.0
5 4 22.2
6 4 17.8
7 4 17.8
8 4 28.7
9 4 32.3
10 4 30.0
# ... with 22 more rows
Here's one approach, I'm using data.table to filter but you can use dplyr instead as well, I just prefer the data.table syntax.
d <- as.data.table(mtcars)
cats <- unique(d$cyl)
m <- lapply(cats, function(z){
return(lm(formula = mpg ~ wt + hp + disp,
data = d[cyl == z, ] ))
})
names(m) <- cats
OUTPUT
> summary(m)
Length Class Mode
6 12 lm list
4 12 lm list
8 12 lm list
# Checking first model
> m[[1]]
Call:
lm(formula = mpg ~ wt + hp + disp, data = d[cyl == z, ])
Coefficients:
(Intercept) wt hp disp
30.27791 -3.89618 -0.01097 0.01610
> sapply(1:length(m), function(z) return(summary(m[[z]])$adj.r.squared))
[1] 0.4434228 0.5829574 0.3461900
I named the list because it might be easier to refer to models by name aa or bb in your case. Hope this helps!
I find the nesting and un-nesting very unnatural, so here's my attempt.
Let's say you want the quality of the model's fit.
library(dplyr)
mtcars %>%
group_by(cyl) %>%
do(data.frame(r2 = summary(lm(mpg ~ wt, data = .))$r.squared))
#> # A tibble: 3 x 2
#> # Groups: cyl [3]
#> cyl r2
#> <dbl> <dbl>
#> 1 4 0.509
#> 2 6 0.465
#> 3 8 0.423
Let's say you want the residuals:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
mtcars %>%
group_by(cyl) %>%
do(data.frame(resid = residuals(lm(mpg ~ wt, data = .))))
#> # A tibble: 32 x 2
#> # Groups: cyl [3]
#> cyl resid
#> <dbl> <dbl>
#> 1 4 -3.67
#> 2 4 2.84
#> 3 4 1.02
#> 4 4 5.25
#> 5 4 -0.0513
#> 6 4 4.69
#> 7 4 -4.15
#> 8 4 -1.34
#> 9 4 -1.49
#> 10 4 -0.627
#> # ... with 22 more rows
See ?do for why you need the embedded data.frame(). You'll probably want to include other columns in the result. Not just the grouping variable and the residuals. I can't find a neat way to do this, other than listing them!
library(dplyr)
mtcars %>%
group_by(cyl) %>%
do(data.frame(disp = .$disp,
qsec = .$qsec,
resid = residuals(lm(mpg ~ wt, data = .))))
#> # A tibble: 32 x 4
#> # Groups: cyl [3]
#> cyl disp qsec resid
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 108 18.6 -3.67
#> 2 4 147. 20 2.84
#> 3 4 141. 22.9 1.02
#> 4 4 78.7 19.5 5.25
#> 5 4 75.7 18.5 -0.0513
#> 6 4 71.1 19.9 4.69
#> 7 4 120. 20.0 -4.15
#> 8 4 79 18.9 -1.34
#> 9 4 120. 16.7 -1.49
#> 10 4 95.1 16.9 -0.627
#> # ... with 22 more rows
Something that doesn't work
For the first example, I thought the following would work:
library(dplyr)
mtcars %>%
group_by(cyl) %>%
summarise(r2 = summary(lm(mpg ~ wt, data = .))$r.squared)
#> # A tibble: 3 x 2
#> cyl r2
#> <dbl> <dbl>
#> 1 4 0.753
#> 2 6 0.753
#> 3 8 0.753
But you can see all models have the same r2. It's because the model is being fit to all the data, not per cyl. Looking at the authors' code, I believe this is because they've optimised the evaluation of mutate() and summarise() using Rcpp, and the optimisation doesn't work in this case. But do() works as expected. It subsets the data by group before passing it to the expression to be evaluated. I see they are pondering this, see Hyrbid Folding
I have a grouped data frame (using dplyr) with 50 numeric columns, which are split into groups using one of the columns. I want to calculate a matrix of correlation between all non grouping columns and one particular column.
An example with the mtcars dataset:
data(mtcars)
cor(mtcars[,2:11], mtcars[,2])
returns a list of correlations between miles per galleon and the other variables.
Let's say however, that I wish to calculate this same correlation for each group of cylinders, e.g.:
library(dplyr)
mtcars <-
mtcars %>%
group_by(cyl)
How would I do this? I am thinking something like
mtcars %>%
group_by(cyl) %>%
summarise_each(funs(cor(...))
But I do not know what to put in the ... as I don't know how to specify a column in the dplyr chain.
Related:
Linear model and dplyr - a better solution? has an answer which is very similar to #akrun's answer. Also, over on cross validated: https://stats.stackexchange.com/questions/4040/r-compute-correlation-by-group has other solutions using packages which are not dplyr.
We could use do.
library(dplyr)
mtcars %>%
group_by(cyl) %>%
do(data.frame(Cor=t(cor(.[,3:11], .[,3]))))
# A tibble: 3 x 10
# Groups: cyl [3]
# cyl Cor.disp Cor.hp Cor.drat Cor.wt Cor.qsec Cor.vs Cor.am Cor.gear Cor.carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 1.00 0.435 -0.500 0.857 0.328 -0.187 -0.734 -0.0679 0.490
#2 6 1.00 -0.514 -0.831 0.473 0.789 0.637 -0.637 -0.899 -0.942
#3 8 1 0.118 -0.0922 0.755 0.195 NA -0.169 -0.169 0.0615
NOTE: t part is contributed by #Alex
Or use group_modify
mtcars %>%
select(-mpg) %>%
group_by(cyl) %>%
group_modify(.f = ~ as.data.frame(t(cor(select(.x, everything()),
.x[['disp']]))))
# A tibble: 3 x 10
# Groups: cyl [3]
# cyl disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 1.00 0.435 -0.500 0.857 0.328 -0.187 -0.734 -0.0679 0.490
#2 6 1.00 -0.514 -0.831 0.473 0.789 0.637 -0.637 -0.899 -0.942
#3 8 1 0.118 -0.0922 0.755 0.195 NA -0.169 -0.169 0.0615
Or another option is summarise with across. Created a new column 'disp1' as 'disp' then grouped by 'cyl', get the cor of columns 'disp' to 'carb' with 'disp1'
mtcars %>%
mutate(disp1 = disp) %>%
group_by(cyl) %>%
summarise(across(disp:carb, ~ cor(., disp1)))
# A tibble: 3 x 10
# cyl disp hp drat wt qsec vs am gear carb
#* <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 1.00 0.435 -0.500 0.857 0.328 -0.187 -0.734 -0.0679 0.490
#2 6 1.00 -0.514 -0.831 0.473 0.789 0.637 -0.637 -0.899 -0.942
#3 8 1 0.118 -0.0922 0.755 0.195 NA -0.169 -0.169 0.0615
Or
library(data.table)
d1 <- copy(mtcars)
setnames(setDT(d1)[, as.list(cor(.SD, .SD[[1]])) , cyl,
.SDcols=3:11], names(d1)[2:11])[]