Add dynamic value to row by group - r

I'd like to add a row for each group, where the entry for a particular column is the mean of the values of that column for that group. It's easy to add a constant value
library(dplyr)
mtcars %>% group_by(cyl) %>% group_modify(~add_row(.x, .before=0, carb=2))
# A tibble: 35 x 11
# Groups: cyl [3]
cyl mpg disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4 NA NA NA NA NA NA NA NA NA 2
2 4 22.8 108 93 3.85 2.32 18.6 1 1 4 1
3 4 24.4 147. 62 3.69 3.19 20 1 0 4 2
4 4 22.8 141. 95 3.92 3.15 22.9 1 0 4 2
But when I try to dynamically add e.g. the mean of all carbs for that group, it doesn't recognise carb as a column:
mtcars %>% group_by(cyl) %>% group_modify(~add_row(.x, .before=0, carb=mean(carb)))
Error in mean(carb) : object 'carb' not found

Alternatively:
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
summarise(carb = mean(carb)) %>%
bind_rows(mtcars) %>%
arrange(cyl)
#> # A tibble: 35 x 11
#> cyl carb mpg disp hp drat wt qsec vs am gear
#> * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 1.55 NA NA NA NA NA NA NA NA NA
#> 2 4 1 22.8 108 93 3.85 2.32 18.6 1 1 4
#> 3 4 2 24.4 147. 62 3.69 3.19 20 1 0 4
#> 4 4 2 22.8 141. 95 3.92 3.15 22.9 1 0 4
#> 5 4 1 32.4 78.7 66 4.08 2.2 19.5 1 1 4
#> 6 4 2 30.4 75.7 52 4.93 1.62 18.5 1 1 4
#> 7 4 1 33.9 71.1 65 4.22 1.84 19.9 1 1 4
#> 8 4 1 21.5 120. 97 3.7 2.46 20.0 1 0 3
#> 9 4 1 27.3 79 66 4.08 1.94 18.9 1 1 4
#> 10 4 2 26 120. 91 4.43 2.14 16.7 0 1 5
#> # ... with 25 more rows

Related

Is slice_max(n=0) not possible?

I'm doing
df_sliced <- df %>% group_by(group) %>% slice_max(n=0, order_by=n, with_ties = FALSE)
but it's just ignored.
Meaning, the df_sliced is equal df.
The problem appears to be with the assignment to n inside slice_max().
For example
mtcars %>% group_by(cyl) %>% slice_max(n=0, order_by=n, with_ties=FALSE)
Error in `slice_max()`:
! Problem while computing indices.
ℹ The error occurred in group 1: cyl = 4.
Caused by error:
! `order_by` must be a vector, not a function.
Run `rlang::last_error()` to see where the error occurred.
and
mtcars %>% group_by(cyl) %>% add_column(n=0) %>% slice_max(n)
# A tibble: 32 × 12
# Groups: cyl [3]
mpg cyl disp hp drat wt qsec vs am gear carb n
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 0
2 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 0
3 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 0
4 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1 0
5 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2 0
6 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1 0
7 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1 0
8 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1 0
9 26 4 120. 91 4.43 2.14 16.7 0 1 5 2 0
10 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 0
# … with 22 more rows
but
mtcars %>% group_by(cyl) %>% add_column(n=0) %>% slice_max(n, with_ties=FALSE)
# A tibble: 3 × 12
# Groups: cyl [3]
mpg cyl disp hp drat wt qsec vs am gear carb n
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 0
2 21 6 160 110 3.9 2.62 16.5 0 1 4 4 0
3 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 0
suggesting that
mtcars %>% group_by(cyl) %>% add_column(n=0) %>% head(0)
# A tibble: 0 × 12
# Groups: cyl [0]
# … with 12 variables: mpg <dbl>, cyl <dbl>, disp <dbl>, hp <dbl>, drat <dbl>, wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>,
# gear <dbl>, carb <dbl>, n <dbl>
(or mtcars %>% add_column(n=0) %>% head(0))
is a potential solution.

Create list-column of dataframes with cur_data() in R. Problems with accessing data

I do the following:
mtcars %>%
group_by(cyl) %>%
summarise(
model = list(lm(disp ~ mpg, data = cur_data())),
data = list(dat = cur_data())
) -> df
However, when I want to access the list-column data, it gives me this error:
> df$data
$dat
Error: Can't subset elements that don't exist.
x Locations 2, 3, 4, 5, 6, etc. don't exist.
ℹ There are only 1 element.
While the actual glimpse looks like this:
> glimpse(df)
Rows: 3
Columns: 3
$ cyl <dbl> 4, 6, 8
$ model <list> [<233.067448, -4.797961, -15.673940, 30.702798, 17.126060, 1.086485, -11.509437, 0.68342…
$ data <named list> [<tbl_df[11 x 11]>, <tbl_df[7 x 11]>, <tbl_df[14 x 11]>]
Not really sure what is going wrong here...
Change the order of operation.
library(dplyr)
mtcars %>%
group_by(cyl) %>%
summarise(
data = list(dat = cur_data()),
model = list(lm(disp ~ mpg, data = cur_data())),
) -> df
df$data
#$dat
# A tibble: 11 x 10
# mpg disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 22.8 108 93 3.85 2.32 18.6 1 1 4 1
# 2 24.4 147. 62 3.69 3.19 20 1 0 4 2
# 3 22.8 141. 95 3.92 3.15 22.9 1 0 4 2
# 4 32.4 78.7 66 4.08 2.2 19.5 1 1 4 1
# 5 30.4 75.7 52 4.93 1.62 18.5 1 1 4 2
#...
#...
#$dat
# A tibble: 7 x 10
# mpg disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 21 160 110 3.9 2.62 16.5 0 1 4 4
#2 21 160 110 3.9 2.88 17.0 0 1 4 4
#3 21.4 258 110 3.08 3.22 19.4 1 0 3 1
#4 18.1 225 105 2.76 3.46 20.2 1 0 3 1
#5 19.2 168. 123 3.92 3.44 18.3 1 0 4 4
#6 17.8 168. 123 3.92 3.44 18.9 1 0 4 4
#7 19.7 145 175 3.62 2.77 15.5 0 1 5 6
#$dat
# A tibble: 14 x 10
# mpg disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 18.7 360 175 3.15 3.44 17.0 0 0 3 2
# 2 14.3 360 245 3.21 3.57 15.8 0 0 3 4
# 3 16.4 276. 180 3.07 4.07 17.4 0 0 3 3
# 4 17.3 276. 180 3.07 3.73 17.6 0 0 3 3
# 5 15.2 276. 180 3.07 3.78 18 0 0 3 3
#...
I don't know the exact reason for this error but my guess is that after doing model = list(lm(disp ~ mpg, data = cur_data())) cur_data() now consists of current grouped dataframe as well as the model which is causing issues in storing the data.

tidyr::pivot_wider() reorder column names grouping by `name_from`

I would like to reorder the columns grouping by names_from instead of values_from, here is my minimal example:
mtcars %>%
tidyr::pivot_wider(names_from = gear, values_from = c(vs, am, carb))
output:
mpg cyl disp hp drat wt qsec vs_4 vs_3 vs_5 am_4 am_3 am_5 carb_4 carb_3 carb_5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 NA NA 1 NA NA 4 NA NA
2 21 6 160 110 3.9 2.88 17.0 0 NA NA 1 NA NA 4 NA NA
3 22.8 4 108 93 3.85 2.32 18.6 1 NA NA 1 NA NA 1 NA NA
Here is what I want the output:
mpg cyl disp hp drat wt qsec vs_4 am_4 carb_4 vs_3 am_3 carb_3 vs_5 am_5 carb_5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 NA NA NA NA NA NA
2 21 6 160 110 3.9 2.88 17.0 0 1 4 NA NA NA NA NA NA
Thanks in advance!
As far as I know, this can't be accomplished with pivot_wider and must be done afterwards.
Here is a long-winded attempt, but it does the job:
library(tidyverse)
suffixes <- unique(mtcars$gear)
pivoted <- mtcars %>%
tidyr::pivot_wider(names_from = gear, values_from = c(vs, am, carb))
names_to_order <- map(suffixes, ~ names(pivoted)[grep(paste0("_", .x), names(pivoted))]) %>% unlist
names_id <- setdiff(names(pivoted), names_to_order)
pivoted %>%
select(names_id, names_to_order)
#> # A tibble: 32 x 16
#> mpg cyl disp hp drat wt qsec vs_4 am_4 carb_4 vs_3 am_3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 NA NA
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 NA NA
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 1 NA NA
#> 4 21.4 6 258 110 3.08 3.22 19.4 NA NA NA 1 0
#> 5 18.7 8 360 175 3.15 3.44 17.0 NA NA NA 0 0
#> 6 18.1 6 225 105 2.76 3.46 20.2 NA NA NA 1 0
#> 7 14.3 8 360 245 3.21 3.57 15.8 NA NA NA 0 0
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0 2 NA NA
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 2 NA NA
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 NA NA
#> # ... with 22 more rows, and 4 more variables: carb_3 <dbl>, vs_5 <dbl>,
#> # am_5 <dbl>, carb_5 <dbl>
Created on 2020-02-25 by the reprex package (v0.3.0)

Select top rows in R using add_tally and top_n functions

I would like to select the top n rows in a data frame for which I
calculated a column n that represents the sum of a variable. For example,
using the mtcars data, I would like to filter to keep only the two cyl
with the greatest sum of mpg. In the following example, I was expecting
to select all rows where cyl == 4 and cyl == 8. It must be simple, but
I can not figure out my mistake.
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
summarise(sum(mpg))
#> # A tibble: 3 x 2
#> cyl `sum(mpg)`
#> <dbl> <dbl>
#> 1 4 293.
#> 2 6 138.
#> 3 8 211.
mtcars %>%
group_by(cyl) %>% # Calculate the sum of mpg for each cyl
add_tally(mpg, sort = TRUE) %>%
ungroup() %>%
top_n(2, n)
#> # A tibble: 11 x 12
#> mpg cyl disp hp drat wt qsec vs am gear carb n
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 293.
#> 2 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 293.
#> 3 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 293.
#> 4 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1 293.
#> 5 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2 293.
#> 6 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1 293.
#> 7 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1 293.
#> 8 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1 293.
#> 9 26 4 120. 91 4.43 2.14 16.7 0 1 5 2 293.
#> 10 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 293.
#> 11 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2 293.
Created on 2019-07-26 by the reprex package (v0.3.0)
It seems that top_n returns the top n rows after ordering the dataframe and returns more than n rows if there are ties. It does not return rows with distinct top n values.
From documentation -
Usage
top_n(x, n, wt)
Arguments
x: a tbl() to filter
n: number of rows to return. If x is grouped,
this is the number of rows per group. Will include more than n rows if
there are ties. If n is positive, selects the top n rows. If negative,
selects the bottom n rows.
You need, as suggested by #tmfmnk -
mtcars %>%
group_by(cyl) %>%
add_tally(mpg, sort = TRUE) %>%
ungroup() %>%
filter(dense_rank(desc(n)) < 3)

custom grouped dplyr function (sample_n)

I am trying to apply a sampling function in a grouped fashion to a data frame, where it should sample n samples from each group, or all group members if the group size is smaller than n.
Using dplyr, I first tried
library(dplyr)
mtcars %>% group_by(cyl) %>% sample_n(2)
This works when n is smaller than all the group sizes but does not take the full group when I choose n larger than the group size (note that there are 7 cars in one of the cyl groups):
mtcars %>% group_by(cyl) %>% sample_n(8)
Error: `size` must be less or equal than 7 (size of data),
set `replace` = TRUE to use sampling with replacement
I tried to solve this by creating an adapted group_n function like so:
sample_n_or_all <- function(tbl, n) {
if (nrow(tbl) < n)return(tbl)
sample_n(tbl, n)
}
but using my custom function (mtcars %>% group_by(cyl) %>% sample_n_or_all(8)) generates the same error.
Any suggestions how I can adapt my function so I can apply it to each of the groups? Or another solution to the problem?
We could check the number of rows in the group and pass the value to sample_n accordingly.
library(dplyr)
n <- 8
temp <- mtcars %>% group_by(cyl) %>% sample_n(if(n() < n) n() else n)
temp
# mpg cyl disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
# 2 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
# 3 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
# 4 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
# 5 26 4 120. 91 4.43 2.14 16.7 0 1 5 2
# 6 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
# 7 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
# 8 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
# 9 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#10 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
# … with 13 more rows
We can check number of rows in each group after that.
table(temp$cyl)
#4 6 8
#8 7 8
table(mtcars$cyl)
# 4 6 8
#11 7 14
We can do this without using a logical condition with pmin
library(dplyr)
tmp <- mtcars %>%
group_by(cyl) %>%
sample_n(pmin(n(), n))
# A tibble: 23 x 11
# Groups: cyl [3]
# mpg cyl disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
# 2 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
# 3 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
# 4 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
# 5 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1
# 6 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1
# 7 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
# 8 26 4 120. 91 4.43 2.14 16.7 0 1 5 2
# 9 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
#10 21 6 160 110 3.9 2.62 16.5 0 1 4 4
# … with 13 more rows
-checking
table(tmp$cyl)
# 4 6 8
# 8 7 8

Resources