using mutate with purrr's map - r

how could I add a column in a tibble which is in a list?
The data looks like the following and I just want to add a column at the end which checks if column 2 is the same as 3
[[1]]
# A tibble: 620 x 4
DataName `Technischer Platz...4` `Technischer Platz...5` `Technischer Platz...307`
<chr> <chr> <chr> <lgl>
1 WrW_SAPP17_VIAUFKAFVC 1202362-001 1202362-001 NA
2 WrW_SAPP17_VIAUFKAFVC 1202362-002 037 1202362-002 037 NA
3 WrW_SAPP17_VIAUFKAFVC 1202362-011 1202362-011 NA
4 WrW_SAPP17_VIAUFKAFVC 1202362 1202362 NA
5 WrW_SAPP17_VIAUFKAFVC 1202362-004 052 1202362-004 052 NA
6 WrW_SAPP17_VIAUFKAFVC 1202362-018 005 1202362-018 005 NA
7 WrW_SAPP17_VIAUFKAFVC 1202362-017 1202362-017 NA
8 WrW_SAPP17_VIAUFKAFVC 1202362-012 1202362-012 NA
9 WrW_SAPP17_VIAUFKAFVC 1202362-002 002 1202362-002 002 NA
10 WrW_SAPP17_VIAUFKAFVC 1202362-002 039 1202362-002 039 NA

We can use :
library(dplyr)
library(purrr)
list_df <- map(list_df, ~.x %>%
mutate(is_2_same_as_3 = `Technischer Platz...4` == `Technischer Platz...5`))
If you want to do it based on position of columns use :
list_df <- map(list_df, ~.x %>%
mutate(is_2_same_as_3 = .[[2]] == .[[3]]))
Same in base R would be using lapply :
list_df <- lapply(list_df, function(x) transform(x, is_2_same_as_3 = `Technischer Platz...4` == `Technischer Platz...5`))
list_df <- lapply(list_df, function(x) transform(x, is_2_same_as_3 = x[[2]] == x[[3]]))

Let's use a simpler example
library(tidyverse)
honda <- tibble(brand = "HONDA",
disp = c(3, 8, 20),
hp = c(3, 90, 115))
volvo <- tibble(brand = "VOLVO",
disp = c(3, 8, 20),
hp = c(3, 34, 115))
list_df <- list(honda, volvo)
list_df %>% map(~ mutate(., check = disp == hp))
#> [[1]]
#> # A tibble: 3 x 4
#> brand disp hp check
#> <chr> <dbl> <dbl> <lgl>
#> 1 HONDA 3 3 TRUE
#> 2 HONDA 8 90 FALSE
#> 3 HONDA 20 115 FALSE
#>
#> [[2]]
#> # A tibble: 3 x 4
#> brand disp hp check
#> <chr> <dbl> <dbl> <lgl>
#> 1 VOLVO 3 3 TRUE
#> 2 VOLVO 8 34 FALSE
#> 3 VOLVO 20 115 FALSE
Created on 2021-02-18 by the reprex package (v1.0.0)

Related

For function over multiple rows (i+1)?

New to R, my apologies if there is an easy answer that I don't know of.
I have a dataframe with 127.124 observations and 5 variables
Head(SortedDF)
number Retention.time..min. Charge m.z Group
102864 6947 12.58028 5 375.0021 Pro
68971 60641 23.36693 2 375.1373 Pro
75001 104156 24.54187 3 375.1540 Pro
87435 146322 22.69630 3 375.1540 Pro
82658 88256 22.32042 3 375.1541 Pro
113553 97971 14.54600 3 375.1566 Pro
...
I want to compare every row with the row underneath it (so basically rownumber vs rownumber +1) and see if they match. After reading the For and if-else functions, I came up with this code:
for (i in 1:dim(SortedDF))
if(abs(m.z[i]-m.z[i+1])<0.01 | abs(Retention.time..min.[i]-Retention.time..min.[i+1])<1 | (Charge[i]=Charge[i+1]) | Group[i]!=Group[i+1])
print("Match")
else
print("No match")
However, this code does not work as it only prints out the first function function [1], and I'm not sure if i+1 is a thing. Is there any way to solve this not using i+1?
library(tidyverse)
data <- tibble(x = c(1, 1, 2), y = "a")
data
#> # A tibble: 3 × 2
#> x y
#> <dbl> <chr>
#> 1 1 a
#> 2 1 a
#> 3 2 a
same_rows <-
data %>%
# consider all columns
unite(col = "all") %>%
transmute(same_as_next_row = all == lead(all))
data %>%
bind_cols(same_rows)
#> # A tibble: 3 × 3
#> x y same_as_next_row
#> <dbl> <chr> <lgl>
#> 1 1 a TRUE
#> 2 1 a FALSE
#> 3 2 a NA
Created on 2022-03-30 by the reprex package (v2.0.0)
library(tidyverse)
data <- tibble::tribble(
~id, ~number, ~Retention.time..min., ~Charge, ~m.z, ~Group,
102864, 6947, 12.58028, 5, 375.0021, "Pro",
68971, 60641, 23.36693, 2, 375.1373, "Pro",
75001, 104156, 24.54187, 3, 375.1540, "Pro",
87435, 146322, 22.69630, 3, 375.1540, "Pro",
82658, 88256, 22.32042, 3, 375.1541, "Pro",
113553, 97971, 14.54600, 3, 375.1566, "Pro"
)
data %>%
mutate(
matches_with_next_row = (abs(m.z - lead(m.z)) < 0.01) |
(abs(Retention.time..min. - lead(Retention.time..min.)) < 1)
)
#> # A tibble: 6 × 7
#> id number Retention.time..min. Charge m.z Group matches_with_next_row
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <lgl>
#> 1 102864 6947 12.6 5 375. Pro FALSE
#> 2 68971 60641 23.4 2 375. Pro FALSE
#> 3 75001 104156 24.5 3 375. Pro TRUE
#> 4 87435 146322 22.7 3 375. Pro TRUE
#> 5 82658 88256 22.3 3 375. Pro TRUE
#> 6 113553 97971 14.5 3 375. Pro NA
Created on 2022-03-30 by the reprex package (v2.0.0)

How to left rows with the same values in columns using dplyr package in R?

I wonder how can we filter rows with the same values in columns using dplyr package? I tried doing something in opposite to what was asked and proposed in this question but nothing worked.
I used the approach with apply function but received the following error:
same_vals <- apply(mydata, 1, function(x) all(x == x[1]))
final <- mydata %>%
filter(same_vals)
Error: Can't subset elements that don't exist.
x Location 1 doesn't exist.
i There are only 0 elements.
apply drives me crazy every time I try to use it. Neither it worked on my sample data:
set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
var1 = ceiling(runif(10, min = 5, max = 10)),
var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9))
# A tibble: 10 x 4
id var1 var2 var3
<dbl> <dbl> <dbl> <dbl>
1 90 6 6 6
2 71 6 5 3
3 13 6 4 4
4 60 8 8 8
5 20 9 12 11
6 70 6 1223 45
7 8 9 14 56
8 4 8 1 78
9 41 8 90 0
10 84 10 1 9
test1 <- apply(test, 1, function(x) all(x == x[1]))
test %>%
filter(test1)
# A tibble: 0 x 4
# ... with 4 variables: id <dbl>, var1 <dbl>, var2 <dbl>, var3 <dbl>
Desirable output
# A tibble: 10 x 4
id var1 var2 var3
<dbl> <dbl> <dbl> <dbl>
1 90 6 6 6
4 60 8 8 8
You could compare min and max of each row :
vars <- test[,-1]
test[do.call(pmin, vars) == do.call(pmax, vars),]
# A tibble: 2 x 4
id var1 var2 var3
<dbl> <dbl> <dbl> <dbl>
1 90 6 6 6
2 60 8 8 8
Update: Again with the help of fabulous akrun: We can use if_all:
test %>% filter(if_all(var2:var3, ~ . == var1))
id var1 var2 var3
<dbl> <dbl> <dbl> <dbl>
1 90 6 6 6
2 60 8 8 8
Updated benchmark: including Ronak's solution and akrun assisted tarjae if_all solution:
First answer:
This is a very good question! Whenever I need a bunch of time to solve a question I notice that the solution is not as obvious as I thought it should be.
In this case I completely agree with the OP, that sometimes things can drive someone crazy.
I would have expected that if_all should work proper for this task as it was created for such kind of situation (in my knowledge, to use filter across multiple columns). But I was not possible for me to apply it.
Therefore I decided to write a function (notice I am not very experienced in programming) Comments are welcomed to improve myself:
# function to filter only rows with same values across some (not all) columns!
library(dplyr)
library(tidyr)
tarjae <- function(x) {
x %>%
pivot_longer(-1) %>%
group_by(group = rep(row_number(), each=ncol(test[,-1]), length.out=n())) %>%
add_count(value) %>%
filter(ncol(test[,-1]) == n) %>%
pivot_wider(names_from = name, values_from = value) %>%
ungroup() %>%
select(-c(group, n))
}
tarjae(test)
id var1 var2 var3
<int> <int> <int> <int>
1 90 6 6 6
2 60 8 8 8
Benchmark: Waldi is first, tarjae is last :-(
library(microbenchmark)
mbm = microbenchmark(
tarjae = tarjae(test),
Yuriy = test %>%
rowwise() %>%
filter(n_distinct(c_across(-id)) == 1),
Mikko1 = test %>% filter(sapply(1:nrow(test), function(i) {all(diff(as.numeric(test[i,2:4])) == 0)})),
Mikko2 = test %>%
rowwise() %>%
mutate(diff = sum(diff(var1:var3))) %>%
filter(diff == 0),
Waldi = test[do.call(pmin, test[,-1]) == do.call(pmax, vars),],
Claudiu1 = test %>%
rowwise() %>%
mutate(unique = n_distinct(c_across(var1:var3))),
Claudiu2 = test %>%
rowwise() %>%
mutate(unique = n_distinct(c_across(var1:var3), na.rm = TRUE)),
times=50
)
mbm
autoplot(mbm)
Unit: microseconds
expr min lq mean median uq max neval cld
tarjae 15099.000 15422.300 16147.84108 15672.5005 16134.201 22529.301 50 e
Yuriy 3995.102 4106.401 4228.88304 4173.9010 4331.300 4640.902 50 cd
Mikko1 1325.702 1369.702 1456.27518 1405.0010 1474.001 2071.201 50 b
Mikko2 4159.700 4256.300 4503.31502 4316.2010 4435.201 10515.101 50 d
Waldi 63.601 87.901 95.26902 92.9015 94.801 209.901 50 a
Claudiu1 3817.500 3877.801 4050.05700 3974.6515 4065.101 5529.501 50 c
Claudiu2 4003.502 4093.501 4252.04898 4170.2010 4272.501 6003.801 50 cd
df <- data.frame(
id = c(90, 71, 13, 60, 20, 70, 8, 4, 41, 84),
var1 = c(6, 6, 6, 8, 9, 6, 9, 8, 8, 10),
var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9)
)
library(tidyverse)
df %>%
rowwise() %>%
filter(n_distinct(c_across(-id)) == 1)
#> # A tibble: 2 x 4
#> # Rowwise:
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
Created on 2022-01-14 by the reprex package (v2.0.1)
Not quite native tidyverse, but does what you desire?
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
var1 = ceiling(runif(10, min = 5, max = 10)),
var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9))
test %>% filter(sapply(1:nrow(test), function(i) {all(diff(as.numeric(test[i,2:4])) == 0)}))
#> # A tibble: 2 × 4
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
Created on 2022-01-14 by the reprex package (v2.0.1)
EDIT:
Here is an entirely tidyverse way:
test %>%
rowwise() %>%
filter(all(diff(var1:var3) == 0))
You may select the rows whose rowwise standard deviation or variance is 0.
test[matrixStats::rowSds(as.matrix(test[-1])) == 0, ]
# A tibble: 2 x 4
# id var1 var2 var3
# <dbl> <dbl> <dbl> <dbl>
#1 90 6 6 6
#2 60 8 8 8
This is follow-up to #Yuriy Saraykin's answer, whose solution I think is the most representative for the tidyverse way and uses the robust n_distinct function. His answer achieves the outcome you are looking for in virtually all circumstances, but I wanted to expand a bit on using it with missing data because NA is always a unique value.
library(dplyr)
library(tidyr)
set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
var1 = c(6, 6, 6, 8, 9, NA, NA, 8, 8, NA),
var2 = c(6, 5, 4, 8, 7, 12, NA, 1, 9, NA),
var3 = c(6, 3, 4, 8, 7, 45, NA, 78, 0, 3))
# count number of distinct values by row across columns -- NAs are all counted as unique values
test %>%
rowwise() %>%
mutate(unique = n_distinct(c_across(var1:var3)))
#> # A tibble: 10 x 5
#> # Rowwise:
#> id var1 var2 var3 unique
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 90 6 6 6 1
#> 2 71 6 5 3 3
#> 3 13 6 4 4 2
#> 4 60 8 8 8 1
#> 5 20 9 7 7 2
#> 6 70 NA 12 45 3
#> 7 8 NA NA NA 1
#> 8 4 8 1 78 3
#> 9 41 8 9 0 3
#> 10 84 NA NA 3 2
# Filter out rows with same value on colums -- NAs are all counted as unique values
test %>%
rowwise() %>%
filter(n_distinct(c_across(var1:var3)) == 1)
#> # A tibble: 3 x 4
#> # Rowwise:
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
#> 3 8 NA NA NA
# count number of distinct values by row across columns -- NAs excluded from count
test %>%
rowwise() %>%
mutate(unique = n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Rowwise:
#> id var1 var2 var3 unique
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 90 6 6 6 1
#> 2 71 6 5 3 3
#> 3 13 6 4 4 2
#> 4 60 8 8 8 1
#> 5 20 9 7 7 2
#> 6 70 NA 12 45 2
#> 7 8 NA NA NA 0
#> 8 4 8 1 78 3
#> 9 41 8 9 0 3
#> 10 84 NA NA 3 1
# Filter out rows with same value on colums -- NAs excluded from count
test %>%
rowwise() %>%
filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1)
#> # A tibble: 3 x 4
#> # Rowwise:
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
#> 3 84 NA NA 3
# count number of distinct values by row across columns -- NAs excluded altogether
test %>%
tidyr::drop_na(var1:var3) %>%
rowwise() %>%
mutate(unique = n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 7 x 5
#> # Rowwise:
#> id var1 var2 var3 unique
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 90 6 6 6 1
#> 2 71 6 5 3 3
#> 3 13 6 4 4 2
#> 4 60 8 8 8 1
#> 5 20 9 7 7 2
#> 6 4 8 1 78 3
#> 7 41 8 9 0 3
# Filter out rows with same value on colums -- NAs excluded altogether
test %>%
tidyr::drop_na(var1:var3) %>%
rowwise() %>%
filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1)
#> # A tibble: 2 x 4
#> # Rowwise:
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
Created on 2022-01-14 by the reprex package (v2.0.1)
Also found for myself another solution which looks almost like Ronak's suggestion. data.table, a little bit of dplyr and matrixStats packages are needed:
setDT(test)
test[, sd_var := matrixStats::rowSds(as.matrix(.SD)),
.SDcols = c("var1", "var2", "var3"), by = 1:nrow(test)] %>%
.[sd_var == 0, , ]
id var1 var2 var3 sd_var
1: 90 6 6 6 0
2: 60 8 8 8 0

Map as.numeric to only specific columns of a dataframe

I have some data in the format below, where all columns are of type chr.
#> # A tibble: 3 x 4
#> id age name income
#> <chr> <chr> <chr> <chr>
#> 1 1 18 jim 100
#> 2 2 21 bob 200
#> 3 3 16 alice 300
I'd like to use as.numeric() on only some columns. Preferably, I'd like to define a vector of column names and then use purrr:map to map as.numeric() to only those columns:
numeric_variables <- c("id", "age", "income")
How can I map that?
My desired output would look like:
df
#> # A tibble: 3 x 4
#> id age name income
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 18 jim 100
#> 2 2 21 bob 200
#> 3 3 16 alice 300
Code for data entry below.
library(purrr)
df <- data.frame(stringsAsFactors=FALSE,
id = c(1, 2, 3),
age = c(18, 21, 16),
name = c("jim", "bob", "alice"),
income = c(100, 200, 300)
)
df <- map_df(df, as.character)
df
Created on 2020-02-15 by the reprex package (v0.3.0)
We can use mutate_at
library(dplyr)
df %>%
mutate_at(vars(numeric_variables), as.numeric) %>%
as_tibble
# A tibble: 3 x 4
# id age name income
# <dbl> <dbl> <chr> <dbl>
#1 1 18 jim 100
#2 2 21 bob 200
#3 3 16 alice 300
Or more easily
df %>%
type.convert(as.is = TRUE)
Or with map
library(purrr)
df %>%
map_if(names(.) %in% numeric_variables, as.numeric) %>%
bind_cols
# A tibble: 3 x 4
# id age name income
# <dbl> <dbl> <chr> <dbl>
#1 1 18 jim 100
#2 2 21 bob 200
#3 3 16 alice 300
Or if we use the compound assignment operator (%<>%), this can be assigned in place
library(magrittr)
df %<>%
map_if(names(.) %in% numeric_variables, as.numeric) %<>%
bind_cols
str(df)
#tibble [3 × 4] (S3: tbl_df/tbl/data.frame)
# $ id : num [1:3] 1 2 3
# $ age : num [1:3] 18 21 16
# $ name : chr [1:3] "jim" "bob" "alice"
# $ income: num [1:3] 100 200 300
You can use map_at
df[] <- purrr::map_at(df, numeric_variables, as.numeric)
df
# A tibble: 3 x 4
# id age name income
# <dbl> <dbl> <chr> <dbl>
#1 1 18 jim 100
#2 2 21 bob 200
#3 3 16 alice 300

What is the process of applying a dplyr function to a list of values

I have created a dplyr function to evaluate counts of events for a population. The code works when used with explicit naming of variables within the dplyr::filter and dplyr::group_by functions.
I need to apply the function to 24 variables which are column headers within a data frame. Here they are referred to as x.
I have used !! as I understand that the variable is evaluated as a string rather than a column name.
The function
summary_table <- function(x){
assign(paste(x,"sum_tab", sep="_"),
envir = parent.frame(),
value = df %>%
filter(!is.na(!!x)) %>%
group_by(!!x) %>%
summarise(
'Variable name' = paste0(x),
Discharged = sum(admission_status == "Discharged"),
'Re-attended' = sum(!is.na(re_admission_status)),
'Admitted on Re-attendance' = sum(re_admission_status == "Admitted", na.rm = TRUE)))
}
I have used:
sapply(var_names, summary_table)
However this only outputs one row of the table for each variable in the list var_names
In summary I would like pointers to the correct mechanism to apply the function written above to a list of column names within the dplyr pipe.
Reproducible example
example <- mtcars %>%
group_by(vs) %>%
summarise(
'6 cylinder' = sum(cyl == 6),
'Large disp' = sum(disp >= 100),
'low gears' = sum(gear <= 4))
})
In this example we would want to apply this function to the following list
cars_var <- c("vm", "am", "carb")
This would produce three tables for each column in the list.
As #eipi10 commented, it is usually unwise to automatically create variables. A better idea is to create a single variable that is a list of data frames.
It is also easier to let users apply the groups themselves with group_by() or group_by_at(), so that you don't have to worry about how they provide the names of the variables.
EDIT 2019-05-2
One way is to regard the names of the grouping variables as the 'data', and map over them, creating a copy of the actual data grouped by each one of the grouping variables.
library(dplyr)
library(purrr)
grouping_vars <- c("vs", "am", "carb")
map(grouping_vars, group_by_at, .tbl = mtcars) %>%
map(summarise,
'6 cylinder' = sum(cyl == 6),
'Large disp' = sum(disp >= 100),
'low gears' = sum(gear <= 4))
#> [[1]]
#> # A tibble: 2 x 4
#> vs `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 0 3 18 14
#> 2 1 4 9 13
#>
#> [[2]]
#> # A tibble: 2 x 4
#> am `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 0 4 19 19
#> 2 1 3 8 8
#>
#> [[3]]
#> # A tibble: 6 x 4
#> carb `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 1 2 4 7
#> 2 2 0 8 8
#> 3 3 0 3 3
#> 4 4 4 10 9
#> 5 6 1 1 0
#> 6 8 0 1 0
Created on 2019-05-02 by the reprex package (v0.2.1)
Original answer
Here is a function that uses dplyr::groups() to find out which variables have been grouped. Then it iterates over each grouping variable, summarises, and appends the resulting data frame to a list.
library(dplyr)
margins <- function(.data, ...) {
groups <- dplyr::groups(.data)
n <- length(groups)
out <- vector(mode = "list", length = n)
for (i in rev(seq_len(n))) {
out[[i]] <-
.data %>%
dplyr::group_by(!!groups[[i]]) %>%
dplyr::summarise(...) %>%
dplyr::group_by(!!groups[[i]]) # Reapply the original group
}
out
}
mtcars %>%
group_by(vs, am, carb) %>%
margins('6 cylinder' = sum(cyl == 6),
'Large disp' = sum(disp >= 100),
'low gears' = sum(gear <= 4))
#> [[1]]
#> # A tibble: 2 x 4
#> # Groups: vs [2]
#> vs `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 0 3 18 14
#> 2 1 4 9 13
#>
#> [[2]]
#> # A tibble: 2 x 4
#> # Groups: am [2]
#> am `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 0 4 19 19
#> 2 1 3 8 8
#>
#> [[3]]
#> # A tibble: 6 x 4
#> # Groups: carb [6]
#> carb `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 1 2 4 7
#> 2 2 0 8 8
#> 3 3 0 3 3
#> 4 4 4 10 9
#> 5 6 1 1 0
#> 6 8 0 1 0
Created on 2019-04-24 by the reprex package (v0.2.1.9000)
If you want to group with a vector of variable names, you can use dplyr::group_by_at() and dplyr::vars().
cars_var <- c("vs", "am", "carb")
mtcars %>%
group_by_at(vars(cars_var)) %>%
margins('6 cylinder' = sum(cyl == 6),
'Large disp' = sum(disp >= 100),
'low gears' = sum(gear <= 4))
I am the author of a small package called armgin that implements this and a few similar ideas.

Passing a list of arguments to a function with quasiquotation

I am trying to write a function in R that summarizes a data frame according to grouping variables. The grouping variables are given as a list and passed to group_by_at, and I would like to parametrize them.
What I am doing now is this:
library(tidyverse)
d = tribble(
~foo, ~bar, ~baz,
1, 2, 3,
1, 3, 5
4, 5, 6,
4, 5, 1
)
sum_fun <- function(df, group_vars, sum_var) {
sum_var = enquo(sum_var)
return(
df %>%
group_by_at(.vars = group_vars) %>%
summarize(sum(!! sum_var))
)
}
d %>% sum_fun(group_vars = c("foo", "bar"), baz)
However, I would like to call the function like so:
d %>% sum_fun(group_vars = c(foo, bar), baz)
Which means the grouping vars should not be evaluated in the call, but in the function. How would I go about rewriting the function to enable that?
I have tried using enquo just like for the summary variable, and then replacing group_vars with !! group_vars, but it leads to this error:
Error in !group_vars : invalid argument type
Using group_by(!!!group_vars) yields:
Column `c(foo, bar)` must be length 2 (the number of rows) or one, not 4
What would be the proper way to rewrite the function?
I'd just use vars to do the quoting. Here is an example using mtcars dataset
library(tidyverse)
sum_fun <- function(.data, .summary_var, .group_vars) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(.group_vars) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun(mtcars, disp, .group_vars = vars(cyl, am))
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
You can also replace .group_vars with ... (dot-dot-dot)
sum_fun2 <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(...) %>% # Forward `...`
summarise(mean = mean(!!summary_var))
}
sum_fun2(mtcars, disp, vars(cyl, am))
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
If you prefer to supply inputs as a list of columns, you will need to use enquos for the ...
sum_fun3 <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
group_var <- enquos(...)
print(group_var)
.data %>%
group_by_at(group_var) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun3(mtcars, disp, c(cyl, am))
#> [[1]]
#> <quosure>
#> expr: ^c(cyl, am)
#> env: global
#>
#> # A tibble: 6 x 3
#> # Groups: cyl [?]
#> cyl am mean
#> <dbl> <dbl> <dbl>
#> 1 4 0 136.
#> 2 4 1 93.6
#> 3 6 0 205.
#> 4 6 1 155
#> 5 8 0 358.
#> 6 8 1 326
Edit: append an .addi_var to .../.group_var.
sum_fun4 <- function(.data, .summary_var, .addi_var, .group_vars) {
summary_var <- enquo(.summary_var)
.data %>%
group_by_at(c(.group_vars, .addi_var)) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun4(mtcars, disp, .addi_var = vars(gear), .group_vars = vars(cyl, am))
#> # A tibble: 10 x 4
#> # Groups: cyl, am [?]
#> cyl am gear mean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 0 3 120.
#> 2 4 0 4 144.
#> 3 4 1 4 88.9
#> 4 4 1 5 108.
#> 5 6 0 3 242.
#> 6 6 0 4 168.
#> 7 6 1 4 160
#> 8 6 1 5 145
#> 9 8 0 3 358.
#> 10 8 1 5 326
group_by_at() can also take input as a character vector of column names
sum_fun5 <- function(.data, .summary_var, .addi_var, ...) {
summary_var <- enquo(.summary_var)
addi_var <- enquo(.addi_var)
group_var <- enquos(...)
### convert quosures to strings for `group_by_at`
all_group <- purrr::map_chr(c(addi_var, group_var), quo_name)
.data %>%
group_by_at(all_group) %>%
summarise(mean = mean(!!summary_var))
}
sum_fun5(mtcars, disp, gear, cyl, am)
#> # A tibble: 10 x 4
#> # Groups: gear, cyl [?]
#> gear cyl am mean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 3 4 0 120.
#> 2 3 6 0 242.
#> 3 3 8 0 358.
#> 4 4 4 0 144.
#> 5 4 4 1 88.9
#> 6 4 6 0 168.
#> 7 4 6 1 160
#> 8 5 4 1 108.
#> 9 5 6 1 145
#> 10 5 8 1 326
Created on 2018-10-09 by the reprex package (v0.2.1.9000)
You can rewrite the function using a combination of dplyr::group_by(), dplyr::across(), and curly curly embracing {{. This works with dplyr version 1.0.0 and greater.
I've edited the original example and code for clarity.
library(tidyverse)
my_data <- tribble(
~foo, ~bar, ~baz,
"A", "B", 3,
"A", "C", 5,
"D", "E", 6,
"D", "E", 1
)
sum_fun <- function(.data, group, sum_var) {
.data %>%
group_by(across({{ group }})) %>%
summarize("sum_{{sum_var}}" := sum({{ sum_var }}))
}
sum_fun(my_data, group = c(foo, bar), sum_var = baz)
#> `summarise()` has grouped output by 'foo'. You can override using the `.groups` argument.
#> # A tibble: 3 x 3
#> # Groups: foo [2]
#> foo bar sum_baz
#> <chr> <chr> <dbl>
#> 1 A B 3
#> 2 A C 5
#> 3 D E 7
Created on 2021-09-06 by the reprex package (v2.0.0)
You could make use of the ellipse .... Take the following example:
sum_fun <- function(df, sum_var, ...) {
sum_var <- substitute(sum_var)
grps <- substitute(list(...))[-1L]
return(
df %>%
group_by_at(.vars = as.character(grps)) %>%
summarize(sum(!! sum_var))
)
}
d %>% sum_fun(baz, foo, bar)
We take the additional arguments and create a list out of them. Afterwards we use non-standard evaluation (substitute) to get the variable names and prevent R from evaluating them. Since group_by_at expects an object of type character or numeric, we simply convert the vector of names into a vector of characters and the function gets evaluated as we would expect.
> d %>% sum_fun(baz, foo, bar)
# A tibble: 3 x 3
# Groups: foo [?]
foo bar `sum(baz)`
<dbl> <dbl> <dbl>
1 1 2 3
2 1 3 5
3 4 5 7
If you do not want to supply grouping variables as any number of additional arguments, then you can of course use a named argument:
sum_fun <- function(df, sum_var, grps) {
sum_var <- enquo(sum_var)
grps <- as.list(substitute(grps))[-1L]
return(
df %>%
group_by_at(.vars = as.character(grps)) %>%
summarize(sum(!! sum_var))
)
}
sum_fun(mtcars, sum_var = hp, grps = c(cyl, gear))
The reason why I use substitute is that it makes it easy to split the expression list(cyl, gear) in its components. There might be a way to use rlang but I have not digged into that package so far.

Resources