create function to pass into dplyr::summarise - r

In my data preparation, I want to create a function for repeated computations into the summarise function. So the idea is to create a function like so:
my_func <-
function(criteria){
sum(case_when(eval(rlang::parse_expr(criteria)))*100, na.rm = TRUE)
}
So then, I can use that function to parse different criteria:
DT %>%
group_by(group_var) %>%
summarise(
# Indicator A:
ia = my_func(var_x %in% c(1,2,3)~1,TRUE ~ 0),
# Indicator B:
ft = my_func(var_x %in% c(4,5)~1,TRUE ~ 0)
)
But, with the above code, I got an error. I really appreciate any idea on how to make this work.

IMHO there is no reason to use rlang::parse_expr. Instead you could use ... like so:
library(dplyr)
my_func <- function(...) {
sum(case_when(...) * 100, na.rm = TRUE)
}
mtcars %>%
group_by(am) %>%
summarise(
ia = my_func(cyl %in% c(4, 6) ~ 1, TRUE ~ 0)
)
#> # A tibble: 2 × 2
#> am ia
#> <dbl> <dbl>
#> 1 0 700
#> 2 1 1100
EDIT To pass a column to scale the result instead of the hard-coded 100 you could do:
my_func <- function(..., scale) {
sum(case_when(...) * {{ scale }}, na.rm = TRUE)
}
mtcars %>%
group_by(am) %>%
summarise(
ia = my_func(cyl %in% c(4, 6) ~ 1, TRUE ~ 0, scale = mpg)
)
#> # A tibble: 2 × 2
#> am ia
#> <dbl> <dbl>
#> 1 0 145.
#> 2 1 286.

Related

Different results of a full_join in arrow and dplyr

I get different results when using full_join on tibble and on arrow_table. Maybe somebody can give a hand on what is going on?
library(arrow)
library(dplyr)
xa1 <- arrow_table(x = 1L)
xa2 <- arrow_table(x = 2L)
x1 <- tibble(x = 1L)
x2 <- tibble(x = 2L)
full_join(xa1,xa2,on = c("x")) %>% collect() %>% compute()
full_join(x1,x2)
# A tibble: 2 × 1
x
<int>
1 1
2 NA
full_join(x1,x2)
Joining, by = "x"
# A tibble: 2 × 1
x
<int>
1 1
2 2
There is no on argument in dplyr::.*_join. Usage according to ?dplyr::full_join is
full_join(
x,
y,
by = NULL,
copy = FALSE,
suffix = c(".x", ".y"),
...,
keep = NULL
)
on is a data.table join argument. We need by here
library(arrow)
library(dplyr)
full_join(xa1, xa2, by = "x") %>%
collect() %>%
compute()
-output
# A tibble: 2 × 1
x
<int>
1 1
2 2
By looking at the methods and source code
> methods("full_join")
[1] full_join.arrow_dplyr_query* full_join.ArrowTabular* full_join.data.frame* full_join.Dataset* full_join.RecordBatchReader*
> getAnywhere(full_join.ArrowTabular)
function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"),
..., keep = FALSE)
{
query <- do_join(x, y, by, copy, suffix, ..., keep = keep,
join_type = "FULL_OUTER")
if (!keep) {
query$selected_columns <- post_join_projection(names(x),
names(y), handle_join_by(by, x, y), suffix)
}
query
}
by is used in the functions that are called inside

i want to write a custom function with tidyverse verbs/syntax that accepts the grouping parameters of my function as string

I want to write a function that has as parameters a data set, a variable to be grouped, and another parameter to be filtered. I want to write the function in such a way that I can afterwards apply map() to it and pass the variables to be grouped in to map() as a vector. Nevertheless, I don't know how my custom function rating() accepts the variables to be grouped as a string. This is what i have tried.
data = tibble(a = seq.int(1:10),
g1 = c(rep("blue", 3), rep("green", 3), rep("red", 4)),
g2 = c(rep("pink", 2), rep("hotpink", 6), rep("firebrick", 2)),
na = NA,
stat=c(23,43,53,2,43,18,54,94,43,87))
rating = function(data, by, no){
data %>%
select(a, {{by}}, stat) %>%
group_by({{by}}) %>%
mutate(rank = rank(stat)) %>%
ungroup() %>%
filter(a == no)
}
fn(data = data, by = g2, no = 5) #this works
And this is the way i want to use my function
map(.x = c("g1", "g2"), .f = ~rating(data = data, by = .x, no = 1))
... but i get
Error: Must group by variables found in `.data`.
* Column `.x` is not found.
As we are passing character elements, it would be better to convert to symbol and evaluate (!!)
library(dplyr)
library(purrr)
rating <- function(data, by, no){
by <- rlang::ensym(by)
data %>%
select(a, !! by, stat) %>%
group_by(!!by) %>%
mutate(rank = rank(stat)) %>%
ungroup() %>%
filter(a == no)
}
-testing
> map(.x = c("g1", "g2"), .f = ~rating(data = data, by = !!.x, no = 1))
[[1]]
# A tibble: 1 × 4
a g1 stat rank
<int> <chr> <dbl> <dbl>
1 1 blue 23 1
[[2]]
# A tibble: 1 × 4
a g2 stat rank
<int> <chr> <dbl> <dbl>
1 1 pink 23 1
It also works with unquoted input
> rating(data, by = g2, no = 5)
# A tibble: 1 × 4
a g2 stat rank
<int> <chr> <dbl> <dbl>
1 5 hotpink 43 3

Using group_by and summarise_all to create dummy indicators for categorical variable

I want to generate dummy indicators for each id for the given categorical variable fruit. I observe the following warning when using summarise_all and self defined function. I also tried to use summarise_all(any) and it gave me warning when coercing double to logical. Is there any efficient or updated way to implement this? Thanks a lot!
fruit = c("apple", "banana", "orange", "pear",
"strawberry", "blueberry", "durian",
"grape", "pineapple")
df_sample = data.frame(id = c(rep("a", 3), rep("b", 5), rep("c", 6)),
fruit = c(sample(fruit, replace = T, size = 3),
sample(fruit, replace = T, size = 5),
sample(fruit, replace = T, size = 6)))
fruit_indicator =
model.matrix(~ -1 + fruit, df_sample) %>%
as.data.frame() %>%
bind_cols(df_sample) %>%
select(-fruit) %>%
group_by(id) %>%
summarise_all(funs(ifelse(any(. > 0), 1, 0)))
# Warning message:
# `funs()` is deprecated as of dplyr 0.8.0.
# Please use a list of either functions or lambdas:
#
# # Simple named list:
# list(mean = mean, median = median)
#
# # Auto named with `tibble::lst()`:
# tibble::lst(mean, median)
#
# # Using lambdas
# list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
You can use across which is available in dplyr 1.0.0 or higher.
library(dplyr)
model.matrix(~ -1 + fruit, df_sample) %>%
as.data.frame() %>%
bind_cols(df_sample) %>%
select(-fruit) %>%
group_by(id) %>%
summarise(across(.fns = ~as.integer(any(. > 0))))
# id fruitapple fruitbanana fruitdurian fruitgrape fruitpear
#* <chr> <int> <int> <int> <int> <int>
#1 a 0 1 1 0 1
#2 b 1 0 0 1 0
#3 c 0 1 0 1 1
# … with 1 more variable: fruitpineapple <int>

Mutate a column of models: "Error: Problem with `mutate()` input `model`. x Input `model` must be a vector, not a `lm` object."

I have a dataframe that contains as a column a model formula definition. I would like to mutate a new column where each row is a model based on the corresponding rows model definition.
Some data:
# Set up
library(tidyverse)
library(lubridate)
# Create data
mydf <- data.frame(
cohort = seq(ymd('2019-01-01'), ymd('2019-12-31'), by = '1 days'),
n = rnorm(365, 1000, 50) %>% round,
cohort_cost = rnorm(365, 800, 50)
) %>%
crossing(tenure_days = 0:365) %>%
mutate(activity_date = cohort + days(tenure_days)) %>%
mutate(daily_revenue = rnorm(nrow(.), 20, 1)) %>%
group_by(cohort) %>%
arrange(activity_date) %>%
mutate(cumulative_revenue = cumsum(daily_revenue)) %>%
arrange(cohort, activity_date) %>%
mutate(payback_velocity = round(cumulative_revenue / cohort_cost, 2)) %>%
select(cohort, n, cohort_cost, activity_date, tenure_days, everything())
## wider data
mydf_wide <- mydf %>%
select(cohort, n, cohort_cost, tenure_days, payback_velocity) %>%
group_by(cohort, n, cohort_cost) %>%
pivot_wider(names_from = tenure_days, values_from = payback_velocity, names_prefix = 'velocity_day_')
Now, the final problem code block. It fails on the very last line:
models <- data.frame(
from = mydf$tenure_days %>% unique,
to = mydf$tenure_days %>% unique
) %>%
expand.grid %>%
filter(to > from) %>%
filter(from > 0) %>%
arrange(from) %>%
mutate(mod_formula = paste0('velocity_day_', to, ' ~ velocity_day_', from)) %>%
mutate(model = lm(as.formula(mod_formula), data = mydf_wide))
Error: Problem with mutate() input model.
x Input model must be a vector, not a lm object.
ℹ Input model is lm(as.formula(mod_formula), data = mydf_wide).
If I run the last code block minus the last line and take a look at the resulting data frame 'models' it looks like this:
models %>% head
from to mod_formula
1 1 2 velocity_day_2 ~ velocity_day_1
2 1 3 velocity_day_3 ~ velocity_day_1
3 1 4 velocity_day_4 ~ velocity_day_1
4 1 5 velocity_day_5 ~ velocity_day_1
5 1 6 velocity_day_6 ~ velocity_day_1
6 1 7 velocity_day_7 ~ velocity_day_1
I tried making it a list column, but to do that as far as I'm aware I need to group by. But in this case I need to group by everything. I amended the last code block:
models <- data.frame(
from = mydf$tenure_days %>% unique,
to = mydf$tenure_days %>% unique
) %>%
expand.grid %>%
filter(to > from) %>%
filter(from > 0) %>%
arrange(from) %>%
mutate(mod_formula = paste0('velocity_day_', to, ' ~ velocity_day_', from)) %>%
group_by_all() %>%
nest() %>%
mutate(model = lm(as.formula(mod_formula), data = mydf_wide))
However this results in the same error.
How can I add a new column onto 'models' that contains a linear model for each row based on the formula in field 'mod_formula'?
lm is not vectorized. Add rowwise to create a model for each row.
library(dplyr)
models <- data.frame(
from = mydf$tenure_days %>% unique,
to = mydf$tenure_days %>% unique
) %>%
expand.grid %>%
filter(to > from) %>%
filter(from > 0) %>%
arrange(from) %>%
mutate(mod_formula = paste0('velocity_day_', to, ' ~ velocity_day_', from)) %>%
rowwise() %>%
mutate(model = list(lm(as.formula(mod_formula), data = mydf_wide)))
models
# from to mod_formula model
# <int> <int> <chr> <list>
#1 1 2 velocity_day_2 ~ velocity_day_1 <lm>
#2 1 3 velocity_day_3 ~ velocity_day_1 <lm>
#3 1 4 velocity_day_4 ~ velocity_day_1 <lm>
#4 1 5 velocity_day_5 ~ velocity_day_1 <lm>
#5 1 6 velocity_day_6 ~ velocity_day_1 <lm>
#6 1 7 velocity_day_7 ~ velocity_day_1 <lm>
#...
#...
You can also use map instead of rowwise.
mutate(model = purrr::map(mod_formula, ~lm(.x, data = mydf_wide)))

multidplyr: trial custom function

I'm trying to learn to run a custom function through multidplyr::do() on a cluster. Consider this simple self contained example. For example's sake, I'm trying to apply my custom function myWxTest to each common_dest (destinations with more than 50 flights) in the flight dataset:
library(dplyr)
library(multidplyr)
library(nycflights13)
library(quantreg)
myWxTest <- function(x){
stopifnot(!is.null(x$dep_time))
stopifnot(!is.null(x$dep_delay))
stopifnot(!is.null(x$sched_dep_time))
stopifnot(!is.null(x$sched_arr_time))
stopifnot(!is.null(x$arr_time))
out_mat <- c('(Intercept)' = NA, dep_time = NA, dep_delay = NA, sched_dep_time = NA, sched_arr_time = NA)
if(length(x$arr_time)>5){
model_1 <- quantreg::rq(arr_time ~ dep_time + dep_delay + sched_dep_time + sched_arr_time, data = x, tau = .5)
out_mat[names(coef(model_1))] <- coef(model_1)
}
return(out_mat)
}
common_dest <- flights %>%
count(dest) %>%
filter(n >= 365) %>%
semi_join(flights, .) %>%
mutate(yday = lubridate::yday(ISOdate(year, month, day)))
cluster <- create_cluster(2)
set_default_cluster(cluster)
by_dest <- common_dest %>%
partition(dest, cluster = cluster)
cluster_library(by_dest, "quantreg")
So far so good (but I'm just reproducing the examples from the vignette). Now, I have to send my custom function to each node:
cluster %>% cluster_call(myWxTest)
But I get:
Error in checkForRemoteErrors(lapply(cl, recvResult)) :
2 nodes produced errors; first error: argument "x" is missing, with no default
eventually, I want to apply myWxTest to each subgroup:
models <- by_dest %>%
do(myWxTest(.))
I got it running with a couple tweaks:
library(dplyr)
library(multidplyr)
library(nycflights13)
library(quantreg)
myWxTest <- function(x){
stopifnot(!is.null(x$dep_time))
stopifnot(!is.null(x$dep_delay))
stopifnot(!is.null(x$sched_dep_time))
stopifnot(!is.null(x$sched_arr_time))
stopifnot(!is.null(x$arr_time))
out_mat <- c('(Intercept)' = NA, dep_time = NA, dep_delay = NA, sched_dep_time = NA, sched_arr_time = NA)
if(length(x$arr_time)>5){
model_1 <- quantreg::rq(arr_time ~ dep_time + dep_delay + sched_dep_time + sched_arr_time, data = x, tau = .5)
out_mat[names(coef(model_1))] <- coef(model_1)
}
return(as.data.frame(out_mat, stringsAsFactors = FALSE)) # change result to data.frame, not matrix
}
common_dest <- flights %>%
count(dest) %>%
filter(n >= 365) %>%
semi_join(flights, .) %>%
mutate(yday = lubridate::yday(ISOdate(year, month, day)))
by_dest <- common_dest %>% partition(dest)
cluster_library(by_dest, "quantreg")
cluster_copy(by_dest, myWxTest) # copy function to each node
models <- by_dest %>% do(myWxTest(.)) %>% collect() # collect data from clusters
...which returns a local data.frame:
models
#> Source: local data frame [390 x 2]
#> Groups: dest [78]
#>
#> dest out_mat
#> <chr> <dbl>
#> 1 CAK 156.5248953
#> 2 CAK 0.9904261
#> 3 CAK -0.0767928
#> 4 CAK -0.3523211
#> 5 CAK 0.3220386
#> 6 DCA 74.5959035
#> 7 DCA 0.2751917
#> 8 DCA 1.0712483
#> 9 DCA 0.2874165
#> 10 DCA 0.4344960
#> # ... with 380 more rows

Resources