Background
I have a simple helper function that applies left_join to any number of passed tables in other to gather them and return one object.
Example
# Settings ----------------------------------------------------------------
library("tidyverse")
set.seed(123)
# Data --------------------------------------------------------------------
sample_one <-
tibble(
column_a = c(1, 2),
column_b = runif(n = 2),
column_other = runif(n = 2)
)
sample_two <-
tibble(
column_a = c(1, 2),
column_b = runif(n = 2),
column_other = runif(n = 2)
)
sample_three <-
tibble(
column_a = c(1, 2),
column_b = runif(n = 2),
column_other = runif(n = 2)
)
# Function ----------------------------------------------------------------
left_join_on_column_a <- function(keep_var, ...) {
keep_var <- enquo(keep_var)
dots <- list(...)
clean_dfs <- map(dots, select, !!keep_var, "column_a")
reduce(.x = clean_dfs,
.f = left_join,
"column_a") %>%
gather(key = "model_type", !!keep_var, -column_a)
}
# Test --------------------------------------------------------------------
left_join_on_column_a(keep_var = column_b, sample_one, sample_two, sample_three)
Problem
I would like to be able to programmatically modify the suffix argument of left_join:
suffix
If there are non-joined duplicate variables in x and y, these
suffixes will be added to the output to disambiguate them. Should be a
character vector of length 2.
Current results
# A tibble: 6 x 3
column_a model_type column_b
<dbl> <chr> <dbl>
1 1 column_b.x 0.288
2 2 column_b.x 0.788
3 1 column_b.y 0.940
4 2 column_b.y 0.0456
5 1 column_b 0.551
6 2 column_b 0.457
Desired results
# A tibble: 6 x 3
column_a model_type column_b
<dbl> <chr> <dbl>
1 1 sample_one 0.288
2 2 sample_one 0.788
3 1 sample_two 0.940
4 2 sample_two 0.0456
5 1 sample_three 0.551
6 2 sample_three 0.457
The model_type column reflects name of the object passed via ....
Attempts
I was trying to capture names of the objects passed within ... but it's not a named object so it doesn't make sense:
left_join_on_column_a <- function(keep_var, ...) {
keep_var <- enquo(keep_var)
dots <- list(...)
table_names <- names(dots)
clean_dfs <- map(dots, select, !!keep_var, "column_a")
reduce(.x = clean_dfs,
.f = left_join,
"column_a",
table_names) %>%
gather(key = "model_type", !!keep_var, -column_a)
}
Maybe rename column_b so that you don't have to worry about suffix
left_join_on_column_a <- function(keep_var, common_var, ...) {
nm = unname(sapply(rlang::enexprs(...), as.character))
keep_var <- as.character(substitute(keep_var))
common_var = as.character(substitute(common_var))
foo = function(x, y) {
x %>% select(!!common_var, !!y := !!keep_var)
}
reduce(.x = Map(foo, list(...), nm),
.f = left_join,
common_var) %>%
gather("model_type", !!keep_var, -!!common_var)
}
left_join_on_column_a(column_b, column_a, sample_one, sample_two, sample_three)
Related
I get different results when using full_join on tibble and on arrow_table. Maybe somebody can give a hand on what is going on?
library(arrow)
library(dplyr)
xa1 <- arrow_table(x = 1L)
xa2 <- arrow_table(x = 2L)
x1 <- tibble(x = 1L)
x2 <- tibble(x = 2L)
full_join(xa1,xa2,on = c("x")) %>% collect() %>% compute()
full_join(x1,x2)
# A tibble: 2 × 1
x
<int>
1 1
2 NA
full_join(x1,x2)
Joining, by = "x"
# A tibble: 2 × 1
x
<int>
1 1
2 2
There is no on argument in dplyr::.*_join. Usage according to ?dplyr::full_join is
full_join(
x,
y,
by = NULL,
copy = FALSE,
suffix = c(".x", ".y"),
...,
keep = NULL
)
on is a data.table join argument. We need by here
library(arrow)
library(dplyr)
full_join(xa1, xa2, by = "x") %>%
collect() %>%
compute()
-output
# A tibble: 2 × 1
x
<int>
1 1
2 2
By looking at the methods and source code
> methods("full_join")
[1] full_join.arrow_dplyr_query* full_join.ArrowTabular* full_join.data.frame* full_join.Dataset* full_join.RecordBatchReader*
> getAnywhere(full_join.ArrowTabular)
function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"),
..., keep = FALSE)
{
query <- do_join(x, y, by, copy, suffix, ..., keep = keep,
join_type = "FULL_OUTER")
if (!keep) {
query$selected_columns <- post_join_projection(names(x),
names(y), handle_join_by(by, x, y), suffix)
}
query
}
by is used in the functions that are called inside
I want to write a function that has as parameters a data set, a variable to be grouped, and another parameter to be filtered. I want to write the function in such a way that I can afterwards apply map() to it and pass the variables to be grouped in to map() as a vector. Nevertheless, I don't know how my custom function rating() accepts the variables to be grouped as a string. This is what i have tried.
data = tibble(a = seq.int(1:10),
g1 = c(rep("blue", 3), rep("green", 3), rep("red", 4)),
g2 = c(rep("pink", 2), rep("hotpink", 6), rep("firebrick", 2)),
na = NA,
stat=c(23,43,53,2,43,18,54,94,43,87))
rating = function(data, by, no){
data %>%
select(a, {{by}}, stat) %>%
group_by({{by}}) %>%
mutate(rank = rank(stat)) %>%
ungroup() %>%
filter(a == no)
}
fn(data = data, by = g2, no = 5) #this works
And this is the way i want to use my function
map(.x = c("g1", "g2"), .f = ~rating(data = data, by = .x, no = 1))
... but i get
Error: Must group by variables found in `.data`.
* Column `.x` is not found.
As we are passing character elements, it would be better to convert to symbol and evaluate (!!)
library(dplyr)
library(purrr)
rating <- function(data, by, no){
by <- rlang::ensym(by)
data %>%
select(a, !! by, stat) %>%
group_by(!!by) %>%
mutate(rank = rank(stat)) %>%
ungroup() %>%
filter(a == no)
}
-testing
> map(.x = c("g1", "g2"), .f = ~rating(data = data, by = !!.x, no = 1))
[[1]]
# A tibble: 1 × 4
a g1 stat rank
<int> <chr> <dbl> <dbl>
1 1 blue 23 1
[[2]]
# A tibble: 1 × 4
a g2 stat rank
<int> <chr> <dbl> <dbl>
1 1 pink 23 1
It also works with unquoted input
> rating(data, by = g2, no = 5)
# A tibble: 1 × 4
a g2 stat rank
<int> <chr> <dbl> <dbl>
1 5 hotpink 43 3
I want to generate dummy indicators for each id for the given categorical variable fruit. I observe the following warning when using summarise_all and self defined function. I also tried to use summarise_all(any) and it gave me warning when coercing double to logical. Is there any efficient or updated way to implement this? Thanks a lot!
fruit = c("apple", "banana", "orange", "pear",
"strawberry", "blueberry", "durian",
"grape", "pineapple")
df_sample = data.frame(id = c(rep("a", 3), rep("b", 5), rep("c", 6)),
fruit = c(sample(fruit, replace = T, size = 3),
sample(fruit, replace = T, size = 5),
sample(fruit, replace = T, size = 6)))
fruit_indicator =
model.matrix(~ -1 + fruit, df_sample) %>%
as.data.frame() %>%
bind_cols(df_sample) %>%
select(-fruit) %>%
group_by(id) %>%
summarise_all(funs(ifelse(any(. > 0), 1, 0)))
# Warning message:
# `funs()` is deprecated as of dplyr 0.8.0.
# Please use a list of either functions or lambdas:
#
# # Simple named list:
# list(mean = mean, median = median)
#
# # Auto named with `tibble::lst()`:
# tibble::lst(mean, median)
#
# # Using lambdas
# list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
You can use across which is available in dplyr 1.0.0 or higher.
library(dplyr)
model.matrix(~ -1 + fruit, df_sample) %>%
as.data.frame() %>%
bind_cols(df_sample) %>%
select(-fruit) %>%
group_by(id) %>%
summarise(across(.fns = ~as.integer(any(. > 0))))
# id fruitapple fruitbanana fruitdurian fruitgrape fruitpear
#* <chr> <int> <int> <int> <int> <int>
#1 a 0 1 1 0 1
#2 b 1 0 0 1 0
#3 c 0 1 0 1 1
# … with 1 more variable: fruitpineapple <int>
I am using the output coefficients from a glm regression model and I need to create a lookup value, using key paste ([column name].[Factor Level], and then return the corresponding value from another data table. The column names must be dynamic so that I don't have to explicitly name each column one by one.
The returned values from the lookup are then multiplied by 1 (for factors) or by the actual numeric values and all coef_colnames summed into column Total.
I've done some example in excel but cannot replicate it in R.
var_Factor1 combines the column name and the factor level from each row (using paste) to build a key for the next step lookup
var_Number1 is just the column name as it is numeric and has no factor levels
library(dplyr)
# original data
dt = data.table(
Factor1 = c("A","B","C"),
Number1 = c(10, 20,40),
Factor2 = c("D","H","N"),
Number2 = c(2, 5,3)
)
# Lookup table
model_coef = data.table(
Factor1.A = 10,
Factor1.B = 20,
Factor1.C = 30,
Factor2.D = 40,
Factor2.H = 50,
Factor2.N = 60,
Number1 = 200,
Number2 = 500
)
#initial steps
dt <- dt %>% mutate (
var_Factor1 = paste("Factor1", Factor1, sep =".")
, var_Number1 = "Number1"
, var_Factor2 = paste("Factor2", Factor2, sep =".")
, var_Number2 = "Number2"
) %>% mutate (
coef_Factor1 = model_coef[,var_Factor1]
)
#The final output should produce (as replicated from Excel)
final_output = data.table (
Factor1= c("A", "B", "C"),
Number1= c(10, 20, 40),
Factor2= c("D", "H", "N"),
Number2= c(2, 5, 3),
var_Factor1= c("Factor1.A", "Factor1.B", "Factor1.C"),
var_Number1= c("Number1", "Number1", "Number1"),
var_Factor2= c("Factor2.D", "Factor2.H", "Factor2.N"),
var_Number2= c("Number2", "Number2", "Number2"),
coef_Factor1= c(10, 20, 30),
coef_Number1= c(200, 200, 200),
coef_Factor2= c(40, 50, 60),
coef_Number2= c(500, 500, 500),
calc_Factor1= c(10, 20, 30),
calc_Number1= c(2000, 4000, 8000),
calc_Factor2= c(40, 50, 60),
calc_Number2= c(1000, 2500, 1500),
Total= c(3050, 6570, 9590)
)
It's generally a bad idea to try to generate and manipulate dynamic columns.
It will probably be better to use tidy data conventions and make the data "long". Also, it looks like you're trying to mix data.table and dplyr/tidyverse. In particular, this doesn't work: mutate (coef_Factor1 = model_coef[,var_Factor1]
I've tidied your data and modified your code to use dplyr/tidyverse below:
using tibble instead of data.table
re-built lookup table to tidy-long format so it can be left_joined
properly to your table
used mutate to do the calculations that you describe
Beyond your example, if you have more than 2 "Numbers"/"Factors" (your naming/labeling/numbering is confusing btw), there are ways to generalize further so that the code multiplies coef * number generically, for each "number"/combination. Also, your data implies but it isn't clear that A is related to D, B is related to H, etc.
library(tidyverse)
data <- tibble(Factor1 = c("A","B","C"),Number1 = c(10, 20,40),Factor2 = c("D","H","N"),Number2 = c(2, 5,3))
model_coef <- tibble(Factor1.A = 10,Factor1.B = 20,Factor1.C = 30,Factor2.D = 40,Factor2.H = 50,Factor2.N = 60,Number1 = 200,Number2 = 500)
(model_coef_factor1 <- model_coef %>%
select(Factor1.A:Factor1.C) %>%
pivot_longer(cols = everything(), names_to = c("number", "factor"), names_sep = "[.]", values_to = "coef_factor1") %>%
select(-number))
#> # A tibble: 3 x 2
#> factor coef_factor1
#> <chr> <dbl>
#> 1 A 10
#> 2 B 20
#> 3 C 30
(model_coef_factor2 <- model_coef %>%
select(Factor2.D:Factor2.N) %>%
pivot_longer(cols = everything(), names_to = c("number", "factor"), names_sep = "[.]", values_to = "coef_factor2") %>%
select(-number))
#> # A tibble: 3 x 2
#> factor coef_factor2
#> <chr> <dbl>
#> 1 D 40
#> 2 H 50
#> 3 N 60
(final_output <- data %>%
left_join(model_coef_factor1, by = c("Factor1"="factor")) %>%
left_join(model_coef_factor2, by = c("Factor2"="factor")) %>%
mutate(coef_number1 = model_coef$Number1,
coef_number2 = model_coef$Number2,
calc_factor1 = coef_factor1,
calc_number1 = Number1 * coef_number1,
calc_factor2 = coef_factor2,
calc_number2 = Number2 * coef_number2,
total = calc_factor1 + calc_number1 + calc_factor2 + calc_number2) %>%
select(total, everything()))
#> # A tibble: 3 x 13
#> total Factor1 Number1 Factor2 Number2 coef_factor1 coef_factor2
#> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 3050 A 10 D 2 10 40
#> 2 6570 B 20 H 5 20 50
#> 3 9590 C 40 N 3 30 60
#> # ... with 6 more variables: coef_number1 <dbl>, coef_number2 <dbl>,
#> # calc_factor1 <dbl>, calc_number1 <dbl>, calc_factor2 <dbl>,
#> # calc_number2 <dbl>
Created on 2019-10-23 by the reprex package (v0.3.0)
I would like to overload the [.data.table operator so that grouped joins can be achieved. I will use dplyr to illustrate what I mean by a grouped join. Take for example the function below that wraps around dplyr::inner_join:
inner_join_grp <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"),
...) {
stopifnot(identical(group_vars(x), group_vars(y)))
grp <- group_vars(x)
by <- c(by, grp)
return(inner_join(x, y, by, copy, suffix, ...))
}
All this function does is to check that input data.frames x and y have the same groups, and if they do, to add the grouping variables to the join key specified by by = ....
An application would be like this:
n_grp <- 5
n_grp2 <- 3
set.seed(0)
tmp_df <-
data.frame(grp = rep(letters[1:n_grp2], each = n_grp),
grp2 = rep(1:n_grp, times = n_grp2),
x = runif(n_grp * n_grp2))
tmp_df_2 <-
data.frame(grp = rep(letters[1:n_grp2], each = n_grp),
grp2 = rep(1:n_grp, times = n_grp2),
y = runif(n_grp * n_grp2))
Using my function:
tmp_df %>%
group_by(grp) %>%
inner_join_grp(tmp_df_2 %>%
group_by(grp),
by = "grp2")
Produces the expected fifteen row table:
# A tibble: 15 x 4
# Groups: grp [?]
grp grp2 x y
<fct> <int> <dbl> <dbl>
1 a 1 0.897 0.770
2 a 2 0.266 0.498
3 a 3 0.372 0.718
4 a 4 0.573 0.992
5 a 5 0.908 0.380
...
whereas using dplyr::inner_join produces a 45 row table since the join is only on grp2:
# A tibble: 45 x 5
# Groups: grp.x [?]
grp.x grp2 x grp.y y
<fct> <int> <dbl> <fct> <dbl>
1 a 1 0.897 a 0.770
2 a 1 0.897 b 0.777
3 a 1 0.897 c 0.267
...
My question is whether is possible to achieve something similar using [, when the input data tables are keyed. I know I can overload merge.data.table (see below for a rough example) in the same way but I would prefer to pass an additional argument to [ to achieve a keyed join.
merge_dt_keyed <-
function (x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE,
all.x = all, all.y = all, sort = TRUE,
suffixes = c(".x", ".y"),
allow.cartesian = getOption("datatable.allow.cartesian"),
...){
stopifnot(identical(key(x), key(y)))
new_key <- key(x)
by.x <- c(new_key, by, by.x)
by.y <- c(new_key, by, by.y)
return(merge(x, y, by.x = by.x, by.y = by.y, all = FALSE,
all.x = all, all.y = all, sort = TRUE,
suffixes = c(".x", ".y"),
allow.cartesian = getOption("datatable.allow.cartesian"),
...))
}
Something along these lines should work:
`[.data.table` = function(x, i, ...) {
args = match.call()
if ('on' %in% names(args)) {
args[['on']] = union(args[['on']], intersect(key(x), key(i)))
}
args[[1]] = data.table:::`[.data.table`
eval(args)
}