dplyr::mutate unquote RHS - r

I am wondering how to properly UQ string created variable names on the RHS in dplyr methods like mutate. See the error messages I got in comments in the wilcox.test part of this MWE:
require(dplyr)
dfMain <- data.frame(
base = c(rep('A', 5), rep('B', 5)),
id = letters[1:10],
q0 = rnorm(10)
)
backgs <- list(
A = rnorm(13),
B = rnorm(11)
)
fun <- function(dfMain, i = 0){
pcol <- sprintf('p%i', i)
qcol <- sprintf('q%i', i)
(
dfMain %>%
group_by(id) %>%
mutate(
!!pcol := ifelse(
!is.nan(!!qcol) &
length(backgs[[base]]),
wilcox.test(
# !!(qcol) - backgs[[base]]
# object 'base' not found
# (!!qcol) - backgs[[base]]
# non-numeric argument to binary operator
(!!qcol) - backgs[[base]]
)$p.value,
NaN
)
)
)
}
dfMain <- dfMain %>% fun()
I guess at !!(qcol) ... it is interpreted as I would like to unquote the whole expression not only the variable name that's why it does not find base? I also found out that (!!qcol) returns the string itself so no surprise the - operator is unable to handle it.

Your code should work as you expect by changing the line where you define qcol to:
qcol <- as.symbol(sprintf('q%i', i))
That is, since qcol was a string, you needed to turn it into a symbol before unquoting for it to be evaluated correctly in your mutate. Also I presume the column you wanted to refer to was the q0 column you defined in your data, not a non-existent column named qval0.

Related

Using cur_column() with mutate(across(all_of())) - Recursive Error

My goal is to take a subset of columns (COL_NAMES) within a dataframe (LR_DATA) and apply a function (FUNCTION). The dataframe (LR_DATA) is mostly nested vectors except for one identifying column (var1). However, I cannot seem to correctly pass the override inputs which are nested under the current column name with the additional suffix "_OVERRIDE".
TRENDED_LR_DATA <- LR_DATA %>% mutate(across(all_of(COL_NAMES), ~list(FUNCTION(var1, unlist(var2), unlist(.x), unlist(!!sym(paste0(cur_column(), "_OVERRIDE")))))))
Specifically I get the error:
If I replace cur_column() with a hardcoded string the code works (though obviously not as intended since it would be referencing the same override column for each specified column in COL_NAMES. Any tips this group has would be greatly appreciated - I'm relatively new to R so please bear with me ^_^.
EDIT: Below is code to reproduce the error in full. Sorry for not including this on the original question submission.
library(dplyr)
LR_DATA <- data.frame(STATE = c(1,2,3),
YEAR = c(2000,2001,2002),
DEVT_A = c(2,4,6),
DEVT_B = c(3,6,9),
DEVT_C = c(4,8,12))
LOSS_COLS <- c("DEVT_A", "DEVT_B", "DEVT_C")
DATA_OVERRIDE <- data.frame(STATE = c(1,2,3),
DEVT_A_OVERRIDE = c(NaN,1,1),
DEVT_B_OVERRIDE = c(1,1,1),
DEVT_C_OVERRIDE = c(1.5,1.5,1.5))
LR_DATA <- LR_DATA %>% left_join(DATA_OVERRIDE, by = 'STATE')
TRENDED_LR_DATA <- LR_DATA %>% summarise(across(everything(), list), .groups = "keep") %>%
mutate(across(all_of(LOSS_COLS), ~list(TREND_LOSS(unlist(.x), unlist(YEAR), unlist( !!sym(paste0(cur_column(), "_OVERRIDE"))) ))))
TREND_LOSS <-
function(LOSSES,
YEARS,
OVERRIDES) {
x <- YEARS
y = log(LOSSES)
xy = x * y
x_sq = x * x
sum_x <- sum(x)
sum_y <- sum(y)
sum_xy <- sum(xy)
sum_x_sq <- sum(x_sq)
n <- length(YEARS)
Slope <- (n*sum_xy - sum_x*sum_y) / (n*sum_x_sq - sum_x*sum_x)
OVERRIDES[is.na(OVERRIDES)] <- Slope
TRENDED_LOSSES <- LOSSES*exp(OVERRIDES)
return(TRENDED_LOSSES)
}
}

R sometimes fails to evaluate expressions parsed from strings

I have a massive dataframe where I need to create "lagged" variables and compare them with former time points. As this process needs to be variable, I've chosen to write my own functions which create these lagged variables (not included here).
As I use GLM's, I want to use the stepAIC function and before I start writing tenth of "lag01 + lag02..." I wanted to create another function (modelfiller) which creates these strings according to my parameters and then I use string2lang to make them expressions.
This mostly works but there is one issue which I cannot get my head around.
As you can see in the reprex full.model can be created when I only use y~x+lag01+lag02. If I use modelfiller("y", 2, "x", "lag") at location 1 and 3 it also works. But the moment I put modelfiller("y", 2, "x", "lag") at location 2 in the code (within the stepAIC glm) it creates the following error message:
Error: Problem with `mutate()` input `GLM_AIC`.
x object '.x' not found
i Input `GLM_AIC` is `purrr::map(...)`.
i The error occurred in group 1: group = "a".
I have also tried as.formula with & without eval, but it caused the same issue.
group <- c(rep("a", 10), rep("b", 10), rep("c", 10))
order <- c(seq(1:10), seq(1:10), seq(1:10))
x <- c(runif(30))
y <- c(runif(30))
df <- data.frame(group, order, x, y)
df <- df %>%
dplyr::group_by(group) %>%
dplyr::arrange(group, order) %>%
dplyr::mutate(lag01 = dplyr::lag(x, n=1),
lag02 = dplyr::lag(x, n=2)) %>%
tidyr::drop_na()
modelfiller = function(depPar, maxlag, indepPar, str) {
varnames = list()
for (i in seq(1:maxlag)) {
varnames[i] = paste0(str, stringr::str_pad(i, width = 2, pad = "0"))
}
varnames = paste0(varnames, collapse="+")
varnames = paste(indepPar, varnames, sep = "+")
return(paste(depPar, varnames, sep = "~"))
}
full.model <- df %>%
tidyr::nest(- group) %>%
dplyr::mutate(
# Perform GLM calculation on each group and then a step-wise model selection based on AIC
GLM = purrr::map(
data, ~ lm(data = .x,
# Location 1 - Working
str2lang(modelfiller("y", 2, "x", "lag"))
#y~x+lag01+lag02
)),
GLM_AIC = purrr::map(
data, ~ MASS::stepAIC(glm(data = .x,
# Location 2 - NOT Working
str2lang(modelfiller("y", 2, "x", "lag"))
#y~x+lag01+lag02
)
,direction = "both", trace = FALSE, k = 2,
scope = list(
lower = lm(data = .x,
y ~ 1),
upper = glm(data = .x,
# Location 3 - Working
str2lang(modelfiller("y", 2, "x", "lag"))
#y~x+lag01+lag02
)
)))
)
The issue is that glm stores the name of the variable used to reference the data, and stepAIC then attempts to retrieve this name and evaluate it to access the data, but gets confused about which environment the variable was defined in. To demonstrate, I'm going to simplify your code to
mdl <- str2lang(modelfiller("y", 2, "x", "lag")) # This is your y~x+lag01+lag02
dfn <- df %>% tidyr::nest( data = c(-group) ) # First step of your %>% chain
glms <- purrr::map( dfn$data, ~glm(data = .x, mdl) ) # Construct the models
# Examine glms to observe that
# Call: glm(formula = mdl, data = .x) <--- glm() remembers that the data is in .x
# but stepAIC is not properly aware of where .x
# is defined and behaves effectively as
MASS::stepAIC( glms[[1]] ) # Error: object '.x' not found
Option 1
One workaround is to manually construct the expression that contains the data and then evaluate it:
glm2 <- function(.df, ...) {
eval(rlang::expr(glm(!!rlang::enexpr(.df),!!!list(...)))) }
glms2 <- purrr::map( dfn$data, ~glm2(data = .x, mdl) ) # Same as above, but with glm2
MASS::stepAIC( glms2[[1]] ) # Now works
Changing glm to glm2 in your problematic spot makes your code work too. The down side is that the Call: then remembers the entire data frame, which can be problematic if they are very large.
Option 2
Another alternative is to replace the purrr call with a for loop, which helps maintain the calling frames assumed by stepAIC, thus guiding it to where the data is defined
# This fails with Error: object '.x' not found
purrr::map( dfn$data, ~MASS::stepAIC(glm(data=.x, mdl), direction="both") )
# This works
for( mydata in dfn$data )
MASS::stepAIC(glm(data=mydata, mdl), direction="both")
The advantage here is not needing to store the entire data frame inside the call. The disadvantage is that you effectively lose access to what purrr does to streamline the code.

Why am I receiving "invalid 'right' arguement" when using cut()

I created a function in R that creates deciles (or any n-tile) based on a volume metric as opposed to observation counts.
User_Decile <- function(x,n,Output = " "){
require(dplyr)
df <- data_frame(index = seq_along(x),value = x)
x_sum <- sum(df$value)
x_ranges <- x_sum/n
df <- df %>% arrange(value)
df$cumsum <- cumsum(df$value)
df$bins <- cut(df$cumsum, breaks = floor(seq(0, x_sum, x_ranges)),
right = T,
include.lowest = T,
labels = as.integer(seq(1,n,1)))
if(Output == "Summary"){
df <- df %>% group_by(bins)
return(df %>% summarise(Lower_Bound = min(value),
Upper_Bound = max(value) - 1,
Value_sum = sum(value)))}
else {
df <- df %>% arrange(index)
return(as.numeric(df$bins))}
}
(x is a vector of numbers, n is the number of bins/-tiles to group the data into, Output= specifies if you want a summary of the bounds/data or the actual data itself.)
It previous worked well within a program I created to segment some data, but I just tried to use the function again for the first time in a couple months and I'm getting:
Error in .bincode(x, breaks, right, included.lowest) :
invalid 'right' argument
According to the error, the issue is with the 'right' argument in the cut() function. As far as I know, the right= argument is boolean and only takes T or F values. I've tried both, but neither seems to work.
Does anyone have a workaround for this issue, or can recommend another function in place of cut()?
?TRUE states that:
TRUE and FALSE are reserved words denoting logical constants in the R
language, whereas T and F are global variables whose initial values
set to these.
It appears that T is being interpreted as something else here. You should always use TRUE to be on the safe side.

tidyeval difference between mutate `:=` and mutate `=`

Both these code blocks work even though they use different equal signs, one with := and the other with =. Which is correct and why? I thought tidyeval required := when using dplyr functions, but strange enough = works just fine in my mutate call.
1
library(tidyverse)
set.seed(1)
graph.data <- tibble(cal.date = as.Date(40100:40129, origin = "1899-12-30"),
random_num = rnorm(30, 8, 5))
child_function <- function(df, variable, hor.line = 6) {
variable <- enquo(variable)
df <- mutate(df, mutation := 2 * !! variable, horizontal.line := hor.line)
df
}
child_function(graph.data, variable = random_num, hor.line=8)
2
library(tidyverse)
set.seed(1)
graph.data <- tibble(cal.date = as.Date(40100:40129, origin = "1899-12-30"),
random_num = rnorm(30, 8, 5))
child_function <- function(df, variable, hor.line = 6) {
variable <- enquo(variable)
df <- mutate(df, mutation = 2 * !! variable, horizontal.line = hor.line)
df
}
child_function(graph.data, variable = random_num, hor.line=8)
The := operator's purpose is to allow you to dynamically set the name of variable on the LHS (left hand side) of the equation, which you are not doing here.
In many cases, including this one, we're just concerned with manipulating the RHS. The := would come in handy if you wanted to control the name of the "mutation" variable.
https://dplyr.tidyverse.org/articles/programming.html#setting-variable-names
There is no obligation to put := in that case.
It becomes obligatory when you want to do something like:
child_function <- function(df, variable, hor.line = 6, mt_name = "mutation") {
variable <- enquo(variable)
df <- mutate(df, !! mt_name := 2 * !! variable, horizontal.line = hor.line)
}
A little bit hard to track down, but from ?quasiquotation
Unfortunately R is very strict about the kind of expressions supported
on the LHS of =. This is why we have made the more flexible :=
operator an alias of =. You can use it to supply names, e.g. a := b is
equivalent to a = b. Since its syntax is more flexible you can unquote
on the LHS:

How can I use purrr:: invoke_map(), on a function that uses enquo()?

Sorry, I know the title isn't the best.
So, I'm trying to write a function that checks whether the output of 2 functions is the same. I'm trying to do this with purrr::invoke_map(). However, I'm struggling to get it to work with any function that uses enquo().
I assume the issue is because the arguments are being evaluated before I want them to be evaluated.
library(tidyverse)
check_output <- function(function_list, param_list = NULL){
param_list <- list(param_list)
output <- invoke_map(.f = function_list, .x = param_list)
identical(output[1], output[2])
}
check_output(function_list = list(cumprod, cumsum), # This works
param_list = list(x = iris$Sepal.Length))
check_output(function_list = list(cumsum, cumsum), # This works
param_list = list(x = iris$Sepal.Length))
m <- function(data, col_name){ # Enquo test function
col_name <- enquo(col_name)
data %>%
select(!!col_name)
}
n <- m
n(iris, Species) # Seeing if the functions work
check_output(function_list = list(m, n), # The call doesn't work
param_list = list(data = iris, col_name = Species))
I solved my problem by adding a helper function called add_quo, which turns the appropriate arguments into quosures.
require(purrr)
require(magrittr)
require(rlang) # add_quo uses this package
add_quo <- function(...) {
## Formats arguments for use in invoke_map()
# Create a logical vector for use as an index below
index <- exprs(...) %>% # Captures the arguments
unlist() %>% # Makes them mappable
map_if(negate(is.name),
.f = function(x) {return(TRUE)}) %>% # Replaces unnecessary args with TRUE
map_if(is.name, as_character) %>% # Converts named arguments to characters
map_if(is.character, exists) %>% # Replaces named args that aren't objects as FALSE
unlist() # To make index mappable
# Wraps named args that aren't objects in new_quosure()
exprs(...) %>% # Captures arguments
unlist() %>% # Makes them mappable
map_if(!index, new_quosure, # Wraps non-object named arg in new_quosure()
env = global_env()) %>% # new_quosure() used instead of quo() as env must be global
map_if(index, eval) %>% # Removes expr() formatting
list() # Makes arguments usable in invoke_map()
}
is.output.same<- function(.f_list, ...) {
## Checks to see if functions produce identical output
param_list <- add_quo(...) # Enables function with dplyr syntax to work
invoke_map(.f = .f_list, .x = param_list) %>% # Pass args to functions
reduce(identical) # Sees if all outputs are identical
}
check_output(function_list = list(m, n), data = iris, col_name = Species) #now works

Resources