Pass multiple arguments to ddply - r

I am attempting to create a function which takes a list as input, and returns a summarised data frame. However, after trying multiple ways, I am unable to pass a list to the function for the aggregation.
So far I have the following, but it is failing.
library(dplyr)
random_df <- data.frame(
region = c("A", "B", "C", "C"),
number_of_reports = c(1, 3, 2, 1),
report_MV = c(12, 33, 22, 12)
)
output_graph <- function(input) {
print(input$arguments)
DF <- input$DF
group_by <- input$group_by
args <- input$arguments
flow <- ddply(DF, group_by, summarize, args)
return(flow)
}
graph_functions <- list(
DF = random_df,
group_by = .(region),
arguments = .(Reports = sum(number_of_reports),
MV_Reports = sum(report_MV))
)
output_graph(graph_functions)
Where this works:
library(dplyr)
random_df <- data.frame(
region = c("A", "B", "C", "C"),
number_of_reports = c(1, 3, 2, 1),
report_MV = c(12, 33, 22, 12)
)
output_graph <- function(input) {
print(input$arguments)
DF <- input$DF
group_by <- input$group_by
args <- input$arguments
flow <- ddply(
DF,
group_by,
summarize,
Reports = sum(number_of_reports),
MV_Reports = sum(report_MV)
)
return(flow)
}
graph_functions <- list(
DF = random_df,
group_by = .(region),
arguments = .(Reports = sum(number_of_reports),
MV_Reports = sum(report_MV))
)
output_graph(graph_functions)
Would anyone be aware of a way to pass a list of functions to ddply? Or another way to achieve the same goal of aggregating a dynamic set of variables.

In order to pass arguments into the function for use by dplyr, I recommend reading this regarding non-standard evaluation (NSE). Here is an edited function producing the same output as my original.
library(dplyr)
random_df <- data.frame(
region = c('A','B','C','C'),
number_of_reports = c(1, 3, 2, 1),
report_MV = c(12, 33, 22, 12)
)
output_graph <- function(df, group, args) {
grp_quo <- enquo(group)
df %>%
group_by(!!grp_quo) %>%
summarise(!!!args)
}
args <- list(
Reports = quo(sum(number_of_reports)),
MV_Reports = quo(sum(report_MV))
)
output_graph(random_df, region, args)
# # A tibble: 3 x 3
# region Reports MV_Reports
# <fctr> <dbl> <dbl>
# 1 A 1.00 12.0
# 2 B 3.00 33.0
# 3 C 3.00 34.0

Related

Define a tidyverse-function

I have a data.frame df and I would like to do some checks on the data. If there's an error (e.g. missing values or non plausible values) I would like to make a list containing the id of the case and the type of error.
# Define an empty data.frame
errors <- data.frame(id = numeric(),
message = character())
# Function that stacks all the errors
addErrorMessage(message){
errors <- rbind(errors, ) # <= what to do here?
}
df <- data.frame(id = 1:7,
var1 = c(1, 2, 3, 3, 9, 4, 5),
var2 = c("A", "A", "B", "C", NA, "D", "A"))
########### List of checks ################
# Check 1: var1 should be smaller than 5
df %>% filter(var1 > 5) %>%
addErrorMsg(message = "Value of var1 is 5 or greater")
# Check 2: var2 should not be missing
df %>% filter(is.na(var2)) %>%
addErrorMessage(message = "Value of var2 is missing")
My question is: How can I define a function addErrorMessage() that I can directly use in the tidyverse-workflow? I want to avoid to save the wrong cases to a temporary data.frame for each check and then stack this data.frame on the errors-data.frame using rbind().
Your actual problem can probably be solved using the {pointblank} package which contains a lot of functions that help to conduct this and similar tests.
If you are more interested in writing such validation functions yourself, see a very rough draft below.
df <- data.frame(id = 1:7,
var1 = c(1, 2, 3, 3, 9, 4, 5),
var2 = c("A", "A", "B", "C", NA, "D", "A"))
library(pointblank)
df %>%
col_vals_lt(vars(var1),
value = 5) %>%
col_vals_not_null(vars(var2))
#> Error: Exceedance of failed test units where values in `var1` should have been < `5`.
#> The `col_vals_lt()` validation failed beyond the absolute threshold level (1).
#> * failure level (2) >= failure threshold (1)
Created on 2021-08-17 by the reprex package (v2.0.1)
{pointblank} can also generate data validation reports:
agent <-
create_agent(
tbl = df,
tbl_name = "My data",
label = "Checking column values",
actions = action_levels(stop_at = 1)
) %>%
col_vals_lt(vars(var1),
value = 5) %>%
col_vals_not_null(vars(var2)) %>%
interrogate()
agent
If you are more interested in writing this kind of functions yourself, below is a very rough draft. It uses the attributes of the underyling data.frame which is not a great solution, since depending on the functions you use in between checks the attributes might get lost. In a package we could use a dedicated environment to capture errors, so in this case we wouldn't need the attributes.
library(dplyr)
df <- data.frame(id = 1:7,
var1 = c(10, 2, 3, 3, 9, 4, 5),
var2 = c("A", NA, "B", "C", NA, "D", "A"))
check <- function(data, condition, message){
exp <- rlang::enexpr(condition)
test <- transmute(data, new = eval(exp))$new
if (any(test)) {
err_df <- attr(data, "error_df")
if (is.null(err_df)) {
attr(data, "error_df") <- data.frame(check = 1L,
row_nr = which(test),
message = message)
} else {
attr(data, "error_df") <- rbind(err_df,
data.frame(check = max(err_df$check) + 1L,
row_nr = which(test),
message = message)
)
}
}
data
}
get_errors <- function(data) {
print(attr(data,"error_df"))
invisible(data)
}
df %>%
check(condition = var1 > 5,
message = "Value of var1 is 5 or greater") %>%
check(condition = is.na(var2),
message = "Value of var2 is missing") %>%
get_errors
#> check row_nr message
#> 1 1 1 Value of var1 is 5 or greater
#> 2 1 5 Value of var1 is 5 or greater
#> 3 2 2 Value of var2 is missing
#> 4 2 5 Value of var2 is missing
Created on 2021-08-17 by the reprex package (v2.0.1)

Nested loops in R that create new variable names and lags

I am new to R, but experienced in Stata. To learn R, I am tracking Covid-19 infections. That requires creating seven-day trailing averages, and I do so with the following loop.
for (mylag in c(1:7)) {
data <- data %>% group_by(state) %>% mutate(!!paste0("deathIncrease", "_", mylag) := lag(deathIncrease, mylag)) %>% ungroup()
}
This works, but then I want to run the same code, not just for deaths, but also for cases. So I tried the following.
var_list <- c("deathIncrease", "positiveIncrease")
for (var in var_list) {
for (mylag in c(1:7)) {
var <- enquo(var)
varname <- enquo( paste0(quo_name(var), "_", mylag) )
data <- data %>% group_by(state) %>% mutate(!!varname := lag(!!var, mylag)) %>% ungroup()
}
}
But that leads to the error arg must be a symbol. Any help would be much appreciated. In Stata, loops are simpler. Is there no package that gets R to automatically fill in the looping variables everywhere, like so: {{ var }}?
Edit: here is a minimal working example. The first way to create lags works, but only for var1. The second nested loop does not.
df <- tribble(
~group_var, ~var1, ~var2,
"A", 1, 10,
"A", 2, 11,
"A", 3, 12,
"B", 1, 10,
"B", 2, 11,
"B", 3, 12)
for (mylag in c(1:2)) {
df <- df %>% group_by(group_var) %>% mutate(!!paste0("var1", "_lag", mylag) := lag(var1, mylag)) %>% ungroup()
}
## Another loop
var_list <- c("var1", "var2")
for (myvar in var_list) {
for (mylag in c(1:2)) {
myvar <- enquo(myvar)
varname <- enquo( paste0(quo_name(myvar), "_", mylag) )
data <- data %>% group_by(state) %>% mutate(!!varname := lag(!!myvar, mylag)) %>% ungroup()
}
}
You can use the function get(), like lag(get(myvar), mylag), to point the specific column the string myvar is referred to:
for(mylag in 1:7){
for(myvar in c('deathIncrease', 'positiveIncrease')){
data <- data %>%
group_by(state) %>%
mutate(
!!paste0(myvar, '_', mylag) := lag(get(myvar), mylag)
) %>%
ungroup()
}
}
My first solution contained a function, that did not respect grouped data. I wanted to look at that anyways, so i spend a bit of time to respect grouped data as well.
This is my solution now, it works as expected on grouped data, but it feels a bit hacky tbh.
add_lag <- function(.data, column, days) {
group <- unlist(groups(.data))
if(is.null(group)){
new <- mapply(function(x, y) {
lag(x, y)
}, x = .data[column], y = sort(rep(days, length(column))))
if(is.null(dim(new))){
new <- t(new)
}
new <- as.data.frame(new, stringsAsFactors = F)
names(new) <- paste0(column, "_", sort(rep(days, length(column))))
new <- as_tibble(new) %>%
select(sort(names(new)))
mutate(.data, !!!new)
} else {
tmp <- .data %>%
nest()
tmp$data <- lapply(tmp$data, function(x,y){
x %>%
add_lag(column,y)
}, y = days)
tmp %>% unnest(c(data))
}
}
df <- tribble(
~group_var, ~var1, ~var2,
"A", 1, 10,
"A", 2, 11,
"A", 3, 12,
"B", 1, 10,
"B", 2, 11,
"B", 3, 12)
df %>%
group_by(group_var) %>%
add_lag("var1", 1:2)
# A tibble: 6 x 5
# Groups: group_var [2]
group_var var1 var2 var1_1 var1_2
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1 10 NA NA
2 A 2 11 1 NA
3 A 3 12 2 1
4 B 1 10 NA NA
5 B 2 11 1 NA
6 B 3 12 2 1

Quosure with in a nested function

I am struggling to write a function fun2 that uses fun1... and keep getting errors. I have written a simplified example below. It is the first time I deal with "tidy evaluation" and not sure to understand the in and outs of it.
Example dataframes:
d1 = data.frame(
ID = c("A", "A", "A", "B", "B", "C", "C", "C", "C"),
EXPR = c(2, 8, 3, 5, 7, 20, 1, 5, 4)
)
d2 = data.frame(
ID = c("A", "B", "C"),
NUM = c(22, 50, 31)
)
First function
fun1 <- function(
df1 = "df 1",
df2 = "df 2",
t1 = "threshold 1",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2") {
# dataframes
df <- df1
db <- df2
# quosure
enquo_id <- enquo(id_col)
enquo_expr <- enquo(expr_col)
# classify
df <- df %>%
mutate(threshold = t1) %>%
mutate(class = ifelse(!!enquo_expr > t1, "positive", "negative")) %>%
mutate(class = factor(class, levels = c("positive", "negative")))
# calculate sample data
df.sum <- df %>%
group_by(!!enquo_id, class) %>%
summarise(count = n()) %>%
complete(class, fill = list(count = 0)) %>%
mutate(total = sum(count), freq = count/total)
# merge dataframes
df.sum <- left_join(df.sum, db, by = quo_name(enquo_id))
# return
return(df.sum)
}
If I run a test of this, I get a dataframe in return, as expected
test <- fun1(df1 = d1, df2 = d2, t1 = 3, expr_col = EXPR, id_col = ID)
Second funtion
Now with fun2, I am trying to use fun1 in a for loop to iterate from ti to tf of the seq vector:
fun2 <- function(
df1 = "df 1",
df2 = "df 2",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2",
ti = "initial value",
tf = "final value",
res = "resolution") {
# define variables for fun1
var1 <- enquo(d1)
var2 <- enquo(d2)
var3 <- enquo(t1)
var4 <- enquo(EXPR)
var5 <- enquo(ID)
# get sequence of values
seq <- seq(from = ti, to = tf, by = res)
# open list
t.list <- list()
# Loop ----
for (i in seq_along(seq)){
t1 <- seq[i]
t.list[[i]] <- fun1(df1 = var1,
df2 = var2,
t1 = var3,
expr_col = var4,
id_col = var5)
}
df.out <- plyr::ldply(t.list, rbind)
### Return ---
return(df.out)
}
But if I run this
test <- fun2(df1 = d1, df2 = d2, expr_col = EXPR, id_col = ID, ti = 1, tf = 10, res = 1)
I get an error message
Error in (function (x) : object 'EXPR' not found
I tried various things... and I am kind of stuck here. I guess I am not using enquo() properly. I can get it to work by not using varX and putting directly the actual appropriate name of each element in the fun1 arguments, but the whole point of doing this, to me, is to make it "generalisable" and therefore specify the arguments only in fun2 which will then be passed to fun1.
Any help would be greatly appreciated.
Many thanks for your answer aosmith. I am now sorted using the following code:
fun2 <- function(
df1 = "df 1",
df2 = "df 2",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2",
ti = "initial value",
tf = "final value",
res = "resolution") {
# define variables for fun1
var4 <- enquo(expr_col)
var5 <- enquo(id_col)
# get sequence of values
seq <- seq(from = ti, to = tf, by = res)
# open list
t.list <- list()
### Loop --------------------------------------------------------------
for (i in seq_along(seq)){
t1 <- seq[i]
t.list[[i]] <- fun1(df1 = df1,
df2 = df2,
t1 = t1,
expr_col = !!var4,
id_col = !!var5)
}
df.out <- plyr::ldply(t.list, rbind)
### Return ---
return(df.out)
}
# TEST FUN2
test <- fun2(df1 = d1, df2 = d2, expr_col = EXPR, id_col = ID, ti = 1, tf = 10, res = 1)

bind_rows() error: by reading in a function?

This block runs below, and produces df_all as intended, but when I uncomment the single function at the top (not even apply it here but I do need for other things) and rerun the same block, I get: Error in bind_rows_(x, .id): Argument 1 must be a data frame or a named atomic vector, not a function
library(data.table)
# addxtoy_newy_csv <- function(df) {
# zdf1 <- df %>% filter(Variable == "s44")
# setDT(df)
# setDT(zdf1)
# df[zdf1, Value := Value + i.Value, on=.(tstep, variable, Scenario)]
# setDF(df)
#}
tstep <- rep(c("a", "b", "c", "d", "e"), 5)
Variable <- c(rep(c("v"), 5), rep(c("w"), 5), rep(c("x"), 5), rep(c("y"), 5), rep(c("x"), 5))
Value <- c(1,2,3,4,5,10,11,12,13,14,33,22,44,57,5,3,2,1,2,3,34,24,11,11,7)
Scenario <- c(rep(c("i"), 20), rep(c("j"), 5) )
df1 <- data.frame(tstep, Variable, Value, Scenario)
tstep <- c("a", "b", "c", "d", "e")
Variable <- rep(c("x"), 5)
Value <- c(100, 34, 100,22, 100)
Scenario <- c(rep(c("i"), 5))
df2<- data.frame(tstep, Variable, Value, Scenario)
setDT(df1)
setDT(df2)
df1[df2, Value := Value + i.Value, on=.(tstep, Variable, Scenario)]
setDF(df1)
df_all <- mget(ls(pattern="df*")) %>% bind_rows()
The pattern you use in ls() will match any object with a "d" in its name, so addxtoy_newy_csv gets included in the list of object names. The f* in your pattern means you currently search for "d, followed by zero or more f's". I think a safer pattern to use would be ^df.*, to match objects that start with "df":
df1 = data.frame(x = 1:3)
df2 = data.frame(x = 4:6)
adder = function(x) x + 1
ls(pattern = "df*")
ls(pattern = "^df.*")

sort data into deciles based on a rolling subset

I am trying to replicate the Fama French 1993 paper using R. I need to do the following sorting :
for each month,
calculate ME decile breakpoints on NYSE stocks only
sort all stocks into the deciles created in 2.
Data generation:
set.seed(1234)
n = 120
stocks <- c("A", "B", "C", "D", "E")
exchange <- c("NYSE", "NASDAQ", "AMEX")
df <- as.data.frame(cbind(Month = 1:12,
exchangeCode = exchange[round(runif(n, 1, 3))],
Stock = stocks[round(runif(n, 1, 5))],
ME=floor(100*abs(rnorm(n)))))
Desired Output:
ME_NYSE_vals <- as.numeric(paste(df[df$Month==1 & df$exchangeCode=="NYSE","ME"]))
ME_ALL_vals <- as.numeric(paste(df[df$Month==1,"ME"]))
cut(x = ME_ALL_vals,
breaks = c(-Inf,quantile(ME_NYSE_vals,probs=seq(.1,.9,.1)),+Inf),
labels = 1:10
)
The breaks should be calculated based on ME_NSYE_vals. The cut should be applied to all ME_ALL_vals for each month.
If the intention is to keep the whole data frame but generate deciles only for the NYSE values the code below could do. The point was to generate deciles only for the entries pertaining to the NYSE values but to keep the full data set achieving some form of a partial sorting.
# Libs
Vectorize(require)(package = c("dplyr", "magrittr"),
character.only = TRUE)
# Transformations
df %<>%
mutate(nTileNYSE = ifelse(exchangeCode == "NYSE", ntile(ME, 10), NA))
arrange(nTileNYSE)
The code was applied to the data:
set.seed(1)
df <- as.data.frame(cbind(exchangeCode = c("NYSE", "NASDAQ"),
Stock = c("A", "B", "C", "A"),
Month = 1:12,
ME=rnorm(1200)))
2nd approach
Following the discussion in the comments I would suggest the following approach:
# Libs --------------------------------------------------------------------
Vectorize(require)(package = c( "tidyr", "dplyr", "magrittr", "xts", "Hmisc"),
char = TRUE)
# Data generation ---------------------------------------------------------
set.seed(1234)
n = 120
stocks <- c("A", "B", "C", "D", "E")
exchange <- c("NYSE", "NASDAQ", "AMEX")
df <- as.data.frame(cbind(Month = 1:12,
exchangeCode = exchange[round(runif(n, 1, 3))],
Stock = stocks[round(runif(n, 1, 5))],
ME = floor(100*abs(rnorm(n)))))
# Transformations ---------------------------------------------------------
# For some reason this was needed
df$ME <- as.numeric(as.character(df$ME))
# Generate cuts
dfNtiles <- df %>%
arrange(exchangeCode, Month, ME) %>%
group_by(exchangeCode, Month) %>%
mutate(cutsBsdOnNYSE = cut(x = ME,
breaks = cut2(x = df$ME[df$exchangeCode == "NYSE"],
g = 10, onlycuts = TRUE))) %>%
ungroup() %>%
group_by(cutsBsdOnNYSE) %>%
mutate(grpBsdOnNYSE = n())
It's fairly straightforward
Generating cut brackets reflecting subset of the data.
Applying those brackets to the whole vector (ME)
Numbering the obtained groups so a group identifier is created
and boils down to:

Resources