I am struggling to write a function fun2 that uses fun1... and keep getting errors. I have written a simplified example below. It is the first time I deal with "tidy evaluation" and not sure to understand the in and outs of it.
Example dataframes:
d1 = data.frame(
ID = c("A", "A", "A", "B", "B", "C", "C", "C", "C"),
EXPR = c(2, 8, 3, 5, 7, 20, 1, 5, 4)
)
d2 = data.frame(
ID = c("A", "B", "C"),
NUM = c(22, 50, 31)
)
First function
fun1 <- function(
df1 = "df 1",
df2 = "df 2",
t1 = "threshold 1",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2") {
# dataframes
df <- df1
db <- df2
# quosure
enquo_id <- enquo(id_col)
enquo_expr <- enquo(expr_col)
# classify
df <- df %>%
mutate(threshold = t1) %>%
mutate(class = ifelse(!!enquo_expr > t1, "positive", "negative")) %>%
mutate(class = factor(class, levels = c("positive", "negative")))
# calculate sample data
df.sum <- df %>%
group_by(!!enquo_id, class) %>%
summarise(count = n()) %>%
complete(class, fill = list(count = 0)) %>%
mutate(total = sum(count), freq = count/total)
# merge dataframes
df.sum <- left_join(df.sum, db, by = quo_name(enquo_id))
# return
return(df.sum)
}
If I run a test of this, I get a dataframe in return, as expected
test <- fun1(df1 = d1, df2 = d2, t1 = 3, expr_col = EXPR, id_col = ID)
Second funtion
Now with fun2, I am trying to use fun1 in a for loop to iterate from ti to tf of the seq vector:
fun2 <- function(
df1 = "df 1",
df2 = "df 2",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2",
ti = "initial value",
tf = "final value",
res = "resolution") {
# define variables for fun1
var1 <- enquo(d1)
var2 <- enquo(d2)
var3 <- enquo(t1)
var4 <- enquo(EXPR)
var5 <- enquo(ID)
# get sequence of values
seq <- seq(from = ti, to = tf, by = res)
# open list
t.list <- list()
# Loop ----
for (i in seq_along(seq)){
t1 <- seq[i]
t.list[[i]] <- fun1(df1 = var1,
df2 = var2,
t1 = var3,
expr_col = var4,
id_col = var5)
}
df.out <- plyr::ldply(t.list, rbind)
### Return ---
return(df.out)
}
But if I run this
test <- fun2(df1 = d1, df2 = d2, expr_col = EXPR, id_col = ID, ti = 1, tf = 10, res = 1)
I get an error message
Error in (function (x) : object 'EXPR' not found
I tried various things... and I am kind of stuck here. I guess I am not using enquo() properly. I can get it to work by not using varX and putting directly the actual appropriate name of each element in the fun1 arguments, but the whole point of doing this, to me, is to make it "generalisable" and therefore specify the arguments only in fun2 which will then be passed to fun1.
Any help would be greatly appreciated.
Many thanks for your answer aosmith. I am now sorted using the following code:
fun2 <- function(
df1 = "df 1",
df2 = "df 2",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2",
ti = "initial value",
tf = "final value",
res = "resolution") {
# define variables for fun1
var4 <- enquo(expr_col)
var5 <- enquo(id_col)
# get sequence of values
seq <- seq(from = ti, to = tf, by = res)
# open list
t.list <- list()
### Loop --------------------------------------------------------------
for (i in seq_along(seq)){
t1 <- seq[i]
t.list[[i]] <- fun1(df1 = df1,
df2 = df2,
t1 = t1,
expr_col = !!var4,
id_col = !!var5)
}
df.out <- plyr::ldply(t.list, rbind)
### Return ---
return(df.out)
}
# TEST FUN2
test <- fun2(df1 = d1, df2 = d2, expr_col = EXPR, id_col = ID, ti = 1, tf = 10, res = 1)
Related
I am looking to hide values from the output table if the frequency of data in the respective variable is less than 4 .
lets say if the number of records in column hp, mpg, qsec is less than 4 than the mean or median should be masked with "--"
i am trying like below but not working showing some error due to NA in database
library(expss)
data <- data.frame(
gender = c(1, 2, 1),
sector = c(3, 3, 1),
col1 = c(12, 15, 22),
col2 = c(33, NA, 41),
col3 = c(1, 1, 0),
col4 = c(NA,NA,NA),
col5 = c(1, 2, 1)
)
data$col3 <- factor(data$col3, levels = 1, labels = "Management")
data$col4 <- factor(data$col4, levels = 1, labels = "HR")
lst <- list(data$col4,data$col3)
fun1 <- function(dataset,var_list,banner1){
perc_25 <- function(x, ...){unname(quantile(x, .25, na.rm=TRUE))}
perc_75 <- function(x, ...){unname(quantile(x, .75, na.rm=TRUE))}
dataset<-dataset[var_list] %>% as.data.frame()
first_col_param <- head(var_list,1)
second_col_param <- tail(var_list,1)
var_lab(colnames(dataset)[ncol(dataset)]) <- ""
mr <- parse(text=paste0("mrset(",
first_col_param ," %to% ",second_col_param,")"))
fun_replace_valid_n <- function(x, n) {
dat <- dplyr::cur_data_all() %>% replace(is.na(.),0)
func_name <- dat$func_name
if(x[func_name == "Valid N"] < n) {
replace(x, func_name %in% c("Mean", "Median"), "--")
} else x
}
t1<- cross_fun(dataset,
eval(mr),
col_vars = banner1,
fun = combine_functions("Mean" = mean,
"Median" = median,
"Max"= max,
"Min"=min,
"25th Perc" = perc_25,
"75th Perc" = perc_75,
"Valid N" = valid_n
))
t1 <- as.data.frame(t1)
t1 <- t1 %>% tidyr::separate(row_labels, into = c('grp', 'func_name'), sep = "\\|")
t1 <- t1 %>% dplyr::group_by(grp)
t1 <- t1 %>% dplyr::mutate(dplyr::across(where(is.numeric), fun_replace_valid_n, n = 4)) %>%
dplyr::ungroup()
t1 <- t1 %>% tidyr::unite(row_labels, grp, func_name, sep = "|") %>%
as.etable
t1
}
debugonce(fun1)
t1 <- fun1(dataset=data,"col1",banner1=lst)
error:
expected output:
I have a list of 2D matrices. Each matrix is filled using the function fillMatrices. This function adds a number of individuals to each day 0 in a matrix and updates the columns a_M, b_M and c_M. The numbers of individuals come from an initial matrix ind. The code works but it is slow when the number of matrices within the list is large. For example with n = 10000:
user system elapsed
3.73 0.83 4.55
If possible, I would like to reduce the elapsed time to <= 1 sec and increase the n to 720000 matrices. So, I am looking for way to optimize only the section 3. Here is the code:
###############################################
###############################################
## Section 3
## Run the function "fillMatrices"
indexTime <- 1
dt_t_1 <- do.call(rbind, lapply(list_matrices, function(x) x[1,]))
dt_t <- fillMatrices(dt_t_1 = dt_t_1, species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## Fill the matrices within the list
system.time(for(i in 1:n){
list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
})
## test <- list_matrices[[1]]
The code of the section 1 is used to initialize the matrices and the function fillMatrices can be found in the section 2. In my example, the function is used to fill matrices for one species. In reality, the function is used for 3 species (i.e., is applied three times) by changing the argument species = c("a_M", "b_M", "c_M"). How can I speed up my code? Any advice would be much appreciated.
Here are the codes of sections 1 and 2:
rm(list=ls(all=TRUE))
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000
###############################################
###############################################
## Section 1
## Build the list of 2D matrices
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
list_matrices <- list()
for(i in 1:n){
print(i)
list_matrices[[i]] <- ff(-999, dim=c(3650, length(col_mat)), dimnames=list(NULL, col_mat), vmode="double", overwrite = TRUE)
}
## test <- list_matrices[[1]]
## dim(list_matrices[[1]])
## Fill the first row of each matrix
for(i in 1:n){
print(i)
list_matrices[[i]][1,] <- c(1, 1, 1, i-1, 0, rep(0, length(v_date)))
}
## test <- list_matrices[[2]]
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1, n), day = rep(1, n), time = rep(1, n), died = rep(0, n), ID = (seq(1, n, 1))- 1, a_M = sample(1:10, n, replace = T), b_M = sample(1:10, n, replace = T), c_M = sample(1:10, n, replace = T)))
## print(ind)
###############################################
###############################################
## Section 2
## Function to convert a data frame into a matrix
convertDFToMat <- function(x){
mat <- as.matrix(x[,-1])
ifelse(is(x[,1], "data.frame"), rownames(mat) <- pull(x[,1]), rownames(mat) <- x[,1])
## Convert character matrix into numeric matrix
mat <- apply(mat, 2, as.numeric)
return(mat)
}
## Define the function that is used to fill the matrices within the list
fillMatrices <- function(dt_t_1, species, maxDuration, matrixColumns){
## Format data
dt <- as.data.frame(dt_t_1) %>%
reshape::melt(id = c("ID")) %>%
arrange(ID) %>%
dplyr::mutate_all(as.character)
## summary(dt)
## Break out the variable "variable" into different columns, with one row for each individual-day
dt_reshape_filter_1 <- dt %>%
dplyr::filter(!variable %in% c("year", "day", "time", "ID", "died")) %>%
dplyr::mutate(day = variable %>% gsub(pattern = "\\_.*", replacement = "", x = .), col = variable %>% gsub(pattern = ".*\\|", replacement = "", x = .)) %>%
dplyr::select(-variable) %>%
tidyr::spread(col, value) %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::arrange(ID, day)
## summary(dt_reshape_filter_1)
## Apply requested transformations and build the data frame
dt_transform <- dt_reshape_filter_1 %>%
dplyr::rename_at(vars(species), ~ c("a", "b", "c")) %>%
dplyr::mutate(day = day + 1) %>%
dplyr::filter(day < maxDuration + 1) %>%
dplyr::bind_rows(tibble(ID = ind[,c("ID")], day = 0, a = ind[,c("a_M")], b = ind[,c("b_M")])) %>%
dplyr::mutate(c = a + b) %>%
dplyr::rename_at(vars("a", "b", "c"), ~ species) %>%
dplyr::arrange(ID, day)
## summary(dt_transform)
## Take different columns of the data frame and gather them into a single column
dt_gather <- dt_transform %>%
tidyr::gather(variable, value, species) %>%
dplyr::mutate(day = if_else(day > 1, paste0(day, "_days"), paste0(day, "_day"))) %>%
tidyr::unite(variable, c("day", "variable"), sep = "|") %>%
dplyr::rename(var2 = ID) %>%
dplyr::mutate_all(as.character)
## summary(dt_gather)
## Add the other columns in the data frame and convert the resulting data frame into a matrix
dt_reshape_filter_2 <- dt %>%
dplyr::rename(var2 = ID) %>%
dplyr::filter(variable %in% c("year", "day", "time", "ID", "died")) %>%
tidyr::spread(variable, value) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate(year = ind[,c("year")],
day = ind[,c("day")],
time = ind[,c("time")],
ID = ind[,c("ID")],
died = ind[,c("died")]) %>%
tidyr::gather(variable, value, c(year, day, time, ID, died)) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate_all(as.character)
## summary(dt_reshape_filter_2)
## Build the output matrix
dt_bind <- bind_rows(dt_reshape_filter_2, dt_gather) %>%
tidyr::spread(var2, value) %>%
dplyr::arrange(match(variable, matrixColumns)) %>%
dplyr::select("variable", as.character(ind[,c("ID")]))
## summary(dt_bind)
dt_mat <- convertDFToMat(dt_bind)
## summary(dt_mat)
return(dt_mat)
}
Making a 3D array instead of a 2D list of matrices gives you more options
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000L
n_row <- 3650L
#array way:
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
arr1 <- ff(-999L, dim = c(n_row, length(col_mat), n), dimnames = list(NULL, col_mat, NULL))
## Fill the first row of each matrix slice
arr1[1, , ] <- c(1L, 1L, 1L, NA, 0L, rep(0L, length(v_date)))
arr1[1, 4, ] <- seq_len(n)-1L
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1L, n), day = rep(1L, n), time = rep(1L, n), died = rep(0L, n), ID = (seq(1L, n, 1L))- 1L, a_M = sample(1L:10L, n, replace = T), b_M = sample(1L:10L, n, replace = T), c_M = sample(1L:10L, n, replace = T)))
##fill the matrix
indexTime <- 1L
dt_t <- fillMatrices(dt_t_1 = t(arr1[1, ,]), species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## reassign
system.time(
arr1[indexTime + 1, ,] <- dt_t
)
user system elapsed
0.05 0.70 0.7
# for comparison
#> system.time(for(i in 1:n){
#+ list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
#+ })
# user system elapsed
# 4.75 1.08 5.90
As far as I can tell, it's giving me the same results as your original approach but does so a lot faster.
I am trying to calculate rolling correlation on a tibble, iterating through column names in a loop. I seem to be struggling passing variables to a function, though. This works:
tbl <- tibble(date = seq(as.Date("1983-03-31"), by=7, length.out=100),
col1 = 1:100, col2 = sample(100, size = 100, replace=TRUE), col3 = col1 + col2)
tbl %>%
tq_mutate_xy(
x = col1,
y = col3,
mutate_fun = runCor,
n = 10,
use = "pairwise.complete.obs",
col_rename = "col1_col3_corr"
)
But this doesn't:
tbl <- tibble(date = seq(as.Date("1983-03-31"), by=7, length.out=100),
col1 = 1:100, col2 = sample(100, size = 100, replace=TRUE), col3 = col1 + col2)
c1 <- "col1"
c2 <- "col3"
tbl %>%
tq_mutate_xy(
x = !!c1,
y = !!c2,
mutate_fun = runCor,
n = 10,
use = "pairwise.complete.obs",
col_rename = paste0(c1, "_", c2, "_corr")
)
The error is "Error in check_x_y_valid(data, x, y) : x = !(!c1) not a valid name."
What am I doing wrong?
First, I think you want the non-standard evaluation (NSE) version of tq_mutate_xy --
that is, tq_mutate_xy_. As a result, when you use the NSE of these functions, you need to use character strings -- this means your mutate_fun variable should also be a character string. The following should work:
c1 <- "col1"
c2 <- "col3"
tbl %>%
tq_mutate_xy_(
x = c1,
y = c2,
mutate_fun = "runCor",
n = 10,
use = "pairwise.complete.obs",
col_rename = paste0(c1, "_", c2, "_corr")
)
Be sure to look at example 5 from the help documentation, ?tq_mutate_xy
I am attempting to create a function which takes a list as input, and returns a summarised data frame. However, after trying multiple ways, I am unable to pass a list to the function for the aggregation.
So far I have the following, but it is failing.
library(dplyr)
random_df <- data.frame(
region = c("A", "B", "C", "C"),
number_of_reports = c(1, 3, 2, 1),
report_MV = c(12, 33, 22, 12)
)
output_graph <- function(input) {
print(input$arguments)
DF <- input$DF
group_by <- input$group_by
args <- input$arguments
flow <- ddply(DF, group_by, summarize, args)
return(flow)
}
graph_functions <- list(
DF = random_df,
group_by = .(region),
arguments = .(Reports = sum(number_of_reports),
MV_Reports = sum(report_MV))
)
output_graph(graph_functions)
Where this works:
library(dplyr)
random_df <- data.frame(
region = c("A", "B", "C", "C"),
number_of_reports = c(1, 3, 2, 1),
report_MV = c(12, 33, 22, 12)
)
output_graph <- function(input) {
print(input$arguments)
DF <- input$DF
group_by <- input$group_by
args <- input$arguments
flow <- ddply(
DF,
group_by,
summarize,
Reports = sum(number_of_reports),
MV_Reports = sum(report_MV)
)
return(flow)
}
graph_functions <- list(
DF = random_df,
group_by = .(region),
arguments = .(Reports = sum(number_of_reports),
MV_Reports = sum(report_MV))
)
output_graph(graph_functions)
Would anyone be aware of a way to pass a list of functions to ddply? Or another way to achieve the same goal of aggregating a dynamic set of variables.
In order to pass arguments into the function for use by dplyr, I recommend reading this regarding non-standard evaluation (NSE). Here is an edited function producing the same output as my original.
library(dplyr)
random_df <- data.frame(
region = c('A','B','C','C'),
number_of_reports = c(1, 3, 2, 1),
report_MV = c(12, 33, 22, 12)
)
output_graph <- function(df, group, args) {
grp_quo <- enquo(group)
df %>%
group_by(!!grp_quo) %>%
summarise(!!!args)
}
args <- list(
Reports = quo(sum(number_of_reports)),
MV_Reports = quo(sum(report_MV))
)
output_graph(random_df, region, args)
# # A tibble: 3 x 3
# region Reports MV_Reports
# <fctr> <dbl> <dbl>
# 1 A 1.00 12.0
# 2 B 3.00 33.0
# 3 C 3.00 34.0
I have a data frame with I obsevations, and each observation belongs to one of g categories.
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, length = 16),
group = sample(1:g, I, T))
I want to randomly assign each observation to one of J "urns", given some vector of probabilities p. That is the probability of being assign to urn J=1 is p[1]. The added complexity is that I want to do this block by block.
If I ignore the blocks, I can do this easily:
J <- 3
p <- c(0.25, 0.5, 0.25)
df1 <- df %>% mutate(urn = sample(x = c(1:J), size = I, replace = T, prob = p))
I thought about this method to do it by "block"
# Block randomization
randomize_block <- function(g) {
df1 <- df %>% filter(group==g)
size <- nrow(df1)
df1 <- df1 %>% mutate(urn = sample(x = c(1:J),
size = size,
replace = T,
prob = p))
return(df1)
}
df2 <- lapply(1:g, randomize_block)
df2 <- data.table::rbindlist(df2)
Is there a better way?
Not sure if this is better, but here is a base R technique with data.frame df, that has group name "group" as well as urn assignments 1:J with assignment probabilities in vector p of length J.
# get urn assignment
urnAssignment <- lapply(unique(df$group),
function(i) sample(1:J, nrow(df[group==i,]), replace =T, prob=p))
# get a list that collects position of observations
obsOrder <- lapply(unique(df$group),
function(i) which(df$group == i))
df$urnAssignment <- unlist(urnAssignment)[unlist(obsOrder)]
randomizr::block_ra does exactly what you want.
library(randomizr)
library(janitor) #just for the tabyl function
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
This does the trick using dplyr:
randomize <- function(data, groups=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/groups, groups)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, groups = J, block_id = "group", p = p, seed = 9782)