Create Recipes and passing column names dynamically - r

I have a function that simply creates a couple of recipe objects. The issue is that inside of the function I have to rename the columns of the data.frame/tibble passed so that I can make the recipes.
I don't want to do this for obvious reasons, the main being, that the column names will have to be what is in the data.frame itself otherwise down the line they are not going to work.
Simple example:
library(tidyverse)
data_tbl <- tibble(
visit_date = seq(
from = as.Date("2021-01-01"),
to = as.Date("2021-10-15"),
by = 7,
),
visits = rnbinom(
n = 42,
size = 100,
mu = 66
)
)
ts_auto_recipe <- function(.data, .date_col, .pred_col){
# * Tidyeval ----
date_col_var <- rlang::enquo(.date_col)
pred_col_var <- rlang::enquo(.pred_col)
# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}
if(rlang::quo_is_missing(date_col_var)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}
if(rlang::quo_is_missing(pred_col_var)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}
# * Data ----
data_tbl <- tibble::as_tibble(.data)
data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var }}, {{ pred_col_var }}, dplyr::everything()
) %>%
dplyr::rename(
date_col = {{ date_col_var }}
, value_col = {{ pred_col_var }}
)
# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = date_col ~ . # I have to do the above so I can do this, which I don't like
, data = data_tbl
)
# * Add Steps ----
# ** ts signature and normalize ----
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(date_col) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj
)
# * Return ----
return(rec_lst)
}
rec_objs <- ts_auto_recipe(data_tbl, visit_date, visits)
The reason I am doing this is because I cannot use dynamic names inside of the recipe function itself, so something like rlang::sym(names(data_tbl)[[1]]) will not work, nor would something like data_tbl[[1]]. I was thinking of using something like step_rename() but that would require you to know the name ahead of time and it cannot be a variable inside of the recipe step. However you can pass a variable to something like timetk::step_time_series_signature
The only other thing I could think of was to force users to use specific column name like in the Facebook Prophet R library of ds and y
I also notice I get some funky output to the terminal when I run rec_objs I get the following:
> rec_objs
$rec_base
Recipe
Inputs:
role #variables
outcome 1
predictor 1
$rec_date
Recipe
Inputs:
role #variables
outcome 1
predictor 1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("ÿþindex.numÿþ"), dplyr::contains("ÿþdate_col...
Yet when I do:
> rec_objs[[2]]
Recipe
Inputs:
role #variables
outcome 1
predictor 1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("index.num"), dplyr::contains("date_col_year")
It does not happen.
Thank you,

I think I have found a solution to this problem, see the following custom function:
ts_auto_recipe_b <- function(.data
, .date_col
, .pred_col
, .step_ts_sig = TRUE
, .step_ts_rm_misc = TRUE
, .step_ts_dummy = TRUE
, .step_ts_fourier = TRUE
, .step_ts_fourier_period = 1
, .K = 1
, .step_ts_yeo = TRUE
, .step_ts_nzv = TRUE) {
# * Tidyeval ----
date_col_var_expr <- rlang::enquo(.date_col)
pred_col_var_expr <- rlang::enquo(.pred_col)
step_ts_sig <- .step_ts_sig
step_ts_rm_misc <- .step_ts_rm_misc
step_ts_dummy <- .step_ts_dummy
step_ts_fourier <- .step_ts_fourier
step_ts_fourier_k <- .K
step_ts_fourier_period <- .step_ts_fourier_period
step_ts_yeo <- .step_ts_yeo
step_ts_nzv <- .step_ts_nzv
# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}
if(rlang::quo_is_missing(date_col_var_expr)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}
if(rlang::quo_is_missing(pred_col_var_expr)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}
# * Data ----
data_tbl <- tibble::as_tibble(.data)
data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var_expr }}
, {{ pred_col_var_expr }}
, dplyr::everything()
)
# %>%
# dplyr::rename(
# date_col = {{ date_col_var_expr }}
# , value_col = {{ pred_col_var_expr }}
# )
# Original Col names ----
ds <- rlang::sym(names(data_tbl)[[1]])
v <- rlang::sym(names(data_tbl)[[2]])
f <- as.formula(paste(v, " ~ ."))
# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = f
, data = data_tbl
)
# * Add Steps ----
# ** ts signature and normalize ----
if(step_ts_sig){
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(ds) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
}
# ** Step rm ----
if(step_ts_rm_misc){
rec_date_obj <- rec_date_obj %>%
recipes::step_rm(dplyr::matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
}
# ** Step Dummy ----
if(step_ts_dummy){
rec_date_obj <- rec_date_obj %>%
recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = TRUE)
}
# ** Step Fourier ----
if(step_ts_fourier){
rec_date_fourier_obj <- rec_date_obj %>%
timetk::step_fourier(
ds
, period = 1#step_ts_fourier_period
, K = 1#step_ts_fourier_k
)
}
# ** Step YeoJohnson ----
if(step_ts_yeo){
rec_date_fourier_obj <- rec_date_fourier_obj %>%
recipes::step_YeoJohnson(!!v, limits = c(0, 1))
}
# ** Step NZV ----
if(step_ts_nzv){
rec_date_fourier_nzv_obj <- rec_date_fourier_obj %>%
recipes::step_nzv(recipes::all_predictors())
}
# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj,
rec_date_fourier = rec_date_fourier_obj,
rec_date_fourier_nzv = rec_date_fourier_nzv_obj
)
# * Return ----
return(rec_lst)
}
Then running the following:
> rec_objs <- ts_auto_recipe_b(.data = data_tbl, .date_col = visit_date, .pred_col = visits)
> rec_objs[[1]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits"
> rec_objs[[2]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_year" "visit_date_half" "visit_date_quarter"
[7] "visit_date_month" "visit_date_day" "visit_date_wday"
[10] "visit_date_mday" "visit_date_qday" "visit_date_yday"
[13] "visit_date_mweek" "visit_date_week" "visit_date_week2"
[16] "visit_date_week3" "visit_date_week4" "visit_date_mday7"
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1" "visit_date_wday.lbl_2" "visit_date_wday.lbl_3"
[34] "visit_date_wday.lbl_4" "visit_date_wday.lbl_5" "visit_date_wday.lbl_6"
[37] "visit_date_wday.lbl_7"
> rec_objs[[3]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_year" "visit_date_half" "visit_date_quarter"
[7] "visit_date_month" "visit_date_day" "visit_date_wday"
[10] "visit_date_mday" "visit_date_qday" "visit_date_yday"
[13] "visit_date_mweek" "visit_date_week" "visit_date_week2"
[16] "visit_date_week3" "visit_date_week4" "visit_date_mday7"
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1" "visit_date_wday.lbl_2" "visit_date_wday.lbl_3"
[34] "visit_date_wday.lbl_4" "visit_date_wday.lbl_5" "visit_date_wday.lbl_6"
[37] "visit_date_wday.lbl_7" "visit_date_sin1_K1" "visit_date_cos1_K1"
> rec_objs[[4]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_half" "visit_date_quarter" "visit_date_month"
[7] "visit_date_day" "visit_date_mday" "visit_date_qday"
[10] "visit_date_yday" "visit_date_mweek" "visit_date_week"
[13] "visit_date_week2" "visit_date_week3" "visit_date_week4"
[16] "visit_date_mday7" "visit_date_month.lbl_01" "visit_date_month.lbl_02"
[19] "visit_date_month.lbl_03" "visit_date_month.lbl_04" "visit_date_month.lbl_05"
[22] "visit_date_month.lbl_06" "visit_date_month.lbl_07" "visit_date_month.lbl_08"
[25] "visit_date_month.lbl_09" "visit_date_month.lbl_10" "visit_date_sin1_K1"
[28] "visit_date_cos1_K1"
Will show that visit_date and visits were passed as desired to the functions by making use of !!v for recipes functions, where as timetk allows for passing variables.

Related

Error in svytotal() : could not find function "svytotal"

I am close to running the script successfully and I got this error in the final script. I am running it on the Rstudio cloud.I computed it by referring PDF: 2020 microdata file to compute estimates and standard errors (RSEs).Page 6,7 in the webpage survey link
Here is my complete script:
# Ref: file:///C:/Users/MMatam/OneDrive%20-%20University%20of%20Central%20Florida/Projects/20230123_US_EIA_DataAnalysis/Residential_BatteryPV_ElectricVehicle_MM/ResidentialEnergyConsumptionSurvey_RECS/microdata-guide.pdf
install.packages("survey")
library(survey)
# Ref: https://stackoverflow.com/questions/54621706/error-in-librarydplyr-there-is-no-package-called-dplyr
install.packages('dplyr')
library(dplyr)
# Import the CSV file from local machine
# Ref: https://community.rstudio.com/t/how-can-i-upload-csv-or-excel-files-existing-in-computer-to-rstudio-cloud/23621
# To import the csv again into this space, right click on the file name and click import dataset
recs2020 <- read_csv(file="recs2020_public_v1.csv")
# Read the
recs2020$NG_MAINSPACEHEAT <- ifelse(recs2020$FUELHEAT == 1, 1, 0)
#
repweights<-select(recs2020,NWEIGHT1:NWEIGHT60)
#
RECS <- svrepdesign(data = recs2020,
weight = ~NWEIGHT,
repweights = repweights,
type = "JK1",
combined.weights = TRUE,
scale = (ncol(repweights)-1)/ncol(repweights),
mse = TRUE)
#
NG_MAINSPACEHEAT<-as.data.frame(svytotal(~NG_MAINSPACEHEAT,RECS))
Present output:
Error in svytotal(~NG_MAINSPACEHEAT, RECS) :
could not find function "svytotal"
library(haven)
library(survey)
sas_url <-
"https://www.eia.gov/consumption/residential/data/2020/sas/recs2020_public_v1.zip"
tf <- tempfile()
download.file( sas_url , tf , mode = 'wb' )
recs_tbl <- read_sas( tf )
recs_df <- data.frame( recs_tbl )
names( recs_df ) <- tolower( names( recs_df ) )
recs_design <-
svrepdesign(
data = recs_df ,
weight = ~ nweight ,
repweights = 'nweight[1-9]+' ,
type = 'JK1' ,
combined.weights = TRUE ,
scale = 59 / 60 ,
mse = TRUE
)
svytotal( ~ as.numeric( fuelheat == 1 ) , recs_design )
# total SE
# as.numeric(fuelheat == 1) 56245389 545591

Why does Rshiny tableOutput create an additional column?

Hi all I'm relatively new to Rshiny and got some unexpected behavior in a simple example of tableOutput.
I'm trying to display a simple table in an Rshiny application but notice there is an additional column created that I did not specify. I'm a little baffled as reproducing the code in 'normal' R does not show this additional column.
My RShiny code:
library(shiny)
ui <- fluidPage(
titlePanel("Test"),
tableOutput("Table1")
)
server <- function(input, output) {
output$Table1 <- renderTable({
matrix1 = matrix(c(2, 3, 5, 8, 13), ncol =1, nrow = 5)
rownames(matrix1) = c('Min', 'value1', 'Max', 'value2', 'Standard deviation')
colnames(matrix1) = c('values')
as.table(matrix1)
})
}
shinyApp(ui = ui, server = server)
This results in the following app where the 'Var2' column is the one I did not expect:
RShiny view
I expected the output to be similar of the 'normal' R one: R table
Created by this R code:
matrix1 = matrix(c(2, 3, 5, 8, 13), ncol =1, nrow = 5)
rownames(matrix1) = c('Min', 'value1', 'Max', 'value2', 'Standard deviation')
colnames(matrix1) = c('values')
table1 = as.table(matrix1)
Can someone help me understand from where the additional column comes from in the RShiny view and how to get rid of it?
(Ps: I'm new to posting, so please let me know in case I'm unclear/missed something)
It is a table object, instead wrapped around the matrix, which is really not needed, instead it can be just the matrix object or converted to data.frame
server <- function(input, output) {
output$Table1 <- renderTable({
matrix1 = matrix(c(2, 3, 5, 8, 13), ncol =1, nrow = 5)
rownames(matrix1) = c('Min', 'value1', 'Max', 'value2', 'Standard deviation')
colnames(matrix1) = c('values')
matrix1
}, rownames = TRUE)
}
shinyApp(ui = ui, server = server)
-output
When we add as.table, it converts to table class and it gets converted to data.frame during the process and this results in the change of format
> as.table(matrix1)
values
Min 2
value1 3
Max 5
value2 8
Standard deviation 13
> as.data.frame(as.table(matrix1))
Var1 Var2 Freq
1 Min values 2
2 value1 values 3
3 Max values 5
4 value2 values 8
5 Standard deviation values 13
> str(as.table(matrix1))
'table' num [1:5, 1] 2 3 5 8 13
- attr(*, "dimnames")=List of 2
..$ : chr [1:5] "Min" "value1" "Max" "value2" ...
..$ : chr "values"
If we check the source code of renderTable, there is a line which does the as.data.frame and thus it changes the format to three column because there are different methods for as.data.frame depending on the class of the object
> shiny::renderTable
...
classNames <- paste0("table shiny-table", paste0(" table-",
names(format)[format], collapse = ""), paste0(" spacing-",
spacing))
data <- as.data.frame(data)
...
Also, check methods for as.data.frame
> methods(as.data.frame)
[1] as.data.frame.aovproj* as.data.frame.array as.data.frame.AsIs as.data.frame.character
[5] as.data.frame.complex as.data.frame.data.frame as.data.frame.data.table* as.data.frame.Date
[9] as.data.frame.default as.data.frame.descr* as.data.frame.difftime as.data.frame.EventHistory.frame*
[13] as.data.frame.factor as.data.frame.ftable* as.data.frame.function* as.data.frame.grouped_df*
[17] as.data.frame.groupedData* as.data.frame.idf* as.data.frame.integer as.data.frame.ITime*
[21] as.data.frame.list as.data.frame.logical as.data.frame.logLik* as.data.frame.mapped_discrete*
[25] as.data.frame.matrix as.data.frame.model.matrix as.data.frame.noquote as.data.frame.numeric
[29] as.data.frame.numeric_version as.data.frame.ordered as.data.frame.POSIXct as.data.frame.POSIXlt
[33] as.data.frame.raw as.data.frame.resamples* as.data.frame.shingle* as.data.frame.Surv*
[37] as.data.frame.Surv2* as.data.frame.table ##### as.data.frame.tbl_df* as.data.frame.timeDate*
[41] as.data.frame.ts as.data.frame.vctrs_sclr* as.data.frame.vctrs_vctr* as.data.frame.vector
[45] as.data.frame.xyVector*
see '?methods' for accessing help and source code
The source code of as.data.frame.table does create three columns
> as.data.frame.table
function (x, row.names = NULL, ..., responseName = "Freq", stringsAsFactors = TRUE,
sep = "", base = list(LETTERS))
{
ex <- quote(data.frame(do.call("expand.grid", c(dimnames(provideDimnames(x,
sep = sep, base = base)), KEEP.OUT.ATTRS = FALSE, stringsAsFactors = stringsAsFactors)),
Freq = c(x), row.names = row.names))
names(ex)[3L] <- responseName
eval(ex)
}

as.coded.data function won't take variables in place of numerics in R

I am attempting to use a batch file to run the funcion SPLOT to bring in variables to my response surface code. I have confirmed the variables are passing properly and are numeric, however, when it gets to the as.coded.data function, I get the error message that it can not locate the variable. Is this a limitation of this function? It worked when I manually entered the values in place of the these variables.
'''
SPLOT <-function(workdir, savedir, TREAT1_LABEL, TREAT2_LABEL, TREAT3_LABEL,RESP1_LABEL, RESP2_LABEL,
T1LOW, T1HIGH, T1MID, T2LOW, T2HIGH, T2MID,
T3LOW, T3HIGH,T3MID){
file_list <- list.files(path=workdir, pattern="*.csv")
for (x in 1:NROW(file_list)) {
PROJ_DATA<-read.csv(file=file_list[x])
i <- 1
while (i <= NROW(file_list)) {
name<-regmatches(file_list[x], regexpr("*.*", file_list[x])) # extract the text from the file-name inorder to name the plot
mytitle = paste(name,".pdf")
mytitle1 = paste(name, ".tiff")
PROJ_DATA.adj <- PROJ_DATA
results <- "savedir"
setwd(workdir)
#### Summarize Data ####
PROJ_DATA1 <- PROJ_DATA.adj %>% group_by(FACTOR) %>%
summarise(N = length(FACTOR),
mean.TREAT1 = mean(TREAT1, na.rm=TRUE),
mean.TREAT2 = mean(TREAT2, na.rm=TRUE),
mean.TREAT3 = mean(TREAT3, na.rm=TRUE),
mean.RESP1 = mean(RESP1, na.rm=TRUE),
mean.RESP2 = mean(RESP2, na.rm=TRUE))%>% drop_na()
PROJ_DATA2 <- na.omit(PROJ_DATA1)
#### Relativize the Dataset - coding ####
PROJ_DATA.coded <- as.coded.data(PROJ_DATA, TREAT1.coded ~ (mean.TREAT - T1MID)/(0.5*(T1HIGH-T1LOW)),
TREAT2.coded ~ (mean.TREAT2 - T2MID)/(0.5*(T2HIGH-T2LOW)),
TREAT3.coded ~ (mean.TREAT3 - T3MID)/(0.5*(T2HIGH-T2LOW)))
PROJ_DATA2.rsm <- rsm(RESP1 ~ FACTOR + SO(TREAT1.coded, TREAT2.coded, TREAT3.coded), data = PROJ_DATA.coded)
PROJ_DATA2.rsm$studres <- rstudent(PROJ_DATA2.rsm)
summary(PROJ_DATA2.rsm)

Parallelizing a loop with updating during each iteration

I have some R code that puts together demographic data from the Census for all of states in the US into a list object. The block-level code can take a week to run as a sequential loop since there are ~11M blocks, so I am trying to parallelize the loop over states to make it faster. I have accomplished this goal with this:
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
library(future.apply)
plan(multiprocess)
ptm <- proc.time()
CensusObj_block_age_sex = list()
CensusObj_block_age_sex[states] <- future_lapply(states, function(s){
county <- census_geo_api(key = "XXX", state = s, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = s, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = s, geo = "block", age = TRUE, sex = TRUE)
censusObj[[s]] <- list(state = s, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
}
)
However, I need to make it more robust. Sometimes there are problem with the Census API, so I would like the CensusObj to be updated at each state iteration so that I don't loose my completed data if something wrong. That way I can restart the loop over the remaining state if something does goes wrong (like if I spell "WY" as "WU")
Would it be possible to accomplish this somehow? I am open to other methods of parallelization.
The code above runs, but it seems to run into memory issues:
Error: Failed to retrieve the value of MultisessionFuture (future_lapply-3) from cluster RichSOCKnode #3 (PID 80363 on localhost ‘localhost’). The reason reported was ‘vector memory exhausted (limit reached?)’. Post-mortem diagnostic: A process with this PID exists, which suggests that the localhost worker is still alive.
I have R_MAX_VSIZE = 8Gb in my .Renviron, but I am not sure how that would get divided between the 8 cores on my machine. This all suggests that I need to store the results of each iteration rather than try to keep it all in memory, and then append the objects together at the end.
Here is a solution that uses doParallel (with the options for UNIX systems, but you can also use it on Windows, see here) and foreach that stores the results for every state separately and afterwards reads in the single files and combines them to a list.
library(doParallel)
library(foreach)
path_results <- "my_path"
ncpus = 8L
registerDoParallel(cores = ncpus)
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
results <- foreach(state = states) %dopar% {
county <- census_geo_api(key = "XXX", state = state, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = state, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = state, geo = "block", age = TRUE, sex = TRUE)
results <- list(state = state, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
# store the results as rds
saveRDS(results,
file = paste0(path_results, "/", state, ".Rds"))
# remove the results
rm(county)
rm(tract)
rm(block)
rm(results)
gc()
# just return a string
paste0("done with ", state)
}
library(purrr)
# combine the results to a list
result_files <- list.files(path = path_results)
CensusObj_block_age_sex <- set_names(result_files, states) %>%
map(~ readRDS(file = paste0(path_results, "/", .x)))
You could use a tryCatch inside future_lapply to try to relaunch the calculation in case of API error, for a maximum of maxtrials.
In the resulting list, you get for each calculation the number of trials and the final status, OK or Error:
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
library(future.apply)
#> Le chargement a nécessité le package : future
plan(multiprocess)
ptm <- proc.time()
maxtrials <- 3
census_geo_api <-
function(key = "XXX",
state = s,
geo = "county",
age = TRUE,
sex = TRUE) {
paste(state,'-', geo)
}
CensusObj_block_age_sex <- future_lapply(states, function(s) {
ntrials <- 1
while (ntrials <= maxtrials) {
hasError <- tryCatch({
#simulate random error
if (runif(1)>0.3) {error("API failed")}
county <- census_geo_api(key = "XXX", state = s, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = s, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = s, geo = "block", age = TRUE, sex = TRUE)
},
error = function(e)
e)
if (inherits(hasError, "error")) {
ntrials <- ntrials + 1
} else { break}
}
if (ntrials > maxtrials) {
res <- list(state = s, status = 'Error', ntrials = ntrials-1, age = NA, sex = NA, block = NA, tract = NA, county = NA)
} else {
res <- list(state = s, status = 'OK' , ntrials = ntrials, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
}
res
}
)
CensusObj_block_age_sex[[1]]
#> $state
#> [1] "AL"
#>
#> $status
#> [1] "OK"
#>
#> $ntrials
#> [1] 3
#>
#> $age
#> [1] TRUE
#>
#> $sex
#> [1] TRUE
#>
#> $block
#> [1] "AL - block"
#>
#> $tract
#> [1] "AL - tract"
#>
#> $county
#> [1] "AL - county"
<sup>Created on 2020-08-19 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
One possible solution that I have is to log the value of CensusObj to a text file i.e print the CensusObj in each iteration. The doSNOW package can be used for logging for example
library(doSNOW)
cl <- makeCluster(1, outfile="abc.out")
registerDoSNOW(cl)
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
foreach(i=1:length(states), .combine=rbind, .inorder = TRUE) %dopar% {
county <- "A"
tract <- "B"
block <- "C"
censusObj <- data.frame(state = states[i], age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
# edit: print json objects to easily extract from the file
cat(sprintf("%s\n",rjson::toJSON(censusObj)))
}
stopCluster(cl)
This would log the value of censusObj in abc.out and also logs the error if program crashes but you will get the latest value of censusObj logged in abc.out.
Here is the output of the last iteration from the log file:
Type: EXEC {"state":"PR","age":true,"sex":true,"block":"C","tract":"B","county":"A"} Type: DONE
Type:EXEC means that the iteration has started and Type:DONE means execution is completed. The result of cat will be present between these two statements of each iteration. Now, the value of CensusObj can be extracted from the log file as shown below:
Lines = readLines("abc.out")
results = list()
for(i in Lines){
# skip processing logs created by doSNOW
if(!startsWith(i, "starting") && !startsWith(i, "Type:")){
results = rlist::list.append(results, jsonlite::fromJSON(i))
}
}
results will contain the elements all the values printed in abc.out.
> head(results, 1)
[[1]]
[[1]]$state
[1] "AL"
[[1]]$age
[1] TRUE
[[1]]$sex
[1] TRUE
[[1]]$block
[1] "C"
[[1]]$tract
[1] "B"
[[1]]$county
[1] "A"
It is not a very clean solution but works.

Getting example codes of R functions into knitr using helpExtract function

I want to get the example codes of R functions to use in knitr. There might be an easy way but tried the following code using helpExtract function which can be obtained from here (written by #AnandaMahto). With my approach I have to look whether a function has Examples or not and have to include only those functions which have Examples.
This is very inefficient and naive approach. Now I'm trying to include only those functions which have Examples. I tried the following code but it is not working as desired. How can I to extract Examples codes from an R package?
\documentclass{book}
\usepackage[T1]{fontenc}
\begin{document}
<< label=packages, echo=FALSE>>=
library(ggplot2)
library(devtools)
source_gist("https://gist.github.com/mrdwab/7586769")
library(noamtools) # install_github("noamtools", "noamross")
#
\chapter{Linear Model}
<< label = NewTest1, results="asis">>=
tryCatch(
{helpExtract(lm, section="Examples", type = "s_text");
cat(
"\\Sexpr{
knit_child(
textConnection(helpExtract(lm, section=\"Examples\", type = \"s_text\"))
, options = list(tidy = FALSE, eval = TRUE)
)
}", "\n"
)
}
, error=function(e) FALSE
)
#
\chapter{Modify properties of an element in a theme object}
<< label = NewTest2, results="asis">>=
tryCatch(
{helpExtract(add_theme , section="Examples", type = "s_text");
cat(
"\\Sexpr{
knit_child(
textConnection(helpExtract(add_theme , section=\"Examples\", type = \"s_text\"))
, options = list(tidy = FALSE, eval = TRUE)
)
}", "\n"
)
}
, error=function(e) FALSE
)
#
\end{document}
I've done some quick work modifying the function (which I've included at this Gist). The Gist also includes a sample Rnw file (I haven't had a chance to check an Rmd file yet).
The function now looks like this:
helpExtract <- function(Function, section = "Usage", type = "m_code", sectionHead = NULL) {
A <- deparse(substitute(Function))
x <- capture.output(tools:::Rd2txt(utils:::.getHelpFile(utils::help(A)),
options = list(sectionIndent = 0)))
B <- grep("^_", x) ## section start lines
x <- gsub("_\b", "", x, fixed = TRUE) ## remove "_\b"
X <- rep(FALSE, length(x)) ## Create a FALSE vector
X[B] <- 1 ## Initialize
out <- split(x, cumsum(X)) ## Create a list of sections
sectionID <- vapply(out, function(x) ## Identify where the section starts
grepl(section, x[1], fixed = TRUE), logical(1L))
if (!any(sectionID)) { ## If the section is missing...
"" ## ... just return an empty character
} else { ## Else, get that list item
out <- out[[which(sectionID)]][-c(1, 2)]
while(TRUE) { ## Remove the extra empty lines
out <- out[-length(out)] ## from the end of the file
if (out[length(out)] != "") { break }
}
switch( ## Determine the output type
type,
m_code = {
before <- "```r"
after <- "```"
c(sectionHead, before, out, after)
},
s_code = {
before <- "<<eval = FALSE>>="
after <- "#"
c(sectionHead, before, out, after)
},
m_text = {
c(sectionHead, paste(" ", out, collapse = "\n"))
},
s_text = {
before <- "\\begin{verbatim}"
after <- "\\end{verbatim}"
c(sectionHead, before, out, after)
},
stop("`type` must be either `m_code`, `s_code`, `m_text`, or `s_text`")
)
}
}
What has changed?
A new argument sectionHead has been added. This is used to be able to specify the section title in the call to the helpExtract function.
The function checks to see whether the relevant section is available in the parsed document. If it is not, it simply returns a "" (which doesn't get printed).
Example use would be:
<<echo = FALSE>>=
mySectionHeading <- "\\section{Some cool section title}"
#
\Sexpr{knit_child(textConnection(
helpExtract(cor, section = "Examples", type = "s_code",
sectionHead = mySectionHeading)),
options = list(tidy = FALSE, eval = FALSE))}
Note: Since Sexpr doesn't allow curly brackets to be used ({), we need to specify the title outside of the Sexpr step, which I have done in a hidden code chunk.
This is not a complete answer so I'm marking it as community wiki. Here are two simple lines to get the examples out of the Rd file for a named function (in this case lm). The code is much simpler than Ananda's gist in my opinion:
x <- utils:::.getHelpFile(utils::help(lm))
sapply(x[sapply(x, function(z) attr(z, "Rd_tag") == "\\examples")][[1]], `[[`, 1)
The result is a simple vector of all of the text in the Rd "examples" section, which should be easy to parse, evaluate, or include in a knitr doc.
[1] "\n"
[2] "require(graphics)\n"
[3] "\n"
[4] "## Annette Dobson (1990) \"An Introduction to Generalized Linear Models\".\n"
[5] "## Page 9: Plant Weight Data.\n"
[6] "ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)\n"
[7] "trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)\n"
[8] "group <- gl(2, 10, 20, labels = c(\"Ctl\",\"Trt\"))\n"
[9] "weight <- c(ctl, trt)\n"
[10] "lm.D9 <- lm(weight ~ group)\n"
[11] "lm.D90 <- lm(weight ~ group - 1) # omitting intercept\n"
[12] "\n"
[13] "\n"
[14] "opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))\n"
[15] "plot(lm.D9, las = 1) # Residuals, Fitted, ...\n"
[16] "par(opar)\n"
[17] "\n"
[18] "\n"
[19] "### less simple examples in \"See Also\" above\n"
Perhaps the following might be useful.
get.examples <- function(pkg=NULL) {
suppressWarnings(f <- unique(utils:::index.search(TRUE, find.package(pkg))))
out <- setNames(sapply(f, function(x) {
tf <- tempfile("Rex")
tools::Rd2ex(utils:::.getHelpFile(x), tf)
if (!file.exists(tf)) return(invisible())
readLines(tf)
}), basename(f))
out[!sapply(out, is.null)]
}
ex.base <- get.examples('base')
This returns the examples for all functions (that have documentation containing examples) within the specified vector of packages. If pkg=NULL, it returns the examples for all functions within loaded packages.
For example:
ex.base['scan']
# $scan
# [1] "### Name: scan"
# [2] "### Title: Read Data Values"
# [3] "### Aliases: scan"
# [4] "### Keywords: file connection"
# [5] ""
# [6] "### ** Examples"
# [7] ""
# [8] "cat(\"TITLE extra line\", \"2 3 5 7\", \"11 13 17\", file = \"ex.data\", sep = \"\\n\")"
# [9] "pp <- scan(\"ex.data\", skip = 1, quiet = TRUE)"
# [10] "scan(\"ex.data\", skip = 1)"
# [11] "scan(\"ex.data\", skip = 1, nlines = 1) # only 1 line after the skipped one"
# [12] "scan(\"ex.data\", what = list(\"\",\"\",\"\")) # flush is F -> read \"7\""
# [13] "scan(\"ex.data\", what = list(\"\",\"\",\"\"), flush = TRUE)"
# [14] "unlink(\"ex.data\") # tidy up"
# [15] ""
# [16] "## \"inline\" usage"
# [17] "scan(text = \"1 2 3\")"
# [18] ""
# [19] ""
# [20] ""
# [21] ""

Resources