wrap tidymodels recipe into function - r

Is it possible to wrap a tidymodel recipe into a function? I've tried the following:
# Data setup
library(tidyverse)
library(tidymodels)
parks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-06-22/parks.csv')
modeling_df <- parks %>%
select(pct_near_park_data, spend_per_resident_data, med_park_size_data) %>%
rename(nearness = "pct_near_park_data",
spending = "spend_per_resident_data",
acres = "med_park_size_data") %>%
mutate(nearness = (parse_number(nearness)/100)) %>%
mutate(spending = parse_number(spending))
# Start building models
set.seed(123)
park_split <- initial_split(modeling_df)
park_train <- training(park_split)
park_test <- testing(park_split)
Works well without function:
tree_rec <- recipe(nearness ~., data = park_train)
Problem: wrap recipe into function:
custom_rec <- function(dat, var){
tree_rec <- recipe(nearness ~ {{var}}, data = dat)
}
custom_rec(park_train, speeding)
Error:
Error during wrapup: No in-line functions should be used here; use steps to define baking actions.
Error: no more error handlers available (recursive errors?); invoking 'abort' restart

The R formula is an extremely useful but weird, weird thing so I don't recommend trying to mess around with it in a situation like you have here.
Instead, try using the update_role() interface for recipes:
library(tidymodels)
library(modeldata)
data(biomass)
# split data
biomass_tr <- biomass[biomass$dataset == "Training",]
my_rec <- function(dat, preds) {
recipe(dat) %>%
update_role({{preds}}, new_role = "predictor") %>%
update_role(HHV, new_role = "outcome") %>%
update_role(sample, new_role = "id variable") %>%
update_role(dataset, new_role = "splitting indicator")
}
my_rec(biomass_tr, carbon) %>% prep() %>% summary()
#> # A tibble: 8 × 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 sample nominal id variable original
#> 2 dataset nominal splitting indicator original
#> 3 carbon numeric predictor original
#> 4 hydrogen numeric <NA> original
#> 5 oxygen numeric <NA> original
#> 6 nitrogen numeric <NA> original
#> 7 sulfur numeric <NA> original
#> 8 HHV numeric outcome original
my_rec(biomass_tr, c(carbon, hydrogen, oxygen, nitrogen)) %>% prep() %>% summary()
#> # A tibble: 8 × 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 sample nominal id variable original
#> 2 dataset nominal splitting indicator original
#> 3 carbon numeric predictor original
#> 4 hydrogen numeric predictor original
#> 5 oxygen numeric predictor original
#> 6 nitrogen numeric predictor original
#> 7 sulfur numeric <NA> original
#> 8 HHV numeric outcome original
Created on 2021-09-21 by the reprex package (v2.0.1)
If you are set on the formula interface, maybe try using rlang::new_formula().

Related

In R tidymodels how can I specify contrasts for specific variables?

I would like to specify "sum to zero" contrasts for two predictors in a LM using a tidymodels recipe. Is it possible? In looking at the recipes documentation, before 1.3, it seems there were attempts to build the variable specific options but the strategy was shifted to a global option.
I am trying to convert this base R code into tidymodels:
Bikeshare <- ISLR2::Bikeshare # start with original data
contrasts(Bikeshare$hr) <- contr.sum(24)
contrasts(Bikeshare$mnth) <- contr.sum(12)
mod.lm2 <-
lm(
bikers ~ mnth + hr + workingday + temp + weathersit,
data = Bikeshare
)
summary(mod.lm2)
I got this far:
library(tidymodels)
Bikeshare <- ISLR2::Bikeshare # start with original data
contrasts(Bikeshare$hr) <- contr.sum(24)
contrasts(Bikeshare$mnth) <- contr.sum(12)
lm_spec <- linear_reg() %>%
set_engine("lm")
the_rec <-
recipe(
bikers ~ mnth + hr + workingday + temp + weathersit,
data = Bikeshare
) %>%
step_dummy(c(mnth, hr), one_hot = TRUE)
the_workflow<- workflow() %>%
add_recipe(the_rec) %>%
add_model(lm_spec)
the_workflow_fit_lm_fit <-
fit(the_workflow, data = Bikeshare) %>%
extract_fit_parsnip()
summary(the_workflow_fit_lm_fit$fit)
Does anybody know how to get the same results out of a tidymodels workflow?
I don't think I can use contr.sum as a global option. This gives me the betas I would like for two of the variables but it changes the contrasts on others.
BikeShare <- ISLR2::Bikeshare # be sure to work with original data ;
old_opt <- options()$contrast;
options(contrasts = c('contr.sum', 'contr.poly'))
The docs for step_dummy() have :
To change the type of contrast being used, change the global contrast option via options.
so there is no way, outside of global options, to change it.
We should probably have an example though :-/
Note that, for new samples, the options are read from the global option again. Make sure that they are set the same at prediction-time:
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
tidymodels_prefer()
data("penguins")
penguins <-
penguins %>%
distinct(species)
# R's defaults
old_opt <- options()$contrast
old_opt
#> unordered ordered
#> "contr.treatment" "contr.poly"
# default contrast
default <-
recipe(~ species, data = penguins) %>%
step_dummy(species) %>%
prep()
default %>% bake(new_data = NULL)
#> # A tibble: 3 × 2
#> species_Chinstrap species_Gentoo
#> <dbl> <dbl>
#> 1 0 0
#> 2 0 1
#> 3 1 0
# Do do something different
# Now set to something else:
options(contrasts = c('contr.sum', 'contr.poly'))
with_opt <-
recipe(~ species, data = penguins) %>%
step_dummy(species) %>%
prep()
with_opt %>% bake(new_data = NULL)
#> # A tibble: 3 × 2
#> species_X1 species_X2
#> <dbl> <dbl>
#> 1 1 0
#> 2 -1 -1
#> 3 0 1
# reset options:
options(contrasts = old_opt)
with_opt %>% bake(new_data = penguins)
#> # A tibble: 3 × 2
#> species_Chinstrap species_Gentoo
#> <dbl> <dbl>
#> 1 0 0
#> 2 0 1
#> 3 1 0
Created on 2021-11-16 by the reprex package (v2.0.0)
edit for clarity

R tidymodels recipes near zero variance filter for numeric attributes

I have troubles to use the step_nzv in R tidymodels recipes to filter out numeric attributes with small variances but continuous values. To me it seems, that the step applies only for nominal values, as it calculates the number of unique values and the ratio of most common to second most common. However I have an attribut which is almost everywhere close to zero, never zero. Do I have to bin first (and discretize with same sized bins would change everything)?
In the code below, I there is a minimal example. I expect that both columns low_variance_num and low_variance_nom are filtered out which doesn't happen:
library(tidymodels)
data <- tibble(num = seq(1000),rand = runif(1000)) %>%
mutate(low_variance_num = ifelse(num == 1, 1, rand/10000),
low_variance_nom = ifelse(num == 1, 1, 0))
data
var(data$low_variance_num)
var(data$low_variance_nom)
recipe <- recipe(formula = num ~., data = data) %>%
update_role("num", new_role = "label") %>%
step_nzv(all_predictors(), freq_cut = 995/5, unique_cut = 10) %>% # 5min bis hier
prep()
summary(recipe)
P.S: Is there a way to use recipes without providing a formula? In this case the formula is nonsense.
For starters, yes, there is a way to use recipes without providing a formula. To do that you call recipe() with only the data as an argument and then manually update the roles via update_role(). This is the recommended approach when the number of variables is very high, as the formula method is memory-inefficient with many variables.
Next, I want to clarify what we mean in tidymodels by "nominal":
Nominal variables include both character and factor.
A numeric variable of all 1s and 0s would not be a nominal variable in tidymodels (would not be selected by all_nominal(), etc).
Next, I want to point out that I don't think step_nzv() is going to do what you are hoping here because you are using the term "variance" in a different sense. If you check out the docs, it describes what we mean here by near-zero-variance:
For example, an example of near-zero variance predictor is one that, for 1000 samples, has two distinct values and 999 of them are a single value.
To be flagged, first, the frequency of the most prevalent value over the second most frequent value (called the "frequency ratio") must be above freq_cut. Secondly, the "percent of unique values," the number of unique values divided by the total number of samples (times 100), must also be below unique_cut.
The example low_variance_num variable you made is not particularly low-variance by the definition used in this step; it has lots of unique values.
For reference, here is a demo of how to build a recipe without the formula:
library(recipes)
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#>
#> step
df <- tibble(num = seq(1000), rand = runif(1000)) %>%
mutate(pred1 = ifelse(num == 1, 1, rand/10000),
pred2 = ifelse(num == 1, 1, 0))
rec <- recipe(df) %>%
update_role(num, new_role = "label") %>%
update_role(rand, pred1, pred2, new_role = "predictor") %>%
step_nzv(all_predictors())
rec %>% prep() %>% bake(new_data = NULL)
#> # A tibble: 1,000 x 3
#> num rand pred1
#> <int> <dbl> <dbl>
#> 1 1 0.842 1
#> 2 2 0.942 0.0000942
#> 3 3 0.977 0.0000977
#> 4 4 0.595 0.0000595
#> 5 5 0.259 0.0000259
#> 6 6 0.454 0.0000454
#> 7 7 0.550 0.0000550
#> 8 8 0.388 0.0000388
#> 9 9 0.702 0.0000702
#> 10 10 0.481 0.0000481
#> # … with 990 more rows
Created on 2021-01-07 by the reprex package (v0.3.0)
The predictor pred2 was removed because it has so few unique values and they are almost all 0. The predictor pred1 was not removed because it has many unique values. I think if I wanted to do the kind of filtering you are describing, I would do it in data cleaning/preparation, not within a feature engineering recipe in a model pipeline.

How do I use tidyeval in R to create a small prediction dataset that has all levels of factors?

I am trying to obtain a prediction grid using the modelr::data_grid() function that has all levels for one factor of a model and only the "typical" level for the other factor in a model. The wrinkle is that the call levels() on the factor held constant in the prediction grid should return all levels present in the original data. Otherwise, some prediction functions can fail and return the infamous
Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
contrasts can be applied only to factors with 2 or more levels
Here is a demonstration of what I can get out of modelr::data_grid(), as well as what I would desire.
library(tidyverse)
co2_mod <- slice(CO2, -1, -43)
fit <- lm(uptake ~ Type + Treatment, data = co2_mod)
grid_df <- modelr::data_grid(co2_mod, Type, .model = fit) %>%
mutate(Treatment = factor(Treatment))
grid_df
#> # A tibble: 2 x 2
#> Type Treatment
#> <fct> <fct>
#> 1 Quebec chilled
#> 2 Mississippi chilled
levels(grid_df$Treatment)
#> [1] "chilled"
I would like to have
levels(grid_df$Treatment)
[1] "nonchilled" "chilled"
The factor variable with only 1 level in the dataset should "know" all of the possible factors in the original dataset. For example, see below the screen shot from the rstanarm vignettes that show the need to have both Male and Female as levels, even though predictions are made holding gender equal to Female.
I have tried to write a couple of functions to accomplish my goal. There is an extra step of repeating the dataset 1000 times, since I am using this grid as newdata for posterior predictive distributions from an rstanarm model fit.
Auxiliary function (this works):
get_all_levels <- function(v, dataset){
v <- enquo(v)
a <- select(dataset, !!v) %>% as_vector()
l <- levels(a)
return(l)
}
Main function, which calls auxiliary function (this doesn't work):
grid_all_levels <- function(model, variable, df){
variable <- enquo(variable)
grid <- expand_grid(
rep = seq(1, 1000, 1),
modelr::data_grid(df, !!variable, .model = model)
) %>%
mutate(across(where(is.character), ~factor(., levels = get_all_levels(., df))))
return(grid)
}
Here is a small reprex that you can run these functions on.
library(tidyverse)
get_all_levels <- function(v, dataset){
v <- enquo(v)
d <- as_tibble(dataset)
a <- select(d, !!v) %>% as_vector()
l <- levels(a)
return(l)
}
grid_all_levels <- function(model, variable, df){
variable <- enquo(variable)
grid <- expand_grid(
rep = seq(1, 1000, 1),
modelr::data_grid(df, !!variable, .model = model)
) %>%
mutate(across(where(is.character), ~factor(., levels = get_all_levels(., df))))
return(grid)
}
co2_mod <- slice(CO2, -1, -43)
fit <- lm(conc ~ Type + Treatment, data = co2_mod)
get_all_levels(Type, co2_mod)
#> [1] "Quebec" "Mississippi"
grid_all_levels(fit, Type, co2_mod)
#> Note: Using an external vector in selections is ambiguous.
#> ℹ Use `all_of(.)` instead of `.` to silence this message.
#> ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.
#> Error: Problem with `mutate()` input `..1`.
#> x Can't subset columns that don't exist.
#> x Columns `chilled`, `chilled`, `chilled`, `chilled`, `chilled`, etc. don't exist.
#> ℹ Input `..1` is `across(...)`.
I'm pretty sure that the issue is how I am using the . in the get_all_levels call in the grid_all_levels function.
How can I get grid_all_levels() to work?
Update
I modified the grid_all_levels() function per answers below to look like this:
grid_all_levels <- function(model, variable, df){
variable <- enquo(variable)
grid <- expand_grid(
rep = seq(1, 1000, 1),
modelr::data_grid(df, !!variable, .model = model)
) %>%
mutate(across(where(is.character), ~factor(., levels = get_all_levels(cur_column(), df))))
return(grid)
}
which returns
#> # A tibble: 2,000 x 3
#> rep Type Treatment
#> <dbl> <fct> <fct>
#> 1 1 Quebec chilled
#> 2 1 Mississippi chilled
#> 3 2 Quebec chilled
#> 4 2 Mississippi chilled
#> 5 3 Quebec chilled
#> 6 3 Mississippi chilled
#> 7 4 Quebec chilled
#> 8 4 Mississippi chilled
#> 9 5 Quebec chilled
#> 10 5 Mississippi chilled
#> # … with 1,990 more rows
levels(a$Treatment)
#> [1] "nonchilled" "chilled"
Which was exactly what I wanted!
The problem is that the . inside the across is the actual column not the name of the column. Sadly there is no way to get the names inside an across call however using purrr::reduce we can make a workaround:
library(tidyverse)
levelize <- function(.x, .y){
mutate(.x, !!paste0(.y) := factor(!!sym(.y), levels = get_all_levels(!!sym(.y), df) ) )
}
estimate_prevalence_differences_polr <- function(model, variable, df){
variable <- enquo(variable)
# putting the names of the factor variables inside a vector
vars <- sapply(df, function(x) is.factor(x))
vars <- names(vars)[vars]
expand_grid(
rep = 1:1000,
modelr::data_grid(df, !!variable, .model = model)
) %>%
### looping through the vars that are factor
## the .x argument is the one set by .init i.e the grid
## the .y argument is the element of the vars vector in the current iter
## the first iter we have .y="Type" and .x=.
## inside the levelize function
## basically mutate .x and change the variable named .y (Type) to become a factor
## with the levels returned from the original df
reduce(intersect(vars, colnames(.)), levelize, .init=.)
}
After calling the function now the levels are set correctly, we can check this by printing the unique values of each factor column:
estimate_prevalence_differences_polr(fit, Type, df) %>% select(where(is.factor)) %>% map(unique)
$Type
[1] Quebec Mississippi
Levels: Quebec Mississippi
$Plant
[1] Qc2
Levels: Qn1 Qn2 Qn3 Qc1 Qc3 Qc2 Mn3 Mn2 Mn1 Mc2 Mc3 Mc1
$Treatment
[1] chilled nonchilled
Levels: nonchilled chilled
EDIT :
a way easier solution is to use cur_column I didn't now about this option when I first wrote the answer. so basically instead of passing . to get_all_levels pass cur_column() i.e
expand_grid(
rep = seq(1, 1000, 1),
modelr::data_grid(df, !!variable, .model = model)
) %>%
mutate(across(where(is.character), ~factor(., levels = get_all_levels(cur_column(), df))))
I think you are making it more complicated than it needs. If I remove all the tidyeval stuff, we get this which is much simpler:
mutate(across(where(is.character), ~ factor(., levels = levels(.))))
You can extract the ~ lambda into a reusable named function.
This implementation immediately brings issues to mind though. Since levels() returns NULL with character vectors, this only works with factors. And with factors this is a no-op. I'm not sure what you're trying to achieve.
Also, the get-all-levels implementation has a problem: it takes a selection, which can return multiple vectors, e.g. with starts_with(). In this case you will get a confusing error.

How can I use map* and mutate to convert a list into a set of additional columns?

I have tried probably hundreds of permutations of this code for literally days to try to get a function that will do what I want, and I have finally given up. It feels like it should definitely be doable and I am so close!
I have tried to get back to the nub of things here with my reprex below.
Basically I have a single-row dataframe, with a column containing a list of strings ("concepts"). I want to create an additional column for each of those strings, using mutate, ideally with the column taking its name from the string, and then to populate the column with the results of a function call (?it doesn't matter which function, for now? - I just need the infrastructure of the function to work.)
I feel, as usual, like I must be missing something obvious... maybe just a syntax error.
I also wonder if I need to use purrr::map, maybe a simpler vectorised mapping would work fine.
I feel like the fact that new columns are named ..1 rather than the concept name is a bit of a clue as to what is wrong.
I can create the data frame I want by calling each concept manually (see end of reprex) but since the list of concepts is different for different data frames, I want to functionalise this using pipes and tidyverse techniques rather than do it manually.
I've read the following questions to find help:
How to use map from purrr with dplyr::mutate to create multiple new columns based on column pairs
How to mutate multiple columns with dynamic variable using purrr:map function?
(R) Cleaner way to use map() with list-columns
Add multiple output variables using purrr and a predefined function
Creating new variables with purrr (how does one go about that?)
How to compute multiple new columns in a R dataframe with dynamic names
but none of those has quite helped me crack the problem I'm experiencing. [edit: added in last q to that list which may be the technique I need].
<!-- language-all: lang-r -->
# load packages -----------------------------------------------------------
library(rlang)
library(dplyr)
library(tidyr)
library(magrittr)
library(purrr)
library(nomisr)
# set up initial list of tibbles ------------------------------------------
df <- list(
district_population = tibble(
dataset_title = "Population estimates - local authority based by single year",
dataset_id = "NM_2002_1"
),
jsa_claimants = tibble(
dataset_title = "Jobseeker\'s Allowance with rates and proportions",
dataset_id = "NM_1_1"
)
)
# just use the first tibble for now, for testing --------------------------
# ideally I want to map across dfs through a list -------------------------
df <- df[[1]]
# nitty gritty functions --------------------------------------------------
get_concept_list <- function(df) {
dataset_id <- pluck(df, "dataset_id")
nomis_overview(id = dataset_id,
select = c("dimensions", "codes")) %>%
pluck("value", 1, "dimension") %>%
filter(!concept == "geography") %>%
pull("concept")
}
# get_concept_list() returns the strings I need:
get_concept_list(df)
#> [1] "time" "gender" "c_age" "measures"
# Here is a list of examples of types of map* that do various things,
# none of which is what I need it to do
# I'm using toupper() here for simplicity - ultimately I will use
# get_concept_info() to populate the new columns
# this creates four new tibbles
get_concept_list(df) %>%
map(~ mutate(df, {{.x}} := toupper(.x)))
#> [[1]]
#> # A tibble: 1 x 3
#> dataset_title dataset_id ..1
#> <chr> <chr> <chr>
#> 1 Population estimates - local authority based by single year NM_2002_1 TIME
#>
#> [[2]]
#> # A tibble: 1 x 3
#> dataset_title dataset_id ..1
#> <chr> <chr> <chr>
#> 1 Population estimates - local authority based by single year NM_2002_1 GENDER
#>
#> [[3]]
#> # A tibble: 1 x 3
#> dataset_title dataset_id ..1
#> <chr> <chr> <chr>
#> 1 Population estimates - local authority based by single year NM_2002_1 C_AGE
#>
#> [[4]]
#> # A tibble: 1 x 3
#> dataset_title dataset_id ..1
#> <chr> <chr> <chr>
#> 1 Population estimates - local authority based by single year NM_2002_1 MEASUR~
# this throws an error
get_concept_list(df) %>%
map_chr(~ mutate(df, {{.x}} := toupper(.x)))
#> Error: Result 1 must be a single string, not a vector of class `tbl_df/tbl/data.frame` and of length 3
# this creates three extra rows in the tibble
get_concept_list(df) %>%
map_df(~ mutate(df, {{.x}} := toupper(.x)))
#> # A tibble: 4 x 3
#> dataset_title dataset_id ..1
#> <chr> <chr> <chr>
#> 1 Population estimates - local authority based by single year NM_2002_1 TIME
#> 2 Population estimates - local authority based by single year NM_2002_1 GENDER
#> 3 Population estimates - local authority based by single year NM_2002_1 C_AGE
#> 4 Population estimates - local authority based by single year NM_2002_1 MEASUR~
# this does the same as map_df
get_concept_list(df) %>%
map_dfr(~ mutate(df, {{.x}} := toupper(.x)))
#> # A tibble: 4 x 3
#> dataset_title dataset_id ..1
#> <chr> <chr> <chr>
#> 1 Population estimates - local authority based by single year NM_2002_1 TIME
#> 2 Population estimates - local authority based by single year NM_2002_1 GENDER
#> 3 Population estimates - local authority based by single year NM_2002_1 C_AGE
#> 4 Population estimates - local authority based by single year NM_2002_1 MEASUR~
# this creates a single tibble 12 columns wide
get_concept_list(df) %>%
map_dfc(~ mutate(df, {{.x}} := toupper(.x)))
#> # A tibble: 1 x 12
#> dataset_title dataset_id ..1 dataset_title1 dataset_id1 ..11 dataset_title2
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Population e~ NM_2002_1 TIME Population es~ NM_2002_1 GEND~ Population es~
#> # ... with 5 more variables: dataset_id2 <chr>, ..12 <chr>,
#> # dataset_title3 <chr>, dataset_id3 <chr>, ..13 <chr>
# function to get info on each concept (except geography) -----------------
# this is the function I want to use eventually to populate my new columns
get_concept_info <- function(df, concept_name) {
dataset_id <- pluck(df, "dataset_id")
nomis_overview(id = dataset_id) %>%
filter(name == "dimensions") %>%
pluck("value", 1, "dimension") %>%
filter(concept == concept_name) %>%
pluck("codes.code", 1) %>%
select(name, value) %>%
nest(data = everything()) %>%
as.list() %>%
pluck("data")
}
# individual mutate works, for comparison ---------------------------------
# I can create the kind of table I want manually using a line like the one below
# df %>% map(~ mutate(., measures = get_concept_info(., concept_name = "measures")))
df %>% mutate(., measures = get_concept_info(df, "measures"))
#> # A tibble: 1 x 3
#> dataset_title dataset_id measures
#> <chr> <chr> <list>
#> 1 Population estimates - local authority based by sin~ NM_2002_1 <tibble [2 x ~
<sup>Created on 2020-02-10 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
Using !! and := lets you dynamically name columns. Then, we can reduce the list output of map() with reduce(), which left_joins() all the dataframes in the list using the dataset title and id columns.
df_2 <-
map(get_concept_list(df),
~ mutate(df,
!!.x := get_concept_info(df, .x))) %>%
reduce(left_join, by = c("dataset_title", "dataset_id"))
df_2
# A tibble: 1 x 6
dataset_title dataset_id time gender c_age measures
<chr> <chr> <list<df[,2]>> <list<df[,2]>> <list<df[,2]>> <list<df[,2]>>
1 Population estimates - local authority based by single year NM_2002_1 [28 x 2] [3 x 2] [121 x 2] [2 x 2]

Appending columns/variables from a data frame into a new variable

I've been searching for how to do this but cannot seem to find an example for my questions. I'm pretty new to R but am very familiar with SAS, so I wanted to ask how to do the equivalent of this SAS code in R.
I have one dataset (cohort), and two variables (last_pre_cv_prob, first_post_cv_prob), and I want to make a new dataset that has two variables, the first of which is the two previous variables set underneath each other (cv_prob), the second is a variable indicating which variable the data came from (time). So in SAS I would simply do this:
data post_cv;
set cohort(keep=last_pre_cv_prob rename=(last_pre_cv_prob=cv_prob) in=a)
cohort(keep=first_post_cv_prob rename=(first_post_cv_prob=cv_prob) in=b);
if b then time='post';
if a then time='pre';
run;
How would I do this in R?
Thanks!
edit:
post_cv2 %>% gather(column, prob, last_pre_cv_prob, first_post_cv_prob)
Error in eval(expr, envir, enclos) : object 'last_pre_cv_prob' not found
Then I tried:
post_cv2 %>% gather(column, prob, liver_cv$last_pre_cv_prob,
liver_cv$first_post_cv_prob)
Error: All select() inputs must resolve to integer column positions.
The following do not:
* liver_cv$last_pre_cv_prob
* liver_cv$first_post_cv_prob
edit:
Second issue resolved, I had to add the little quote marks
post_cv2 <- post_cv %>%
gather(time, cv_prob, `liver_cv$last_pre_cv_prob`,
`liver_cv$first_post_cv_prob`)
edit:
Solved!
library(tidyverse)
library(stringr)
post_cv <- data_frame(pre = liver_cv$last_pre_cv_prob, post = liver_cv$first_post_cv_prob)
post_cv2 <- post_cv %>%
gather(time, cv_prob, pre, post)
You can simply gather the 2 columns and extract the time information:
library(tidyverse)
cohort <- data_frame(last_pre_cv_prob = runif(5),
first_post_cv_prob = runif(5))
cohort_2 <- cohort %>%
gather(time, cv_prob, last_pre_cv_prob, first_post_cv_prob) %>%
mutate(time = str_extract(time, "post|pre"))
cohort_2
#> # A tibble: 10 × 2
#> time cv_prob
#> <chr> <dbl>
#> 1 pre 0.64527372
#> 2 pre 0.55086818
#> 3 pre 0.05882369
#> 4 pre 0.19626147
#> 5 pre 0.05933594
#> 6 post 0.25564350
#> 7 post 0.01908338
#> 8 post 0.84901506
#> 9 post 0.07761842
#> 10 post 0.29019190

Resources