I have a data frame like this:
df <- tibble(
i = rep(1:10, times = 5),
t = rep(1:5, each = 10)
) %>%
mutate(y = rnorm(50))
I want to apply a function that takes data frame of each t as argument:
f <- function(df){
return(lm(y ~ +1, data = df))
}
When I apply purrr::map for a nested data frame with pipe operator, I get error.
# does not work
df_nested <- df %>%
nest(data = c(t, y)) %>%
rename(data_col = data)
df_nested %>%
purrr::map(.x = .$data_col, .f = f)
On the other hand, when I do not use pipe operator, I get the desired result.
# Ok
purrr::map(.x = df_nested$data_col, .f = f)
To my understanding, both code should return the same result. What is wrong with the code with pipe operator?
Pipe already passes the previous value (df_nested) as the first argument to map. You may use {} to stop that from happening.
library(tidyverse)
df_nested %>%
{purrr::map(.x = .$data_col, .f = f)}
Another way would be to use -
df %>%
nest(data_col = c(t, y)) %>%
mutate(model = map(data_col, f))
# i data_col model
# <int> <list> <list>
# 1 1 <tibble [5 × 2]> <lm>
# 2 2 <tibble [5 × 2]> <lm>
# 3 3 <tibble [5 × 2]> <lm>
# 4 4 <tibble [5 × 2]> <lm>
# 5 5 <tibble [5 × 2]> <lm>
# 6 6 <tibble [5 × 2]> <lm>
# 7 7 <tibble [5 × 2]> <lm>
# 8 8 <tibble [5 × 2]> <lm>
# 9 9 <tibble [5 × 2]> <lm>
#10 10 <tibble [5 × 2]> <lm>
Related
I have the following code, where I do some grid search for different mtry and min_n. I know how to extract the parameters that give the highest accuracy (see second code box). How can I extract the importance of each feature in the training dataset? The guides I found online show how to do it only in the test dataset using "last_fit". E.g. of guide: https://www.tidymodels.org/start/case-study/#data-split
set.seed(seed_number)
data_split <- initial_split(node_strength,prop = 0.8,strata = Group)
train <- training(data_split)
test <- testing(data_split)
train_folds <- vfold_cv(train,v = 10)
rfc <- rand_forest(mode = "classification", mtry = tune(),
min_n = tune(), trees = 1500) %>%
set_engine("ranger", num.threads = 48, importance = "impurity")
rfc_recipe <- recipe(data = train, Group~.)
rfc_workflow <- workflow() %>% add_model(rfc) %>%
add_recipe(rfc_recipe)
rfc_result <- rfc_workflow %>%
tune_grid(train_folds, grid = 40, control = control_grid(save_pred = TRUE),
metrics = metric_set(accuracy))
.
best <-
rfc_result %>%
select_best(metric = "accuracy")
To do this, you will want to create a custom extract function, as outlined in this documentation.
For random forest variable importance, your function will look something like this:
get_rf_imp <- function(x) {
x %>%
extract_fit_parsnip() %>%
vip::vi()
}
And then you can apply it to your resamples like so (notice that you get a new .extracts column):
library(tidymodels)
data(cells, package = "modeldata")
set.seed(123)
cell_split <- cells %>% select(-case) %>%
initial_split(strata = class)
cell_train <- training(cell_split)
cell_test <- testing(cell_split)
folds <- vfold_cv(cell_train)
rf_spec <- rand_forest(mode = "classification") %>%
set_engine("ranger", importance = "impurity")
ctrl_imp <- control_grid(extract = get_rf_imp)
cells_res <-
workflow(class ~ ., rf_spec) %>%
fit_resamples(folds, control = ctrl_imp)
cells_res
#> # Resampling results
#> # 10-fold cross-validation
#> # A tibble: 10 × 5
#> splits id .metrics .notes .extracts
#> <list> <chr> <list> <list> <list>
#> 1 <split [1362/152]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 2 <split [1362/152]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 3 <split [1362/152]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 4 <split [1362/152]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 5 <split [1363/151]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 6 <split [1363/151]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 7 <split [1363/151]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 8 <split [1363/151]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 9 <split [1363/151]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
#> 10 <split [1363/151]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]> <tibble [1 × 2]>
Created on 2022-06-19 by the reprex package (v2.0.1)
Once you have those variable importance score extracts, you can unnest() them (right now, you have to do this twice because it is deeply nested) and then you can summarize and visualize as you prefer:
cells_res %>%
select(id, .extracts) %>%
unnest(.extracts) %>%
unnest(.extracts) %>%
group_by(Variable) %>%
summarise(Mean = mean(Importance),
Variance = sd(Importance)) %>%
slice_max(Mean, n = 15) %>%
ggplot(aes(Mean, reorder(Variable, Mean))) +
geom_crossbar(aes(xmin = Mean - Variance, xmax = Mean + Variance)) +
labs(x = "Variable importance", y = NULL)
Created on 2022-06-19 by the reprex package (v2.0.1)
I am trying to create a new nested column using the data from the min and max values of another nested column.
If I nest the IRIS data by Species and want to create a new nested data frame by the min and max of the Petal.Length for each Species how would I do it?
My code so far, create a function to create a new data.frame or expand.grid, then apply it using mutate(...map(...
Code/Data:
func = function(input){
data.frame(
min_to_max = seq(
from = min(.x$Petal.Length),
to = max(.x$Petal.Length),
by = 1
)
)
}
iris %>%
group_by(Species) %>%
nest() %>%
mutate(
expandDF = map(data, ~ func(.x))
)
The function should have match the argument name used i.e. input and not .x
func <- function(input){
data.frame(
min_to_max = seq(
from = min(input$Petal.Length),
to = max(input$Petal.Length),
by = 1
)
)
}
-testing
iris %>%
group_by(Species) %>%
nest() %>%
mutate(
expandDF = map(data, ~ func(.x))
) %>% ungroup
-output
# A tibble: 3 × 3
Species data expandDF
<fct> <list> <list>
1 setosa <tibble [50 × 4]> <df [1 × 1]>
2 versicolor <tibble [50 × 4]> <df [3 × 1]>
3 virginica <tibble [50 × 4]> <df [3 × 1]>
We could also do this without using map i.e with nest_by
iris %>%
nest_by(Species) %>%
mutate(expandDF = list(data.frame(min_to_max =
seq(from = min(data$Petal.Length), to = max(data$Petal.Length))))) %>%
ungroup
# A tibble: 3 × 3
Species data expandDF
<fct> <list<tibble[,4]>> <list>
1 setosa [50 × 4] <df [1 × 1]>
2 versicolor [50 × 4] <df [3 × 1]>
3 virginica [50 × 4] <df [3 × 1]>
R novice, so apologies in advance. I want to write a for loop that does sequential operations on a series of dataframes and then binds them (by sequence number).
Ideally, I'd think it would be something like this (where sc2 is the base dataframe I'm working from, week3 is the selection variable used. The dataframes I'm trying to create would be t1, t2, t3,... and w1, w2, w3,... etc. In other words, the 'i' in the dataframe name would read from the for statement.
for(i in 1:16) {
ti= tail((subset(sc2, sc2$week3<i)), n=200)
wi= subset(sc2, sc2$week3==i)
mi=rbind(ti, wi)
}
Which I'm sure you know doesn't work. I've gotten this far -
for(i in 1:16) {
txi= tail((subset(sc2, sc2$week3<i)), n=200)
assign(paste0("trst",i), txi, envir = .GlobalEnv)
wxi= subset(sc2, sc2$week3==i)
assign(paste0("w",i), wxi, envir = .GlobalEnv)
}
Which creates a dummy dataframes (*xi) that are then assigned for each i to the global environment. But now how to rbind them? Is there a more elegant way to do all of this, or am I missing something about the way to refer to the dataframes dynamically?
Don't do it in a loop!
This can be done much easier by holding data frame in data frame or rather I should write tibble in tibble. See the example below.
library(tidyverse)
sc2 = tibble(
week3 = sample(1:20, 100, replace = TRUE),
x = rnorm(100)
)
ftxi = function(i) sc2 %>% filter(week3<i)
fwxi = function(i) sc2 %>% filter(week3==i)
df = tibble(id = 1:16) %>%
group_by(id) %>%
mutate(txi = map(id, ~ftxi(.x)),
wxi = map(id, ~fwxi(.x)))
Let's see what is df.
# A tibble: 16 x 3
# Groups: id [16]
id txi wxi
<int> <list> <list>
1 1 <tibble [0 x 2]> <tibble [4 x 2]>
2 2 <tibble [4 x 2]> <tibble [6 x 2]>
3 3 <tibble [10 x 2]> <tibble [6 x 2]>
4 4 <tibble [16 x 2]> <tibble [6 x 2]>
5 5 <tibble [22 x 2]> <tibble [4 x 2]>
6 6 <tibble [26 x 2]> <tibble [4 x 2]>
7 7 <tibble [30 x 2]> <tibble [6 x 2]>
8 8 <tibble [36 x 2]> <tibble [4 x 2]>
9 9 <tibble [40 x 2]> <tibble [3 x 2]>
10 10 <tibble [43 x 2]> <tibble [6 x 2]>
11 11 <tibble [49 x 2]> <tibble [3 x 2]>
12 12 <tibble [52 x 2]> <tibble [4 x 2]>
13 13 <tibble [56 x 2]> <tibble [6 x 2]>
14 14 <tibble [62 x 2]> <tibble [5 x 2]>
15 15 <tibble [67 x 2]> <tibble [5 x 2]>
16 16 <tibble [72 x 2]> <tibble [7 x 2]>
As you can see it is a tibble which has other tibble in it.
So let's see if everything is correct and take a look at the second row.
First, let's look at the txi variable df$txi[[2]]
# A tibble: 4 x 2
week3 x
<int> <dbl>
1 1 -0.0829
2 1 -2.15
3 1 -0.949
4 1 -0.0583
Now it's the turn of the variable wxi df$wxi[[2]]
# A tibble: 6 x 2
week3 x
<int> <dbl>
1 2 -0.0643
2 2 -0.228
3 2 -0.620
4 2 -1.21
5 2 0.186
6 2 1.19
Bingo you get what you expected!
It is also a very quick method. You can see my other answer in this forum
What is faster/better: Loop over each row..
Sample Data
ex_list <- list(a = tibble(x = 1:4, y = 5:8),
b = mtcars)
How do I convert this list of tibbles/dataframes into a nested tibble as shown below:
# A tibble: 2 x 2
data_name data
<chr> <list>
1 a <tibble [4 × 2]>
2 b <df [32 × 11]>
Tidy solutions appreciated!
We may use enframe
library(tibble)
enframe(ex_list)
# A tibble: 2 x 2
name value
<chr> <list>
1 a <tibble [4 × 2]>
2 b <df [32 × 11]>
If we need to change the column names, use the name and value
> enframe(ex_list, name = 'data_name', value = 'data')
# A tibble: 2 x 2
data_name data
<chr> <list>
1 a <tibble [4 × 2]>
2 b <df [32 × 11]>
Is this what you want?
library(tidyverse)
lapply(ex_list, nest) %>%
dplyr::bind_rows(., .id = "data_name")
# # A tibble: 2 x 2
# data_name data
# <chr> <list>
# 1 a <tibble [4 x 2]>
# 2 b <tibble [32 x 11]>
#OR map
#map(ex_list, nest) %>%
# bind_rows(., .id = "data_name")
I'm computing the model outputs for a linear regression for a dependent variable with 45 different id values. How can I use tidy (dplyr, apply, etc.) code to accomplish this?
I have a dataset with three variables data = c(id, distance, actPct) such that id == 1:45; -10 <= distance <= 10; 0 <= actsPct <= 1.
I need to run a regression, model0n, on each value of id, such that model0n has out put in a new tibble/df. I have completed it for a single regression:
model01 <- data %>%
filter(id == 1) %>%
filter(distance < 1) %>%
filter(distance > -4)
model01 <- lm(data = model01, actPct~distance)
Example Data
set.seed(42)
id <- as.tibble(sample(1:45,100,replace = T))
distance <- as.tibble(sample(-4:4,100,replace = T))
actPct <- as.tibble(runif(100, min=0, max=1))
data01 <- bind_cols(id=id, distance=distance, actPct=actPct)
attr(data01, "col.names") <- c("id", "distance", "actPct")
I expect a new tibble or dataframe that has model01:model45 so I can put all of the regression outputs into a single table.
You can use group_by, nest and mutate with map from the tidyverse to accomplish this:
data01 %>%
group_by(id) %>%
nest() %>%
mutate(models = map(data, ~ lm(actPct ~ distance, data = .x)))
# A tibble: 41 x 3
# id data models
# <int> <list> <list>
# 1 42 <tibble [3 x 2]> <S3: lm>
# 2 43 <tibble [4 x 2]> <S3: lm>
# 3 13 <tibble [2 x 2]> <S3: lm>
# 4 38 <tibble [4 x 2]> <S3: lm>
# 5 29 <tibble [2 x 2]> <S3: lm>
# 6 24 <tibble [5 x 2]> <S3: lm>
# 7 34 <tibble [5 x 2]> <S3: lm>
# 8 7 <tibble [3 x 2]> <S3: lm>
# 9 30 <tibble [2 x 2]> <S3: lm>
# 10 32 <tibble [2 x 2]> <S3: lm>
# ... with 31 more rows
See also the chapter in R for R for Data Science about many models: https://r4ds.had.co.nz/many-models.html
Data
set.seed(42)
id <- sample(1:45, 100, replace = T)
distance <- sample(-4:4, 100, replace = T)
actPct <- runif(100, min = 0, max = 1)
data01 <- tibble(id = id, distance = distance, actPct = actPct)