broom::augment returns matrix can't unnest - r

I'm trying to run broom on augment on lm(y ~poly(x, 3), data = dat).
With that formaula broom::augment returns a matrix in a nested column. When I try to unnest, this fails whit an error message similar to "can't cast poly...
I'v found a similar question, but no answer
Trying to unnest broom::augment data, but R "can't cast"
library(rmarkdown)
library(tidyverse)
library(fs)
structure(list(a = c("2019-11-25", "2019-11-25",
"2019-11-25", "2019-11-25", "2019-11-25"),
b = c("laktat-felttest", "laktat-felttest",
"laktat-felttest", "laktat-felttest",
"laktat-felttest"),
c = c("kai", "kai", "kai", "kai", "kai"),
maaling = c(1, 2, 3, 4, 5),
load = c(800, 850, 900, 1000, 1100),
time_mm = c(5, 5, 4, 4, 4),
time_ss = c(9, 0, 55, 35, 45),
hr = c(125, 140, 140, 160, 172),
rpe = c(2, 4, 4, 8, 9),
laktat = c(2.7, 2.1, 2, 4.8, 10.2),
time = c(309, 300, 295, 275, 285),
x = c(2.58899676375405, 2.83333333333333,
3.05084745762712, 3.63636363636364, 3.85964912280702),
la_x = c(1.042875, 0.741176470588235,
0.655555555555556, 1.32, 2.64272727272727)),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -5L)) ->
dat
dat %>%
group_by(a,b,c) %>%
nest() %>%
mutate(model = data %>% map( ., ~lm( data = ., laktat ~ poly(x, 3), na.action = "na.exclude")),
tidied = model %>% map( ., broom::tidy ),
glance = model %>% map(., broom::glance),
augment = map( model, broom::augment),
augment = map( augment, janitor::clean_names, "snake" )) ->
model_tbl
##model_tbl %>%
## unnest(augment)
It seems to me, that the nested column, where augment is, there is a matrix. I don't know how to change this to listcolumns.
Greetings from Denmark
Dan Olesen

The poly() function is the problem. It causes weird column names. You can replace it by x + I(x^2) + I(x^3) and the you get better column names.
dat %>%
group_by(a,b,c) %>%
nest() %>%
mutate(model = data %>% map( ., ~lm( data = ., laktat ~ x + I(x^2) + I(x^3), na.action = "na.exclude")),
tidied = model %>% map( ., broom::tidy ),
glance = model %>% map(., broom::glance),
augment = map( model, broom::augment),
augment = map( augment, janitor::clean_names, "snake" )) ->
model_tbl
Update:
I just realized an issue. Using x + I(x^2) + I(x^3) is not exactly the same as poly(x, 3). If you use the latter the coefficients are not correlated, but if you use the first they are. Don't know how to solve this at the moment.

Related

neural network in R: tfdatasets::feature_spec() on 3D Array

Feeding several multivariate time series in a neural network in R seems to be a hard problem for me.
As far as I can tell there are three way to do so and they all do have their pitfalls:
A nested tibble would be the most intuitive solution for R, but it does not seem to work at all.
With a 3D array you have to be careful about the ordering.
Converting a 3D Array in a dataset and then creating a tfdatasets::feature_spec(.) results in an error.
Here is a small example, which does not work at the moment:
I get the data as a nested tibble so I start from there.
library(magrittr)
reticulate::use_python("/usr/local/bin/python")
data <- tibble::tibble(
Timeseires = list(
A = tibble::tibble(
Period = rep(0:4, each = 10),
MeasuremntA = runif(50, 20, 30),
MeasuremntB = runif(50, 0.001, 0.01)
),
C = tibble::tibble(
Period = rep(0:4, each = 10),
MeasuremntA = runif(50, 20, 30),
MeasuremntB = runif(50, 0.001, 0.01)
),
B = tibble::tibble(
Period = rep(0:4, each = 10),
MeasuremntA = runif(50, 20, 30),
MeasuremntB = runif(50, 0.001, 0.01)
),
D = tibble::tibble(
Period = rep(0:4, each = 10),
MeasuremntA = runif(50, 20, 30),
MeasuremntB = runif(50, 0.001, 0.01)
)
)
)
measurement.data <- data %>%
dplyr::mutate(Timeseires = purrr::map(
.x = Timeseires,
.f = simplify2array
)) %>%
.$Timeseires %>%
simplify2array() %>%
tfdatasets::tensor_slices_dataset()
feature.spec.measurment <- measurement.data %>%
tfdatasets::feature_spec(.) %>%
tfdatasets::step_numeric_column(
tfdatasets::all_numeric(),
normalizer_fn = tfdatasets::scaler_standard()
) %>%
tfdatasets::fit()
It is crashing with the following error:
Error: Unable to resolve features for dataset that does not have named outputs
I've tried adding some names to it, bot without success.
Do you have any idea how to insert several multivariate time series in a neural network in R with keras and thus get the example to work?
The remainder of the example looks like this:
##### Data Prep for meta.data #####
meta.data <- data %>%
dplyr::select(-Timeseires) %>%
tfdatasets::tensor_slices_dataset()
feature.spec.meta <- meta.data %>%
tfdatasets::feature_spec(Result ~ .) %>%
tfdatasets::step_numeric_column(
tfdatasets::all_numeric(),
normalizer_fn = tfdatasets::scaler_standard()
) %>%
tfdatasets::step_categorical_column_with_vocabulary_list(
tfdatasets::all_nominal()
) %>%
tfdatasets::step_indicator_column(
tfdatasets::all_nominal()
) %>%
tfdatasets::fit()
input.measurement <- keras::layer_input(shape = c(3, 4))
lstm.out <- input.measurement %>%
keras::layer_lstm(units = 32)
#####
input.meta <- data %>%
dplyr::select(-Timeseires, -Result) %>%
tfdatasets::layer_input_from_dataset()
dense.out <- input.meta %>%
keras::layer_dense_features(feature_columns = tfdatasets::dense_features(feature.spec.meta))
#####
output <- keras::layer_concatenate(c(dense.out, lstm.out)) %>%
keras::layer_dense(units = 64, activation = "relu") %>%
keras::layer_dense(units = 64, activation = "relu") %>%
keras::layer_dense(units = 64, activation = "relu") %>%
keras::layer_dense(units = 1, activation = "sigmoid")
output.auxiliary <- input.measurement %>%
keras::layer_dense(units = 1, activation = "sigmoid", name = "aux_output")
model <- keras::keras_model(
inputs = c(input.meta, input.measurement),
outputs = c(output, output.auxiliary)
)
model %>% keras::compile(
loss = keras::loss_binary_crossentropy,
optimizer = "adam",
metrics = "binary_accuracy"
)
summary(model)
history <- model %>%
keras::fit(
x = list(
# tfdatasets::dataset_use_spec(meta.data, spec = feature.spec.meta),
# tfdatasets::dataset_use_spec(measurement.data ,feature.spec.measurment)
data %>% dplyr::select(-Timeseires, -Result),
data %>%
dplyr::mutate(Timeseires = purrr::map(
.x = Timeseires,
.f = simplify2array
)) %>% .$Timeseires
%>% simplify2array()
),
y = list(
data$Result,
data$Result
),
epochs = 10,
validation_split = 0.3
)
At the moment I've chosen this Input shape because it gets the example to work at least, but it is wrong since dim(measurement.data) = 50 3 4 and I should actually use keras::layer_input(shape = c(50, 3)) since the number of time series (matrices) will vary within the evaluation and be the same as in training.

How to plot sjPlots from a nested tibble?

I create some models like this using a nested tidyr dataframe:
set.seed(1)
library(tidyr)
library(dplyr)
library(sjPlot)
library(tibble)
library(purrr)
fits <- tribble(~group, ~colA, ~colB, ~colC,
sample(c("group1", "group2"), 10, replace = T), 0, sample(10, replace = T), sample(10, replace = T),
sample(c("group1", "group2"), 10, replace = T), 1, sample(10, replace = T), sample(10, replace = T)) %>%
unnest(cols = c(colB, colC)) %>%
nest(data=-group) %>%
mutate(fit= map(data, ~glm(formula = colA ~ colB + colC, data = .x, family="binomial"))) %>%
dplyr::select(group, fit) %>%
tibble::column_to_rownames("group")
I would like to use this data to create some quick marginal effects plots with sjPlot::plot_models like this
plot_models(as.list(fits), type = "pred", terms = c("colB", "colA", "colC"))
Unfortunately, I get the error
Error in if (fam.info$is_linear) tf <- NULL else tf <- "exp" :
argument is of length zero
In addition: Warning message:
Could not access model information.
I've played around a bit with the nesting of the data but I've been unable to get it into a format that sjPlot::plot_models will accept.
What I was expecting to get is a "Forest plot of multiple regression models" as described in the help file. Ultimately, the goal is to plot the marginal effects of regression models by group, which I was hoping the plot_models will do (please correct me if I'm wrong).
It think there are some issues with the original code as well as with the data. There are arguments from plot_model in the function call which are not supported in plot_models. I first show an example that shows how plot_models can be called and used with a nested tibble using {ggplot2}'s diamonds data set. Then I apply this approach to the OP's sample data, which doesn't yield useable results*. Finally, I create some new toy data to show how the approach could be applied to a binominal model.
(* In the original toy data the dependent variable is either always 0 or always 1 in each model so this is unlikely to yield useable results).
set.seed(1)
library(tidyr)
library(dplyr)
library(sjPlot)
library(tibble)
library(ggplot2)
# general example
fits <- tibble(id = c("x", "y", "z")) %>%
rowwise() %>%
mutate(fit = list(glm(reformulate(
termlabels = c("cut", "color", "depth", "table", "price", id),
response = "carat"),
data = diamonds)))
plot_models(fits$fit)
# OP's example data
fits2 <- tribble(~group, ~colA, ~colB, ~colC,
sample(c("group1", "group2"), 10, replace = T), 0,
sample(10, replace = T), sample(10, replace = T),
sample(c("group1", "group2"), 10, replace = T), 1,
sample(10, replace = T),
sample(10, replace = T)) %>%
unnest(cols = c(colB, colC)) %>%
nest(data = -group) %>%
rowwise() %>%
mutate(fit = list(glm(formula = colA ~ colB + colC, data = data, family="binomial")))
plot_models(fits2$fit)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning: Removed 4 rows containing missing values (geom_point).
# new data for binominal model
n <- 500
g <- round(runif(n, 0L, 1L), 0)
x1 <- runif(n,0,100)
x2 <- runif(n,0,100)
y <- (x2 - x1 + rnorm(n,sd=20)) < 0
fits3 <- tibble(g, y, x1, x2) %>%
nest_by(g) %>%
mutate(fit = list(glm(formula = y ~ x1 + x2, data = data, family="binomial")))
plot_models(fits3$fit)
Created on 2021-01-23 by the reprex package (v0.3.0)

Rolling window slider::slide() with grouped data

In the following example I try to compute the first coefficient from a linear model for time t = 1 until t. It's an expanding rolling window.
It works well with ungrouped data, but when grouped by case, I get the error Error: Columncoef1must be length 10 (the group size) or one, not 30.
How can I handle grouped data?
library(dplyr)
library(slider)
get_coef1 <- function(data) {
coef1 <- lm(data = data, r1 ~ r2 + r3) %>%
coef() %>%
.["r2"] %>%
unname()
return(coef1)
}
data <- tibble(t = rep(1:10, 3),
case = c(rep("a", 10), rep("b", 10), rep("c", 10)),
r1 = rnorm(30),
r2 = rnorm(30),
r3 = rnorm(30))
data %>%
# ungroup() %>%
group_by(case) %>%
mutate(coef1 = slider::slide_dbl(., ~get_coef1(.x),
.before = Inf, .complete = T))
You have to first tidyr::nest the cases. Within the nested tibbles (accessed via purrr::map) you can then apply slide (same technique as with purrr::map). The important point is that you do not want to slide across cases, but only within cases.
library(dplyr)
library(tidyr)
library(purrr)
library(slider)
get_coef1 <- function(data) {
coef1 <- lm(data = data, r1 ~ r2 + r3) %>%
coef() %>%
.["r2"] %>%
unname()
return(coef1)
}
data <- tibble(t = rep(1:10, 3),
case = c(rep("a", 10), rep("b", 10), rep("c", 10)),
r1 = rnorm(30),
r2 = rnorm(30),
r3 = rnorm(30))
data %>%
# ungroup() %>%
group_by(case) %>% nest() %>%
mutate(rollreg = map(data, ~ .x %>% mutate(coef1 = slider::slide_dbl(., ~get_coef1(.x), .before = Inf, .complete = TRUE)))) %>%
select(-data) %>% unnest(rollreg)
I have been trying for a while to use the new dplyr::nest_by() from dplyr 1.0.0 trying to use summarise in combination with the rowwise cases but couldn't get that to work.
I realize this is an old post, but for the sake of completeness, I offer another solution. Is this what you're looking for? Two subtle changes to the arguments to slide_dbl. The code runs.
data %>%
# ungroup() %>%
group_by(case) %>%
mutate(coef1 = slider::slide_dbl(.x = cur_data(), # use cur_data() instead of .; arg .x
.f = ~get_coef1(.x), # arg .f
.before = Inf, .complete = T))
See the slider() documentation for underlying reasons.

Mapping a keras NN over data lists in R

I am trying to figure out the correct method of going about applying a keras model to each of my lists. I have used the iris dataset and created 4 lists and the goal is to correctly predict versicolor or virginica (I omit setosa because I want a binary classification model).
data(iris)
iris <- iris %>%
mutate(
splt = sample(4, size = nrow(.), replace = TRUE),
binary = case_when(
Species == "versicolor" ~ 0,
Species == "virginica" ~ 1
)
) %>%
filter(Species != "setosa") %>%
split(., .$splt)
iris_x_train <- iris %>%
map(., ~select(., Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
as.matrix())
iris_y_train <- iris %>%
map(., ~select(., binary) %>%
to_categorical(2))
NN_model <- keras_model_sequential() %>%
layer_dense(units = 4, activation = 'relu', input_shape = 4) %>%
layer_dense(units = 2, activation = 'softmax')
NN_model %>%
summary
NN_model %>%
compile(
loss = 'binary_crossentropy',
optimizer_sgd(lr = 0.01, momentum = 0.9),
metrics = c('accuracy')
)
My problem occurs here. When I apply the below code:
NN_model %>%
future_map(., ~future_map2(
.x = iris_x_train,
.y = iris_y_train,
~fit(
x = .x,
y = .y,
epochs = 5,
batch_size = 20,
validation_split = 0
)
)
)
I get this error:
Error in py_get_item_impl(x, key, FALSE) : TypeError: 'Sequential'
object does not support indexing
When I apply this code:
NN_model %>%
future_map2(
.x = iris_x_train,
.y = iris_y_train,
~fit(
x = .x,
y = .y,
epochs = 5,
batch_size = 20,
validation_split = 0
)
)
I get this error:
~fit(x = .x, y = .y, epochs = 5, batch_size = 20, validation_split =
0) Error in py_call_impl(callable,
dots$args, dots$keywords) : Evaluation error: Unable to convert R
object to Python type.
How can I map a keras model to each of the 4 datasets?
library(keras)
library(tensorflow)
library(furrr)
library(purrr)
The following works for the first list:
NN_model %>%
fit(
x = iris_x_train[[1]],
y = iris_y_train[[1]],
epochs = 50,
batch_size = 20,
validation_split = 0
)
EDIT: I seem to have solved it.
Putting the NN_model inside the fit() function appears to work.
future_map2(
.x = iris_x_train,
.y = iris_y_train,
~fit(NN_model,
.x,
.y,
epochs = 5,
batch_size = 20,
validation_split = 0
)
)

Find predictions for linear model that is grouped_by

I would like to get predicted values based on a model I fit to a training set of data. I have done this before, but now I have a grouping factor and it is throwing me off. I want to predict biomass based on population for each environment.
library(tidyverse)
fit_mods<-df %>%
group_by(environ) %>%
do(model = lm(biomass ~ poly(population, 2), data = .))
Ultimately, I will want to find at which population biomass is the greatest. Usually I would do this by creating a grid and running the model on my new values and finding the max value, but I'm blanking on how to do this with the grouping. Usual way:
min_pop <- min(df$population)
max_pop <- max(df$population)
grid_pop <- expand.grid(new = (seq(from = min_pop,
to = max_pop,
length.out = 1000)),
environ = c("A", "B"))
#This is what I did with ungrouped data, but doesn't work now.
pred_pop <- predict(object = fit_mods,
newdata = grid_pop,
interval = "predict")
Here is some dummy data:
df <- as.data.frame(list(environ = c("a", "a", "a", "a", "a", "b", "b", "b", "b", "b"),
population = c(2, 3, 4, 5, 6, 3, 4, 5, 6, 7),
biomass = c(1, 2.2, 3.5, 4.1, 3.8, 2.5, 3.6, 4.3, 5.2, 5.1)), class = "data.frame")
In a tidyverse many models approach you could do it the following way:
library(tidyverse)
fit_mods <- df %>%
nest(-environ) %>%
mutate(models = map(data, ~ lm(biomass ~ poly(population, 2), data = .x)),
min_pop = map_dbl(data, ~ pull(.x, population) %>% min),
max_pop = map_dbl(data, ~ pull(.x, population) %>% max),
new = map2(min_pop, max_pop, ~ tibble(population = seq(from = .x,
to = .y,
length.out = 1000))),
pred = map2(models,
new,
~ predict(object = .x,
newdata = select(.y,population),
interval = "predict")))

Resources