I am trying to implement parallel processing to make the STEPAIC function running on several models more efficient.
I've came into some errors, like vectors differing in length.
This is my code:
library(MASS)
library(furrr)
library(tidyverse)
library(prediction)
nb_thesis_inter <- function (df,mdl){
target_col <- "mpg"
interactions<- predictors(mdl)
target_formula <- as.formula(sprintf("%s ~ (%s)^2",
target_col,
paste(interactions, collapse = " + ")))
model <- MASS::glm.nb(target_formula, data = df)
model <- (MASS::stepAIC(model))
return (model)}
m<- mtcars %>% group_by(cyl) %>% nest()
m<- m %>% mutate(model= map(.x = data, .f = ~lm(mpg ~ .)))
m<- m %>%
mutate(model= future_map2(.x= data, .y=model, .f = nb_thesis_inter))
Related
I have a list of linear regression fit objects. Let's create it in this example by:
c('hp', 'wt', 'disp') %>%
paste('mpg ~', .) %>%
map(as.formula) %>%
map(lm, data = mtcars)
What I would like is to keep just the residuals and fitted.values from each of the regression fit objects, within this same pipeline. I was trying to use the keep function, but it doesn't work:
c('hp', 'wt', 'disp') %>%
paste('mpg ~', .) %>%
map(as.formula) %>%
map(lm, data = mtcars) %>%
map(keep, names(.) %in% c("residuals", "fitted.values"))
Error:
Error in probe(.x, .p, ...) : length(.p) == length(.x) is not TRUE
How can I perform this action?
If a data frame is wanted as output then use the code below or if a list is wanted omit the bind_rows line.
library(dplyr)
library(purrr)
nms <- c('hp', 'wt', 'disp')
out <- nms %>%
set_names(x = map(paste('mpg ~', .), as.formula)) %>%
map(lm, data = mtcars) %>%
map(~ data.frame(fit = fitted(.), resid = resid(.))) %>%
bind_rows(.id = "id")
We can simplify this slightly using sapply as it will add names and use reformulate to generate the formula.
out <- nms %>%
sapply(reformulate, response = "mpg") %>%
map(lm, data = mtcars) %>%
map(~ data.frame(fit = fitted(.), resid = resid(.))) %>%
bind_rows(.id = "id")
I have a nested GBM, and am looking to extract the partial depndence, tryingto use the following query:
library(rsample) # data splitting
library(gbm) # basic implementation
library(xgboost) # a faster implementation of gbm
library(caret) # an aggregator package for performing many machine learning models
library(h2o) # a java-based platform
library(pdp) # model visualization
basic_gbm <- function(data) {
mymodel <- gbm(formula = mpg ~ . ,
distribution = "gaussian",
data = data ,
n.minobsinnode = 1,
bag.fraction = 1
)
return(mymodel)
}
blah_model <- mtcars %>%
group_by() %>%
nest() %>%
mutate(model = map(data, basic_gbm))
blah_summary <- mtcars %>%
group_by() %>%
nest() %>%
mutate(model = map(data, basic_gbm)) %>%
mutate(summary = map(model, summary)) %>%
mutate(all_data = pmap(list(data, summary), .f =left_join, by = character())) %>%
select(cols=c(all_data)) %>%
unnest(cols = c(cols)) %>%
ungroup()
blah_model %>%
left_join(blah_summary, by = character()) %>%
mutate(pred = map(model, partial, pred.var = var, n.trees = model$n.trees, train = data)) -- this does not work
This does work and is what I would want as a nested df for each var:
coeffs <- blah_model$model[[1]] %>%
partial(pred.var = 'disp', n.trees = blah_model$model[[1]]$n.trees, train = blah_model$data[[1]])
However, it is saying it is not finding the variables in the training data - the data I am passing through is the training data. The var in the map is from the summary functions - these are prediction variables.
I gave a better example
I am performing simple linear regressions on multiple groups from my data set. However, I want extract the summaries from each of these regressions and put them into a master table by organized by group. I can run it like this (and it works):
fit_basic <- rs2_anova %>% #Run multiple simple linear regressions
group_by(quant_method) %>%
nest() %>%
mutate(model = map(data, ~lm(recoveries ~ treatment, data = .)))
fit_basic_A <- fit_basic[[1,"model"]] #Remove the model from fit_basic
fit_basic_B <- fit_basic[[1,"model"]] #Remove the model from fit_basic
fit_basic_table_A <- get_regression_table(fit_basic_A) %>%
select("term", "estimate") %>%
pivot_wider(names_from = "term", values_from = "estimate") %>%
mutate(quant_method = "A")
fit_basic_table_B <- get_regression_table(fit_basic_A) %>%
select("term", "estimate") %>%
pivot_wider(names_from = "term", values_from = "estimate") %>%
mutate(quant_method = "B")
fit_basic_table <- rbind(fit_basic_table_A, fit_basic_table_B)
To save myself some lines of code (because I have many more groups than presented here) I thought I could use the map function, but I keep getting stuck at mapping the summary table, which throws an error:
fit_basic <- rs2_anova %>%
group_by(quant_method) %>%
nest() %>%
mutate(model = map(data, ~lm(recoveries ~ treatment, data = .))) %>%
mutate(summaries = map(data, get_regression_table(.$model)))
Error in input_checks(model, digits, print) :
Only simple linear regression models are supported. Try again using only `lm()` models as appropriate.
I also tried something along this line:
fit_basic_table <- map(fit_basic$model,
function(x) {
p <- get_regression_table(x)
cbind(par=rownames(p), p)
})
But I get a list of dataframes that I can't breakdown into a single dataframe and I have lost my group designations. I have tried:
fit_basic_table <- map(fit_basic$model,
function(x) {
p <- get_regression_table(x)
cbind(par=rownames(p), p)
}) %>%
map_df(as_tibble, .id = "id")
and
fit_basic_table <- map(fit_basic$model,
function(x) {
p <- get_regression_table(x)
cbind(par=rownames(p), p)
}) %>%
unnest(cols = "id")
Any thoughts on how to automate this?
*Random test dataframe:
quant_method <- c("A", "A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B")
treatment <- c("x","x","x","x","x","y","y","y","y","y","x","x","x","x","x","y","y","y","y","y")
recoveries <-c("88","86","87","82","85","76","65","55","72","71","98","96","97","92","99","66",
"55","55","62","61")
rs2_anova <- data.frame(quant_method, treatment, recoveries)
Here is one solution using tidyverse and broom packages. It is slightly different from the purrr method you were attempting, but I think the result shows the terms that you were interested in extracting from the lm object (i.e., term and estimate).
library(tidyverse)
library(broom)
#Added the stringsAsFactors argument = F to avoid an error in the lm model
rs2_anova <- data.frame(quant_method,
treatment,
recoveries,
stringsAsFactors = F)
fit_basic <- rs2_anova %>%
#Group by quant_method column
group_by(quant_method) %>%
#do the linear models by grouping var
do(model = lm(recoveries ~ treatment, data = .)) %>%
#tidy lm object and order it as tibble
tidy(model)
I found the answer here: unnest a list column after modeling after group_by in r
and modified it to:
fit_cec <- rs2_anova %>%
group_by(quant_method) %>%
nest %>%
mutate(data = map(data, ~ .x %>%
summarise(model = list(broom::tidy(lm(recoveries ~ treatment)))))) %>%
unnest(data) %>%
unnest(model)
or to get all estimates, predictions and summaries (https://drsimonj.svbtle.com/running-a-model-on-separate-groups). This also works nicely:
fit_cec <- rs2_anova %>%
group_by(quant_method) %>%
nest %>%
mutate(fit = map(data, ~ lm(loss_abs_BC_2 ~ cec, data = .)),
parameters = map(fit, tidy), #provides estimate table for slope and y-intercept with std.error and estimate p-values
summary = map(fit, broom::glance), #provides R2, adj.R2, sigma, and model p-value
predictions = map(fit, augment)) %>% #provides fitted values with residuals and errors
unnest(parameters) %>%
pivot_wider(names_from = term, values_from = c(estimate, std.error, statistic, p.value)) %>%
unnest(summary) %>%
unnest(predictions)
In the following code, I want to replace map_dfr from purrr with one of the SparkR apply functions to parallelize the Shapley calculations on the azure databricks:
#install.packages("randomForest"); install.packages("tidyverse"); install.packages("iml"); install.packages(SparkR)
library(tidyverse); library(iml); library(randomForest); library(SparkR)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs), id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor <- Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapelyresults <- map_dfr(1:nrow(mtcars), ~(Shapley$new(predictor, x.interest = mtcars1[.x,]) %>%
.$results %>%
as_tibble() %>%
arrange(desc(phi)) %>%
slice(1:5) %>%
select(feature.value, phi) %>%
mutate(id = .x)))
I could not leverage the answer on the following link: How to apply a function to each row in SparkR?
Consider the following example longitudinal/repeated measures dataset
library(tidyverse)
library(broom)
library(nlme)
data <- read.csv("https://stats.idre.ucla.edu/stat/data/study2.csv")
data <- data %>% mutate(dbp =rnorm(120, 30:150), sbp = rnorm(120, 50:200),bmi
= rnorm(120,15:40), chol = rnorm(120,50:350), insulin = rnorm(120,2:40), educ = rnorm(120,5:10))
I can use group_by %>% do(tidy(*)) to run several unadjusted and adjusted single-level regression models (looping through list of outcomes & exposures) and extract model results to the data frame as follows
out <-c("pulse","insulin","chol")
exp <- c("factor(exertype)","sbp","dbp")
conf <- c("bmi","factor(diet)")
#Unadjusted models - single level regression (lm)
#################################################
Unadjusted <- expand.grid(out, exp) %>%
group_by(Var1) %>% rowwise() %>%
summarise(frm = paste0(Var1, "~", Var2)) %>%
group_by(model_id = row_number(),frm) %>%
do(tidy(lm(.$frm, data = data))) %>%
mutate(lci = estimate-(1.96*std.error)) %>%
mutate(uci = estimate+(1.96*std.error))
#Adjusted models - single level regression (lm)
###############################################
Adjusted <- expand.grid(out, exp, conf) %>%
group_by(Var1, Var2) %>%
summarise(Var3 = paste0(Var3, collapse = "+")) %>%
rowwise() %>%
summarise(frm = paste0(Var1, "~", Var2, "+", Var3)) %>%
group_by(model_id = row_number(), frm) %>%
do(tidy(lm(.$frm, data = data))) %>%
mutate(lci = estimate-(1.96*std.error)) %>%
mutate(uci = estimate+(1.96*std.error))
I would like to use the same process to fit multilevel models to account for the repeated data. using the example code:
lme(sbp ~ pulse+factor(diet)+time, data=data, random= ~time|id, method ="ML")
HOWEVER, when I try to do this, e.g. using:
#Unadjusted models - multi-level regression (lme)
#################################################
Unadjusted <- expand.grid(out, exp) %>%
group_by(Var1) %>% rowwise() %>%
summarise(frm = paste0(Var1, "~", Var2)) %>%
group_by(model_id = row_number(),frm) %>%
do(tidy(lme(.$frm, data = data, random= ~time|id, method = "ML"))) %>%
mutate(lci = estimate-(1.96*std.error)) %>%
mutate(uci = estimate+(1.96*std.error))
I get the following error message:
Error in UseMethod("lme") : no applicable method for 'lme' applied to an object of class "character"
Any ideas on how to get this working for lme type models?
Unlike lm, lme will not accept a formula as a character value. You need to explicitly cast it to a formula. Try just adding as.formula()
do(tidy(lme(as.formula(.$frm), ...))) %>%