How to plot sjPlots from a nested tibble? - r

I create some models like this using a nested tidyr dataframe:
set.seed(1)
library(tidyr)
library(dplyr)
library(sjPlot)
library(tibble)
library(purrr)
fits <- tribble(~group, ~colA, ~colB, ~colC,
sample(c("group1", "group2"), 10, replace = T), 0, sample(10, replace = T), sample(10, replace = T),
sample(c("group1", "group2"), 10, replace = T), 1, sample(10, replace = T), sample(10, replace = T)) %>%
unnest(cols = c(colB, colC)) %>%
nest(data=-group) %>%
mutate(fit= map(data, ~glm(formula = colA ~ colB + colC, data = .x, family="binomial"))) %>%
dplyr::select(group, fit) %>%
tibble::column_to_rownames("group")
I would like to use this data to create some quick marginal effects plots with sjPlot::plot_models like this
plot_models(as.list(fits), type = "pred", terms = c("colB", "colA", "colC"))
Unfortunately, I get the error
Error in if (fam.info$is_linear) tf <- NULL else tf <- "exp" :
argument is of length zero
In addition: Warning message:
Could not access model information.
I've played around a bit with the nesting of the data but I've been unable to get it into a format that sjPlot::plot_models will accept.
What I was expecting to get is a "Forest plot of multiple regression models" as described in the help file. Ultimately, the goal is to plot the marginal effects of regression models by group, which I was hoping the plot_models will do (please correct me if I'm wrong).

It think there are some issues with the original code as well as with the data. There are arguments from plot_model in the function call which are not supported in plot_models. I first show an example that shows how plot_models can be called and used with a nested tibble using {ggplot2}'s diamonds data set. Then I apply this approach to the OP's sample data, which doesn't yield useable results*. Finally, I create some new toy data to show how the approach could be applied to a binominal model.
(* In the original toy data the dependent variable is either always 0 or always 1 in each model so this is unlikely to yield useable results).
set.seed(1)
library(tidyr)
library(dplyr)
library(sjPlot)
library(tibble)
library(ggplot2)
# general example
fits <- tibble(id = c("x", "y", "z")) %>%
rowwise() %>%
mutate(fit = list(glm(reformulate(
termlabels = c("cut", "color", "depth", "table", "price", id),
response = "carat"),
data = diamonds)))
plot_models(fits$fit)
# OP's example data
fits2 <- tribble(~group, ~colA, ~colB, ~colC,
sample(c("group1", "group2"), 10, replace = T), 0,
sample(10, replace = T), sample(10, replace = T),
sample(c("group1", "group2"), 10, replace = T), 1,
sample(10, replace = T),
sample(10, replace = T)) %>%
unnest(cols = c(colB, colC)) %>%
nest(data = -group) %>%
rowwise() %>%
mutate(fit = list(glm(formula = colA ~ colB + colC, data = data, family="binomial")))
plot_models(fits2$fit)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning: Removed 4 rows containing missing values (geom_point).
# new data for binominal model
n <- 500
g <- round(runif(n, 0L, 1L), 0)
x1 <- runif(n,0,100)
x2 <- runif(n,0,100)
y <- (x2 - x1 + rnorm(n,sd=20)) < 0
fits3 <- tibble(g, y, x1, x2) %>%
nest_by(g) %>%
mutate(fit = list(glm(formula = y ~ x1 + x2, data = data, family="binomial")))
plot_models(fits3$fit)
Created on 2021-01-23 by the reprex package (v0.3.0)

Related

Using map to fit model to multiple datasets in a list

I am trying to sample from a dataset, select columns, and fit a model to each sample.
library(bartMachine)
library(tidyverse)
#dataset
dat <- data.frame( x=rnorm(20, mean=5, sd=1), z=rnorm(20, mean=2, sd=0.5), y=rnorm(20, mean=6, sd=1), weight = rnorm(20, mean=1, sd=0.2) )
#variable names
var1 <- dat %>% select(x, z) %>% names()
var2 <- dat %>% select(y) %>% names()
out <- tibble(run = 1:20) %>%
mutate(
data = map(run, ~ sample_n(dat, size = 5, weight = weight, replace = TRUE)),
mdl = map(data[[run]], ~bartMachine(X=as.data.frame(.x) %>% select(all_of(var1)),
y=as.data.frame(.x) %>% select(all_of(var2)) %>% unlist(), seed = 1)),
smry = map(mdl, ~ summary(.x))
)
Credit to #r2evans of helping with the above code
I get this error:
Error in `mutate()`:
! Problem while computing `mdl = map(...)`.
Caused by error in `data[[run]]`:
! recursive indexing failed at level 3
Run `rlang::last_error()` to see where the error occurred.
Since out$data[[1]] gives me the first dataset in the list, why is data[[run]] not working?

New column with random boolean values while controlling the ratio of TRUE/FALSE per category

In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.

Apply different data to a function in R

I have the following data frame:
library(tidyverse)
set.seed(1234)
df <- data.frame(
x = seq(1, 100, 1),
y = rnorm(100)
)
Where I apply a smooth spline using different knots:
nknots <- seq(4, 15, 1)
output <- map(nknots, ~ smooth.spline(x = df$x, y = df$y, nknots = .x))
What I need to do now is to apply the same function using 2-point and 3-point averages:
df_2 <- df %>%
group_by(., x = round(.$x/2)*2) %>%
summarise_all(funs(mean))
df_3 <- df %>%
group_by(., x = round(.$x/3)*3) %>%
summarise_all(funs(mean))
In summary, I need to apply the function I used in output with the following data frames:
df
df_2
df_3
Of course, this is a minimal example, so I am looking for a efficient way of doing it. Preferably with the purrr package.
Using lapply, and the library zoo to calculate the moving average in a more simple and elegant manner:
library(zoo)
lapply(1:3,function(roll){
dftemp <- as.data.frame(rollmean(df,roll))
map(nknots, ~ smooth.spline(x = dftemp$x, y = dftemp$y, nknots = .x))
})
Here's one possible solution:
library(tidyverse)
set.seed(1234)
df <- data.frame(x = seq(1, 100, 1),
y = rnorm(100))
# funtion to get v-point averages
GetAverages = function(v) {
df %>%
group_by(., x = round(.$x/v)*v) %>%
summarise_all(funs(mean)) }
# specify nunber of knots
nknots <- seq(4, 15, 1)
dt_res = tibble(v=1:3) %>% # specify v-point averages
mutate(d = map(v, GetAverages)) %>% # get data for each v-point
crossing(., data.frame(nknots=nknots)) %>% # combine each dataset with a knot
mutate(res = map2(d, nknots, ~smooth.spline(x = .x$x, y = .x$y, nknots = .y))) # apply smooth spline
You can use dt_res$res[dt_res$v == 1] to see all results for your original daatset, dt_res$res[dt_res$v == 2] to see results for your 2-point estimate, etc.

How to get formulas of multiple regressions by vectorizing

Suppose I have the following code that makes multiple regressions and stores the lm and lm with stepwise selection models in tibbles:
library(dplyr)
library(tibble)
library(MASS)
set.seed(1)
df <- data.frame(A = sample(3, 10, replace = T),
B = sample(100, 10, replace = T),
C = sample(100, 10, replace = T))
df <- df %>% arrange(A)
formula_df <- as.tibble(NA)
aic_df <- as.tibble(NA)
for (i in unique(df$A)){
temp <- df %>% filter(A == i)
formula_df[i, 1] <- temp %>%
do(model = lm(B ~ C, data = .))
aic_df[i, 1] <- temp %>%
do(model = stepAIC(formula_df[[1,1]], direction = "both", trace = F))
}
Is it possible to vectorize to make it faster, for example using the *pply functions? The loop becomes extremely slow when the data gets larger. Thank you in advance.
You could try something like:
model <- df %>% group_by(A) %>%
summarise(formula_model = list(lm(B ~ C))) %>%
mutate(aic_model = list(stepAIC(.[[1,2]], direction = "both", trace = F)))

Fastest way to get residuals from linear, poisson, and negative binomial regressions across many responses

I have a numeric, a count, and an over-dispersed count large matrices:
set.seed(1)
numeric.mat <- matrix(rnorm(10000*6000),10000,6000)
count.mat <- matrix(rpois(10000*6000,10),10000,6000)
dispersed.count.mat <- matrix(rnegbin(10000*6000,10,2),10000,6000)
And one corresponding factors data.frame (can be a matrix too):
factors.df <- data.frame(f1 = sample(LETTERS[1:3], 10000, replace = T),
f2 = sample(LETTERS[4:5], 10000, replace = T))
The number of factors is pretty small (in this case only 2 but won't be more than 5 for real data), and the number of levels in each (they're all categorical) is also small (also up to 5).
I'd like to obtain the residuals for fitting a linear, poisson, and negative binomial regression models to each of the columns in each of the matrices, respectively.
So for a single column:
data.df <- factors.df %>%
dplyr::mutate(numeric.y = numeric.mat[,1],
count.y = count.mat[,1],
dispersed.count.y = dispersed.count.mat[,1])
I'd use:
lm(numeric.y ~ f1+f2, data = data.df)$residuals
residuals(object = glm(count.y ~ f1+f2, data = data.df, family = "poisson"), type = 'pearson')
residuals(object = glm.nb(formula = model.formula, data = regression.df), type = 'pearson')
For the three regression models.
Is there a faster way of obtaining these residuals other than, for example, using do.call, for each. E.g.:
do.call(cbind,
lapply(1:ncol(numeric.mat),
function(i)
lm(numeric.y ~ f1+f2,
data = dplyr::mutate(factors.df,
numeric.y = numeric.mat[,i])
)$residuals
))
I'd slightly readjust how the workflow runs and allow it to be easily run in parallel.
# Use variables to adjust models, makes it easier to change sizes
iter <- 60
iter_samps <- 1000
factors_df <- data.frame(f1 = sample(LETTERS[1:3], iter_samps, replace = T),
f2 = sample(LETTERS[4:5], iter_samps, replace = T))
# using a data.frame in a longer format to hold the data, allows easier splitting
data_df <- rep(list(factors_df), iter) %>%
bind_rows(.id = "id") %>%
mutate(numeric_y = rnorm(iter_samps * iter),
count_y = rpois(iter_samps * iter, 10),
dispersed_count_y = MASS::rnegbin(iter_samps * iter, 10, 2))
# creating function that determines residuals
model_residuals <- function(data) {
data$lm_resid <- lm(numeric_y ~ f1+f2, data = data)$residuals
data$glm_resid <- residuals(object = glm(count_y ~ f1+f2, data = data, family = "poisson"), type = 'pearson')
return(data)
}
# How to run the models not in parallel
data_df %>%
split(.$id) %>%
map(model_residuals) %>%
bind_rows()
To run the models in parallel you can use multidplyr to do all the annoying work
library("multidplyr")
test = data_df %>%
partition(id) %>%
cluster_library("tidyverse") %>%
cluster_library("MASS") %>%
cluster_assign_value("model_residuals", model_residuals) %>%
do(results = model_residuals(.)) %>%
collect() %>%
.$results %>%
bind_rows()

Resources