Find predictions for linear model that is grouped_by - r

I would like to get predicted values based on a model I fit to a training set of data. I have done this before, but now I have a grouping factor and it is throwing me off. I want to predict biomass based on population for each environment.
library(tidyverse)
fit_mods<-df %>%
group_by(environ) %>%
do(model = lm(biomass ~ poly(population, 2), data = .))
Ultimately, I will want to find at which population biomass is the greatest. Usually I would do this by creating a grid and running the model on my new values and finding the max value, but I'm blanking on how to do this with the grouping. Usual way:
min_pop <- min(df$population)
max_pop <- max(df$population)
grid_pop <- expand.grid(new = (seq(from = min_pop,
to = max_pop,
length.out = 1000)),
environ = c("A", "B"))
#This is what I did with ungrouped data, but doesn't work now.
pred_pop <- predict(object = fit_mods,
newdata = grid_pop,
interval = "predict")
Here is some dummy data:
df <- as.data.frame(list(environ = c("a", "a", "a", "a", "a", "b", "b", "b", "b", "b"),
population = c(2, 3, 4, 5, 6, 3, 4, 5, 6, 7),
biomass = c(1, 2.2, 3.5, 4.1, 3.8, 2.5, 3.6, 4.3, 5.2, 5.1)), class = "data.frame")

In a tidyverse many models approach you could do it the following way:
library(tidyverse)
fit_mods <- df %>%
nest(-environ) %>%
mutate(models = map(data, ~ lm(biomass ~ poly(population, 2), data = .x)),
min_pop = map_dbl(data, ~ pull(.x, population) %>% min),
max_pop = map_dbl(data, ~ pull(.x, population) %>% max),
new = map2(min_pop, max_pop, ~ tibble(population = seq(from = .x,
to = .y,
length.out = 1000))),
pred = map2(models,
new,
~ predict(object = .x,
newdata = select(.y,population),
interval = "predict")))

Related

Plot TukeyHSD as Heatmap

Can I present the result of TukeyHSD as a heatmap? And how would the code look like concerning the example below?
#Daten erstellen
set.seed (0)
data <- data.frame(group = rep(c("A", "B", "C"), each = 30),
values = c(runif(30, 0, 3),
runif(30, 0, 5),
runif(30, 1, 7)))
#Die ersten sechs Zeilen anzeigen
head(data)
#einfaktorielles ANOVA-Modell anpassen
model <- aov(values~group, data=data)
#Sehen Sie sich die Modellausgabe an
summary(model)
#Tukey Test durchführen
TukeyHSD(model, conf.level=.95)
#Konfidenzintervalle plotten
plot(TukeyHSD(model, conf.level=.95), las = 2)
Thank you so much!!
I actually get the right results, but can't display them as a heatmap.
Here a way to do it with a tidyverse approach
library(dplyr)
library(lubridate)
library(tidyr)
data <- data.frame(group = rep(c("A", "B", "C"), each = 30),
values = c(runif(30, 0, 3),
runif(30, 0, 5),
runif(30, 1, 7)))
#einfaktorielles ANOVA-Modell anpassen
model <- aov(values~group, data=data)
#Tukey Test durchführen
test_list <- TukeyHSD(model, conf.level=.95)
test_data <- test_list$group
test_data %>%
as_tibble() %>%
bind_cols(data.frame(rw = rownames(test_data))) %>%
separate(rw,into = c("var1","var2")) %>%
ggplot(aes(x = var1,y = var2, fill = `p adj`))+
geom_tile()+
geom_text(aes(label = p.adjust(`p adj`)), color = "white")

Using summary_row() values to calculate group percentage with {gt} package?

I am trying to calculate the percentage for an entire group while using the summary_rows() function from the {gt} package. The problem I am encountering is how to create a function that uses summary_rows() values to calculate a percentage rowwise.
Sample df:
df <- tibble(
division = c("Science", "Science", "Science"),
department = c("Biology", "Biology", "Biology"),
course_num = c('101', '201', "301"),
widthraws = c(1, 2, 3),
unsucessful = c(0, 0 , 0),
successfull = c(1, 3, 4),
total_enrolled = c(2, 5, 7),
percent_successful = c(.50, .60, .57)
)
Sample of gt table:
df %>%
group_by(division, department) %>%
gt() %>%
summary_rows(
groups = TRUE,
columns = 4:7,
missing_text = " ",
fns = list(
total = ~sum(.)),
)
What I would want is the total row of the percent_successful column to be .57. Open to other ideas that would help me achieve this too.
Compute the percentage for total outside and add a layer
library(gt)
library(dplyr)
total_success_perc <- with(df, round(sum(successfull)/sum(total_enrolled), 2))
df %>%
group_by(division, department) %>%
gt() %>%
summary_rows(
groups = TRUE,
columns = 4:7,
missing_text = " ",
fns = list(
total = ~sum(.)),
) %>%
summary_rows(groups = TRUE, columns = 8, missing_text = " ",
fns = list(total = ~ c(total_success_perc)))
-output
library(tidyverse)
library(gt)
df <- tibble(
division = c("Science", "Science", "Science"),
department = c("Biology", "Biology", "Biology"),
course_num = c('101', '201', "301"),
widthraws = c(1, 2, 3),
unsucessful = c(0, 0 , 0),
successfull = c(1, 3, 4),
total_enrolled = c(2, 5, 7),
percent_successful = c(.50, .60, .57)
)
df %>%
group_by(division, department) %>%
gt() %>%
summary_rows(
groups = TRUE,
columns = c(widthraws:percent_successful),
missing_text = " ",
fns = list(
total = ~sum(.),
max = ~max(.),
min = ~min(.),
medain = ~median(.))
)

How to plot sjPlots from a nested tibble?

I create some models like this using a nested tidyr dataframe:
set.seed(1)
library(tidyr)
library(dplyr)
library(sjPlot)
library(tibble)
library(purrr)
fits <- tribble(~group, ~colA, ~colB, ~colC,
sample(c("group1", "group2"), 10, replace = T), 0, sample(10, replace = T), sample(10, replace = T),
sample(c("group1", "group2"), 10, replace = T), 1, sample(10, replace = T), sample(10, replace = T)) %>%
unnest(cols = c(colB, colC)) %>%
nest(data=-group) %>%
mutate(fit= map(data, ~glm(formula = colA ~ colB + colC, data = .x, family="binomial"))) %>%
dplyr::select(group, fit) %>%
tibble::column_to_rownames("group")
I would like to use this data to create some quick marginal effects plots with sjPlot::plot_models like this
plot_models(as.list(fits), type = "pred", terms = c("colB", "colA", "colC"))
Unfortunately, I get the error
Error in if (fam.info$is_linear) tf <- NULL else tf <- "exp" :
argument is of length zero
In addition: Warning message:
Could not access model information.
I've played around a bit with the nesting of the data but I've been unable to get it into a format that sjPlot::plot_models will accept.
What I was expecting to get is a "Forest plot of multiple regression models" as described in the help file. Ultimately, the goal is to plot the marginal effects of regression models by group, which I was hoping the plot_models will do (please correct me if I'm wrong).
It think there are some issues with the original code as well as with the data. There are arguments from plot_model in the function call which are not supported in plot_models. I first show an example that shows how plot_models can be called and used with a nested tibble using {ggplot2}'s diamonds data set. Then I apply this approach to the OP's sample data, which doesn't yield useable results*. Finally, I create some new toy data to show how the approach could be applied to a binominal model.
(* In the original toy data the dependent variable is either always 0 or always 1 in each model so this is unlikely to yield useable results).
set.seed(1)
library(tidyr)
library(dplyr)
library(sjPlot)
library(tibble)
library(ggplot2)
# general example
fits <- tibble(id = c("x", "y", "z")) %>%
rowwise() %>%
mutate(fit = list(glm(reformulate(
termlabels = c("cut", "color", "depth", "table", "price", id),
response = "carat"),
data = diamonds)))
plot_models(fits$fit)
# OP's example data
fits2 <- tribble(~group, ~colA, ~colB, ~colC,
sample(c("group1", "group2"), 10, replace = T), 0,
sample(10, replace = T), sample(10, replace = T),
sample(c("group1", "group2"), 10, replace = T), 1,
sample(10, replace = T),
sample(10, replace = T)) %>%
unnest(cols = c(colB, colC)) %>%
nest(data = -group) %>%
rowwise() %>%
mutate(fit = list(glm(formula = colA ~ colB + colC, data = data, family="binomial")))
plot_models(fits2$fit)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning: Removed 4 rows containing missing values (geom_point).
# new data for binominal model
n <- 500
g <- round(runif(n, 0L, 1L), 0)
x1 <- runif(n,0,100)
x2 <- runif(n,0,100)
y <- (x2 - x1 + rnorm(n,sd=20)) < 0
fits3 <- tibble(g, y, x1, x2) %>%
nest_by(g) %>%
mutate(fit = list(glm(formula = y ~ x1 + x2, data = data, family="binomial")))
plot_models(fits3$fit)
Created on 2021-01-23 by the reprex package (v0.3.0)

Assign name of data frame to ggplot title in purrr loop

I have two data sets from which I would like to generate histograms showing how the data overlap by name (A, B, C). I have written a custom function so I can use ggplot with map2.
I would like the graphs to be titled according to the name of each data set, so "A", "B", "C." Does anyone know of a way to do this?
# load packages
library(ggplot2)
library(dplyr)
library(purrr)
## load and format data 1
df1_raw <- data.frame(name = c("A", "B", "C", "A", "C", "B"),
start = c(1, 3, 4, 5, 2, 1),
end = c(6, 5, 7, 8, 6, 7))
df1 <- split(x = df1_raw, f = df1_raw$name) # split data by name
df1 <- lapply(df1, function(x) Map(seq.int, x$start, x$end)) # generate sequence intervals
df1 <- map(df1, unlist) # unlist sequences
df1 <- lapply(df1, data.frame) # convert to df
## load and format data 2
df2_raw <- data.frame(name = c("C", "B", "C", "A", "A", "B"),
start = c(5, 4, 3, 4, 4, 5),
end = c(7, 8, 7, 6, 9, 6))
df2 <- split(x = df2_raw, f = df2_raw$name) # split data by name
df2 <- lapply(df2, function(x) Map(seq.int, x$start, x$end)) # generate sequence intervals
df2 <- map(df2, unlist) # unlist sequences
df2 <- lapply(df2, data.frame) # convert to df
## write custom ggplot function and generate graphs
gplot <- function(data1, data2) {
ggplot() +
geom_histogram(data = data1, aes(x = X..i..), binwidth = 1, color = "grey", fill = "grey") +
geom_histogram(data = data2, aes(x = X..i..), binwidth = 1, fill = "pink", alpha = 0.7) +
labs(
title = ls(data1))
}
hist <- map2(df1, df2, gplot)
I also tried the following in the title field in my function:
deparse(substitute(data1))
Another similar option to what #GregorThomas mentioned in the comments, you could add a name variable to your data.frames and pull from that in your gplot() function. I've also shown how you might combine a few of your data manipulation steps:
# load packages
library(ggplot2)
library(dplyr)
library(purrr)
## load and format data 1
df1_raw <- data.frame(name = c("A", "B", "C", "A", "C", "B"),
start = c(1, 3, 4, 5, 2, 1),
end = c(6, 5, 7, 8, 6, 7))
df1 <- df1_raw %>%
split(.$name) %>% # split data by name
imap(function(x, x_name) {
data.frame(value = Map(seq.int, x$start, x$end) %>% unlist,
name = x_name)
})
## load and format data 2
df2_raw <- data.frame(name = c("C", "B", "C", "A", "A", "B"),
start = c(5, 4, 3, 4, 4, 5),
end = c(7, 8, 7, 6, 9, 6))
df2 <- df2_raw %>%
split(.$name) %>% # split data by name
imap(function(x, x_name) {
data.frame(value = Map(seq.int, x$start, x$end) %>% unlist,
name = x_name)
})
## change the title component of your previous function
gplot <- function(data1, data2) {
ggplot() +
geom_histogram(data = data1, aes(x = value), binwidth = 1, color = "grey", fill = "grey") +
geom_histogram(data = data2, aes(x = value), binwidth = 1, fill = "pink", alpha = 0.7) +
ggtitle(data1$name[1])
}
## plot it
map2(df1, df2, gplot)

broom::augment returns matrix can't unnest

I'm trying to run broom on augment on lm(y ~poly(x, 3), data = dat).
With that formaula broom::augment returns a matrix in a nested column. When I try to unnest, this fails whit an error message similar to "can't cast poly...
I'v found a similar question, but no answer
Trying to unnest broom::augment data, but R "can't cast"
library(rmarkdown)
library(tidyverse)
library(fs)
structure(list(a = c("2019-11-25", "2019-11-25",
"2019-11-25", "2019-11-25", "2019-11-25"),
b = c("laktat-felttest", "laktat-felttest",
"laktat-felttest", "laktat-felttest",
"laktat-felttest"),
c = c("kai", "kai", "kai", "kai", "kai"),
maaling = c(1, 2, 3, 4, 5),
load = c(800, 850, 900, 1000, 1100),
time_mm = c(5, 5, 4, 4, 4),
time_ss = c(9, 0, 55, 35, 45),
hr = c(125, 140, 140, 160, 172),
rpe = c(2, 4, 4, 8, 9),
laktat = c(2.7, 2.1, 2, 4.8, 10.2),
time = c(309, 300, 295, 275, 285),
x = c(2.58899676375405, 2.83333333333333,
3.05084745762712, 3.63636363636364, 3.85964912280702),
la_x = c(1.042875, 0.741176470588235,
0.655555555555556, 1.32, 2.64272727272727)),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -5L)) ->
dat
dat %>%
group_by(a,b,c) %>%
nest() %>%
mutate(model = data %>% map( ., ~lm( data = ., laktat ~ poly(x, 3), na.action = "na.exclude")),
tidied = model %>% map( ., broom::tidy ),
glance = model %>% map(., broom::glance),
augment = map( model, broom::augment),
augment = map( augment, janitor::clean_names, "snake" )) ->
model_tbl
##model_tbl %>%
## unnest(augment)
It seems to me, that the nested column, where augment is, there is a matrix. I don't know how to change this to listcolumns.
Greetings from Denmark
Dan Olesen
The poly() function is the problem. It causes weird column names. You can replace it by x + I(x^2) + I(x^3) and the you get better column names.
dat %>%
group_by(a,b,c) %>%
nest() %>%
mutate(model = data %>% map( ., ~lm( data = ., laktat ~ x + I(x^2) + I(x^3), na.action = "na.exclude")),
tidied = model %>% map( ., broom::tidy ),
glance = model %>% map(., broom::glance),
augment = map( model, broom::augment),
augment = map( augment, janitor::clean_names, "snake" )) ->
model_tbl
Update:
I just realized an issue. Using x + I(x^2) + I(x^3) is not exactly the same as poly(x, 3). If you use the latter the coefficients are not correlated, but if you use the first they are. Don't know how to solve this at the moment.

Resources