Regression in R with grouped variables - r

The dependent variable Value of the data frame DF is predicted using the independent variables Mean, X, Y in the following way:
DF <- DF %>%
group_by(Country, Sex) %>%
do({
mod = lm(Value ~ Mean + X + Y, data = .)
A <- predict(mod, .)
data.frame(., A)
})
Data are grouped by Country and Sex. So, the fitting formula can be expressed as:
Value(Country, Sex) = a0(Country, Sex) + a1(Country, Sex) Mean + a2(Country, Sex) X + a3(Country, Sex) Y
However, I want to use this formula:
Value(Country, Sex) = a0(Country, Sex) + a1(Country, Sex) Mean + a2(Country) X + a3(Country) Y
Where a2 and a3 are independent of Sex. How can I do it?

I don't think you can when grouping by Country and Sex. You could just group by Country and add interactions with Sex:
DF <- DF %>%
group_by(Country) %>%
do({
mod = lm(Value ~ Sex + Mean*Sex + X + Y, data = .)
A <- predict(mod, .)
data.frame(., A)
})
or estimate your model in one go adding interactions with Sex and Country:
mod <- lm(Value ~ Sex*Country*Mean + Country*X + Country*Y
A <- predict(mod)

Related

Fable TSLM different models yield identical predictions

I'm trying to fit and forecast TSLM models with different time-t predictors added alongside the trend... for reasons I don't understand, several of the models yield identical predictions even though the the input test data appears different, and the coefficients in the models themselves appear different. Almost certainly this an error on my part.. let me know what's going wrong!
Reprex data
suppressPackageStartupMessages({
library(tidyverse)
library(tsibble)
library(fable)
library(feasts)
})
proj_tract <- read_csv("path_to_reprexdata")
proj_tract <- as_tsibble(proj_tract, key = tractid, index = year)
train <- proj_tract %>%
filter(year < 2019)
test <- proj_tract %>%
filter(year >= 2019)
fit <- train %>%
model(
trend_only = TSLM(log(chh) ~ trend()),
trend_w_dar = TSLM(log(chh) ~ trend() + log(ig_count_imptd)),
trend_w_da1 = TSLM(log(chh) ~ trend() + log(prd_1)),
trend_w_da2 = TSLM(log(chh) ~ trend() + log(prd_2)),
trend_w_da3 = TSLM(log(chh) ~ trend() + log(prd_3)),
trend_w_da4 = TSLM(log(chh) ~ trend() + log(prd_4)),
trend_w_da5 = TSLM(log(chh) ~ trend() + log(prd_glmnet))
)
fc <- forecast(
fit,
new_data = test
) %>%
hilo(.95)
res <- fc %>%
as_tibble() %>%
rename("proj" = ".mean", "model" = ".model") %>%
select(model, proj, lchh) %>%
pivot_wider(names_from = model, values_from = proj)
head(res)
A subset of these models yield identical predictions -- help me understand why!

Extract slope and r squared from grouped linear models using broom

I have a dataframe that I want to run linear models on by group, then use the broom package to extract the slope and r squared for each model. So far I am trying this:
library(tidyverse)
library(broom)
#read in the dataset
data(mtcars)
#add a group variable
mtcars <- mtcars %>% as_tibble() %>% mutate(LC = 1)
#create a second group
mtcars2 <- mtcars
mtcars2 <- mtcars2 %>% mutate(LC = 2)
#bind together
mtcars <- rbind(mtcars, mtcars2)
#groupby and run regressions
all_regress <- mtcars %>% group_by(LC) %>%
do(mod1 = lm(mpg ~ disp, data = .),
mod2 = lm(mpg ~ wt, data = .))
#use broom the extract the slope and rsq per group
glance <-all_regress %>% mutate(tidy = map(mod1, broom::tidy),
glance = map(mod1, broom::glance),
augment = map(mod1, broom::augment),
rsq = glance %>% map_dbl('r.squared'),
slope = tidy %>% map_dbl(function(x) x$estimate[2]))
but this fails with:
Error: Problem with `mutate()` input `tidy`.
x No tidy method for objects of class qr
ℹ Input `tidy` is `map(mod1, broom::tidy)`.
ℹ The error occurred in row 1.
If I do this without groups such as:
#read in the dataset
data(mtcars)
mtcars <- mtcars %>% as_tibble()
#run regressions
all_regress <- mtcars %>%
do(mod1 = lm(mpg ~ disp, data = .),
mod2 = lm(mpg ~ wt, data = .))
#use broom the extract the slope and rsq per group
glance <- all_regress %>% mutate(tidy = map(mod1, broom::tidy),
glance = map(mod1, broom::glance),
augment = map(mod1, broom::augment),
rsq = glance %>% map_dbl('r.squared'),
slope = tidy %>% map_dbl(function(x) x$estimate[2]))
there is no error.
I think simply adding ungroup() achieves what you need:
all_regress <- mtcars %>% group_by(LC) %>%
do(mod1 = lm(mpg ~ disp, data = .),
mod2 = lm(mpg ~ wt, data = .)) %>% ungroup()
#use broom the extract the slope and rsq per group
glance <-all_regress %>% mutate(tidy = map(mod1, broom::tidy),
glance = map(mod1, broom::glance),
augment = map(mod1, broom::augment),
rsq = glance %>% map_dbl('r.squared'),
slope = tidy %>% map_dbl(function(x) x$estimate[2]))
I used this approach, its longer but i think theres more control in the individual steps. Finally i created a tibble with lists columns containing each model.
library(tidyverse)
library(broom)
#read in the dataset
data(mtcars)
#add a group variable
mtcars <- mtcars %>% as_tibble() %>% dplyr::select(-c(vs, am, gear, carb, cyl)) %>% mutate(LC = 1)
#create a second group
mtcars2 <- mtcars
mtcars2 <- mtcars2 %>% mutate(LC = 2)
#bind together
mtcars <- bind_rows(mtcars2, mtcars)
#group_split and run regressions
all_regress <- mtcars %>% group_split(LC) %>%
map(~ list(mod1 = lm(mpg ~ disp, data = .),
mod2 = lm(mpg ~ wt, data = .)))
# example <- all_regress[[2]][[1]] %>% glance()
#the list has 2 levels with 2 models each
data <- all_regress %>%
map(~
map(.x, function(model){
#column lists are needed because each function output different objects
tibble(mod = list(model),
tidy = list(broom::tidy(model)),
glance = list(broom::glance(model)),
augment = list(broom::augment(model))) %>%
mutate(
rsq = list(glance[[1]]$r.squared),
slope = list(tidy[[1]]$estimate[2]))
} ))
data_final <-
data %>% map2(unique(mtcars$LC), ~
map2(.x, .y, function(each_model, lc){
mutate(each_model, LC = lc)
}))
final_format <- #because of the list structure i need to bind the two datasets in each level and then bind them again.
map(data_final, ~reduce(.x, rbind)) %>% reduce(rbind)
#acces the data
final_format[1, 1][[1]]

grouped regresion in R

I am trying to run a multiple linear regression but i am getting the same coefficients for all my grouped variables
names<- rep(LETTERS[1:25], each = 20)
daysp<- runif(1:500,1,500)
startdate <-sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 500)
enddate<- sample(seq(as.Date('2010/01/01'), as.Date('2020/01/01'), by="day"), 500)
class <- rep(LETTERS[1:4], each = 125)
amt<- runif(1:500,10000,500000)
2ndclass <- rep(LETTERS[5:8], each = 125)
df<-data.frame(names,daysp,startdate,enddate,class,amt,2ndclass)
Changed to factor class and 2ndclass
fitted_models = df %>% group_by(names) %>% do(model = lm(daysp ~ startdate + enddate
+ class + 2ndclass + amt, data=df))
fitted_models$models
How can i run the regressions and get different coefficients for each group?
data = df explicitly uses the entire data frame df, ignoring any grouping. Use . to refer to the data that is piped in, which will let do use the groups. See the example at the bottom of ?do for reference:
## From ?do
by_cyl <- mtcars %>% group_by(cyl)
models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
Though, versions of dplyr > 1.0 will prefer using nest_by (also demonstrated on the ?do help page):
models <- mtcars %>%
nest_by(cyl) %>%
mutate(mod = list(lm(mpg ~ disp, data = data)))
models %>% summarise(broom::tidy(mod))

How to Graph regression coefficients (or other estimates for model parameters) from (nested) regression models by condition?

I'm trying to plot regression coefficients from a (nested) dataframe (by condition) for which i ran four regression models for the four condtitions (with multiple predictors) on the nested data within each condition. Plotting the R-Squared values per model per condition (see example) works, but now I'd like to plot the regression coefficients first for x1 by condition (b's for x1 in descending order) and then same for x2 (or even facetted by predictor number), can someone help me out with the code?
Example of plotting R - Squared values for multiple models:
# creating data example
library(modelr)
library(tidyverse)
set.seed(123)
data <- tibble(
condition = replicate(40, paste(sample(c("A", "B", "C", "D"), 1, replace=TRUE))),
x1 = rnorm(n = 40, mean = 10, sd = 2),
x2 = rnorm(n = 40, mean = 5, sd = 1.5),
y = x1*rnorm(n = 40, mean = 2, sd = 1) + x2*rnorm(n = 40, mean = 3, sd = 2))
by_condition <- data %>%
group_by(condition) %>%
nest()
# looking at data from first condition
by_condition$data[[1]]
# regression model function
reg.model <- function(df) {
lm(y ~ x1 + x2,
data = df)
}
# creating column with models per condition
by_condition <- by_condition %>%
mutate(model = map(data, reg.model))
# looking at reg. model for first group
by_condition$model[[1]]
summary(by_condition$model[[1]])
# graphing R-squared (ascending) per model by condition
glance <- by_condition %>%
mutate(glance = map(model, broom::glance)) %>%
unnest(glance)
glance %>%
ggplot(aes(x = reorder(condition, desc(r.squared)), y = r.squared)) +
geom_point() +
coord_flip() +
xlab("Condition") +
ggtitle("R Square of reg. model per Condition")
So this example works, but i don't know how to extract the coefficients seperately and plot those in descending order by condition in similar graphs. Thanks
I found the answer to plotting coefficients of (nested) regression models within different conditions (tidying kicks ass):
by_condition %>%
mutate(regressions = map(model, broom::tidy)) %>%
unnest(regressions)
by_condition
regression_output <- by_condition %>%
mutate(regressions = map(model, broom::tidy))
regression_coefficients <- regression_output %>%
unnest(regressions)
regression_coefficients %>%
ggplot(aes(x = term, y = estimate )) +
geom_point() +
coord_flip() +
facet_wrap(~ condition) +
xlab("predictor") +
ggtitle("Coefficients of reg. model per Condition")

dplyr, do(), extracting parameters from model without losing grouping variable

A slightly changed example from the R help for do():
by_cyl <- group_by(mtcars, cyl)
models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
coefficients<-models %>% do(data.frame(coef = coef(.$mod)[[1]]))
In the dataframe coefficients, there is the first coefficient of the linear model for each cyl group. My question is how can I produce a dataframe that contains not only a column with the coefficients, but also a column with the grouping variable.
===== Edit: I extend the example to try to make more clear my problem
Let's suppose that I want to extract the coefficients of the model and some prediction. I can do this:
by_cyl <- group_by(mtcars, cyl)
getpars <- function(df){
fit <- lm(mpg ~ disp, data = df)
data.frame(intercept=coef(fit)[1],slope=coef(fit)[2])
}
getprediction <- function(df){
fit <- lm(mpg ~ disp, data = df)
x <- df$disp
y <- predict(fit, data.frame(disp= x), type = "response")
data.frame(x,y)
}
pars <- by_cyl %>% do(getpars(.))
prediction <- by_cyl %>% do(getprediction(.))
The problem is that the code is redundant because I am fitting the model two times. My idea was to build a function that returns a list with all the information:
getAll <- function(df){
results<-list()
fit <- lm(mpg ~ disp, data = df)
x <- df$disp
y <- predict(fit, data.frame(disp= x), type = "response")
results$pars <- data.frame(intercept=coef(fit)[1],slope=coef(fit)[2])
results$prediction <- data.frame(x,y)
results
}
The problem is that I don't know how to use do() with the function getAll to obtain for example just a dataframe with the parameters (like the dataframe pars).
Like this?
coefficients <-models %>% do(data.frame(coef = coef(.$mod)[[1]], group = .[[1]]))
yielding
coef group
1 40.87196 4
2 19.08199 6
3 22.03280 8
Using the approach of Hadley Wickham in this video:
library(dplyr)
library(purrr)
library(broom)
fitmodel <- function(d) lm(mpg ~ disp, data = d)
by_cyl <- mtcars %>%
group_by(cyl) %>%
nest() %>%
mutate(mod = map(data, fitmodel),
pars = map(mod, tidy),
pred = map(mod, augment))
pars <- by_cyl %>% unnest(pars)
prediction <- by_cyl %>% unnest(pred)

Resources