Related
I want to create a gls regression that includes the value R squared and observations where the values "log likelihood" etc. are. The p values should be below the coefficients in the table. Here is an example of a code:
`
# import the necessary packages
library(nlme)
library(dplyr)
library(stargazer)
# create a new subset that only includes observations with a value in the "Price.Book.Value" column
dotcom_subset_MBV <- dotcom_subset %>% filter(!is.na(Price.Book.Value))
financial_subset_MBV <- financial_subset %>% filter(!is.na(Price.Book.Value))
covid_subset_MBV <- covid_subset %>% filter(!is.na(Price.Book.Value))
# Hypothesis 2: Fit GLS models
dotcom_model_MBV <- gls(X1.Month.Equity.Premium ~ crisis*Price.Book.Value, data = dotcom_subset_MBV, method = "ML")
financial_model_MBV <- gls(X1.Month.Equity.Premium ~ crisis*Price.Book.Value, data = financial_subset_MBV, method = "ML")
covid_model_MBV <- gls(X1.Month.Equity.Premium ~ crisis*Price.Book.Value, data = covid_subset_MBV, method = "ML")
library(stargazer)
stargazer(dotcom_model_MBV, financial_model_MBV, covid_model_MBV, type = "text",column.labels = c("Dotcom","Financial","Covid"),report=('vc*p'))
The only problem with the code above is that it shows the Log Likelihood, Akaike Inf. Crit. and Bayesian Inf. Crit. instead of the R squared values. The rest would be okay.
I tried the following:
omit.stat = c("ll", "AIC", "BIC")
and it works. However, it still doesn't show me the R squared. Then I tried:
add.lines = list(c(paste0("R-squared = ", round(r2_dotcom, 2)
and it includes a line that is called "R Squared" but without any values.
Here's a working example using the mtcars data:
data(mtcars)
library(nlme)
library(stargazer)
#>
#> Please cite as:
#> Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
#> R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
m1 <- gls(qsec ~ cyl + wt, data=mtcars)
m2 <- gls(mpg ~ cyl + wt, data=mtcars)
r2 <- c(cor(fitted(m1), mtcars$qsec)^2,
cor(fitted(m2), mtcars$mpg)^2)
stargazer(m1, m2,
type="text",
omit.stat = c("ll", "AIC", "BIC"),
add.lines = list(c("R-squared", sprintf("%.2f", r2))))
#>
#> =========================================
#> Dependent variable:
#> ----------------------------
#> qsec mpg
#> (1) (2)
#> -----------------------------------------
#> cyl -1.173*** -1.508***
#> (0.197) (0.415)
#>
#> wt 1.356*** -3.191***
#> (0.360) (0.757)
#>
#> Constant 20.743*** 39.686***
#> (0.815) (1.715)
#>
#> -----------------------------------------
#> R-squared 0.56 0.83
#> Observations 32 32
#> =========================================
#> Note: *p<0.1; **p<0.05; ***p<0.01
Created on 2023-02-01 by the reprex package (v2.0.1)
The add.lines option expects a list and each vector in the list will be appended to the table with a different element in each column. So you would need the vector of values to be "R-squared" and then each of the r-squared values printed in a string (that's what sprintf() does).
Also, note, that I've calculated the r-squared as the squared correlation between the observed and fitted values, but make no claim that this is statistically sound (though it is one way of calculating R-squared in the OLS model).
There are two methods available to estimate confidence intervals for a gls model in R: using function confint and function intervals. The results are not the same and I want to know what are the causes of the differences and which one is the preferred to use for a gls (and for lme as well) models.
I will use the cats data set for this example. I will use four different approaches to estimate the mean difference (MD) of Hwt between sex:
t-test (heterogeneous variance)
Linear model, using lm (homogeneous variance)
Linear model, using gls (homogeneous variance)
Heteroscedastic linear model, using gls (heterogeneous variance)
for the gls approaches confint and intervals are available for calculating confidence intervals.
Here is the code:
library(pacman)
p_load(tidyverse)
p_load(MASS)
p_load(nlme)
set.seed(150)
cats%>%ggplot(aes(x=Sex,y=Hwt))+
geom_boxplot()+theme_bw()
###different approaches for the same mean difference estimation
cats_ttest<-t.test(Hwt~Sex,data=cats)
cats$Sex<-relevel(cats$Sex,ref="M")
cats_lm<-lm(Hwt~Sex,data=cats)
cats_gls_hom<-gls(Hwt~Sex,data=cats)
cats_gls_het<-gls(Hwt~Sex,weights=varIdent(form=~1|Sex),data=cats)
###store estimations and CI's from different approaches
a<-rbind(confint(cats_lm),confint(cats_gls_hom),confint(cats_gls_het),
intervals(cats_gls_hom,which = "coef")$coef[,c(1,3)],
intervals(cats_gls_het,which = "coef")$coef[,c(1,3)]) %>% data.frame%>% {cbind(par=rownames(.),.)}
a$par<-a$par %>% str_remove_all("X.|.1|.2|.3|.4")
a$par<-factor(a$par,levels =c("Intercept.","SexF"),
labels =c("Intercept.","SexF") )
a$est<-c(rep(cats_lm %>% coef,3),
cats_gls_hom %>% coef,cats_gls_het %>% coef
)
a$mod<-c(rep("cats_lm_ci",2),rep("cats_gls_hom_ci",2),rep("cats_gls_het_ci",2),
rep("cats_gls_hom_int",2),rep("cats_gls_het_int",2)
)
colnames(a)[2:3]<-c("LCI","UCI")
a<-rbind(data.frame(par="SexF",LCI=cats_ttest$conf.int[1],
UCI=cats_ttest$conf.int[2],est=cats_ttest$estimate[1]-cats_ttest$estimate[2],
mod="ttest"),a)
a$mod<-factor(a$mod,levels =c("ttest","cats_lm_ci","cats_gls_hom_ci","cats_gls_het_ci","cats_gls_hom_int","cats_gls_het_int"))
a$diff<-a$UCI-a$LCI
rownames(a)<-NULL
###results
a[order(a$par,a$diff),]
#> par LCI UCI est mod diff
#> 4 Intercept. 10.879181 11.766179 11.322680 cats_gls_hom_ci 0.8869980
#> 2 Intercept. 10.875369 11.769992 11.322680 cats_lm_ci 0.8946223
#> 8 Intercept. 10.875369 11.769992 11.322680 cats_gls_hom_int 0.8946223
#> 6 Intercept. 10.816754 11.828606 11.322680 cats_gls_het_ci 1.0118521
#> 10 Intercept. 10.812406 11.832955 11.322680 cats_gls_het_int 1.0205495
#> 7 SexF -2.758218 -1.482888 -2.120553 cats_gls_het_ci 1.2753295
#> 11 SexF -2.763699 -1.477407 -2.120553 cats_gls_het_int 1.2862917
#> 1 SexF -2.763753 -1.477352 -2.120553 ttest 1.2864011
#> 5 SexF -2.896844 -1.344261 -2.120553 cats_gls_hom_ci 1.5525835
#> 3 SexF -2.903517 -1.337588 -2.120553 cats_lm_ci 1.5659288
#> 9 SexF -2.903517 -1.337588 -2.120553 cats_gls_hom_int 1.5659288
a %>% ggplot(aes(x=par,y=est,color=mod,group=mod))+geom_point(position=position_dodge(0.5))+
geom_errorbar(aes(ymin=LCI, ymax=UCI), width=.2,
position=position_dodge(0.5))+theme_bw()
Created on 2022-09-11 by the reprex package (v2.0.1)
As you can see, there are mild differences in CI amplitudes from the different methods,and as expected, the methods which accounts for differences in variances produced the narrowest CI for the mean differences (parameter SexF in dataframe a).
So, why are two methods available to estimate confidence intervals for gls models, what are the differences between them and which one is the preferred one for this kind of models?
tl;dr use intervals(), it gives you CIs based on a Student-t rather than a Normal sampling distribution.
If you look at methods(class = "gls") you'll see that confint() is not listed. That means that when you call confint(gls_fit), R falls back to the default confint method. If we look at the code for stats::confint.default you'll see fac <- qnorm(a); ...; ci[] <- cf[parm] + ses %o% fac. In other words, confint.default is constructing CIs based on a Normal distribution.
In contrast, nlme:::intervals.gls uses
len <- -qt((1 - level)/2, dims$N - dims$p) * sqrt(diag(object$varBeta))
— i.e., an interval based on a t-distribution.
It makes very little difference in this case (CI interval width of 1.55 vs 1.56).
For what it's worth, you can streamline this kind of comparison a little bit using broom/broom.mixed (although this does not include the confint.default option for gls!)
library(broom)
library(broom.mixed)
options(pillar.sigfig = 7)
(tibble::lst(cats_ttest, cats_lm, cats_gls_hom, cats_gls_het)
|> map_dfr(tidy, .id = "model", conf.int = TRUE)
## t-test doesn't have a "term" element
|> mutate(across(term, ~ifelse(is.na(.), "SexF", term)))
|> select(model, term, estimate, lwr = conf.low, upr = conf.high)
|> mutate(width = upr - lwr)
|> arrange(term)
)
As a general rule, you should use the most specific method available — this usually happens automatically, it's sort of an accident that confint() works for gls objects (partly because the nlme package predates R itself, so doesn't follow all of its conventions ...)
I have a fixed effects model with only few observations and would therefore like to bootstrap in order to obtain more accurate standard errors. At the same time, I assume SE to be clustered thus I would also like to correct for clustering, i.e. do a cluster bootstrap.
I found a function for lm models (vcovBS), however could not find anything for plm models. Does anybody know an analogous function to obtain cluster bootstrapped SE for fixed effects models?
The clusterSEs package has an implementation of the wild cluster bootstrap for plm models: https://www.rdocumentation.org/packages/clusterSEs/versions/2.6.2/topics/cluster.wild.plml.
An alternative package is fwildclusterboot. It does not work with plm but with two other fixed effects regression packages, lfe and fixest, and should be significantly faster than clusterSEs.
With the fixest package, its syntax would look like this:
library(fixest)
library(fwildclusterboot)
# load data set voters included in fwildclusterboot
data(voters)
# estimate the regression model via feols
feols_fit <- feols(proposition_vote ~ treatment + ideology1 + log_income + Q1_immigration , data = voters)
# bootstrap inference
boot_feols <- boottest(feols_fit, clustid = "group_id1", param = "treatment", B = 9999)
summary(boot_feols)
#> boottest.fixest(object = lm_fit, clustid = "group_id1", param = "treatment",
#> B = 9999)
#>
#> Observations: 300
#> Bootstr. Iter: 9999
#> Bootstr. Type: rademacher
#> Clustering: 1-way
#> Confidence Sets: 95%
#> Number of Clusters: 40
#>
#> term estimate statistic p.value conf.low conf.high
#> 1 treatment 0.073 3.786 0.001 0.033 0.114
I want to run a multinomial logit in R and have used two libraries, nnet and mlogit, which produce different results and report different types of statistics. My questions are:
What is the source of discrepency between the coefficients and standard errors reported by nnet and those reported by mlogit?
I would like to report my results to a Latex file using stargazer. When doing so, there is a problematic tradeoff:
If I use the results from mlogit then I get the statistics I wish, such as psuedo R squared, however, the output is in long format (see example below).
If I use the results from nnet then the format is as expected, but it reports statistics that I am not interested in such as AIC, but does not include, for example, psuedo R squared.
I would like to have the statistics reported by mlogit in the formatting of nnet when I use stargazer.
Here is a reproducible example, with three choice alternatives:
library(mlogit)
df = data.frame(c(0,1,1,2,0,1,0), c(1,6,7,4,2,2,1), c(683,276,756,487,776,100,982))
colnames(df) <- c('y', 'col1', 'col2')
mydata = df
mldata <- mlogit.data(mydata, choice="y", shape="wide")
mlogit.model1 <- mlogit(y ~ 1| col1+col2, data=mldata)
The tex output when compiled is of what I refer to as "long format" which I deem undesired:
Now, using nnet:
library(nnet)
mlogit.model2 = multinom(y ~ 1 + col1+col2, data=mydata)
stargazer(mlogit.model2)
Gives the tex output:
which is of the "wide" format which I desire. Note the different coefficient and standard errors.
To my knowledge, there are three R packages that allow the estimation of the multinomial logistic regression model: mlogit, nnet and globaltest (from Bioconductor). I do not consider here the mnlogit package, a faster and more efficient implementation of mlogit.
All the above packages use different algorithms that, for small samples, give different results. These differencies vanishes for moderate sample sizes (try with n <- 100).
Consider the following data generating process taken from the James Keirstead's blog:
n <- 40
set.seed(4321)
df1 <- data.frame(x1=runif(n,0,100), x2=runif(n,0,100))
df1 <- transform(df1, y=1+ifelse(100 - x1 - x2 + rnorm(n,sd=10) < 0, 0,
ifelse(100 - 2*x2 + rnorm(n,sd=10) < 0, 1, 2)))
str(df1)
'data.frame': 40 obs. of 3 variables:
$ x1: num 33.48 90.91 41.15 4.38 76.35 ...
$ x2: num 68.6 42.6 49.9 36.1 49.6 ...
$ y : num 1 1 3 3 1 1 1 1 3 3 ...
table(df1$y)
1 2 3
19 8 13
The model parameters estimated by the three packages are respectively:
library(mlogit)
df2 <- mlogit.data(df1, choice="y", shape="wide")
mlogit.mod <- mlogit(y ~ 1 | x1+x2, data=df2)
(mlogit.cf <- coef(mlogit.mod))
2:(intercept) 3:(intercept) 2:x1 3:x1 2:x2 3:x2
42.7874653 80.9453734 -0.5158189 -0.6412020 -0.3972774 -1.0666809
#######
library(nnet)
nnet.mod <- multinom(y ~ x1 + x2, df1)
(nnet.cf <- coef(nnet.mod))
(Intercept) x1 x2
2 41.51697 -0.5005992 -0.3854199
3 77.57715 -0.6144179 -1.0213375
#######
library(globaltest)
glbtest.mod <- globaltest::mlogit(y ~ x1+x2, data=df1)
(cf <- glbtest.mod#coefficients)
1 2 3
(Intercept) -41.2442934 1.5431814 39.7011119
x1 0.3856738 -0.1301452 -0.2555285
x2 0.4879862 0.0907088 -0.5786950
The mlogit command of globaltest fits the model without using a reference outcome category, hence the usual parameters can be calculated as follows:
(glbtest.cf <- rbind(cf[,2]-cf[,1],cf[,3]-cf[,1]))
(Intercept) x1 x2
[1,] 42.78747 -0.5158190 -0.3972774
[2,] 80.94541 -0.6412023 -1.0666813
Concerning the estimation of the parameters in the three packages, the method used in mlogit::mlogit is explained in detail here.
In nnet::multinom the model is a neural network with no hidden layers, no bias nodes and a softmax output layer; in our case there are 3 input units and 3 output units:
nnet:::summary.nnet(nnet.mod)
a 3-0-3 network with 12 weights
options were - skip-layer connections softmax modelling
b->o1 i1->o1 i2->o1 i3->o1
0.00 0.00 0.00 0.00
b->o2 i1->o2 i2->o2 i3->o2
0.00 41.52 -0.50 -0.39
b->o3 i1->o3 i2->o3 i3->o3
0.00 77.58 -0.61 -1.02
Maximum conditional likelihood is the method used in multinom for model fitting.
The parameters of multinomial logit models are estimated in globaltest::mlogit using maximum likelihood and working with an equivalent log-linear model and the Poisson likelihood. The method is described here.
For models estimated by multinom the McFadden's pseudo R-squared can be easily calculated as follows:
nnet.mod.loglik <- nnet:::logLik.multinom(nnet.mod)
nnet.mod0 <- multinom(y ~ 1, df1)
nnet.mod0.loglik <- nnet:::logLik.multinom(nnet.mod0)
(nnet.mod.mfr2 <- as.numeric(1 - nnet.mod.loglik/nnet.mod0.loglik))
[1] 0.8483931
At this point, using stargazer, I generate a report for the model estimated by mlogit::mlogit which is as similar as possible to the report of multinom. The basic idea is to substitute the estimated coefficients and probabilities in the object created by multinom with the corresponding estimates of mlogit.
# Substitution of coefficients
nnet.mod2 <- nnet.mod
cf <- matrix(nnet.mod2$wts, nrow=4)
cf[2:nrow(cf), 2:ncol(cf)] <- t(matrix(mlogit.cf,nrow=2))
# Substitution of probabilities
nnet.mod2$wts <- c(cf)
nnet.mod2$fitted.values <- mlogit.mod$probabilities
Here is the result:
library(stargazer)
stargazer(nnet.mod2, type="text")
==============================================
Dependent variable:
----------------------------
2 3
(1) (2)
----------------------------------------------
x1 -0.516** -0.641**
(0.212) (0.305)
x2 -0.397** -1.067**
(0.176) (0.519)
Constant 42.787** 80.945**
(18.282) (38.161)
----------------------------------------------
Akaike Inf. Crit. 24.623 24.623
==============================================
Note: *p<0.1; **p<0.05; ***p<0.01
Now I am working on the last issue: how to visualize loglik, pseudo R2 and other information in the above stargazer output.
If you are using stargazer you can use omit to remove unwanted rows or references. Here is a quick example, hopefully, it will point you int he right direction.
nb. My assumption is you are using Rstudio and rmarkdown with knitr.
```{r, echo=FALSE}
library(mlogit)
df = data.frame(c(0,1,1,2,0,1,0), c(1,6,7,4,2,2,1), c(683,276,756,487,776,100,982))
colnames(df) <- c('y', 'col1', 'col2')
mydata = df
mldata <- mlogit.data(mydata, choice = "y", shape="wide")
mlogit.model1 <- mlogit(y ~ 1| col1+col2, data=mldata)
mlogit.col1 <- mlogit(y ~ 1 | col1, data = mldata)
mlogit.col2 <- mlogit(y ~ 1 | col2, data = mldata)
```
# MLOGIT
```{r echo = FALSE, message = TRUE, error = TRUE, warning = FALSE, results = 'asis'}
library(stargazer)
stargazer(mlogit.model1, type = "html")
stargazer(mlogit.col1,
mlogit.col2,
type = "html",
omit=c("1:col1","2:col1","1:col2","2:col2"))
```
Result:
Note that the second image omits 1:col1, 2:col2, 1:col2 and 2:col2
I have a glm model for which I use coeftest from the lmtest package to estimate robust standard errors. When I use stargazer to produce regression tables I get the correct results but without the number of observations and other relevant statistics like the null deviance and the model deviance.
Here's an example:
library(lmtest)
library(stargazer)
m1 <- glm(am ~ mpg + cyl + disp, mtcars, family = binomial)
# Simple binomial regression
# For whatever reason, let's say I want to use coeftest to estimate something
m <- coeftest(m1)
stargazer(m, type = "text", single.row = T) # This is fine, but I want to also include the number of observations
# the null deviance and the model deviance.
I'm specifically interested in the number of observations, the null deviance and the residual deviance.
I thought that If I replaced the old coefficient matrix with the new one, I'd get the correct estimates with the correct statistics and stargazer would recognize the model and print it correctly. For that, I've tried substituting the coefficients, SE's, z statistic and p values from the coeftest model in the m1 model but some of these statistics are computed with summary.glm and are not included in the m1 output. I could easily substitute these coefficients in the summary output but stargazer doesn't recognize summary type class. I've tried adding attributes to the m object with the specific statistics but they don't show up in the output and stargazer doesn't recognize it.
Note: I know stargazer can compute robust SE's but I'm also doing other computations, so the example needs to include the coeftest output.
Any help is appreciated.
It may be easiest to pass the original models into stargazer, and then use coeftest to pass in custom values for standard errors (se = ), confidence intervals (ci.custom = ) and/or p values (p = ). See below for how to easily handle a list containing multiple models.
suppressPackageStartupMessages(library(lmtest))
suppressPackageStartupMessages(library(stargazer))
mdls <- list(
m1 = glm(am ~ mpg, mtcars, family = poisson),
m2 = glm(am ~ mpg + cyl + disp, mtcars, family = poisson)
)
# Calculate robust confidence intervals
se_robust <- function(x)
coeftest(x, vcov. = sandwich::sandwich)[, "Std. Error"]
# Original SE
stargazer(mdls, type = "text", single.row = T, report = "vcsp")
#>
#> ===============================================
#> Dependent variable:
#> -----------------------------
#> am
#> (1) (2)
#> -----------------------------------------------
#> mpg 0.106 (0.042) 0.028 (0.083)
#> p = 0.012 p = 0.742
#> cyl 0.435 (0.496)
#> p = 0.381
#> disp -0.014 (0.009)
#> p = 0.151
#> Constant -3.247 (1.064) -1.488 (3.411)
#> p = 0.003 p = 0.663
#> -----------------------------------------------
#> Observations 32 32
#> Log Likelihood -21.647 -20.299
#> Akaike Inf. Crit. 47.293 48.598
#> ===============================================
#> Note: *p<0.1; **p<0.05; ***p<0.01
# With robust SE
stargazer(
mdls, type = "text", single.row = TRUE, report = "vcsp",
se = lapply(mdls, se_robust))
#>
#> ===============================================
#> Dependent variable:
#> -----------------------------
#> am
#> (1) (2)
#> -----------------------------------------------
#> mpg 0.106 (0.025) 0.028 (0.047)
#> p = 0.00002 p = 0.560
#> cyl 0.435 (0.292)
#> p = 0.137
#> disp -0.014 (0.007)
#> p = 0.042
#> Constant -3.247 (0.737) -1.488 (2.162)
#> p = 0.00002 p = 0.492
#> -----------------------------------------------
#> Observations 32 32
#> Log Likelihood -21.647 -20.299
#> Akaike Inf. Crit. 47.293 48.598
#> ===============================================
#> Note: *p<0.1; **p<0.05; ***p<0.01
Created on 2020-11-09 by the reprex package (v0.3.0)
If I get you right, you could try the following:
First, assign your stargazer analysis to an object like this
stargazer.values <- stargazer(m, type = "text", single.row = T)
then check the code of the stargazer command with body(stargazer).
Hopefully you can find objects for values that stargazers uses but does not report. You can then address them like this (if there is, for example, an object named "null.deviance"
stargazers.values$null.deviance
Or, if it is part of another data frame, say df, it could go like this
stargazers.values$df$null.deviance
maybe a code like this could be helpful
print(null.deviance <- stargazers.values$null.deviance)
Hope this helps!