Plotting Estimates (Fixed Effects) of Regression Models - r

I'm using lmer4 package [lmer() function] to estimate several Average Models, which I want to plot their Estimated Coefficients. I found this document, "Plotting Estimates (Fixed Effects) of Regression Models, by Daniel Lüdecke" that explains how to plot Estimates, and it works with Average Models, but uses Conditional Average values instead of Full Average values.
Example of script:
library(lme4)
options(na.action = "na.omit")
PA_model_clima1_Om_ST <- lmer(O.matt ~ mes_N + Temperatura_Ar_PM_ST + RH_PM_ST + Vento_V_PM_ST + Evapotranspiracao_PM_ST + Preci_total_PM_ST + (1|ID), data=Abund)
library(MuMIn)
options(na.action = "na.fail")
PA_clima1_Om_ST<-dredge(PA_model_clima1_Om_ST)
sort.PA_clima1_Om_ST<- PA_clima1_Om_ST[order(PA_clima1_Om_ST$AICc),]
top.models_PA_clima1_Om_ST<-get.models(sort.PA_clima1_Om_ST, subset = delta < 2)
model.sel(top.models_PA_clima1_Om_ST)
Avg_PA_clima1_Om_ST<-model.avg(top.models_PA_clima1_Om_ST, fit = TRUE)
summary(Avg_PA_clima1_Om_ST)
Results of this script:
Term codes:
Evapotranspiracao_PM_ST Preci_total_PM_ST RH_PM_ST Temperatura_Ar_PM_ST
1 2 3 4
Vento_V_PM_ST
5
Model-averaged coefficients:
(full average)
Estimate Std. Error Adjusted SE z value Pr(>|z|)
(Intercept) 5.4199 1.4094 1.4124 3.837 0.000124 ***
Preci_total_PM_ST -0.8679 1.0300 1.0313 0.842 0.400045
RH_PM_ST 0.6116 0.8184 0.8193 0.746 0.455397
Temperatura_Ar_PM_ST -1.9635 0.7710 0.7725 2.542 0.011026 *
Vento_V_PM_ST -0.6214 0.7043 0.7052 0.881 0.378289
Evapotranspiracao_PM_ST -0.1202 0.5174 0.5183 0.232 0.816654
(conditional average)
Estimate Std. Error Adjusted SE z value Pr(>|z|)
(Intercept) 5.4199 1.4094 1.4124 3.837 0.000124 ***
Preci_total_PM_ST -1.2200 1.0304 1.0322 1.182 0.237249
RH_PM_ST 1.0067 0.8396 0.8410 1.197 0.231317
Temperatura_Ar_PM_ST -1.9635 0.7710 0.7725 2.542 0.011026 *
Vento_V_PM_ST -0.8607 0.6936 0.6949 1.238 0.215546
Evapotranspiracao_PM_ST -0.3053 0.7897 0.7912 0.386 0.699619
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Plot scrip:
library(sjPlot)
library(sjlabelled)
library(sjmisc)
library(ggplot2)
data(efc)
theme_set(theme_sjplot())
plot_model(Avg_PA_clima1_Om_ST, type="est", vline.color="black", sort.est = TRUE, show.values = TRUE, value.offset = .3, title= "O. mattogrossae")
Plot:
As you can see, it uses the values of Conditional Average values instead of Full Average values.
How can I plot Estimates of Average Models using Full Average values?

I think it takes the conditional.. so unless you hack the function or maybe contact the author to have such an option, one way is to plot the coefficients yourself:
library(lme4)
library(MuMIn)
options(na.action = "na.fail")
set.seed(888)
dat= data.frame(y = rnorm(100),
var1 = rnorm(100),var2 = rnorm(100),
var3=rnorm(100),rvar = sample(1:2,replace=TRUE,100))
lme_mod <- lmer(y ~ var1+ var2+ var3 + (1|rvar), dat)
dre_mod <- dredge(lme_mod)
avg_mod = model.avg(dre_mod,fit=TRUE)
summary(avg_mod)
Model-averaged coefficients:
(full average)
Estimate Std. Error Adjusted SE z value Pr(>|z|)
(Intercept) -0.02988 0.18699 0.18936 0.158 0.875
var2 -0.03791 0.08817 0.08858 0.428 0.669
var1 -0.02999 0.07740 0.07778 0.386 0.700
var3 0.01521 0.05371 0.05404 0.281 0.778
(conditional average)
Estimate Std. Error Adjusted SE z value Pr(>|z|)
(Intercept) -0.02988 0.18699 0.18936 0.158 0.875
var2 -0.16862 0.11197 0.11339 1.487 0.137
var1 -0.15293 0.10841 0.10978 1.393 0.164
var3 0.11227 0.10200 0.10327 1.087 0.277
The matrix is under:
summary(avg_mod)$coefmat.full
Estimate Std. Error Adjusted SE z value Pr(>|z|)
(Intercept) -0.02988418 0.18698720 0.18935677 0.1578194 0.8745991
var2 -0.03791016 0.08816936 0.08857788 0.4279867 0.6686608
var1 -0.02998709 0.07740247 0.07778028 0.3855360 0.6998404
var3 0.01520633 0.05371407 0.05404100 0.2813850 0.7784151
We take it out, pivot and plot:
library(ggplot2)
df = data.frame(summary(avg_mod)$coefmat.full)
df$variable = rownames(df)
colnames(df)[2] = "std_error"
df = df[df$variable !="(Intercept)",]
df$type = ifelse(df$Estimate>0,"pos","neg")
ggplot(df,aes(x=variable,y=Estimate))+
geom_point(aes(col=type),size=3) +
geom_errorbar(aes(col=type,ymin=Estimate-1.96*std_error,ymax=Estimate+1.96*std_error),width=0,size=1) +
geom_text(aes(label=round(Estimate,digits=2)),nudge_x =0.1) +
geom_hline(yintercept=0,col="black")+ theme_bw()+coord_flip()+
scale_color_manual(values=c("#c70039","#111d5e")) +
theme(legend.position="none")

You can also use parameters::model_parameters(), which is internally used by sjPlot::plot_model(). model_parameters() has a component-argument to decide which component to return. However, plot_model() does not yet pass additional arguments down to model_parameters(). I'm going to address this in sjPlot. Meanwhile, using model_parameters() at least offers a quick plot()-method.
library(lme4)
library(MuMIn)
options(na.action = "na.fail")
set.seed(888)
dat= data.frame(y = rnorm(100),
var1 = rnorm(100),var2 = rnorm(100),
var3=rnorm(100),rvar = sample(1:2,replace=TRUE,100))
lme_mod <- lmer(y ~ var1+ var2+ var3 + (1|rvar), dat)
dre_mod <- dredge(lme_mod)
xavg_mod = model.avg(dre_mod,fit=TRUE)
library(parameters)
model_parameters(avg_mod)
#> Parameter | Coefficient | SE | 95% CI | z | df | p
#> --------------------------------------------------------------------
#> (Intercept) | -0.03 | 0.19 | [-0.40, 0.34] | 0.16 | 96 | 0.875
#> var2 | -0.17 | 0.11 | [-0.39, 0.05] | 1.49 | 96 | 0.137
#> var1 | -0.15 | 0.11 | [-0.37, 0.06] | 1.39 | 96 | 0.164
#> var3 | 0.11 | 0.10 | [-0.09, 0.31] | 1.09 | 96 | 0.277
model_parameters(avg_mod, component = "full")
#> Parameter | Coefficient | SE | 95% CI | z | df | p
#> --------------------------------------------------------------------
#> (Intercept) | -0.03 | 0.19 | [-0.40, 0.34] | 0.16 | 96 | 0.875
#> var2 | -0.04 | 0.09 | [-0.21, 0.14] | 0.43 | 96 | 0.669
#> var1 | -0.03 | 0.08 | [-0.18, 0.12] | 0.39 | 96 | 0.700
#> var3 | 0.02 | 0.05 | [-0.09, 0.12] | 0.28 | 96 | 0.778
plot(model_parameters(avg_mod, component = "full"))
You can do some minor modifications to the plot:
library(ggplot2)
plot(model_parameters(avg_mod, component = "full")) +
geom_text(aes(label = round(Coefficient, 2)), nudge_x = .2)
Created on 2020-06-27 by the reprex package (v0.3.0)

Related

Computing marginal effects: Why does ggeffect and ggemmeans give difference answers?

Example
library(glmmTMB)
library(ggeffects)
## Zero-inflated negative binomial model
(m <- glmmTMB(count ~ spp + mined + (1|site),
ziformula=~spp + mined,
family=nbinom2,
data=Salamanders,
na.action = "na.fail"))
summary(m)
ggemmeans(m, terms="spp")
spp | Predicted | 95% CI
--------------------------------
GP | 1.11 | [0.66, 1.86]
PR | 0.42 | [0.11, 1.59]
DM | 1.32 | [0.81, 2.13]
EC-A | 0.75 | [0.37, 1.53]
EC-L | 1.81 | [1.09, 3.00]
DES-L | 2.00 | [1.25, 3.21]
DF | 0.99 | [0.61, 1.62]
ggeffects::ggeffect(m, terms="spp")
spp | Predicted | 95% CI
--------------------------------
GP | 1.14 | [0.69, 1.90]
PR | 0.44 | [0.12, 1.63]
DM | 1.36 | [0.85, 2.18]
EC-A | 0.78 | [0.39, 1.57]
EC-L | 1.87 | [1.13, 3.07]
DES-L | 2.06 | [1.30, 3.28]
DF | 1.02 | [0.63, 1.65]
Questions
Why are ggeffect and ggemmeans giving different results for the marginal effects? Is it simply something internal with how the packages emmeans and effects are computing them? Also, does anyone know of some resources on how to compute marginal effects from scratch for a model like that in the example?
You fit a complex model: zero-inflated negative binomial model with random effects.
What you observe has little to do with the model specification. Let's show this by fitting a simpler model: Poisson with fixed effects only.
library("glmmTMB")
library("ggeffects")
m <- glmmTMB(
count ~ spp + mined,
family = poisson,
data = Salamanders
)
ggemmeans(m, terms = "spp")
#> # Predicted counts of count
#>
#> spp | Predicted | 95% CI
#> --------------------------------
#> GP | 0.73 | [0.59, 0.89]
#> PR | 0.18 | [0.12, 0.27]
#> DM | 0.91 | [0.76, 1.10]
#> EC-A | 0.34 | [0.25, 0.45]
#> EC-L | 1.35 | [1.15, 1.59]
#> DES-L | 1.43 | [1.22, 1.68]
#> DF | 0.79 | [0.64, 0.96]
ggeffect(m, terms = "spp")
#> # Predicted counts of count
#>
#> spp | Predicted | 95% CI
#> --------------------------------
#> GP | 0.76 | [0.62, 0.93]
#> PR | 0.19 | [0.13, 0.28]
#> DM | 0.96 | [0.79, 1.15]
#> EC-A | 0.35 | [0.26, 0.47]
#> EC-L | 1.41 | [1.20, 1.66]
#> DES-L | 1.50 | [1.28, 1.75]
#> DF | 0.82 | [0.67, 1.00]
The documentation explains that internally ggemmeans() calls emmeans::emmeans() while ggeffect() calls effects::Effect().
Both emmeans and effects compute marginal effects but they make a different (default) choice how to marginalize out (ie. average over) mined in order to get the effect of spp.
mined is a categorical variable with two levels: "yes" and "no". The crucial bit is that the two levels are not balanced: there are slightly more "no"s than "yes"s.
xtabs(~ mined + spp, data = Salamanders)
#> spp
#> mined GP PR DM EC-A EC-L DES-L DF
#> yes 44 44 44 44 44 44 44
#> no 48 48 48 48 48 48 48
Intuitively, this means that the weighted average over mined [think of (44 × yes + 48 × no) / 92] is not the same as the simple average over mined [think of (yes + no) / 2].
Let's check the intuition by specifying how to marginalize out mined when we call emmeans::emmeans() directly.
# mean (default)
emmeans::emmeans(m, "spp", type = "response", weights = "equal")
#> spp rate SE df lower.CL upper.CL
#> GP 0.726 0.0767 636 0.590 0.893
#> PR 0.181 0.0358 636 0.123 0.267
#> DM 0.914 0.0879 636 0.757 1.104
#> EC-A 0.336 0.0497 636 0.251 0.449
#> EC-L 1.351 0.1120 636 1.148 1.590
#> DES-L 1.432 0.1163 636 1.221 1.679
#> DF 0.786 0.0804 636 0.643 0.961
#>
#> Results are averaged over the levels of: mined
#> Confidence level used: 0.95
#> Intervals are back-transformed from the log scale
# weighted mean
emmeans::emmeans(m, "spp", type = "response", weights = "proportional")
#> spp rate SE df lower.CL upper.CL
#> GP 0.759 0.0794 636 0.618 0.932
#> PR 0.190 0.0373 636 0.129 0.279
#> DM 0.955 0.0909 636 0.793 1.152
#> EC-A 0.351 0.0517 636 0.263 0.469
#> EC-L 1.412 0.1153 636 1.203 1.658
#> DES-L 1.496 0.1196 636 1.279 1.751
#> DF 0.822 0.0832 636 0.674 1.003
#>
#> Results are averaged over the levels of: mined
#> Confidence level used: 0.95
#> Intervals are back-transformed from the log scale
The second option returns the marginal effects computed with ggeffects::ggeffect.
Update
#Daniel points out that ggeffects accepts the weights argument and will pass it to emmeans. This way you can keep using ggeffects and still control how predictions are averaged to compute marginal effects.
Try it out for yourself with:
ggemmeans(m, terms="spp", weights = "proportional")
ggemmeans(m, terms="spp", weights = "equal")

Extracting final p-value statistic from an lm lapply loop with multiple models

I have the following code that automatically performs lm between my independent variable (Kpl) and all my other dependent variables (Y1, Y2, ...., Yi):
linear_summary <- lapply(testdata[,-1], function(x) summary(lm(Kpl ~ x)))
The output for this is
Call:
lm(formula = Kpl ~ x)
Residuals:
Min 1Q Median 3Q Max
-1.37567 -0.52392 0.04236 0.67444 0.81316
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.7282 0.3456 5.001 0.000402 ***
x -0.1550 0.2712 -0.571 0.579196
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.772 on 11 degrees of freedom
Multiple R-squared: 0.02883, Adjusted R-squared: -0.05946
F-statistic: 0.3265 on 1 and 11 DF, p-value: 0.5792
$Y2
Call:
lm(formula = Kpl ~ x)
Residuals:
Min 1Q Median 3Q Max
-1.2472 -0.4236 -0.2057 0.7140 1.0348
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.6900 0.9010 0.766 0.460
x 0.8832 0.8767 1.007 0.335
Residual standard error: 0.7495 on 11 degrees of freedom
Multiple R-squared: 0.08447, Adjusted R-squared: 0.001238
F-statistic: 1.015 on 1 and 11 DF, p-value: 0.3354
Etc. (I have truncated it for just the first 2 correlations)
I wanted to extract the final p-value for the whole model for each of the instances (0.5792 and 0.3354 in these two cases). Ideally this would come in some sort of table form with the associated correlation variable, i.e. Y1=0.5792 Y2=0.3354.
Most of the info I can find either seem to only work for a single correlation (as opposed to an sapply with multiple correlations) or I do not seem to get it to work, which could be a problem with my original code.
Any suggestions for a person just starting with R on how to solve this?
Edit: The data looks something like this
| X | Y1 | Y2 | Y3 | Y4 |
| -------- | ------------|-------------|-------------|-------------|
| 0.33767 | 2.33063062 | 1.013212308 | 1.277996888 | 1.373238355 |
| 0.33767 | 0.095967324 | 0.508830529 | 0.789257027 | 0.815877121 |
| 1.010474 | 2.344657045 | 0.842490752 | 1.240582283 | 1.262360905 |
| 1.010474 | 0.08135992 | 0.912535398 | 0.384427466 | 0.409817599 |
| 1.183276 | 0.135626937 | 0.967877981 | 0.505801442 | 0.576288093 |
| 1.536974 | 1.507146148 | 1.428839993 | 1.316569449 | 1.392022619 |
| 1.536974 | 1.255210981 | 1.191822955 | 1.395769591 | 1.41903939 |
| 2.017965 | 1.410299711 | 1.121560244 | 1.369835675 | 1.385143026 |
| 2.017965 | 1.032587109 | 1.372235121 | 1.390878783 | 1.42741762 |
| 2.3436 | 1.275999998 | 0.930400789 | 1.19877482 | 1.217540034 |
| 2.3436 | 1.250513383 | 1.063880146 | 1.206719195 | 1.23325973 |
| 2.387598 | 0.182866909 | 0.89588293 | 0.416923749 | 0.45364797 |
| 2.387598 | 0.097133916 | 0.750430855 | 0.506463633 | 0.03434754 |
These are the actual values that I used to get the correlations above
I think the p-value is not stored, you need to calculate it from the fstatistics, maybe something like this:
set.seed(111)
testdata = data.frame(Kpl = rnorm(100), Y1 = rnorm(100),
Y2 = rnorm(100), Y3 = rnorm(100))
IV = colnames(testdata)[-1]
DV = "Kpl"
linear_summary <- lapply(IV,function(x){
summary(lm(reformulate(response=DV,termlabels=x),data=testdata))
})
names(linear_summary) = IV
tab = lapply(IV,function(x){
p = with(
linear_summary[[x]],
pf(fstatistic[1],fstatistic[2],fstatistic[3],lower.tail=FALSE)
)
data.frame(IV = x, p = p)
})
do.call(rbind,tab)
IV p
value Y1 0.5757187
value1 Y2 0.4922582
value2 Y3 0.4009439
Check for example first summary:
linear_summary[[1]]
Call:
lm(formula = reformulate(response = DV, termlabels = x), data = testdata)
Residuals:
Min 1Q Median 3Q Max
-2.94515 -0.73325 0.05448 0.57901 2.76026
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.01382 0.10747 -0.129 0.898
Y1 -0.05950 0.10597 -0.562 0.576
Residual standard error: 1.075 on 98 degrees of freedom
Multiple R-squared: 0.003207, Adjusted R-squared: -0.006964
F-statistic: 0.3153 on 1 and 98 DF, p-value: 0.5757
Ok I edited my code in the following way:
library(purrr)
library(dplyr)
library(broom)
library(tidyr)
df %>% # Solution 1
pivot_longer(-X) %>%
group_split(name) %>%
set_names(nm = map(., ~ first(.x$name))) %>%
map(~ tidy(lm(X ~ value, data = .))) %>%
bind_rows(.id = "var") %>%
filter(term == "value")
# A tibble: 4 x 6
var term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Y1 value -0.155 0.271 -0.571 0.579
2 Y2 value 0.883 0.877 1.01 0.335
3 Y3 value 0.0341 0.552 0.0618 0.952
4 Y4 value -0.158 0.469 -0.337 0.743
Or you can use this:
df %>% # Solution 2
pivot_longer(Y1:Y4) %>%
group_by(name) %>%
arrange(.by_group = TRUE) %>%
nest() %>%
mutate(models = map(data, ~ lm(X ~ value, data = .)),
glance = map(models, glance)) %>%
unnest(glance)
# A tibble: 4 x 15
# Groups: name [4]
name data models r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<chr> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Y1 <tibbl~ <lm> 0.0288 -0.0595 0.772 0.327 0.579 1 -14.0 34.0 35.7
2 Y2 <tibbl~ <lm> 0.0845 0.00124 0.750 1.01 0.335 1 -13.6 33.2 34.9
3 Y3 <tibbl~ <lm> 0.000348 -0.0905 0.783 0.00382 0.952 1 -14.2 34.4 36.1
4 Y4 <tibbl~ <lm> 0.0102 -0.0798 0.779 0.113 0.743 1 -14.1 34.2 35.9
# ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
I know you already got your answer but here I presented 2 other solutions. Thought it may be ok to learn alternative ways of dealing with a problem and thank you for your question, it was very good.

Regression without intercept in R and Stata

Recently, I stumbled upon the fact that Stata and R handle regressions without intercept differently. I'm not a statistician, so please be kind if my vocabulary is not ideal.
I tried to make the example somewhat reproducible. This is my example in R:
> set.seed(20210211)
> df <- data.frame(y = runif(50), x = runif(50))
> df$d <- df$x > 0.5
>
> (tmp <- tempfile("data", fileext = ".csv"))
[1] "C:\\Users\\s1504gl\\AppData\\Local\\Temp\\1\\RtmpYtS6uk\\data1b2c1c4a96.csv"
> write.csv(df, tmp, row.names = FALSE)
>
> summary(lm(y ~ x + d, data = df))
Call:
lm(formula = y ~ x + d, data = df)
Residuals:
Min 1Q Median 3Q Max
-0.48651 -0.27449 0.03828 0.22119 0.53347
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4375 0.1038 4.214 0.000113 ***
x -0.1026 0.3168 -0.324 0.747521
dTRUE 0.1513 0.1787 0.847 0.401353
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2997 on 47 degrees of freedom
Multiple R-squared: 0.03103, Adjusted R-squared: -0.0102
F-statistic: 0.7526 on 2 and 47 DF, p-value: 0.4767
> summary(lm(y ~ x + d + 0, data = df))
Call:
lm(formula = y ~ x + d + 0, data = df)
Residuals:
Min 1Q Median 3Q Max
-0.48651 -0.27449 0.03828 0.22119 0.53347
Coefficients:
Estimate Std. Error t value Pr(>|t|)
x -0.1026 0.3168 -0.324 0.747521
dFALSE 0.4375 0.1038 4.214 0.000113 ***
dTRUE 0.5888 0.2482 2.372 0.021813 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2997 on 47 degrees of freedom
Multiple R-squared: 0.7196, Adjusted R-squared: 0.7017
F-statistic: 40.21 on 3 and 47 DF, p-value: 4.996e-13
And here is what I have in Stata (please note that I have copied the filename from R to Stata):
. import delimited "C:\Users\s1504gl\AppData\Local\Temp\1\RtmpYtS6uk\data1b2c1c4a96.csv"
(3 vars, 50 obs)
. encode d, generate(d_enc)
.
. regress y x i.d_enc
Source | SS df MS Number of obs = 50
-------------+---------------------------------- F(2, 47) = 0.75
Model | .135181652 2 .067590826 Prob > F = 0.4767
Residual | 4.22088995 47 .089806169 R-squared = 0.0310
-------------+---------------------------------- Adj R-squared = -0.0102
Total | 4.3560716 49 .08889942 Root MSE = .29968
------------------------------------------------------------------------------
y | Coef. Std. Err. t P>|t| [95% Conf. Interval]
-------------+----------------------------------------------------------------
x | -.1025954 .3168411 -0.32 0.748 -.7399975 .5348067
|
d_enc |
TRUE | .1512977 .1786527 0.85 0.401 -.2081052 .5107007
_cons | .4375371 .103837 4.21 0.000 .2286441 .6464301
------------------------------------------------------------------------------
. regress y x i.d_enc, noconstant
Source | SS df MS Number of obs = 50
-------------+---------------------------------- F(2, 48) = 38.13
Model | 9.23913703 2 4.61956852 Prob > F = 0.0000
Residual | 5.81541777 48 .121154537 R-squared = 0.6137
-------------+---------------------------------- Adj R-squared = 0.5976
Total | 15.0545548 50 .301091096 Root MSE = .34807
------------------------------------------------------------------------------
y | Coef. Std. Err. t P>|t| [95% Conf. Interval]
-------------+----------------------------------------------------------------
x | .976214 .2167973 4.50 0.000 .5403139 1.412114
|
d_enc |
TRUE | -.2322011 .1785587 -1.30 0.200 -.5912174 .1268151
------------------------------------------------------------------------------
As you can see, the results of the regression with intercept are identical. But if I omit the intercept (+ 0 in R, , noconstant in Stata), the results differ. In R, the intercept is now captured in dFALSE, which is reasonable from what I understand. I don't understand what Stata is doing here. Also the degrees of freedom differ.
My questions:
Can anyone explain to me how Stata is handling this?
How can I replicate Stata's behavior in R?
I believe bas pointed in the right direction, but I am still unsure why both results differ.
I am not attempting to answer the question, but provdide deeper understanding of what stata is doing (by digging into the source of R's lm() function. In the following lines I replicate what lm() does, but jumping over sanity checks and options such as weights, contrasts, etc...
(I cannot yet fully understand why in the second regression (with NO CONSTANT) the dFALSE coefficient captures the effect of the intercept in the default regression (with constant)
set.seed(20210211)
df <- data.frame(y = runif(50), x = runif(50))
df$d <- df$x > 0.5
lm() With Constant
form_default <- as.formula(y ~ x + d)
mod_frame_def <- model.frame(form_default, df)
mod_matrix_def <- model.matrix(object = attr(mod_frame_def, "terms"), mod_frame_def)
head(mod_matrix_def)
#> (Intercept) x dTRUE
#> 1 1 0.7861162 1
#> 2 1 0.2059603 0
#> 3 1 0.9793946 1
#> 4 1 0.8569093 1
#> 5 1 0.8124811 1
#> 6 1 0.7769280 1
stats:::lm.fit(
y = model.response(mod_frame_def),
x = mod_matrix_def
)$coefficients
#> (Intercept) x dTRUE
#> 0.4375371 -0.1025954 0.1512977
lm() No Constant
form_nocon <- as.formula(y ~ x + d + 0)
mod_frame_nocon <- model.frame(form_nocon, df)
mod_matrix_nocon <- model.matrix(object = attr(mod_frame_nocon, "terms"), mod_frame_nocon)
head(mod_matrix_nocon)
#> x dFALSE dTRUE
#> 1 0.7861162 0 1
#> 2 0.2059603 1 0
#> 3 0.9793946 0 1
#> 4 0.8569093 0 1
#> 5 0.8124811 0 1
#> 6 0.7769280 0 1
stats:::lm.fit(
y = model.response(mod_frame_nocon),
x = mod_matrix_nocon
)$coefficients
#> x dFALSE dTRUE
#> -0.1025954 0.4375371 0.5888348
lm() with as.numeric()
[as indicated in the comments by bas]
form_asnum <- as.formula(y ~ x + as.numeric(d) + 0)
mod_frame_asnum <- model.frame(form_asnum, df)
mod_matrix_asnum <- model.matrix(object = attr(mod_frame_asnum, "terms"), mod_frame_asnum)
head(mod_matrix_asnum)
#> x as.numeric(d)
#> 1 0.7861162 1
#> 2 0.2059603 0
#> 3 0.9793946 1
#> 4 0.8569093 1
#> 5 0.8124811 1
#> 6 0.7769280 1
stats:::lm.fit(
y = model.response(mod_frame_asnum),
x = mod_matrix_asnum
)$coefficients
#> x as.numeric(d)
#> 0.9762140 -0.2322012
Created on 2021-03-18 by the reprex package (v1.0.0)

Reproducing a result from R in Stata - Telling R or Stata to remove the same variables causing perfect collinearity/singularities

I am trying to reproduce a result from R in Stata (Please note that the data below is fictitious and serves just as an example). For some reason however, Stata appears to deal with certain issues differently than R. It chooses different dummy variables to kick out in case of multicollinearity.
I have posted a related question dealing with the statistical implications of these country-year dummies being removed here.
In the example below, R kicks out 2, while Stata kicks out 3, leading to a different result. Check for example the coefficients and p-values for vote and vote_won.
In essence, all I want to know is how to communicate to either R or Stata, which variables to kick out, so that they both do the same.
Data
The data looks as follows:
library(data.table)
library(dplyr)
library(foreign)
library(censReg)
library(wooldridge)
data('mroz')
year= c(2005, 2010)
country = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
n <- 2
DT <- data.table( country = rep(sample(country, length(mroz), replace = T), each = n),
year = c(replicate(length(mroz), sample(year, n))))
x <- DT
DT <- rbind(DT, DT); DT <- rbind(DT, DT); DT <- rbind(DT, DT) ; DT <- rbind(DT, DT); DT <- rbind(DT, x)
mroz <- mroz[-c(749:753),]
DT <- cbind(mroz, DT)
DT <- DT %>%
group_by(country) %>%
mutate(base_rate = as.integer(runif(1, 12.5, 37.5))) %>%
group_by(country, year) %>%
mutate(taxrate = base_rate + as.integer(runif(1,-2.5,+2.5)))
DT <- DT %>%
group_by(country, year) %>%
mutate(vote = sample(c(0,1),1),
votewon = ifelse(vote==1, sample(c(0,1),1),0))
rm(mroz,x, country, year)
The lm regression in R
summary(lm(educ ~ exper + I(exper^2) + vote + votewon + country:as.factor(year), data=DT))
Call:
lm(formula = educ ~ exper + I(exper^2) + vote + votewon + country:as.factor(year),
data = DT)
Residuals:
Min 1Q Median 3Q Max
-7.450 -0.805 -0.268 0.954 5.332
Coefficients: (3 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.170064 0.418578 26.69 < 0.0000000000000002 ***
exper 0.103880 0.029912 3.47 0.00055 ***
I(exper^2) -0.002965 0.000966 -3.07 0.00222 **
vote 0.576865 0.504540 1.14 0.25327
votewon 0.622522 0.636241 0.98 0.32818
countryA:as.factor(year)2005 -0.196348 0.503245 -0.39 0.69653
countryB:as.factor(year)2005 -0.530681 0.616653 -0.86 0.38975
countryC:as.factor(year)2005 0.650166 0.552019 1.18 0.23926
countryD:as.factor(year)2005 -0.515195 0.638060 -0.81 0.41968
countryE:as.factor(year)2005 0.731681 0.502807 1.46 0.14605
countryG:as.factor(year)2005 0.213345 0.674642 0.32 0.75192
countryH:as.factor(year)2005 -0.811374 0.637254 -1.27 0.20334
countryI:as.factor(year)2005 0.584787 0.503606 1.16 0.24594
countryJ:as.factor(year)2005 0.554397 0.674789 0.82 0.41158
countryA:as.factor(year)2010 0.388603 0.503358 0.77 0.44035
countryB:as.factor(year)2010 -0.727834 0.617210 -1.18 0.23869
countryC:as.factor(year)2010 -0.308601 0.504041 -0.61 0.54056
countryD:as.factor(year)2010 0.785603 0.503165 1.56 0.11888
countryE:as.factor(year)2010 0.280305 0.452293 0.62 0.53562
countryG:as.factor(year)2010 0.672074 0.674721 1.00 0.31954
countryH:as.factor(year)2010 NA NA NA NA
countryI:as.factor(year)2010 NA NA NA NA
countryJ:as.factor(year)2010 NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.3 on 728 degrees of freedom
Multiple R-squared: 0.037, Adjusted R-squared: 0.0119
F-statistic: 1.47 on 19 and 728 DF, p-value: 0.0882
Same regression in Stata
write.dta(DT, "C:/Users/.../mroz_adapted.dta")
encode country, gen(n_country)
reg educ c.exper c.exper#c.exper vote votewon n_country#i.year
note: 9.n_country#2010.year omitted because of collinearity
note: 10.n_country#2010.year omitted because of collinearity
Source | SS df MS Number of obs = 748
-------------+---------------------------------- F(21, 726) = 1.80
Model | 192.989406 21 9.18997171 Prob > F = 0.0154
Residual | 3705.47583 726 5.1039612 R-squared = 0.0495
-------------+---------------------------------- Adj R-squared = 0.0220
Total | 3898.46524 747 5.21882897 Root MSE = 2.2592
---------------------------------------------------------------------------------
educ | Coef. Std. Err. t P>|t| [95% Conf. Interval]
----------------+----------------------------------------------------------------
exper | .1109858 .0297829 3.73 0.000 .052515 .1694567
|
c.exper#c.exper | -.0031891 .000963 -3.31 0.001 -.0050796 -.0012986
|
vote | .0697273 .4477115 0.16 0.876 -.8092365 .9486911
votewon | -.0147825 .6329659 -0.02 0.981 -1.257445 1.227879
|
n_country#year |
A#2010 | .0858634 .4475956 0.19 0.848 -.7928728 .9645997
B#2005 | -.4950677 .5003744 -0.99 0.323 -1.477421 .4872858
B#2010 | .0951657 .5010335 0.19 0.849 -.8884818 1.078813
C#2005 | -.5162827 .447755 -1.15 0.249 -1.395332 .3627664
C#2010 | -.0151834 .4478624 -0.03 0.973 -.8944434 .8640767
D#2005 | .3664596 .5008503 0.73 0.465 -.6168283 1.349747
D#2010 | .5119858 .500727 1.02 0.307 -.4710599 1.495031
E#2005 | .5837942 .6717616 0.87 0.385 -.7350329 1.902621
E#2010 | .185601 .5010855 0.37 0.711 -.7981486 1.169351
F#2005 | .5987978 .6333009 0.95 0.345 -.6445219 1.842117
F#2010 | .4853639 .7763936 0.63 0.532 -1.038881 2.009608
G#2005 | -.3341302 .6328998 -0.53 0.598 -1.576663 .9084021
G#2010 | .2873193 .6334566 0.45 0.650 -.956306 1.530945
H#2005 | -.4365233 .4195984 -1.04 0.299 -1.260294 .3872479
H#2010 | -.1683725 .6134262 -0.27 0.784 -1.372673 1.035928
I#2005 | -.39264 .7755549 -0.51 0.613 -1.915238 1.129958
I#2010 | 0 (omitted)
J#2005 | 1.036108 .4476018 2.31 0.021 .1573591 1.914856
J#2010 | 0 (omitted)
|
_cons | 11.58369 .350721 33.03 0.000 10.89514 12.27224
---------------------------------------------------------------------------------
Just for your question about which 'variables to kick out": I guess you meant which combination of interaction terms to be used as the reference group for calculating regression coefficients.
By default, Stata uses the combination of the lowest values of two variables as the reference while R uses the highest values of two variables as the reference. I use Stata auto data to demonstrate this:
# In R
webuse::webuse("auto")
auto$foreign = as.factor(auto$foreign)
auto$rep78 = as.factor(auto$rep78)
# Model
r_model <- lm(mpg ~ rep78:foreign, data=auto)
broom::tidy(r_model)
# A tibble: 11 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 26.3 1.65 15.9 2.09e-23
2 rep781:foreign0 -5.33 3.88 -1.38 1.74e- 1
3 rep782:foreign0 -7.21 2.41 -2.99 4.01e- 3
4 rep783:foreign0 -7.33 1.91 -3.84 2.94e- 4
5 rep784:foreign0 -7.89 2.34 -3.37 1.29e- 3
6 rep785:foreign0 5.67 3.88 1.46 1.49e- 1
7 rep781:foreign1 NA NA NA NA
8 rep782:foreign1 NA NA NA NA
9 rep783:foreign1 -3.00 3.31 -0.907 3.68e- 1
10 rep784:foreign1 -1.44 2.34 -0.618 5.39e- 1
11 rep785:foreign1 NA NA NA NA
In Stata:
. reg mpg i.foreign#i.rep78
note: 1.foreign#1b.rep78 identifies no observations in the sample
note: 1.foreign#2.rep78 identifies no observations in the sample
Source | SS df MS Number of obs = 69
-------------+---------------------------------- F(7, 61) = 4.88
Model | 839.550121 7 119.935732 Prob > F = 0.0002
Residual | 1500.65278 61 24.6008652 R-squared = 0.3588
-------------+---------------------------------- Adj R-squared = 0.2852
Total | 2340.2029 68 34.4147485 Root MSE = 4.9599
-------------------------------------------------------------------------------
mpg | Coef. Std. Err. t P>|t| [95% Conf. Interval]
--------------+----------------------------------------------------------------
foreign#rep78 |
Domestic#2 | -1.875 3.921166 -0.48 0.634 -9.715855 5.965855
Domestic#3 | -2 3.634773 -0.55 0.584 -9.268178 5.268178
Domestic#4 | -2.555556 3.877352 -0.66 0.512 -10.3088 5.19769
Domestic#5 | 11 4.959926 2.22 0.030 1.082015 20.91798
Foreign#1 | 0 (empty)
Foreign#2 | 0 (empty)
Foreign#3 | 2.333333 4.527772 0.52 0.608 -6.720507 11.38717
Foreign#4 | 3.888889 3.877352 1.00 0.320 -3.864357 11.64213
Foreign#5 | 5.333333 3.877352 1.38 0.174 -2.419912 13.08658
|
_cons | 21 3.507197 5.99 0.000 13.98693 28.01307
-------------------------------------------------------------------------------
To reproduce the previous R in Stata, we could recode those two variables foreign and rep78:
. reg mpg i.foreign2#i.rep2
note: 0b.foreign2#1.rep2 identifies no observations in the sample
note: 0b.foreign2#2.rep2 identifies no observations in the sample
Source | SS df MS Number of obs = 69
-------------+---------------------------------- F(7, 61) = 4.88
Model | 839.550121 7 119.935732 Prob > F = 0.0002
Residual | 1500.65278 61 24.6008652 R-squared = 0.3588
-------------+---------------------------------- Adj R-squared = 0.2852
Total | 2340.2029 68 34.4147485 Root MSE = 4.9599
-------------------------------------------------------------------------------
mpg | Coef. Std. Err. t P>|t| [95% Conf. Interval]
--------------+----------------------------------------------------------------
foreign2#rep2 |
0 1 | 0 (empty)
0 2 | 0 (empty)
0 3 | -3 3.306617 -0.91 0.368 -9.61199 3.61199
0 4 | -1.444444 2.338132 -0.62 0.539 -6.119827 3.230938
1 0 | 5.666667 3.877352 1.46 0.149 -2.086579 13.41991
1 1 | -5.333333 3.877352 -1.38 0.174 -13.08658 2.419912
1 2 | -7.208333 2.410091 -2.99 0.004 -12.02761 -2.389059
1 3 | -7.333333 1.909076 -3.84 0.000 -11.15077 -3.515899
1 4 | -7.888889 2.338132 -3.37 0.001 -12.56427 -3.213506
|
_cons | 26.33333 1.653309 15.93 0.000 23.02734 29.63933
-------------------------------------------------------------------------------
The same approach applies to reproduce Stata results in R, just redefine levels of those two factor variables.

Translating Stata regression into R

I am currently trying to translate Stata regression into R and here is the original code :
char ethnicity[omit]8
char cid[omit]3
xi: reg nationalism i.cid ib(8).ethnicity male age religious education income rural_now rural_prev killed [pw=stdwt] if warcountry ==1, cl(cid)
and here is what I have so far in terms of translating it into R
lm(nationalism ~ cid + ethnicity +male+ age+ religious+ education+ income+ rural_now+ rural_prev+ killed, data=tab5data)
My question is how do I do the first portion of the Stata code ( char ethnicity[omit]8) because I know it is the reference group but I am unsure how to do that in R. Do I need to remove all those groups in the original dataset or do I need to run those groups in a seperate regression all together? Also what exactly does the ib(8) mean?
You can use relevel() in R. The code below uses a user-written command rsource to run R from within Stata to show the equivalence:
. sysuse auto, clear
(1978 Automobile Data)
. saveold auto, version(12) replace
(saving in Stata 12 format, which can be read by Stata 11 or 12)
file auto.dta saved
.
. rsource, terminator(XXX)
Assumed R program path: "/usr/local/bin/R"
Beginning of R output
> library("foreign")
> mydata<-read.dta("~/Desktop/auto.dta")
> mydata$rep78 <- relevel(as.factor(mydata$rep78), ref = 4)
> m1<-lm(price ~ rep78,data = mydata)
> summary(m1)
Call:
lm(formula = price ~ rep78, data = mydata)
Residuals:
Min 1Q Median 3Q Max
-3138.2 -1925.2 -1181.5 369.5 9476.8
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6071.5 702.4 8.643 2.38e-12 ***
rep781 -1507.0 2221.3 -0.678 0.500
rep782 -103.9 1266.4 -0.082 0.935
rep783 357.7 888.5 0.403 0.689
rep785 -158.5 1140.6 -0.139 0.890
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2980 on 64 degrees of freedom
(5 observations deleted due to missingness)
Multiple R-squared: 0.01449, Adjusted R-squared: -0.0471
F-statistic: 0.2353 on 4 and 64 DF, p-value: 0.9174
>
End of R output
.
. /* Old Way */
. char rep78[omit]4
. xi: reg price i.rep78
i.rep78 _Irep78_1-5 (naturally coded; _Irep78_4 omitted)
Source | SS df MS Number of obs = 69
-------------+---------------------------------- F(4, 64) = 0.24
Model | 8360542.63 4 2090135.66 Prob > F = 0.9174
Residual | 568436416 64 8881819 R-squared = 0.0145
-------------+---------------------------------- Adj R-squared = -0.0471
Total | 576796959 68 8482308.22 Root MSE = 2980.2
------------------------------------------------------------------------------
price | Coef. Std. Err. t P>|t| [95% Conf. Interval]
-------------+----------------------------------------------------------------
_Irep78_1 | -1507 2221.338 -0.68 0.500 -5944.633 2930.633
_Irep78_2 | -103.875 1266.358 -0.08 0.935 -2633.715 2425.965
_Irep78_3 | 357.7333 888.5353 0.40 0.689 -1417.32 2132.787
_Irep78_5 | -158.5 1140.558 -0.14 0.890 -2437.026 2120.026
_cons | 6071.5 702.4489 8.64 0.000 4668.197 7474.803
------------------------------------------------------------------------------
.
. /* Post-Stata 11 Way */
. reg price ib4.rep78
Source | SS df MS Number of obs = 69
-------------+---------------------------------- F(4, 64) = 0.24
Model | 8360542.63 4 2090135.66 Prob > F = 0.9174
Residual | 568436416 64 8881819 R-squared = 0.0145
-------------+---------------------------------- Adj R-squared = -0.0471
Total | 576796959 68 8482308.22 Root MSE = 2980.2
------------------------------------------------------------------------------
price | Coef. Std. Err. t P>|t| [95% Conf. Interval]
-------------+----------------------------------------------------------------
rep78 |
1 | -1507 2221.338 -0.68 0.500 -5944.633 2930.633
2 | -103.875 1266.358 -0.08 0.935 -2633.715 2425.965
3 | 357.7333 888.5353 0.40 0.689 -1417.32 2132.787
5 | -158.5 1140.558 -0.14 0.890 -2437.026 2120.026
|
_cons | 6071.5 702.4489 8.64 0.000 4668.197 7474.803
------------------------------------------------------------------------------
. fvset base 4 rep78
. reg price i.rep78
Source | SS df MS Number of obs = 69
-------------+---------------------------------- F(4, 64) = 0.24
Model | 8360542.63 4 2090135.66 Prob > F = 0.9174
Residual | 568436416 64 8881819 R-squared = 0.0145
-------------+---------------------------------- Adj R-squared = -0.0471
Total | 576796959 68 8482308.22 Root MSE = 2980.2
------------------------------------------------------------------------------
price | Coef. Std. Err. t P>|t| [95% Conf. Interval]
-------------+----------------------------------------------------------------
rep78 |
1 | -1507 2221.338 -0.68 0.500 -5944.633 2930.633
2 | -103.875 1266.358 -0.08 0.935 -2633.715 2425.965
3 | 357.7333 888.5353 0.40 0.689 -1417.32 2132.787
5 | -158.5 1140.558 -0.14 0.890 -2437.026 2120.026
|
_cons | 6071.5 702.4489 8.64 0.000 4668.197 7474.803
------------------------------------------------------------------------------

Resources