Higher level cluster standard errors for panel data - panel-data
I want to estimate the cluster SE of a panel model (first differences) in R, with 100 groups, 6,156 individuals and 15 years. Some of
the individuals are repeated (4,201 unique) because they are part of a
matched sample obtained with a one-to-one, with replacement, matching
method.
I have been using plm to estimate the model coefficients, after
transforming my matched sample into a pdata.frame by using indivuals
and years as indexes.
I have also been able to estimate the cluster
standard errors at the individual level by using the vcovHC function.
However, these individuals are clustered within the groups, and
therefore I want to cluster at this higher level of aggregation rather
than at the individual level.
Unfortunately, it is not clear to me how
to proceed. Of course if I replace the individuals for groups in the
index I get repeated row.names and then I canĀ“t estimate the panel
model with plm. I get the following error message:
Error in row.names<-.data.frame (*tmp*, value = c("1-1", "1-1",
"1-1", : duplicate 'row.names' are not allowed
For simplicity, I make my case using the following example (copied
from: http://www.richard-bluhm.com/clustered-ses-in-r-and-stata-2/):
# require packages
require(plm)
#> Loading required package: plm
require(lmtest)
#> Loading required package: lmtest
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
# get data and load as pdata.frame
url <- "http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt"
p.df <- read.table(url)
names(p.df) <- c("firmid", "year", "x", "y")
# Introduce group (State) Id
p.df$State <- rep(1:100, each = 50)
p.df2 <- pdata.frame(p.df, index = c("State", "year"), drop.index = F, row.names = T)
#> Warning in pdata.frame(p.df, index = c("State", "year"), drop.index = F, : duplicate couples (id-time) in resulting pdata.frame
#> to find out which, use e.g. table(index(your_pdataframe), useNA = "ifany")
# fit model with plm
pm1 <- plm(y ~ x, data = p.df2, model = "within") # this is where the error occurs.
#> Warning: non-unique values when setting 'row.names': '1-1', '1-10', '1-2',
#> '1-3', '1-4', '1-5', '1-6', '1-7', '1-8', '1-9', '10-1', '10-10', '10-2',
#> '10-3', '10-4', '10-5', '10-6', '10-7', '10-8', '10-9', '100-1', '100-10',
#> '100-2', '100-3', '100-4', '100-5', '100-6', '100-7', '100-8', '100-9', '11-1',
#> '11-10', '11-2', '11-3', '11-4', '11-5', '11-6', '11-7', '11-8', '11-9', '12-1',
#> '12-10', '12-2', '12-3', '12-4', '12-5', '12-6', '12-7', '12-8', '12-9', '13-1',
#> '13-10', '13-2', '13-3', '13-4', '13-5', '13-6', '13-7', '13-8', '13-9', '14-1',
#> '14-10', '14-2', '14-3', '14-4', '14-5', '14-6', '14-7', '14-8', '14-9', '15-1',
#> '15-10', '15-2', '15-3', '15-4', '15-5', '15-6', '15-7', '15-8', '15-9', '16-1',
#> '16-10', '16-2', '16-3', '16-4', '16-5', '16-6', '16-7', '16-8', '16-9', '17-1',
#> '17-10', '17-2', '17-3', '17-4', '17-5', '17-6', '17-7', '17-8', '17-9', '18-1',
#> '18-10', '18-2', '18-3', '18-4', '18-5', '18-6', '18-7', '18-8', '18-9', '19-1',
#> '19-10', '19-2', '19-3', '19-4', '19-5', '19-6', '19-7', '19-8', '19-9', '2-1',
#> '2-10', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '20-1', '20-10',
#> '20-2', '20-3', '20-4', '20-5', '20-6', '20-7', '20-8', '20-9', '21-1', '21-10',
#> '21-2', '21-3', '21-4', '21-5', '21-6', '21-7', '21-8', '21-9', '22-1', '22-10',
#> '22-2', '22-3', '22-4', '22-5', '22-6', '22-7', '22-8', '22-9', '23-1', '23-10',
#> '23-2', '23-3', '23-4', '23-5', '23-6', '23-7', '23-8', '23-9', '24-1', '24-10',
#> '24-2', '24-3', '24-4', '24-5', '24-6', '24-7', '24-8', '24-9', '25-1', '25-10',
#> '25-2', '25-3', '25-4', '25-5', '25-6', '25-7', '25-8', '25-9', '26-1', '26-10',
#> '26-2', '26-3', '26-4', '26-5', '26-6', '26-7', '26-8', '26-9', '27-1', '27-10',
#> '27-2', '27-3', '27-4', '27-5', '27-6', '27-7', '27-8', '27-9', '28-1', '28-10',
#> '28-2', '28-3', '28-4', '28-5', '28-6', '28-7', '28-8', '28-9', '29-1', '29-10',
#> '29-2', '29-3', '29-4', '29-5', '29-6', '29-7', '29-8', '29-9', '3-1', '3-10',
#> '3-2', '3-3', '3-4', '3-5', '3-6', '3-7', '3-8', '3-9', '30-1', '30-10', '30-2',
#> '30-3', '30-4', '30-5', '30-6', '30-7', '30-8', '30-9', '31-1', '31-10', '31-2',
#> '31-3', '31-4', '31-5', '31-6', '31-7', '31-8', '31-9', '32-1', '32-10', '32-2',
#> '32-3', '32-4', '32-5', '32-6', '32-7', '32-8', '32-9', '33-1', '33-10', '33-2',
#> '33-3', '33-4', '33-5', '33-6', '33-7', '33-8', '33-9', '34-1', '34-10', '34-2',
#> '34-3', '34-4', '34-5', '34-6', '34-7', '34-8', '34-9', '35-1', '35-10', '35-2',
#> '35-3', '35-4', '35-5', '35-6', '35-7', '35-8', '35-9', '36-1', '36-10', '36-2',
#> '36-3', '36-4', '36-5', '36-6', '36-7', '36-8', '36-9', '37-1', '37-10', '37-2',
#> '37-3', '37-4', '37-5', '37-6', '37-7', '37-8', '37-9', '38-1', '38-10', '38-2',
#> '38-3', '38-4', '38-5', '38-6', '38-7', '38-8', '38-9', '39-1', '39-10', '39-2',
#> '39-3', '39-4', '39-5', '39-6', '39-7', '39-8', '39-9', '4-1', '4-10', '4-2',
#> '4-3', '4-4', '4-5', '4-6', '4-7', '4-8', '4-9', '40-1', '40-10', '40-2',
#> '40-3', '40-4', '40-5', '40-6', '40-7', '40-8', '40-9', '41-1', '41-10', '41-2',
#> '41-3', '41-4', '41-5', '41-6', '41-7', '41-8', '41-9', '42-1', '42-10', '42-2',
#> '42-3', '42-4', '42-5', '42-6', '42-7', '42-8', '42-9', '43-1', '43-10', '43-2',
#> '43-3', '43-4', '43-5', '43-6', '43-7', '43-8', '43-9', '44-1', '44-10', '44-2',
#> '44-3', '44-4', '44-5', '44-6', '44-7', '44-8', '44-9', '45-1', '45-10', '45-2',
#> '45-3', '45-4', '45-5', '45-6', '45-7', '45-8', '45-9', '46-1', '46-10', '46-2',
#> '46-3', '46-4', '46-5', '46-6', '46-7', '46-8', '46-9', '47-1', '47-10', '47-2',
#> '47-3', '47-4', '47-5', '47-6', '47-7', '47-8', '47-9', '48-1', '48-10', '48-2',
#> '48-3', '48-4', '48-5', '48-6', '48-7', '48-8', '48-9', '49-1', '49-10', '49-2',
#> '49-3', '49-4', '49-5', '49-6', '49-7', '49-8', '49-9', '5-1', '5-10', '5-2',
#> '5-3', '5-4', '5-5', '5-6', '5-7', '5-8', '5-9', '50-1', '50-10', '50-2',
#> '50-3', '50-4', '50-5', '50-6', '50-7', '50-8', '50-9', '51-1', '51-10', '51-2',
#> '51-3', '51-4', '51-5', '51-6', '51-7', '51-8', '51-9', '52-1', '52-10', '52-2',
#> '52-3', '52-4', '52-5', '52-6', '52-7', '52-8', '52-9', '53-1', '53-10', '53-2',
#> '53-3', '53-4', '53-5', '53-6', '53-7', '53-8', '53-9', '54-1', '54-10', '54-2',
#> '54-3', '54-4', '54-5', '54-6', '54-7', '54-8', '54-9', '55-1', '55-10', '55-2',
#> '55-3', '55-4', '55-5', '55-6', '55-7', '55-8', '55-9', '56-1', '56-10', '56-2',
#> '56-3', '56-4', '56-5', '56-6', '56-7', '56-8', '56-9', '57-1', '57-10', '57-2',
#> '57-3', '57-4', '57-5', '57-6', '57-7', '57-8', '57-9', '58-1', '58-10', '58-2',
#> '58-3', '58-4', '58-5', '58-6', '58-7', '58-8', '58-9', '59-1', '59-10', '59-2',
#> '59-3', '59-4', '59-5', '59-6', '59-7', '59-8', '59-9', '6-1', '6-10', '6-2',
#> '6-3', '6-4', '6-5', '6-6', '6-7', '6-8', '6-9', '60-1', '60-10', '60-2',
#> '60-3', '60-4', '60-5', '60-6', '60-7', '60-8', '60-9', '61-1', '61-10', '61-2',
#> '61-3', '61-4', '61-5', '61-6', '61-7', '61-8', '61-9', '62-1', '62-10', '62-2',
#> '62-3', '62-4', '62-5', '62-6', '62-7', '62-8', '62-9', '63-1', '63-10', '63-2',
#> '63-3', '63-4', '63-5', '63-6', '63-7', '63-8', '63-9', '64-1', '64-10', '64-2',
#> '64-3', '64-4', '64-5', '64-6', '64-7', '64-8', '64-9', '65-1', '65-10', '65-2',
#> '65-3', '65-4', '65-5', '65-6', '65-7', '65-8', '65-9', '66-1', '66-10', '66-2',
#> '66-3', '66-4', '66-5', '66-6', '66-7', '66-8', '66-9', '67-1', '67-10', '67-2',
#> '67-3', '67-4', '67-5', '67-6', '67-7', '67-8', '67-9', '68-1', '68-10', '68-2',
#> '68-3', '68-4', '68-5', '68-6', '68-7', '68-8', '68-9', '69-1', '69-10', '69-2',
#> '69-3', '69-4', '69-5', '69-6', '69-7', '69-8', '69-9', '7-1', '7-10', '7-2',
#> '7-3', '7-4', '7-5', '7-6', '7-7', '7-8', '7-9', '70-1', '70-10', '70-2',
#> '70-3', '70-4', '70-5', '70-6', '70-7', '70-8', '70-9', '71-1', '71-10', '71-2',
#> '71-3', '71-4', '71-5', '71-6', '71-7', '71-8', '71-9', '72-1', '72-10', '72-2',
#> '72-3', '72-4', '72-5', '72-6', '72-7', '72-8', '72-9', '73-1', '73-10', '73-2',
#> '73-3', '73-4', '73-5', '73-6', '73-7', '73-8', '73-9', '74-1', '74-10', '74-2',
#> '74-3', '74-4', '74-5', '74-6', '74-7', '74-8', '74-9', '75-1', '75-10', '75-2',
#> '75-3', '75-4', '75-5', '75-6', '75-7', '75-8', '75-9', '76-1', '76-10', '76-2',
#> '76-3', '76-4', '76-5', '76-6', '76-7', '76-8', '76-9', '77-1', '77-10', '77-2',
#> '77-3', '77-4', '77-5', '77-6', '77-7', '77-8', '77-9', '78-1', '78-10', '78-2',
#> '78-3', '78-4', '78-5', '78-6', '78-7', '78-8', '78-9', '79-1', '79-10', '79-2',
#> '79-3', '79-4', '79-5', '79-6', '79-7', '79-8', '79-9', '8-1', '8-10', '8-2',
#> '8-3', '8-4', '8-5', '8-6', '8-7', '8-8', '8-9', '80-1', '80-10', '80-2',
#> '80-3', '80-4', '80-5', '80-6', '80-7', '80-8', '80-9', '81-1', '81-10', '81-2',
#> '81-3', '81-4', '81-5', '81-6', '81-7', '81-8', '81-9', '82-1', '82-10', '82-2',
#> '82-3', '82-4', '82-5', '82-6', '82-7', '82-8', '82-9', '83-1', '83-10', '83-2',
#> '83-3', '83-4', '83-5', '83-6', '83-7', '83-8', '83-9', '84-1', '84-10', '84-2',
#> '84-3', '84-4', '84-5', '84-6', '84-7', '84-8', '84-9', '85-1', '85-10', '85-2',
#> '85-3', '85-4', '85-5', '85-6', '85-7', '85-8', '85-9', '86-1', '86-10', '86-2',
#> '86-3', '86-4', '86-5', '86-6', '86-7', '86-8', '86-9', '87-1', '87-10', '87-2',
#> '87-3', '87-4', '87-5', '87-6', '87-7', '87-8', '87-9', '88-1', '88-10', '88-2',
#> '88-3', '88-4', '88-5', '88-6', '88-7', '88-8', '88-9', '89-1', '89-10', '89-2',
#> '89-3', '89-4', '89-5', '89-6', '89-7', '89-8', '89-9', '9-1', '9-10', '9-2',
#> '9-3', '9-4', '9-5', '9-6', '9-7', '9-8', '9-9', '90-1', '90-10', '90-2',
#> '90-3', '90-4', '90-5', '90-6', '90-7', '90-8', '90-9', '91-1', '91-10', '91-2',
#> '91-3', '91-4', '91-5', '91-6', '91-7', '91-8', '91-9', '92-1', '92-10', '92-2',
#> '92-3', '92-4', '92-5', '92-6', '92-7', '92-8', '92-9', '93-1', '93-10', '93-2',
#> '93-3', '93-4', '93-5', '93-6', '93-7', '93-8', '93-9', '94-1', '94-10', '94-2',
#> '94-3', '94-4', '94-5', '94-6', '94-7', '94-8', '94-9', '95-1', '95-10', '95-2',
#> '95-3', '95-4', '95-5', '95-6', '95-7', '95-8', '95-9', '96-1', '96-10', '96-2',
#> '96-3', '96-4', '96-5', '96-6', '96-7', '96-8', '96-9', '97-1', '97-10', '97-2',
#> '97-3', '97-4', '97-5', '97-6', '97-7', '97-8', '97-9', '98-1', '98-10', '98-2',
#> '98-3', '98-4', '98-5', '98-6', '98-7', '98-8', '98-9', '99-1', '99-10', '99-2',
#> '99-3', '99-4', '99-5', '99-6', '99-7', '99-8', '99-9'
#> Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
Created on 2020-05-29 by the reprex package (v0.3.0)
I wrote a package (clubTamal) when I encountered the same issue. clubTamal transform a plm object (by re-estimation) into an lm object in order to be able to cluster standard errors using teh multiwayvcov package. You find a Rpubs example and documentation here: https://rpubs.com/eliascis/clubTamal.
The package works for plm estimations with Fixed-Effects (model='within') or First-Difference models (model='fd').
To obtain a clustered covariance matrix use the vcovTamal command.
The package is still under development but can be installed directly from github:
library(devtools)
install_github("eliascis/clubTamal")
Unfortunalty the link to your example data does not work, but clubTamal further installs spd4testing, which constructs a simulated Small Panel Data set for testing purposes.
## packages
library(foreign)
library(lmtest)
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
library(plm)
library(multiwayvcov)
library(spd4testing)
library(clubTamal)
## simulated data
d <- spd4testing()
## formula
f <- formula(y ~ x + factor(year))
## standard estimation
e <- plm(formula = f, data = d, model = "fd")
summary(e)
#> Oneway (individual) effect First-Difference Model
#>
#> Call:
#> plm(formula = f, data = d, model = "fd")
#>
#> Unbalanced Panel: n = 6, T = 3-5, N = 26
#> Observations used in estimation: 20
#>
#> Residuals:
#> Min. 1st Qu. Median 3rd Qu. Max.
#> -250.333 -115.219 12.651 108.390 228.110
#>
#> Coefficients:
#> Estimate Std. Error t-value Pr(>|t|)
#> (Intercept) -80.937 199.405 -0.4059 0.69096
#> x 71.858 25.974 2.7666 0.01514 *
#> factor(year)2002 194.842 216.449 0.9002 0.38325
#> factor(year)2003 109.118 414.298 0.2634 0.79609
#> factor(year)2004 446.147 583.234 0.7650 0.45700
#> factor(year)2005 451.514 752.479 0.6000 0.55807
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Total Sum of Squares: 1078300
#> Residual Sum of Squares: 394270
#> R-Squared: 0.63435
#> Adj. R-Squared: 0.50376
#> F-statistic: 4.85757 on 5 and 14 DF, p-value: 0.0087377
e <- plm(formula = f, data = d, model = "within")
summary(e)
#> Oneway (individual) effect Within Model
#>
#> Call:
#> plm(formula = f, data = d, model = "within")
#>
#> Unbalanced Panel: n = 6, T = 3-5, N = 26
#>
#> Residuals:
#> Min. 1st Qu. Median 3rd Qu. Max.
#> -167.4294 -59.3741 -6.9404 73.7132 146.4199
#>
#> Coefficients:
#> Estimate Std. Error t-value Pr(>|t|)
#> x 72.362 23.434 3.0879 0.007501 **
#> factor(year)2002 113.786 77.276 1.4725 0.161569
#> factor(year)2003 -67.413 75.012 -0.8987 0.383013
#> factor(year)2004 200.420 83.649 2.3960 0.030062 *
#> factor(year)2005 127.170 81.030 1.5694 0.137400
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Total Sum of Squares: 441370
#> Residual Sum of Squares: 190660
#> R-Squared: 0.56803
#> Adj. R-Squared: 0.28005
#> F-statistic: 3.94491 on 5 and 15 DF, p-value: 0.017501
## clustering
# no clustering
v <- e$vcov
coeftest(e)
#>
#> t test of coefficients:
#>
#> Estimate Std. Error t value Pr(>|t|)
#> x 72.362 23.434 3.0879 0.007501 **
#> factor(year)2002 113.786 77.276 1.4725 0.161569
#> factor(year)2003 -67.413 75.012 -0.8987 0.383013
#> factor(year)2004 200.420 83.649 2.3960 0.030062 *
#> factor(year)2005 127.170 81.030 1.5694 0.137400
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# clustering at id level with plm-package
v <- vcovHC(e, type = "HC1", cluster = "group", tol = 1 * 10^-20)
coeftest(e, v)
#>
#> t test of coefficients:
#>
#> Estimate Std. Error t value Pr(>|t|)
#> x 72.362 24.586 2.9433 0.010070 *
#> factor(year)2002 113.786 76.548 1.4865 0.157870
#> factor(year)2003 -67.413 63.962 -1.0540 0.308585
#> factor(year)2004 200.420 61.464 3.2608 0.005266 **
#> factor(year)2005 127.170 67.310 1.8893 0.078338 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## clustering at group level with clubTamal
v <- vcovTamal(estimate = e, data = d, groupvar = "gid")
#> Error in vcovTamal(estimate = e, data = d, groupvar = "gid"): better use the very fast and powerful lfe::felm
coeftest(e, v)
#>
#> t test of coefficients:
#>
#> Estimate Std. Error t value Pr(>|t|)
#> x 72.362 24.586 2.9433 0.010070 *
#> factor(year)2002 113.786 76.548 1.4865 0.157870
#> factor(year)2003 -67.413 63.962 -1.0540 0.308585
#> factor(year)2004 200.420 61.464 3.2608 0.005266 **
#> factor(year)2005 127.170 67.310 1.8893 0.078338 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Created on 2020-05-29 by the reprex package (v0.3.0)
Related
bootstrap within groups in R
I have a data frame with response ratios for multiple locations and each location is assigned to a group (region). I want to generate a regression for each group (region) that uses Response Ratio (RR) as the response, location as the unit of replication, and each soil type as a predictor. I would like to use bootstrap resampling to generate confidence intervals around the coefficients for each soil type but I am not sure how to generate this. #sample data df <- data.frame( group=rep(c('region1','region2'), 100), subgroup=rep(c('location1','location2', 'location2', 'location1'), 25), predictor = rep(c('soil1','soil2','soil3','soil4'), 25), RR=rnorm(200) ) Adding script from #Rui below. I actually have a multiple regression and so I added an additional predictor. It is still unclear to me how to extract the coefficient CIs for both soil type and temperature. library(boot) bootfun <- function(data, i) { d <- data[i,] fit <- lm(RR ~ soil_type + temperature, data = d) coef(fit) } set.seed(2022) set.seed(123) df <- data.frame( group=rep(c('region1','region2'), 100), subgroup=rep(c('location1','location2', 'location2', 'location1'), 25), soil_type = rep(c('soil1','soil2','soil3','soil4'), 25), temperature = abs(rnorm(100, 2,1.75)), RR=rnorm(200), stringsAsFactors = TRUE ) R <- 1000 b_list <- by(df, df$group, \(X) { boot(X, bootfun, R, strata = X$subgroup) }) b_list$region1
Function boot is base package boot has an argument strata. Split by group and apply a boot function with, for instance, by stratifying by location. library(boot) bootfun <- function(data, i) { d <- data[i,] fit <- lm(RR ~ predictor, data = d) coef(fit) } set.seed(2022) df <- data.frame( group=rep(c('region1','region2'), 100), subgroup=rep(c('location1','location2', 'location2', 'location1'), 25), predictor = rep(c('soil1','soil2','soil3','soil4'), 25), RR=rnorm(200), stringsAsFactors = TRUE ) R <- 1000 b_list <- by(df, df$group, \(X) { boot(X, bootfun, R, strata = X$subgroup) }) b_list$region1 #> #> STRATIFIED BOOTSTRAP #> #> #> Call: #> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup) #> #> #> Bootstrap Statistics : #> original bias std. error #> t1* -0.2608885 0.000469295 0.1541482 #> t2* 0.3502007 -0.004239248 0.2083503 b_list$region2 #> #> STRATIFIED BOOTSTRAP #> #> #> Call: #> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup) #> #> #> Bootstrap Statistics : #> original bias std. error #> t1* -0.03727332 -0.0001557172 0.1422502 #> t2* 0.11987005 0.0016393125 0.1952310 lapply(b_list, boot.ci) #> Warning in sqrt(tv[, 2L]): NaNs produced #> Warning in sqrt(tv[, 2L]): NaNs produced #> $region1 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> FUN(boot.out = X[[i]]) #> #> Intervals : #> Level Normal Basic Studentized #> 95% (-0.5635, 0.0408 ) (-0.5611, 0.0545 ) (-0.8227, -0.0225 ) #> #> Level Percentile BCa #> 95% (-0.5762, 0.0393 ) (-0.5733, 0.0446 ) #> Calculations and Intervals on Original Scale #> #> $region2 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> FUN(boot.out = X[[i]]) #> #> Intervals : #> Level Normal Basic Studentized #> 95% (-0.3159, 0.2417 ) (-0.3260, 0.2460 ) (-0.3493, 0.1757 ) #> #> Level Percentile BCa #> 95% (-0.3206, 0.2514 ) (-0.3321, 0.2352 ) #> Calculations and Intervals on Original Scale Created on 2022-10-25 with reprex v2.0.2 Edit To get the bootstrapped confidence intervals of each coefficient, the code below uses two nested loops. The outer loop is by region, according to the original data partition. The inner loop is on index, meaning, on the matrix t returned by boot, see help("boot"), section Value. The index are the column numbers in any of b_list$region1$t b_list$region2$t each of them with 3 columns. library(boot) npars <- ncol(b_list$region1$t) ci_list <- lapply(b_list, \(region) { ci <- lapply(seq.int(npars), \(index) { boot.ci(region, index = index, type = c("norm","basic", "perc", "bca")) }) setNames(ci, c("Intercept", "soil", "temperature")) }) ci_list$region1$Intercept #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = region, type = c("norm", "basic", "perc", #> "bca"), index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.2517, 0.6059 ) (-0.2423, 0.6043 ) #> #> Level Percentile BCa #> 95% (-0.2410, 0.6056 ) (-0.2414, 0.6048 ) #> Calculations and Intervals on Original Scale ci_list$region2$temperature #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = region, type = c("norm", "basic", "perc", #> "bca"), index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.2317, 0.0420 ) (-0.2416, 0.0404 ) #> #> Level Percentile BCa #> 95% (-0.2278, 0.0542 ) (-0.2265, 0.0570 ) #> Calculations and Intervals on Original Scale Created on 2022-10-25 with reprex v2.0.2 Edit 2 Like I say in a comment below, in the new data the soil type uniquely identifies pairs of region and location, unique(df[1:3]) shows it. And it becomes useless to split by group and stratify within groups. bootfun2 <- function(data, i) { d <- data[i,] fit <- lm(RR ~ temperature + soil_type, data = d) coef(fit) } unique(df[1:3]) # soil type uniquely identifies region/location #> group subgroup soil_type #> 1 region1 location1 soil1 #> 2 region2 location2 soil2 #> 3 region1 location2 soil3 #> 4 region2 location1 soil4 fit <- lm(RR ~ temperature + soil_type, data = df) coef(fit) #> (Intercept) temperature soil_typesoil2 soil_typesoil3 soil_typesoil4 #> 0.25928498 -0.06352205 -0.17739104 -0.05243836 -0.20408527 set.seed(2022) R <- 1000 b_3 <- boot(df, bootfun2, R) b_3 #> #> ORDINARY NONPARAMETRIC BOOTSTRAP #> #> #> Call: #> boot(data = df, statistic = bootfun2, R = R) #> #> #> Bootstrap Statistics : #> original bias std. error #> t1* 0.25928498 0.005724634 0.18033509 #> t2* -0.06352205 -0.002910677 0.05161868 #> t3* -0.17739104 0.004932486 0.18665594 #> t4* -0.05243836 0.005796168 0.19602658 #> t5* -0.20408527 0.004914674 0.20355549 btype <- c("norm","basic", "perc", "bca") ci_list3 <- lapply(seq_len(ncol(b_3$t)), \(index) { boot.ci(b_3, type = btype, index = index) }) names(ci_list3) <- names(coef(fit)) ci_list3 #> $`(Intercept)` #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.0999, 0.6070 ) (-0.0868, 0.6172 ) #> #> Level Percentile BCa #> 95% (-0.0986, 0.6054 ) (-0.0992, 0.6034 ) #> Calculations and Intervals on Original Scale #> #> $temperature #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.1618, 0.0406 ) (-0.1631, 0.0401 ) #> #> Level Percentile BCa #> 95% (-0.1672, 0.0360 ) (-0.1552, 0.0503 ) #> Calculations and Intervals on Original Scale #> #> $soil_typesoil2 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.5482, 0.1835 ) (-0.5541, 0.1955 ) #> #> Level Percentile BCa #> 95% (-0.5503, 0.1994 ) (-0.5542, 0.1927 ) #> Calculations and Intervals on Original Scale #> #> $soil_typesoil3 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.4424, 0.3260 ) (-0.4399, 0.3068 ) #> #> Level Percentile BCa #> 95% (-0.4117, 0.3350 ) (-0.4116, 0.3350 ) #> Calculations and Intervals on Original Scale #> #> $soil_typesoil4 #> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS #> Based on 1000 bootstrap replicates #> #> CALL : #> boot.ci(boot.out = b_3, type = btype, index = index) #> #> Intervals : #> Level Normal Basic #> 95% (-0.6080, 0.1900 ) (-0.6116, 0.2127 ) #> #> Level Percentile BCa #> 95% (-0.6208, 0.2035 ) (-0.6284, 0.1801 ) #> Calculations and Intervals on Original Scale Created on 2022-10-25 with reprex v2.0.2
How to fix error in step function (stepcACI): rep(no, length.out = len)
I got the following error running the stepcAIC function in a linear mixed effect model (lmer): Fehler in rep(no, length.out = len) : attempt to replicate an object of type 'language' I don't understand what the error is saying. All the factors are declared as "factors" and all the other variables as "numeric". The storage.mode is "integer" for the factors and "double" for the other variables. This is my model and the step function: biom_FULLO<-lmer((above_bio)~MUM_germ_time+MUM_num_seed+MUM_av_seed_mass+ MUM_num_inf+MUM_above_bio+MUM_total_bio+MUM_inf_size+MUM_root_bio+ MUM_CV_seed_mass+MUM_CV_SEM_per_inflor+MUM_CV_inflor_size+ MUM_seed_weight+germ_date_year+germ_time+flow_start_date+ height_3m+height_flow+num_inf+seed_gen+Early+ Late+seed_gen:Early+seed_gen:Late+ Early:Late+seed_gen:Early:Late+(1|ID_year), DT_gen_biom) biom_step<-stepcAIC(biom_FULLO, direction = "backward", trace = FALSE, data = DT_gen_biom) Any idea anyone? P.D.: traceback() 6: ifelse(wasGam, formula(modelInit$gam)[[2]], formula(modelInit)[[2]]) 5: makeFormula(x, modelInit) 4: FUN(X[[i]], ...) 3: lapply(newSetup, function(x) makeFormula(x, modelInit)) 2: calculateAllCAICs(newSetup = newSetup, modelInit = object, numCores = numCores, data = data, calcNonOptimMod = calcNonOptimMod, nrmods = numberOfSavedModels, ...) 1: stepcAIC(biom_FULLO, direction = "backward", trace = FALSE, data = DT_gen_biom)
Just remove the parenthesis of your response variable in the LHS of your formula: library(cAIC4) #> Loading required package: lme4 #> Loading required package: Matrix #> Loading required package: stats4 #> Loading required package: nlme #> #> Attaching package: 'nlme' #> The following object is masked from 'package:lme4': #> #> lmList library(lme4) m1 <- lmer((Sepal.Length) ~ Sepal.Width + (1|Species), data = iris) stepcAIC(m1) #> Warning in nobars(formula(modelInit)) == formula(modelInit)[[2]]: longer object #> length is not a multiple of shorter object length #> Error in rep(no, length.out = len): attempt to replicate an object of type 'language' m2 <- lmer(Sepal.Length ~ Sepal.Width + (1|Species), data = iris) stepcAIC(m2) #> $finalModel #> Linear mixed model fit by REML ['lmerMod'] #> Formula: Sepal.Length ~ Sepal.Width + (1 | Species) #> Data: iris #> REML criterion at convergence: 194.6361 #> Random effects: #> Groups Name Std.Dev. #> Species (Intercept) 1.010 #> Residual 0.438 #> Number of obs: 150, groups: Species, 3 #> Fixed Effects: #> (Intercept) Sepal.Width #> 3.4062 0.7972 #> #> $additionalModels #> NULL #> #> $bestCAIC #> [1] 184.0044 Created on 2022-02-11 by the reprex package (v2.0.0)
Calculating AIC for Fixed Effect logit from bife package
I would like to ask how to calculace inf. criteria such as AIC, etc... for Fixed effect logit model from bife package. Basic summmary output does NOT include AIC, how ever when looking at: Goodness-of-fit for fixed effect logit model using 'bife' package The AIC criterium was computed. how ever I do no have it in my summary output nor log-likelihood. dta = bife::psid mod_logit <- bife(LFP ~ AGE + I(INCH / 1000) + KID1 + KID2 + KID3 | ID, data = dta, bias_corr = "ana") summary(mod_logit)
If you check bife code, AIC was computed in earlier versions at least in version 0.5. You might be using the current version 0.6 in which AIC is no longer included. If you do not mind using the older version, try the following: remove the current version from your library. download version 0.5 from CRAN website: https://cran.r-project.org/src/contrib/Archive/bife/ install to your computer: install.packages("D:\\bife_0.5.tar.gz", repos = NULL, type="source"). Assuming it is stored on D: drive. Or: require(devtools) install_version("bife", version = "0.5", repos = "http://cran.us.r-project.org") If successfully installed, run the following with AIC included: library(bife) dta = bife::psid mod_logit <- bife(LFP ~ AGE + I(INCH / 1000) + KID1 + KID2 + KID3 | ID, data = dta, bias_corr = "ana") summary(mod_logit) #> --------------------------------------------------------------- #> Fixed effects logit model #> with analytical bias-correction #> #> Estimated model: #> LFP ~ AGE + I(INCH/1000) + KID1 + KID2 + KID3 | ID #> #> Log-Likelihood= -3045.505 #> n= 13149, number of events= 9516 #> Demeaning converged after 5 iteration(s) #> Offset converged after 3 iteration(s) #> #> Corrected structural parameter(s): #> #> Estimate Std. error t-value Pr(> t) #> AGE 0.033945 0.012990 2.613 0.00898 ** #> I(INCH/1000) -0.007630 0.001993 -3.829 0.00013 *** #> KID1 -1.052985 0.096186 -10.947 < 2e-16 *** #> KID2 -0.509178 0.084510 -6.025 1.74e-09 *** #> KID3 -0.010562 0.060413 -0.175 0.86121 #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> AIC= 9023.011 , BIC= 19994.7 #> #> #> Average individual fixed effects= 0.0122 #> --------------------------------------------------------------- Created on 2020-01-09 by the reprex package (v0.3.0)
Reorder x axis using plot_model() from sjPlot
I have run a binomial logistic regression model in R using the lme4 package. Now, I want to plot the estimated marginal means for the model, so I have installed the sjPlot package and I have used the plot_model() function. My x axis includes three variables corresponding to three different groups: "L1", "HS", and "L2". I want to have the three variables in that precise order. However, when I plot the model, I get "HS" before "L1", because the labels appear in alphabetical order. I would like to change the order of those two labels and I know how to do that in a dataframe, but not when plotting a model with that function. Any ideas on how to reorder my x axis using the sjPlot package?
You can change the order of the coefficients using the order.terms-argument. Note that the numbers for this argument correspond to the position of the summary. Example: library(sjPlot) library(sjlabelled) data(efc) efc <- as_factor(efc, c161sex, e42dep, c172code) m <- lm(neg_c_7 ~ pos_v_4 + c12hour + e42dep + c172code, data = efc) plot_model(m, auto.label = F) summary(m) #> #> Call: #> lm(formula = neg_c_7 ~ pos_v_4 + c12hour + e42dep + c172code, #> data = efc) #> #> Residuals: #> Min 1Q Median 3Q Max #> -6.5411 -2.0797 -0.5183 1.3256 19.1412 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 17.65938 0.82864 21.311 < 2e-16 *** #> pos_v_4 -0.66552 0.05163 -12.890 < 2e-16 *** #> c12hour 0.01134 0.00270 4.201 2.95e-05 *** #> e42dep2 0.84189 0.47605 1.768 0.077355 . #> e42dep3 1.73616 0.47118 3.685 0.000244 *** #> e42dep4 3.10107 0.50470 6.144 1.26e-09 *** #> c172code2 0.12894 0.28832 0.447 0.654844 #> c172code3 0.69876 0.36649 1.907 0.056922 . #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> #> Residual standard error: 3.27 on 810 degrees of freedom #> (90 observations deleted due to missingness) #> Multiple R-squared: 0.2981, Adjusted R-squared: 0.292 #> F-statistic: 49.15 on 7 and 810 DF, p-value: < 2.2e-16 # according to summary, order of coefficients: # 1=pos_v_4, 2=c12hour, 3=e42dep2, 4=e42dep3, ... plot_model(m, auto.label = F, order.terms = c(1,2,4,5,3,6,7)) Created on 2019-05-08 by the reprex package (v0.2.1)
Log-rank test with time-dependent variable [duplicate]
Background: at half-year follow up times for 4y, patients may switch to a different medication group. To account for this, I've converted survival data into counting process form. I want to compare survival curves for medication groups A, B, and C. I am using an extended Cox model but want to do pairwise comparisons of each hazard function or do stratified log-rank tests. pairwise_survdiff throws an error because of the form of my data, I think. Example data: x<-data.frame(tstart=rep(seq(0,18,6),3),tstop=rep(seq(6,24,6),3), rx = rep(c("A","B","C"),4), death=c(rep(0,11),1)) x Problem: When using survdiff in the survival package, survdiff(Surv(tstart,tstop,death) ~ rx, data = x) I get the error: Error in survdiff(Surv(tstart, tstop, death) ~ rx, data = x) : Right censored data only I think this stems from the counting process form, since I can't find an example online that compares survival curves for time-varying covariates. Question: is there a quick fix to this problem? Or, is there an alternative package/function with the same versatility to compare survival curves, namely using different methods? How can I implement stratified log-rank tests using survidff on counting process form data? NOTE: this was marked as a known issue in the survminer package, see github issue here, but updating survminer did not solve my issue, and using one time interval, tstop-tstart wouldn't be correct, since that would leave, e.g., multiple entries at 6 months rather than out to the actual interval of risk.
So, here is an example of fitting the model and making the multiple comparisons using multcomp package. Note that this implicitly assumes that administration of treatments A-C is random. Depending on the assumptions about the process, it might be better to fit a multistate model with transitions between treatments and outcome. library(purrr) library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union library(survival) library(multcomp) #> Loading required package: mvtnorm #> Loading required package: TH.data #> Loading required package: MASS #> #> Attaching package: 'MASS' #> The following object is masked from 'package:dplyr': #> #> select #> #> Attaching package: 'TH.data' #> The following object is masked from 'package:MASS': #> #> geyser # simulate survival data set.seed(123) n <- 200 df <- data.frame( id = rep(1:n, each = 8), start = rep(seq(0, 42, by = 6), times = 8), stop = rep(seq(6, 48, by = 6), times = 8), rx = sample(LETTERS[1:3], n * 8, replace = T)) df$hazard <- exp(-3.5 -1 * (df$rx == "A") + .5 * (df$rx == "B") + .5 * (df$rx == "C")) df_surv <- data.frame(id = 1:n) df_surv$time <- split(df, f = df$id) %>% map_dbl(~msm::rpexp(n = 1, rate = .x$hazard, t = .x$start)) df <- df %>% left_join(df_surv) #> Joining, by = "id" df <- df %>% mutate(status = 1L * (time <= stop)) %>% filter(start <= time) df %>% head() #> id start stop rx hazard time status #> 1 1 0 6 A 0.01110900 13.78217 0 #> 2 1 6 12 C 0.04978707 13.78217 0 #> 3 1 12 18 B 0.04978707 13.78217 1 #> 4 2 0 6 B 0.04978707 22.37251 0 #> 5 2 6 12 B 0.04978707 22.37251 0 #> 6 2 12 18 C 0.04978707 22.37251 0 # fit the model model <- coxph(Surv(start, stop, status)~rx, data = df) # define pairwise comparison glht_rx <- multcomp::glht(model, linfct=multcomp::mcp(rx="Tukey")) glht_rx #> #> General Linear Hypotheses #> #> Multiple Comparisons of Means: Tukey Contrasts #> #> #> Linear Hypotheses: #> Estimate #> B - A == 0 1.68722 #> C - A == 0 1.60902 #> C - B == 0 -0.07819 # perform multiple comparisons # (adjusts for multiple comparisons + takes into account correlation of coefficients -> more power than e.g. bonferroni) smry_rx <- summary(glht_rx) smry_rx # -> B and C different to A, but not from each other #> #> Simultaneous Tests for General Linear Hypotheses #> #> Multiple Comparisons of Means: Tukey Contrasts #> #> #> Fit: coxph(formula = Surv(start, stop, status) ~ rx, data = df) #> #> Linear Hypotheses: #> Estimate Std. Error z value Pr(>|z|) #> B - A == 0 1.68722 0.28315 5.959 <1e-05 *** #> C - A == 0 1.60902 0.28405 5.665 <1e-05 *** #> C - B == 0 -0.07819 0.16509 -0.474 0.88 #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #> (Adjusted p values reported -- single-step method) # confidence intervals plot(smry_rx) Created on 2019-04-01 by the reprex package (v0.2.1)