I am using the 'car' package function Anova for some statistical testing.
It gives the following output:
Y = cbind(curdata$V1, curdata$V2, curdata$V3)
mymdl = lm(Y ~ curdata$V4 + curdata$V5)
myanova = Anova(mymdl)
Type II MANOVA Tests: Pillai test statistic
Df test stat approx F num Df den Df Pr(>F)
curdata$V4 1 0.27941 2.9728 3 23 0.05280 .
curdata$V5 1 0.33570 3.8743 3 23 0.02228 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
I would like to extract the values in the 'Pr(>F)' column, so I can place these p-values in another matrix for later correction of multiple comparisons.
I have tried using unlist, but it still does not provide the p-values found in the column.
Any help with this would be greatly appreciated.
If we have multiple response variables, it is a Manova. We could capture the output and use regex
as.numeric(sub(".*\\s*(\\d+\\.[0-9e-]+)\\s*[*.]*", "\\1", capture.output(out)[4:5]))
#[1] 8.836e-06 2.200e-16
data
mymdl <- lm(cbind(Sepal.Length, Sepal.Width) ~ Petal.Width +
Petal.Length, data = iris)
out <- Anova(mymdl)
Maybe not the most practical way, but you can play around columns using separate() from tidyr:
library(car)
library(dplyr)
library(tidyr)
#Code
v1 <- data.frame(capture.output(myanova))
v1 <- v1[3:5,,drop=F]
names(v1)<-'v1'
v2 <- separate(v1,v1,c(paste0('v',1:21)),sep = '\\s')
v2 <- v2[-1,]
Output:
as.numeric(v2$v21)
[1] 8.836e-06 2.200e-16
Warning: you would need to change 1:21 if necessary if more columns are present in the capture action.
TLDR:
# define helper:
get_summary_for_print <- car:::print.Anova.mlm
body(get_summary_for_print) <- local({tmp <- body(get_summary_for_print);tmp[-(length(tmp)-(0:1))]})
#use it:
get_summary_for_print(Anova(mymdl))$`Pr(>F)`
Unfortunately there is no designated way. But you can look at the source of car:::print.Anova.mlm (by typing this in the R console) to learn how it gets the values you want:
function (x, ...)
{
if ((!is.null(x$singular)) && x$singular)
stop("singular error SSP matrix; multivariate tests unavailable\ntry summary(object, multivariate=FALSE)")
test <- x$test
repeated <- x$repeated
ntests <- length(x$terms)
tests <- matrix(NA, ntests, 4)
if (!repeated)
SSPE.qr <- qr(x$SSPE)
for (term in 1:ntests) {
eigs <- Re(eigen(qr.coef(if (repeated) qr(x$SSPE[[term]]) else SSPE.qr,
x$SSP[[term]]), symmetric = FALSE)$values)
tests[term, 1:4] <- switch(test, Pillai = Pillai(eigs,
x$df[term], x$error.df), Wilks = Wilks(eigs, x$df[term],
x$error.df), `Hotelling-Lawley` = HL(eigs, x$df[term],
x$error.df), Roy = Roy(eigs, x$df[term], x$error.df))
}
ok <- tests[, 2] >= 0 & tests[, 3] > 0 & tests[, 4] > 0
ok <- !is.na(ok) & ok
tests <- cbind(x$df, tests, pf(tests[ok, 2], tests[ok, 3],
tests[ok, 4], lower.tail = FALSE))
rownames(tests) <- x$terms
colnames(tests) <- c("Df", "test stat", "approx F", "num Df",
"den Df", "Pr(>F)")
tests <- structure(as.data.frame(tests), heading = paste("\nType ",
x$type, if (repeated)
" Repeated Measures", " MANOVA Tests: ", test, " test statistic",
sep = ""), class = c("anova", "data.frame"))
print(tests, ...)
invisible(x)
}
<bytecode: 0x56032ea80990>
<environment: namespace:car>
In this case, there is quite a few lines of code involved to compute the p-values. However, we can easily create a modified version of the print function to return the table (tests) instead of only printing it (print(tests, ...)) and returning the original object (invisible(x)):
get_summary_for_print <- car:::print.Anova.mlm # copy the original print function (inclusive environment)
body(get_summary_for_print) <- # replace the code of our copy
local({ # to avoid pollution of environment by tmp
tmp <- body(get_summary_for_print) # to avoid code duplication
tmp[-(length(tmp)-(0:1))] # remove the last two code lines of the function
})
And use it for example like this:
library(car)
#> Loading required package: carData
res <- Anova(lm(cbind(Sepal.Width, Sepal.Length, Petal.Width) ~ Species + Petal.Length, iris))
res
#>
#> Type II MANOVA Tests: Pillai test statistic
#> Df test stat approx F num Df den Df Pr(>F)
#> Species 2 0.70215 26.149 6 290 < 2.2e-16 ***
#> Petal.Length 1 0.63487 83.461 3 144 < 2.2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
str(get_summary_for_print(res))
#> Classes 'anova' and 'data.frame': 2 obs. of 6 variables:
#> $ Df : num 2 1
#> $ test stat: num 0.702 0.635
#> $ approx F : num 26.1 83.5
#> $ num Df : num 6 3
#> $ den Df : num 290 144
#> $ Pr(>F) : num 7.96e-25 2.41e-31
#> - attr(*, "heading")= chr "\nType II MANOVA Tests: Pillai test statistic"
Related
I use the "vegan" package to perform a PERMANOVA (adonis2()), and I also want to calculate the effect size (ω²). For this, I tried to use omega_squared() from the "effectsize" package, but I failed. I think it does not understand the output table, specifically the part with the mean squares. Is it possible to fix this or do I have to calculate manually?
library(vegan)
#> Lade nötiges Paket: permute
#> Lade nötiges Paket: lattice
#> This is vegan 2.6-4
library(effectsize)
data(dune)
data(dune.env)
ado <- adonis2(dune ~ Management, data = dune.env, permutations = 100)
ado
#> Permutation test for adonis under reduced model
#> Terms added sequentially (first to last)
#> Permutation: free
#> Number of permutations: 100
#>
#> adonis2(formula = dune ~ Management, data = dune.env, permutations = 100)
#> Df SumOfSqs R2 F Pr(>F)
#> Management 3 1.4686 0.34161 2.7672 0.009901 **
#> Residual 16 2.8304 0.65839
#> Total 19 4.2990 1.00000
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
w2 <- omega_squared(ado)
#> Error in `[[<-.data.frame`(`*tmp*`, "Mean_Square", value = numeric(0)): Ersetzung hat 0 Zeilen, Daten haben 3
interpret_omega_squared(w2)
#> Error in interpret(es, rules): Objekt 'w2' nicht gefunden
Created on 2022-11-15 with reprex v2.0.2
EDIT
I tried to do it manually:
library(vegan, quietly = T, warn.conflicts = F)
#> This is vegan 2.6-4
library(effectsize)
library(dplyr, quietly = T, warn.conflicts = F)
library(tibble)
library(purrr)
data(dune)
data(dune.env)
ado <- adonis2(dune ~ Management, data = dune.env, permutations = 100)
w2 <- omega_squared(ado) # Does not work
#> Error in `[[<-.data.frame`(`*tmp*`, "Mean_Square", value = numeric(0)): Ersetzung hat 0 Zeilen, Daten haben 3
interpret_omega_squared(w2) # Does not work
#> Error in interpret(es, rules): Objekt 'w2' nicht gefunden
ado_tidy <- tibble( # manually create Adonis test result table
parameter = c("Management", "Residual", "Total"),
df = ado %>% pull("Df"), # Degree of freedom
ss = ado %>% pull("SumOfSqs"), # sum of squares
meansqs = ss / df, # mean squares
p_r2 = ado %>% pull("R2"), # partial R²
f = ado %>% pull("F"), # F value
p = ado %>% pull("Pr(>F)") # p value
)
ado_tidy
#> # A tibble: 3 x 7
#> parameter df ss meansqs p_r2 f p
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Management 3 1.47 0.490 0.342 2.77 0.00990
#> 2 Residual 16 2.83 0.177 0.658 NA NA
#> 3 Total 19 4.30 0.226 1 NA NA
# Formula:
# W2 = (DFm * (F - 1)) / ((DFm * (F - 1)) + (DFm + 1))
W2 <- abs(
(ado_tidy %>% pull(df) %>% chuck(3) * (ado_tidy %>% pull(f) %>% chuck(1) - 1)) /
((ado_tidy %>% pull(df) %>% chuck(3) * (ado_tidy %>% pull(f) %>% chuck(1) - 1) +
ado_tidy %>% pull(df) %>% chuck(3) + 1)
)
)
W2
#> [1] 0.6267099
interpret_omega_squared(W2, rules = "field2013")
#> [1] "large"
#> (Rules: field2013)
Created on 2022-11-15 with reprex v2.0.2
Hopefully, the equation is correct...
Here is the MicEco::adonis_OmegaSq function edited so that it works both with the current vegan::adonis2 and deprecated vegan::adonis:
#' Calculate (partial) Omega-squared (effect-size calculation) for PERMANOVA and add it to the input object
#'
#' #param adonisOutput An adonis object
#' #param partial Should partial omega-squared be calculated (sample size adjusted). Default TRUE
#' #return Original adonis object with the (partial) Omega-squared values added
#' #import vegan
#' #export
adonis_OmegaSq <- function(adonisOutput, partial = TRUE){
if(!(is(adonisOutput, "adonis") || is(adonisOutput, "anova.cca")))
stop("Input should be an adonis object")
if (is(adonisOutput, "anova.cca")) {
aov_tab <- adonisOutput
aov_tab$MeanSqs <- aov_tab$SumOfSqs / aov_tab$Df
aov_tab$MeanSqs[length(aov_tab$Df)] <- NA
} else {
aov_tab <- adonisOutput$aov.tab
}
heading <- attr(aov_tab, "heading")
MS_res <- aov_tab[pmatch("Residual", rownames(aov_tab)), "MeanSqs"]
SS_tot <- aov_tab[rownames(aov_tab) == "Total", "SumsOfSqs"]
N <- aov_tab[rownames(aov_tab) == "Total", "Df"] + 1
if(partial){
omega <- apply(aov_tab, 1, function(x) (x["Df"]*(x["MeanSqs"]-MS_res))/(x["Df"]*x["MeanSqs"]+(N-x["Df"])*MS_res))
aov_tab$parOmegaSq <- c(omega[1:(length(omega)-2)], NA, NA)
} else {
omega <- apply(aov_tab, 1, function(x) (x["SumsOfSqs"]-x["Df"]*MS_res)/(SS_tot+MS_res))
aov_tab$OmegaSq <- c(omega[1:(length(omega)-2)], NA, NA)
}
if (is(adonisOutput, "adonis"))
cn_order <- c("Df", "SumsOfSqs", "MeanSqs", "F.Model", "R2",
if (partial) "parOmegaSq" else "OmegaSq", "Pr(>F)")
else
cn_order <- c("Df", "SumOfSqs", "F", if (partial) "parOmegaSq" else "OmegaSq",
"Pr(>F)")
aov_tab <- aov_tab[, cn_order]
attr(aov_tab, "names") <- cn_order
attr(aov_tab, "heading") <- heading
if (is(adonisOutput, "adonis"))
adonisOutput$aov.tab <- aov_tab
else
adonisOutput <- aov_tab
return(adonisOutput)
}
source() this function and it should work. In my test it gave the same results for both adonis2 and adonis.
I was wondering if there might be a way to turn the following part of the OUTPUT of the res and res2 objects into a data.frame?
Note: answer below works with res but not res2.
A functional answer is appreciated as the data below is just toy.
library(metafor)
dat <- dat.konstantopoulos2011
res <- rma.mv(yi, vi, random = ~ 1 | district/school, data=dat)
#== OUTPUT (CAN WE TURN ONLY BELOW PART INTO A data.frame?):
#Variance Components:
# estim sqrt nlvls fixed factor
#sigma^2.1 0.0651 0.2551 11 no district
#sigma^2.2 0.0327 0.1809 56 no district/school
#Test for Heterogeneity:
#Q(df = 55) = 578.8640, p-val < .0001
# AND
res2 <- rma.mv(yi, vi, random = ~ factor(school) | district, data=dat)
#== OUTPUT (CAN WE TURN ONLY BELOW PART INTO A data.frame?):
#Variance Components:
#outer factor: district (nlvls = 11)
#inner factor: factor(school) (nlvls = 11)
# estim sqrt fixed
#tau^2 0.0978 0.3127 no
#rho 0.6653 no
#Test for Heterogeneity:
#Q(df = 55) = 578.8640, p-val < .0001
If there is no default/standard way to extract the data then you can manipulate the output using capture.output.
return_data <- function(res) {
tmp <- capture.output(res)
#data start from second line after "Variance Components:"
start <- which(tmp == "Variance Components:") + 2
index <- which(tmp == "")
#Data ends before the empty line after "Variance Components:"
end <- index[which.max(index > start)] - 1
data <- read.table(text = paste0(tmp[start:end], collapse = '\n'), header = T)
heterogeneity_index <- which(tmp == "Test for Heterogeneity:") + 1
list(data = data, heterogeneity = tmp[heterogeneity_index])
}
res <- rma.mv(yi, vi, random = ~ 1 | district/school, data=dat)
return_data(res)
#$data
# estim sqrt nlvls fixed factor
#sigma^2.1 0.0651 0.2551 11 no district
#sigma^2.2 0.0327 0.1809 56 no district/school
#$heterogeneity
#[1] "Q(df = 55) = 578.8640, p-val < .0001"
Would this suit your purposes? The 'Test for Heterogeneity' doesn't really fit in the dataframe, so I added it as a seperate column and it gets duplicated as a result. I'm not sure how else you could do it.
library(tidyverse)
#install.packages("metafor")
library(metafor)
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#>
#> Loading the 'metafor' package (version 3.0-2). For an
#> introduction to the package please type: help(metafor)
dat <- dat.konstantopoulos2011
res <- rma.mv(yi, vi, random = ~ 1 | district/school, data=dat)
res
#>
#> Multivariate Meta-Analysis Model (k = 56; method: REML)
#>
#> Variance Components:
#>
#> estim sqrt nlvls fixed factor
#> sigma^2.1 0.0651 0.2551 11 no district
#> sigma^2.2 0.0327 0.1809 56 no district/school
#>
#> Test for Heterogeneity:
#> Q(df = 55) = 578.8640, p-val < .0001
#>
#> Model Results:
#>
#> estimate se zval pval ci.lb ci.ub
#> 0.1847 0.0846 2.1845 0.0289 0.0190 0.3504 *
#>
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vc <- cbind(estim = res$sigma2,
sqrt = res$sigma,
nlvls = res$s.nlevels,
fixed = ifelse(res$vc.fix$sigma2, "yes", "no"),
factor = res$s.names,
R = ifelse(res$Rfix, "yes", "no"),
Test_for_heterogeneity = paste0("Q(df = ", res$k - res$p, ") = ", metafor:::.fcf(res$QE, res$digits[["test"]]), ", p-val ", metafor:::.pval(res$QEp,
res$digits[["pval"]], showeq = TRUE, sep = " "))
)
rownames(vc) <- c("sigma^2.1", "sigma^2.2")
result <- as.data.frame(vc)
result
#> estim nlvls fixed factor R Test_for_heterogeneity
#> sigma^2.1 "0.0650619442753117" "11" "no" "district" "no" "Q(df = 55) = 578.8640, p-val < .0001"
#> sigma^2.2 "0.0327365170279351" "56" "no" "district/school" "no" "Q(df = 55) = 578.8640, p-val < .0001"
Created on 2021-10-06 by the reprex package (v2.0.1)
I am trying to perform a pairwise manova analysis where I loop through all the possible pairs of my columns. I think this is best communicated with an example:
varList <- colnames(iris)
m1 <- manova(cbind(varList[1], varList[2]) ~ Species, data = iris)
# Error in model.frame.default(formula = cbind(varList[1], varList[2]) ~ :
# variable lengths differ (found for 'Species')
m2 <- manova(cbind(noquote(varList[1]), noquote(varList[2])) ~ Species,
data = iris)
# Error in model.frame.default(formula = cbind(noquote(varList[1]), noquote(varList[2])) ~ :
# variable lengths differ (found for 'Species')
m3 <- manova(cbind(Sepal.Length, Petal.Length) ~ Species, data = iris)
m4 <- manova(cbind(iris[ ,1], iris[ ,3]) ~ Species, data = iris)
summary(m3)
# Df Pillai approx F num Df den Df Pr(>F)
# Species 2 0.9885 71.829 4 294 < 2.2e-16 ***
# Residuals 147
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
R.version.string
# [1] "R version 3.4.2 (2017-09-28)"
RStudio.Version()$version
# [1] ‘1.1.383’
I think this is more related to referring to colnames from a vector in my cbind() function. I saw something about the using parenthesis from this question here, but can't get that to work for my case. I can call the columns by their number (see m4), but I'd prefer to use column names if possible.
You need to wrap each of the entries from the vector that you are calling with eval(as.symbol()).
So:
m1 <- manova(cbind(eval(as.symbol(varList[1])), eval(as.symbol(varList[2]))) ~ Species, data = iris) should work.
So far my code looks like this:
Points = readOGR(dsn = "./Data/filename.shp",layer = "layername",stringsAsFactors = FALSE)
Points$LDI = extract(LDI, Points)
LDI = raster("./Data/filename2.tif")
Points$LDI = extract(LDI, Points)
PointsDF = Points#data
for(i in PointsDF) {
Mod1 = lm(LDI ~ i, data = PointsDF)
Mod2 = lm(LDI ~ 1, data = PointsDF)
anova(Mod1, Mod2)
}
This last part is where I know I'm doing everything wrong. I want to run the anova on every numerical field in the data frame.
You're close. A natural way is to loop over the field names. Although there are many ways to do this, lapply is perhaps the most idiomatic because (a) it uses the field names (rather than field indexes, which can be dangerous) and (b) does not require pre-allocating any structures for the output. The trick is to convert field names into formulas. Again, there are many ways to do this, but a direct way is to assemble the formula as a string.
Here is working code as an example. It produces a list of anova objects.
#
# Create some random data.
#
n <- 20
set.seed(17)
X <- data.frame(Y=rnorm(n), X1=runif(n), X2=1:n, X3=rexp(n))
#
# Loop over the regressors.
# (The base model can be precomputed.)
#
mod.0 <- lm(Y ~ 1, X)
models <- lapply(setdiff(names(X), "Y"), function(s) {
mod.1 <- lm(as.formula(paste("Y ~", s)), X)
anova(mod.0, mod.1)
})
print(models)
Here's the output, displaying this list of three anova results.
[[1]]
Analysis of Variance Table
Model 1: Y ~ 1
Model 2: Y ~ X1
Res.Df RSS Df Sum of Sq F Pr(>F)
1 19 10.1157
2 18 9.6719 1 0.44385 0.826 0.3754
[[2]]
Analysis of Variance Table
Model 1: Y ~ 1
Model 2: Y ~ X2
Res.Df RSS Df Sum of Sq F Pr(>F)
1 19 10.1157
2 18 8.1768 1 1.939 4.2684 0.05353 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
[[3]]
Analysis of Variance Table
Model 1: Y ~ 1
Model 2: Y ~ X3
Res.Df RSS Df Sum of Sq F Pr(>F)
1 19 10.116
2 18 10.081 1 0.034925 0.0624 0.8056
As another example of working with what you have produced, here is sapply being used to print out their p-values:
sapply(models, function(m) m[["Pr(>F)"]][2])
[1] 0.37542968 0.05352883 0.80562894
The issue is that you are not telling the loop what it is iterating on, defing a formula object in the anova call nor creating an object to store results.
In this example the "ij" variable is assigning to the list object and storing the anova models, "y" defined as a variable indicating the left-hand side of the model. The list object "anova.results" is storing each model. The index in the loop definition is using "which" to assign which column contains "y" and as such, drops it from the iterator. I am using the R "iris" dataset for the example.
data(iris)
iris <- iris[,-5]
y = "Sepal.Length"
anova.results <- list()
ij=0
for(i in names(iris)[-which(names(iris) == y)]) {
ij = ij+1
Mod = lm(stats::as.formula(paste(y, i, sep = "~")), data = iris)
anova.results[[ij]] <- anova(Mod, Mod)
}
anova.results
my problem is this: I get NA where I should get some values in the computation of robust standard errors.
I am trying to do a fixed effect panel regression with cluster-robust standard errors. For this, I follow Arai (2011) who on p. 3 follows Stock/ Watson (2006) (later published in Econometrica, for those who have access). I would like to correct the degrees of freedom by (M/(M-1)*(N-1)/(N-K) against downward bias as my number of clusters is finite and I have unbalanced data.
Similar problems have been posted before [1, 2] on StackOverflow and related problems [3] on CrossValidated.
Arai (and the answer in the 1st link) uses the following code for functions (I provide my data below with some further comment):
gcenter <- function(df1,group) {
variables <- paste(
rep("C", ncol(df1)), colnames(df1), sep=".")
copydf <- df1
for (i in 1:ncol(df1)) {
copydf[,i] <- df1[,i] - ave(df1[,i], group,FUN=mean)}
colnames(copydf) <- variables
return(cbind(df1,copydf))}
# 1-way adjusting for clusters
clx <- function(fm, dfcw, cluster){
# R-codes (www.r-project.org) for computing
# clustered-standard errors. Mahmood Arai, Jan 26, 2008.
# The arguments of the function are:
# fitted model, cluster1 and cluster2
# You need to install libraries `sandwich' and `lmtest'
# reweighting the var-cov matrix for the within model
library(sandwich);library(lmtest)
M <- length(unique(cluster))
N <- length(cluster)
K <- fm$rank
dfc <- (M/(M-1))*((N-1)/(N-K))
uj <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)*dfcw
coeftest(fm, vcovCL) }
,where the gcenter computes deviations from the mean (fixed effect). I then continue and do the regression with DS_CODEbeing my cluster variable (I have named my data 'data').
centerdata <- gcenter(data, data$DS_CODE)
datalm <- lm(C.L1.retE1M ~ C.MCAP_SEC + C.Impact_change + C.Mom + C.BM + C.PD + C.CashGen + C.NITA + C.PE + C.PEdummy + factor(DS_CODE), data=centerdata)
M <- length(unique(data$DS_CODE))
dfcw <- datalm$df / (datalm$df - (M-1))
and want to calculate
clx(datalm, dfcw, data$DS_CODE)
However, when I want to compute uj (see formula clx above) for the variance, I get only at the beginning some values for my regressors, then lots of zeros. If this input uj is used for the variance, only NAs result.
My data
Since my data may be of special structure and I can't figure out the problem, I post the entire thing as a link from Hotmail. The reason is that with other data (taken from Arai (2011)) my problem does not occur. Sorry in advance for the mess but I'd be very grateful if you could have a look at it nevertheless.
The file is a 5mb .txt file containing purely data.
After some time playing around, it works for me and gives me:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.5099e-16 5.2381e-16 0.8610 0.389254
C.MCAP_SEC -5.9769e-07 1.2677e-07 -4.7149 2.425e-06 ***
C.Impact_change -5.3908e-04 7.5601e-05 -7.1306 1.014e-12 ***
C.Mom 3.7560e-04 3.3378e-03 0.1125 0.910406
C.BM -1.6438e-04 1.7368e-05 -9.4645 < 2.2e-16 ***
C.PD 6.2153e-02 3.8766e-02 1.6033 0.108885
C.CashGen -2.7876e-04 1.4031e-02 -0.0199 0.984149
C.NITA -8.1792e-02 3.2153e-02 -2.5438 0.010969 *
C.PE -6.6170e-06 4.0138e-06 -1.6485 0.099248 .
C.PEdummy 1.3143e-02 4.8864e-03 2.6897 0.007154 **
factor(DS_CODE)130324 -5.2497e-16 5.2683e-16 -0.9965 0.319028
factor(DS_CODE)130409 -4.0276e-16 5.2384e-16 -0.7689 0.441986
factor(DS_CODE)130775 -4.4113e-16 5.2424e-16 -0.8415 0.400089
...
This leaves us with the question why it doesn't for you. I guess it has something to do with the format of your data. Is everything numeric? I converted the column classes and it looks like that for me:
str(dat)
'data.frame': 48251 obs. of 12 variables:
$ DS_CODE : chr "902172" "902172" "902172" "902172" ...
$ DNEW : num 2e+05 2e+05 2e+05 2e+05 2e+05 ...
$ MCAP_SEC : num 78122 71421 81907 80010 82462 ...
$ NITA : num 0.135 0.135 0.135 0.135 0.135 ...
$ CashGen : num 0.198 0.198 0.198 0.198 0.198 ...
$ BM : num 0.1074 0.1108 0.097 0.0968 0.0899 ...
$ PE : num 57 55.3 63.1 63.2 68 ...
$ PEdummy : num 0 0 0 0 0 0 0 0 0 0 ...
$ L1.retE1M : num -0.72492 0.13177 0.00122 0.07214 -0.07332 ...
$ Mom : num 0 0 0 0 0 ...
$ PD : num 5.41e-54 1.51e-66 3.16e-80 2.87e-79 4.39e-89 ...
$ Impact_change: num 0 -10.59 -10.43 0.7 -6.97 ...
What does str(data) return for you?
The plm package can estimate clustered SEs for panel regressions. The original data is no longer available, so here's an example using dummy data.
require(foreign)
require(plm)
require(lmtest)
test <- read.dta("http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.dta")
fpm <- plm(y ~ x, test, model='pooling', index=c('firmid', 'year'))
##Arellano clustered by *group* SEs
> coeftest(fpm, vcov=function(x) vcovHC(x, cluster="group", type="HC0"))
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.029680 0.066939 0.4434 0.6575
x 1.034833 0.050540 20.4755 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
If you're using lm models (instead of plm), then the multiwayvcov package may help.
library("lmtest")
library("multiwayvcov")
data(petersen)
m1 <- lm(y ~ x, data = petersen)
> coeftest(m1, vcov=function(x) cluster.vcov(x, petersen[ , c("firmid")],
df_correction=FALSE))
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.029680 0.066939 0.4434 0.6575
x 1.034833 0.050540 20.4755 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
For more details see:
Fama-MacBeth and Cluster-Robust (by Firm and Time) Standard Errors in R.
See also:
Double clustered standard errors for panel data