I had a weird problem in plm() function. Below is the code:
library(data.table)
library(tidyverse)
library(plm)
#Data Generation
n <- 500
set.seed(75080)
z <- rnorm(n)
w <- rnorm(n)
x <- 5*z + 50
y <- -100*z+ 1100 + 50*w
y <- 10*round(y/10)
y <- ifelse(y<200,200,y)
y <- ifelse(y>1600,1600,y)
dt1 <- data.table('id'=1:500,'sat'=y,'income'=x,'group'=rep(1,n))
z <- rnorm(n)
w <- rnorm(n)
x <- 5*z + 80
y <- -80*z+ 1200 + 50*w
y <- 10*round(y/10)
y <- ifelse(y<200,200,y)
y <- ifelse(y>1600,1600,y)
dt2 <- data.table('id'=501:1000,'sat'=y,'income'=x,'group'=rep(2,n))
z <- rnorm(n)
w <- rnorm(n)
x <- 5*z + 30
y <- -120*z+ 1000 + 50*w
y <- 10*round(y/10)
y <- ifelse(y<200,200,y)
y <- ifelse(y>1600,1600,y)
dt3 <- data.table('id'=1001:1500,'sat'=y,'income'=x,'group'=rep(3,n))
dtable <- merge(dt1 ,dt2, all=TRUE)
dtable <- merge(dtable ,dt3, all=TRUE)
# Model
dtable_p <- pdata.frame(dtable, index = "group")
mod_1 <- plm(sat ~ income, data = dtable_p,model = "pooling")
Error in [.data.frame(x, , which) : undefined columns selected
I checked all possibilities but I can not figure out why it gives me an error. the columns'names are correct, why R said undefined columns??? Thank you!
Follow up: I add another data set test as the #StupidWolf use to prove
data("Produc", package = "plm")
form <- log(gsp) ~ log(pc)
Produc$group <- Produc$region
pProduc <- pdata.frame(Produc, index = "group")
Produc$group <- rep(1:48, each = 17)
summary(plm(form, data = pProduc, model = "pooling"))
>Error in `[.data.frame`(x, , which) : undefined columns selected
This is extremely weird, the answer is index must not be named "group".
I suspect somewhere in the plm function, it must be adding a "group" to your data.frame.
We can use the example dataset
data("Produc", package = "plm")
form <- log(gsp) ~ log(pc)
Produc$group = Produc$region
pProduc <- pdata.frame(Produc, index = c("group"))
summary(plm(form, data = pProduc, model = "random"))
Error in `[.data.frame`(x, , which) : undefined columns selected
Using the "region" column from which I copied, it works:
pProduc <- pdata.frame(Produc, index = c("region"))
summary(plm(form, data = pProduc, model = "random"))
Oneway (individual) effect Random Effect Model
(Swamy-Arora's transformation)
Call:
plm(formula = form, data = pProduc, model = "random")
Unbalanced Panel: n = 9, T = 51-136, N = 816
Effects:
var std.dev share
idiosyncratic 0.03691 0.19213 0.402
individual 0.05502 0.23457 0.598
theta:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8861 0.9012 0.9192 0.9157 0.9299 0.9299
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.68180 -0.11014 0.00977 -0.00039 0.13815 0.45491
Coefficients:
Estimate Std. Error z-value Pr(>|z|)
(Intercept) -1.099088 0.138395 -7.9417 1.994e-15 ***
log(pc) 1.100102 0.010623 103.5627 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 459.71
Residual Sum of Squares: 30.029
R-Squared: 0.93468
Adj. R-Squared: 0.9346
Chisq: 11647.6 on 1 DF, p-value: < 2.22e-16
For your example, just rename the column "group" and also set it as a factor to avoid the other errors. (For "pooling" it should be treated a categorical not numeric).
dtable <- merge(dt1 ,dt2, all=TRUE)
dtable <- merge(dtable ,dt3, all=TRUE)
dtable$group = factor(dtable$group)
colnames(dtable)[4] = "GROUP"
dtable_p <- pdata.frame(dtable, index = "GROUP")
summary(plm(sat ~ income, data = dtable_p,method="pooling"))
Related
I get a warning message from the plm package in R when I perform ´summary()´ of a model:
1: In Ops.pseries(y, bX) : indexes of pseries have same length but
not same content: result was assigned first operand's index
2: In Ops.pseries(y, bX) : indexes of pseries have same length but not
same content: result was assigned first operand's index
I used the following code:
library(dplyr)
library(lubridate)
library(plm)
data <- data.frame(ID = rep(c("123456", "234567", "345678", "456789", "567890", "678901", "789012", "890123", "901234","9012345"), each = 24),
month = rep(seq(dmy("01.01.2019"), dmy("01.12.2020"), by = "1 months"),10), group = rep(c(rep(T, 12), rep(F, 12)), 10),
temperature = runif(24*10, 0, 1)) %>%
group_by(ID, group) %>% mutate(consumption = ifelse(group, runif(12, 1,2), runif(12,2,3)))
pdata <- pdata.frame(x = data, index = c("ID", "month"))
model <- plm(formula = consumption ~ group + temperature, data = pdata, effect = "individual", model = "within")
summary(model)
## Warnmeldungen:
## 1: In Ops.pseries(y, bX) :
## indexes of pseries have same length but not same content: result was assigned first operand's index
## 2: In Ops.pseries(y, bX) :
## indexes of pseries have same length but not same content: result was assigned first operand's index
My thought was that it could be one of the two indices. However, I get the same warning message when I either use "ID" or "month" as index.
An excerpt of the data feed in look like this:
It seems like plm or pdata.frame does not like some modifications injected into the data frame by some transformation you perform on the data prior to estimation.
Make sure to feed a clean data frame to pdata.frame like this and the code runs fine:
fdata <- data.frame(data)
pdata <- pdata.frame(x = fdata, index = c("ID", "month"))
model <- plm(formula = consumption ~ group + temperature, data = pdata, effect = "individual", model = "within")
summary(model)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = consumption ~ group + temperature, data = pdata,
## effect = "individual", model = "within")
##
## Balanced Panel: n = 10, T = 24, N = 240
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.581113 -0.237459 0.031184 0.252256 0.541147
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## groupTRUE -1.020820 0.038559 -26.4743 <2e-16 ***
## temperature -0.029801 0.064738 -0.4603 0.6457
## ---
## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
##
## Total Sum of Squares: 82.792
## Residual Sum of Squares: 20.318
## R-Squared: 0.75459
## Adj. R-Squared: 0.74275
## F-statistic: 350.521 on 2 and 228 DF, p-value: < 2.22e-16
Another way of solving the problem is to add ungroup().
So the following
group_by(ID, group) %>% mutate(consumption = ifelse(group, runif(12, 1,2), runif(12,2,3)))
should become
group_by(ID, group) %>% mutate(consumption = ifelse(group, runif(12, 1,2), runif(12,2,3))) %>% ungroup()
I am making spatial panel models from the splm library but I cannot make a fixed or random effects model because it comes out that it is an unbalanced panel.
Data
data and shp files
Reproducible example
library(dplyr)
library(Hmisc)
library(sf)
library(plm)
library(splm)
library(spdep)
data = read.csv("data.csv", colClasses=c(id="factor"))
map = st_read("nxparroquias.shp")
colnames(map)[1] = "id"
balanced = data %>%
group_by(id) %>%
#drop_na() %>%
filter(all(c(2013:2018) %in% year)) %>%
ungroup() %>%
filter(id %in% unique(map$id))
map = map %>%
filter(id %in% unique(balanced$id))
pt = spdep::poly2nb(map)
ptw = spdep::nb2listw(pt, zero.policy = TRUE)
describe(balanced$id)
#balanced$id
#n missing distinct
#5832 0 972
describe(map$id)
#map$id
#n missing distinct
#972 0 972
#checks if the data are balanced
plm::is.pbalanced(balanced$id,balanced$year)
#[1] TRUE
p_balanced = pdata.frame(balanced, index = c("id", "year")) #same issues
Pooling
md1 = splm::spml(data = balanced,
y ~ x1+x2+x3,
index = c("id", "year"),
zero.policy = TRUE,
model = "pooling",
listw = ptw)
summary(md1)
#ML panel with , spatial error correlation
#
#Call:
# spreml(formula = formula, data = data, index = index, w = listw2mat(listw),
# w2 = listw2mat(listw2), lag = lag, errors = errors, cl = cl,
# zero.policy = TRUE)
#
#Residuals:
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#-14.860 -3.213 0.539 0.003 3.194 13.181
#
#Error variance parameters:
# Estimate Std. Error t-value Pr(>|t|)
#rho 0.087841 0.019622 4.4767 7.58e-06 ***
#
# Coefficients:
# Estimate Std. Error t-value Pr(>|t|)
#(Intercept) 5.1373431 0.0972298 52.8371 < 2.2e-16 ***
# x1 0.0423977 0.0098276 4.3141 1.602e-05 ***
# x2 0.1109780 0.0083800 13.2432 < 2.2e-16 ***
# x3 -0.0444174 0.0096522 -4.6018 4.189e-06 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
FE
md2 = splm::spml(data = balanced,
y ~ x1+x2+x3,
index = c("id", "year"),
zero.policy = TRUE,
model = "within",
listw = ptw)
With reproducible example Fixed effects
Error in lag.listw(listw, u, zero.policy = zero.policy) :
object lengths differ
With my original data
Error in spfeml(formula = formula, data = data, index = index, listw = listw, :
Estimation method unavailable for unbalanced panels
RE
with random effects
Error in .C64("aplsb1", SIGNATURE = c(SS$signature, SS$signature, "double", :
NAs in argument 7 and 'NAOK = FALSE' (dotCall64)
I am working with an interaction model similar to this one below:
set.seed(1993)
moderating <- sample(c("Yes", "No"),100, replace = T)
x <- sample(c("Yes", "No"), 100, replace = T)
y <- sample(1:100, 100, replace = T)
df <- data.frame(y, x, moderating)
Results <- lm(y ~ x*moderating)
summary(Results)
Call:
lm(formula = y ~ x * moderating)
Residuals:
Min 1Q Median 3Q Max
-57.857 -29.067 3.043 22.960 59.043
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 52.4000 6.1639 8.501 2.44e-13 ***
xYes 8.4571 9.1227 0.927 0.356
moderatingYes -11.4435 8.9045 -1.285 0.202
xYes:moderatingYes -0.1233 12.4563 -0.010 0.992
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 30.82 on 96 degrees of freedom
Multiple R-squared: 0.04685, Adjusted R-squared: 0.01707
F-statistic: 1.573 on 3 and 96 DF, p-value: 0.2009
I'm learning how to calculate the fitted value of a interaction from a regression table. In the example, the base category (or omitted category) is x= No and moderating = No.
Thus far, I know the following fitted values:
#Calulate Fitted Value From a Regression Interaction by hand
#Omitted Variable = X_no.M_no
X_no.M_no <- 52.4000
X_yes.M_no <- 52.4000 + 8.4571
X_no.M_yes <- 52.4000 + -11.4435
X_yes.M_yes #<- ?
I do not understand how the final category, X_yes.M_yes, is calculated. My initial thoughts were X_yes.M_yes <- 52.4000 + -0.1233, (the intercept plus the interaction term) but that is incorrect. I know its incorrect because, using the predict function, the fitted value of X_yes.M_yes = 49.29032, not 52.4000 + -0.1233 = 52.2767.
How do I calculate, by hand, the predicted value of the X_yes.M_yes category?
Here are the predicted values as generated from the predict function in R
#Validated Here Using the Predict Function:
newdat <- NULL
for(m in na.omit(unique(df$moderating))){
for(i in na.omit(unique(df$x))){
moderating <- m
x <- i
newdat<- rbind(newdat, data.frame(x, moderating))
}
}
Prediction.1 <- cbind(newdat, predict(Results, newdat, se.fit = TRUE))
Prediction.1
Your regression looks like this in math:
hat_y = a + b x + c m + d m x
Where x = 1 when "yes" and 0 when "no" and m is similarly defined by moderating.
Then X_yes.M_yes implies x = 1 and m = 1, so your prediction is a + b + c + d.
or in your notation X_yes.M_yes = 52.4000 + 8.4571 - 11.4435 - 0.1233
In order to correct heteroskedasticity in error terms, I am running the following weighted least squares regression in R :
#Call:
#lm(formula = a ~ q + q2 + b + c, data = mydata, weights = weighting)
#Weighted Residuals:
# Min 1Q Median 3Q Max
#-1.83779 -0.33226 0.02011 0.25135 1.48516
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) -3.939440 0.609991 -6.458 1.62e-09 ***
#q 0.175019 0.070101 2.497 0.013696 *
#q2 0.048790 0.005613 8.693 8.49e-15 ***
#b 0.473891 0.134918 3.512 0.000598 ***
#c 0.119551 0.125430 0.953 0.342167
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Residual standard error: 0.5096 on 140 degrees of freedom
#Multiple R-squared: 0.9639, Adjusted R-squared: 0.9628
#F-statistic: 933.6 on 4 and 140 DF, p-value: < 2.2e-16
Where "weighting" is a variable (function of the variable q) used for weighting the observations. q2 is simply q^2.
Now, to double-check my results, I manually weight my variables by creating new weighted variables :
mydata$a.wls <- mydata$a * mydata$weighting
mydata$q.wls <- mydata$q * mydata$weighting
mydata$q2.wls <- mydata$q2 * mydata$weighting
mydata$b.wls <- mydata$b * mydata$weighting
mydata$c.wls <- mydata$c * mydata$weighting
And run the following regression, without the weights option, and without a constant - since the constant is weighted, the column of 1 in the original predictor matrix should now equal the variable weighting:
Call:
lm(formula = a.wls ~ 0 + weighting + q.wls + q2.wls + b.wls + c.wls,
data = mydata)
#Residuals:
# Min 1Q Median 3Q Max
#-2.38404 -0.55784 0.01922 0.49838 2.62911
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#weighting -4.125559 0.579093 -7.124 5.05e-11 ***
#q.wls 0.217722 0.081851 2.660 0.008726 **
#q2.wls 0.045664 0.006229 7.330 1.67e-11 ***
#b.wls 0.466207 0.121429 3.839 0.000186 ***
#c.wls 0.133522 0.112641 1.185 0.237876
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Residual standard error: 0.915 on 140 degrees of freedom
#Multiple R-squared: 0.9823, Adjusted R-squared: 0.9817
#F-statistic: 1556 on 5 and 140 DF, p-value: < 2.2e-16
As you can see, the results are similar but not identical. Am I doing something wrong while manually weighting the variables, or does the option "weights" do something more than simply multiplying the variables by the weighting vector?
Provided you do manual weighting correctly, you won't see discrepancy.
So the correct way to go is:
X <- model.matrix(~ q + q2 + b + c, mydata) ## non-weighted model matrix (with intercept)
w <- mydata$weighting ## weights
rw <- sqrt(w) ## root weights
y <- mydata$a ## non-weighted response
X_tilde <- rw * X ## weighted model matrix (with intercept)
y_tilde <- rw * y ## weighted response
## remember to drop intercept when using formula
fit_by_wls <- lm(y ~ X - 1, weights = w)
fit_by_ols <- lm(y_tilde ~ X_tilde - 1)
Although it is generally recommended to use lm.fit and lm.wfit when passing in matrix directly:
matfit_by_wls <- lm.wfit(X, y, w)
matfit_by_ols <- lm.fit(X_tilde, y_tilde)
But when using these internal subroutines lm.fit and lm.wfit, it is required that all input are complete cases without NA, otherwise the underlying C routine stats:::C_Cdqrls will complain.
If you still want to use the formula interface rather than matrix, you can do the following:
## weight by square root of weights, not weights
mydata$root.weighting <- sqrt(mydata$weighting)
mydata$a.wls <- mydata$a * mydata$root.weighting
mydata$q.wls <- mydata$q * mydata$root.weighting
mydata$q2.wls <- mydata$q2 * mydata$root.weighting
mydata$b.wls <- mydata$b * mydata$root.weighting
mydata$c.wls <- mydata$c * mydata$root.weighting
fit_by_wls <- lm(formula = a ~ q + q2 + b + c, data = mydata, weights = weighting)
fit_by_ols <- lm(formula = a.wls ~ 0 + root.weighting + q.wls + q2.wls + b.wls + c.wls,
data = mydata)
Reproducible Example
Let's use R's built-in data set trees. Use head(trees) to inspect this dataset. There is no NA in this dataset. We aim to fit a model:
Height ~ Girth + Volume
with some random weights between 1 and 2:
set.seed(0); w <- runif(nrow(trees), 1, 2)
We fit this model via weighted regression, either by passing weights to lm, or manually transforming data and calling lm with no weigths:
X <- model.matrix(~ Girth + Volume, trees) ## non-weighted model matrix (with intercept)
rw <- sqrt(w) ## root weights
y <- trees$Height ## non-weighted response
X_tilde <- rw * X ## weighted model matrix (with intercept)
y_tilde <- rw * y ## weighted response
fit_by_wls <- lm(y ~ X - 1, weights = w)
#Call:
#lm(formula = y ~ X - 1, weights = w)
#Coefficients:
#X(Intercept) XGirth XVolume
# 83.2127 -1.8639 0.5843
fit_by_ols <- lm(y_tilde ~ X_tilde - 1)
#Call:
#lm(formula = y_tilde ~ X_tilde - 1)
#Coefficients:
#X_tilde(Intercept) X_tildeGirth X_tildeVolume
# 83.2127 -1.8639 0.5843
So indeed, we see identical results.
Alternatively, we can use lm.fit and lm.wfit:
matfit_by_wls <- lm.wfit(X, y, w)
matfit_by_ols <- lm.fit(X_tilde, y_tilde)
We can check coefficients by:
matfit_by_wls$coefficients
#(Intercept) Girth Volume
# 83.2127455 -1.8639351 0.5843191
matfit_by_ols$coefficients
#(Intercept) Girth Volume
# 83.2127455 -1.8639351 0.5843191
Again, results are the same.
I have a table with Ancylostoma's infection, vs sex (2 factor), location (2 factor), year, management (2 factor), ancestry (4 factor) and viremia like categorical variable, and the I have HL an age like numeric variable.**
I made a glmm:
glm_toxo<-glmer((Ancylostoma) ~ as.factor(Sexo)+(Edad)+as.factor(año)+as.factor(Manejo)+as.factor(Localizacion)+as.factor(Viremia.FeLV) +(Ancestria) +(HL)+as.factor(1|Nombre), family="binomial", data= data_silv)
dd_toxo <- dredge (glm_toxo)
a<- get.models(dd_toxo, subset = delta < 2)
b<-(model.avg(a))
And I got this result
Model-averaged coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.0222 0.8911 2.269 0.0233 *
as.factor(Localizacion)PORT -15.2935 2163.9182 0.007 0.9944
as.factor(Localizacion)SMO -3.0012 0.7606 3.946 7.95e-05 ***
as.factor(Manejo)SILV 1.8125 0.7799 2.324 0.0201 *
Edad -0.1965 0.1032 1.904 0.0569 .
as.factor(Sexo)M 0.5015 0.4681 1.071 0.2840
HL -0.9381 1.4244 0.659 0.5102
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
I would like represent the probability of infection (y) vs age (x), but using the estimate of my model.avg**
I tried with this script:
nseq <- function(x, len = length(x)) seq(min(x, na.rm = TRUE),max(x, na.rm=TRUE), length = len)
####
newdata <- as.data.frame(lapply(lapply(data_silv[2:4], mean), rep, 213))
newdata$Edad <- nseq(data_silv$Edad, nrow(newdata))
(año <- sample(as.factor(data_silv$año),size=213,rep=T))
(Manejo <- sample(as.factor(data_silv$Manejo),size=213,rep=T))
(Sexo <- sample(as.factor(data_silv$Sexo),size=213,rep=T))
newdata <- as.data.frame(cbind(mean(data_silv$HL), año,Manejo,Sexo,
data_silv$Localizacion, nseq(data_silv$Edad, nrow(newdata)),
data_silv$Ancylostoma))
names(newdata) <- c("HL","año","Manejo","Sexo","Localizacion","Edad",
"Ancylostoma")
newdata$pred <- data.frame(
model = sapply(a, predict, newdata = newdata),
averaged.subset = predict(b, newdata, full = FALSE),
averaged.full = predict(b, newdata, full = TRUE)
)
library(ggplot2)
ggplot(newdata,aes(x="Edad",y="pred",color="Localizacion")) + geom_line()
#####
But I haven't got graph...or I have error
Someone know any form to represent my model.avg with categorical and variable numeric?, But taking into account that I only want represent probability of infection vs age, with two line: localizacion1 and localizacion2...(localization had 2 factors).**
my original date would be this table:
#
año <- sample(as.factor(2005:2009),size=213,rep=T)
riqueza <- sample((0:3),size=213,rep=T)
HL <- rnorm(213, mean=0.54, sd=0.13)
Ancylostoma <- sample(as.factor(0:1),size=213,rep=T)
Edad <- sample(as.factor(0:21),size=213,rep=T)
Manejo<- sample(c("CCC", "SILV"), 213, replace = TRUE)
Sexo<- sample(c("M", "H"), 213, replace = TRUE)
Localizacion<- sample(c("SMO", "DON", "PORT"), 213, replace = TRUE)
Ancestria<- sample(c("DON", "SMO", "F1", "F2"), 213, replace = TRUE)
newdata <- as.data.frame(cbind(HL,año,Manejo,Sexo,
Localizacion, Edad,Ancylostoma))
names(newdata) <- c("HL","año","Manejo","Sexo","Localizacion","Edad",
"Ancylostoma")
#
And with that date I make my model's estimates. Then I would like do prediction
Thank you, I don't sure if I am explaining well
I so sorry for my english