Unobserved Components Model Predictions: predict.UCM not working in rucm package - r

I built a model using ucm function. However when i try to forecast for future, it's not letting me pass the independent variables.
library("rucm")
library("lubridate")
#Create Simulated Data
Date<- as.Date(seq(from=as.Date('2012-01-01'),to=as.Date('2014-03-31'),by=1),"%Y-%m-%d")
Actual <- sample(27:65,length(Date),replace = TRUE)
DOW <- wday(Date)
Month <- month(Date)
DOM <- mday(Date)
Week <- week(Date)
Ya <- year(Date)
Ya <- ifelse(Ya=="2014",2,1)
a <- data.frame(Date,Actual,DOW,Month,DOM,Week,Ya)
a$Date<-as.Date(a$Date,"%Y-%m-%d")
abc <- cbind(Weekday=model.matrix(~as.factor(a$DOW)),
Mont=model.matrix(~as.factor(a$Month)),
Day=model.matrix(~as.factor(a$DOM)),a[,7,drop=FALSE],
Weekofyear=model.matrix(~as.factor(a$Week)))
abc<-data.frame(abc)
abc<-data.frame(abc[,c(-1,-8,-20,-52)])
abc2 <- subset(abc,abc$Ya==1)
abc2 <- abc2[,-48]
abc3 <- subset(abc,abc$Ya==2)
abc3 <- abc3[,-48]
#train and insample MAPE
a1<-subset(a,a$Ya==1)
a2<-subset(a,a$Ya==2)
#build model
dat <- as.data.frame(cbind(a1[,2,drop=FALSE], abc2))
fo <- as.formula(paste("Actual ~ ", paste(names(dat)[2:42], collapse= "+")))
fit_train_ucm <- ucm(fo, data = dat, cycle = TRUE, cycle.period = 365)
#predict for future
predict(fit_train_ucm,n.ahead = 90,newdata = abc3)
i am getting this error
Error in is.SSModel(newdata, na.check = TRUE, return.logical = FALSE) :
Object is not of class 'SSModel'
Update:
Based on suggestion by package author from GitHub, i used below code(sorry to say this is not very self explanatory, i did however try to edit code)
SSModel(rep(NA,nrow(abc3)) ~ x + SSMtrend(2, Q = list(fit_train_ucm$est.var.level, fit_train_ucm$est.var.slope)) + SSMseasonal(12, Q = fit_train_ucm$est.var.season), H = fit_train_ucm$irr.var, data=abc3)
Now the error message is
Error in eval(expr, envir, enclos) : object 'x' not found
Any help on this?

This bug of prediction with newdata in ucm is yet to corrected.
This is how you can get predictions for an out-of-sample period.
indep <- paste(names(dat)[2:42], collapse= "+")
newdata1 <- SSModel(as.formula(paste0("rep(NA,nrow(abc3)) ~ ", indep, "+ SSMtrend(1, Q = list(fit_train_ucm$est.var.level))",
"+ SSMcycle(365, Q = fit_train_ucm$est.var.cycle)")), H = fit_train_ucm$irr.var, data=abc3)
pred<-predict(fit_train_ucm$model, newdata=newdata1)
So basically here I am using the predict function from KFAS package. To use the predict function, I have to define the data to be an object of class SSModel.
You will take all the parameters that you used in the model as independent variables and run a State space model with NA's as independent variable.
P.S.: I will keep the comments in mind to write a better answer next time.

Just so someone lands at this page again in future - there has been some update on the git repo maintained by the author to approach this problem in simplistic way. Please check this commit
Alternatively, pasting the function which one may use from that commit:
predict.ucm <- function(object, n.ahead, newdata,...){
#### Predict in sample ####
if (missing(newdata)) {
return(predict(object = object$model, n.ahead = n.ahead))
}
#### Predict out of sample ####
# Regression variables.
model_variables <- paste0(names(object$est), collapse = " + ")
# Trend
# Case 1 no trend
if (is.null(object$est.var.level) & is.null(object$est.var.slope)) {
model_trend <- ""
}
# Case 2 level and trend
if (!is.null(object$est.var.level) & !is.null(object$est.var.slope)) {
model_trend <- "+ SSMtrend(degree = 2, Q = list(object$est.var.level, object$est.var.slope))"
}
# Case 3 level only / trend only is not allowed in R
if (!is.null(object$est.var.level) & is.null(object$est.var.slope)) {
model_trend <- "+ SSMtrend(degree = 1, Q = list(object$est.var.level))"
}
# Seasonality
if (!is.null(object$est.var.season)) {
model_season <- sprintf("+ SSMseasonal(period = %s, Q = object$est.var.season)",
object$call['season.length'] %>% as.character())
} else {
model_season <- ""
}
# Cycle
if (!is.null(object$est.var.cycle)) {
model_cycle <- sprintf("+ SSMcycle(period = %s, Q = object$est.var.cycle)",
object$call['cycle.period'] %>% as.character())
} else {
model_cycle <- ""
}
# Combine all components into a formula
model_formula <- as.formula(sprintf("rep(NA,nrow(newdata)) ~ %s %s %s %s",
model_variables,
model_trend,
model_season,
model_cycle
))
# Build a SSM object for the prediction
oos_data <- KFAS::SSModel(formula = model_formula, H = object$irr.var, data = newdata)
# Return the predictions
predict(object$model, newdata = oos_data)
}
EDIT
In case you are interested in predicting on a new data which is considered as for the same time period as the original model was built using, then following edit can be made in the above function. That new data should ideally include the same dependent variable values which was used to build the first model.
# Using same dependent which was used initially to fit the estimates, instead of NA
model_formula <- as.formula(paste0(
object$model$terms[[2]],
sprintf(
" ~ %s %s %s %s",
model_variables,
model_trend,
model_season,
model_cycle
)
))
# Build a SSM object for the prediction
oos_data <- KFAS::SSModel(formula = model_formula, H = object$irr.var, data = newdata)
# Return the predictions for in sample only - hence removing newdata arg
# predict(object$model, newdata = oos_data)
predict(oos_data)

Related

Apply logistic regression in a function in R

I want to run logistic regression for multiple parameters and store the different metrics i.e AUC.
I wrote the function below but I get an error when I call it: Error in eval(predvars, data, env) : object 'X0' not found even if the variable exists in both my training and testing dataset. Any idea?
new.function <- function(a) {
model = glm(extry~a,family=binomial("logit"),data = train_df)
pred.prob <- predict(model,test_df, type='response')
predictFull <- prediction(pred.prob, test_df$extry)
auc_ROCR <- performance(predictFull, measure = "auc")
my_list <- list("AUC" = auc_ROCR)
return(my_list)
}
# Call the function new.function supplying 6 as an argument.
les <- new.function(X0)
The main reason why your function didn't work is that you are trying to call an object into a formula. You can fix it with paste formula function, but that is ultimately quite limiting.
I suggest instead that you consider using update. This allow you more flexibility to change with multiple variable combination, or change a training dataset, without breaking the function.
model = glm(extry~a,family=binomial("logit"),data = train_df)
new.model = update(model, .~X0)
new.function <- function(model){
pred.prob <- predict(model, test_df, type='response')
predictFull <- prediction(pred.prob, test_df$extry)
auc_ROCR <- performance(predictFull, measure = "auc")
my_list <- list("AUC" = auc_ROCR)
return(my_list)
}
les <- new.function(new.model)
The function can be further improved by calling the test_df as a separate argument, so that you can fit it with an alternative testing data.
To run the function in the way you intended, you would need to use non-standard evaluation to capture the symbol and insert it in a formula. This can be done using match.call and as.formula. Here's a fully reproducible example using dummy data:
new.function <- function(a) {
# Convert symbol to character
a <- as.character(match.call()$a)
# Build formula from character strings
form <- as.formula(paste("extry", a, sep = "~"))
model <- glm(form, family = binomial("logit"), data = train_df)
pred.prob <- predict(model, test_df, type = 'response')
predictFull <- ROCR::prediction(pred.prob, test_df$extry)
auc_ROCR <- ROCR::performance(predictFull, "auc")
list("AUC" = auc_ROCR)
}
Now we can call the function in the way you intended:
new.function(X0)
#> $AUC
#> A performance instance
#> 'Area under the ROC curve'
new.function(X1)
#> $AUC
#> A performance instance
#> 'Area under the ROC curve'
If you want to see the actual area under the curve you would need to do:
new.function(X0)$AUC#y.values[[1]]
#> [1] 0.6599759
So you may wish to modify your function so that the list contains auc_ROCR#y.values[[1]] rather than auc_ROCR
Data used
set.seed(1)
train_df <- data.frame(X0 = sample(100), X1 = sample(100))
train_df$extry <- rbinom(100, 1, (train_df$X0 + train_df$X1)/200)
test_df <- data.frame(X0 = sample(100), X1 = sample(100))
test_df$extry <- rbinom(100, 1, (test_df$X0 + test_df$X1)/200)
Created on 2022-06-29 by the reprex package (v2.0.1)

Dummies not included in summary

I want to create a function which will perform panel regression with 3-level dummies included.
Let's consider within model with time effects :
library(plm)
fit_panel_lr <- function(y, x) {
x[, length(x) + 1] <- y
#adding dummies
mtx <- matrix(0, nrow = nrow(x), ncol = 3)
mtx[cbind(seq_len(nrow(mtx)), 1 + (as.integer(unlist(x[, 2])) - min(as.integer(unlist(x[, 2])))) %% 3)] <- 1
colnames(mtx) <- paste0("dummy_", 1:3)
#converting to pdataframe and adding dummy variables
x <- pdata.frame(x)
x <- cbind(x, mtx)
#performing panel regression
varnames <- names(x)[3:(length(x))]
varnames <- varnames[!(varnames == names(y))]
form <- paste0(varnames, collapse = "+")
x_copy <- data.frame(x)
form <- as.formula(paste0(names(y), "~", form,'-1'))
params <- list(
formula = form, data = x_copy, model = "within",
effect = "time"
)
pglm_env <- list2env(params, envir = new.env())
model_plm <- do.call("plm", params, envir = pglm_env)
model_plm
}
However, if I use data :
data("EmplUK", package="plm")
dep_var<-EmplUK['capital']
df1<-EmplUK[-6]
In output I will get :
> fit_panel_lr(dep_var, df1)
Model Formula: capital ~ sector + emp + wage + output + dummy_1 + dummy_2 +
dummy_3 - 1
<environment: 0x000001ff7d92a3c8>
Coefficients:
sector emp wage output
-0.055179 0.328922 0.102250 -0.002912
How come that in formula dummies are considered and in coefficients are not ? Is there any rational explanation or I did something wrong ?
One point why you do not see the dummies on the output is because they are linear dependent to the other data after the fixed-effect time transformation. They are dropped so what is estimable is estimated and output.
Find below some (not readily executable) code picking up your example from above:
dat <- cbind(EmplUK, mtx) # mtx being the dummy matrix constructed in your question's code for this data set
pdat <- pdata.frame(dat)
rhs <- paste(c("emp", "wage", "output", "dummy_1", "dummy_2", "dummy_3"), collapse = "+")
form <- paste("capital ~" , rhs)
form <- formula(form)
mod <- plm(form, data = pdat, model = "within", effect = "time")
detect.lindep(mod$model) # before FE time transformation (original data) -> nothing offending
detect.lindep(model.matrix(mod)) # after FE time transformation -> dummies are offending
The help page for detect.lindep (?detect.lindep is included in package plm) has some more nice examples on linear dependence before and after FE transformation.
A suggestion:
As for constructing dummy variables, I suggest to use R's factor with three levels and not have the dummy matrix constructed yourself. Using a factor is typically more convinient and less error prone. It is converted to the binary dummies (treatment style) by your typical estimation function using the model.frame/model.matrix framework.

R: Clustered robust standard errors using miceadds lm.cluster - error with subset and weights

I am trying to use the lm.cluster function in the package miceadds to get robust clustered standard errors for a multiply imputed dataset.
I am able to get the standard version of it to run but I get the following error when I try to add a subset or weights:
Error in eval(substitute(subset), data, env) :
..1 used in an incorrect context, no ... to look in
Example that works without subset or weights:
require("mice")
require("miceadds")
data(data.ma01)
# imputation of the dataset: use six imputations
dat <- data.ma01[ , - c(1:2) ]
imp <- mice::mice( dat , maxit=3 , m=6 )
datlist <- miceadds::mids2datlist( imp )
# linear regression with cluster robust standard errors
mod <- lapply(datlist, FUN = function(data){miceadds::lm.cluster( data=data ,
formula=read ~ paredu+ female , cluster = data.ma01$idschool )} )
# extract parameters and covariance matrix
betas <- lapply( mod , FUN = function(rr){ coef(rr) } )
vars <- lapply( mod , FUN = function(rr){ vcov(rr) } )
# conduct statistical inference
summary(pool_mi( qhat = betas, u = vars ))
Example that breaks with subset:
mod <- lapply(datlist, FUN = function(data){miceadds::lm.cluster( data=data ,
formula=read ~ paredu+ female , cluster = data.ma01$idschool, subset=
(data.ma01$urban==1))} )
Error during wrapup: ..1 used in an incorrect context, no ... to look in
Example that breaks with weights:
mod <- lapply(datlist, FUN = function(data){miceadds::lm.cluster( data=data ,
formula=read ~ paredu+ female , cluster = data.ma01$idschool,
weights=data.ma01$studwgt)} )
Error during wrapup: ..1 used in an incorrect context, no ... to look in
From searching, I think I am encountering similar issues as others when passing these commands through an lm or glm wrapper (such as: Passing Argument to lm in R within Function or R : Pass argument to glm inside an R function or Passing the weights argument to a regression function inside an R function)
However, I am not sure how to address the issue with the imputed datasets & existing lm.cluster command.
Thanks
This works fine with the estimatr package which is on CRAN and the estimatr::lm_robust() function. Two notes: (1) you can change the type of standard errors using se_type = and (2) I keep idschool in the data because we like the clusters to be in the same data.frame as we fit the model on.
library(mice)
library(miceadds)
library(estimatr)
# imputation of the dataset: use six imputations
data(data.ma01)
dat <- data.ma01[, -c(1)] # note I keep idschool in data
imp <- mice::mice( dat , maxit = 3, m = 6)
datlist <- miceadds::mids2datlist(imp)
# linear regression with cluster robust standard errors
mod <- lapply(
datlist,
function (dat) {
estimatr::lm_robust(read ~ paredu + female, dat, clusters = idschool)
}
)
# subset
mod <- lapply(
datlist,
function (dat) {
estimatr::lm_robust(read ~ paredu + female, dat, clusters = idschool, subset = urban == 1)
}
)
# weights
mod <- lapply(
datlist,
function (dat) {
estimatr::lm_robust(read ~ paredu + female, dat, clusters = idschool, weights = studwgt)
}
)
# note that you can use the `se_type` argument of lm_robust()
# to change the vcov estimation
# extract parameters and covariance matrix
betas <- lapply(mod, coef)
vars <- lapply(mod, vcov)
# conduct statistical inference
summary(pool_mi( qhat = betas, u = vars ))
I'm no expert, but there is an issue with the passing of the weights to lm(). I know this is not an ideal situation, but I managed to get it to work by modifying the lm.cluster() function to hard code the weights pass and then just used my own.
lm.cluster <- function (data, formula, cluster, wgts=NULL, ...)
{
TAM::require_namespace_msg("multiwayvcov")
if(is.null(wgts)) {
mod <- stats::lm(data = data, formula = formula)
} else {
data$.weights <- wgts
mod <- stats::lm(data = data, formula = formula, weights=data$.weights)
}
if (length(cluster) > 1) {
v1 <- cluster
}
else {
v1 <- data[, cluster]
}
dfr <- data.frame(cluster = v1)
vcov2 <- multiwayvcov::cluster.vcov(model = mod, cluster = dfr)
res <- list(lm_res = mod, vcov = vcov2)
class(res) <- "lm.cluster"
return(res)
}

Function lipsitz.test {generalhoslem} is not working for object clm{ordinal}

I am now tring to test the goodness of fit of an ordianl model using lipsitz.test {generalhoslem}. According to the document, the function can deal with both polr and clm. However, when I try to use clm in the lipsitz.testfunction, an error occurs. Here is an example
library("ordinal")
library(generalhoslem)
data("wine")
fm1 <- clm(rating ~ temp * contact, data = wine)
lipsitz.test(fm1)
Error in names(LRstat) <- "LR statistic" :
'names' attribute [1] must be the same length as the vector [0]
In addition: Warning message:
In lipsitz.test(fm1) :
n/5c < 6. Running this test when n/5c < 6 is not recommended.
Is there any solution to fix this? Thanks a lot.
I'm not sure if this is off-topic and should be on CrossValidated. It's partly a problem with the coding of the test and partly about the statistics of the test itself.
There are two problems. I've just spotted a bug in the code when using clm and will push a fix to CRAN (corrected code below).
There does however appear to be a more fundamental problem with the example data. Basically, the Lipsitz test requires fitting a new model with dummy variables of the groupings. When fitting the new model with this example, the model fails and so some of the coefficients are not calculated. If using polr, the new model gets the warning that it is rank-deficient; if using clm, the new model gets a message that two coefficients are not fitted due to singularities. I think this example data set is just unsuitable for this kind of analysis.
The corrected code is below and I have used a larger example dataset on which the test runs.
lipsitz.test <- function (model, g = NULL) {
oldmodel <- model
if (class(oldmodel) == "polr") {
yhat <- as.data.frame(fitted(oldmodel))
} else if (class(oldmodel) == "clm") {
predprob <- oldmodel$model[, 2:ncol(oldmodel$model)]
yhat <- predict(oldmodel, newdata = predprob, type = "prob")$fit
} else warning("Model is not of class polr or clm. Test may fail.")
formula <- formula(oldmodel$terms)
DNAME <- paste("formula: ", deparse(formula))
METHOD <- "Lipsitz goodness of fit test for ordinal response models"
obs <- oldmodel$model[1]
if (is.null(g)) {
g <- round(nrow(obs)/(5 * ncol(yhat)))
if (g < 6)
warning("n/5c < 6. Running this test when n/5c < 6 is not recommended.")
}
qq <- unique(quantile(1 - yhat[, 1], probs = seq(0, 1, 1/g)))
cutyhats <- cut(1 - yhat[, 1], breaks = qq, include.lowest = TRUE)
dfobs <- data.frame(obs, cutyhats)
dfobsmelt <- melt(dfobs, id.vars = 2)
observed <- cast(dfobsmelt, cutyhats ~ value, length)
if (g != nrow(observed)) {
warning(paste("Not possible to compute", g, "rows. There might be too few observations."))
}
oldmodel$model <- cbind(oldmodel$model, cutyhats = dfobs$cutyhats)
oldmodel$model$grp <- as.factor(vapply(oldmodel$model$cutyhats,
function(x) which(observed[, 1] == x), 1))
newmodel <- update(oldmodel, . ~ . + grp, data = oldmodel$model)
if (class(oldmodel) == "polr") {
LRstat <- oldmodel$deviance - newmodel$deviance
} else if (class(oldmodel) == "clm") {
LRstat <- abs(-2 * (newmodel$logLik - oldmodel$logLik))
}
PARAMETER <- g - 1
PVAL <- 1 - pchisq(LRstat, PARAMETER)
names(LRstat) <- "LR statistic"
names(PARAMETER) <- "df"
structure(list(statistic = LRstat, parameter = PARAMETER,
p.value = PVAL, method = METHOD, data.name = DNAME, newmoddata = oldmodel$model,
predictedprobs = yhat), class = "htest")
}
library(foreign)
dt <- read.dta("http://www.ats.ucla.edu/stat/data/hsbdemo.dta")
fm3 <- clm(ses ~ female + read + write, data = dt)
lipsitz.test(fm3)
fm4 <- polr(ses ~ female + read + write, data = dt)
lipsitz.test(fm4)

pgmm from plm package gives error for summary

I am trying to use the pgmm function from the plm package for R. The regression runs and I can call up the results, however, asking for the summary gives the following error:
Error in t(y) %*% x : non-conformable arguments
I've imported the data from the World Bank using the WDI package:
library(plm) # load package
library(WDI) # Load package
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
Calling the results as follows works.
EQ
But the summary (below) gives the error message mentioned above.
summary(EQ)
I think the error occurs because summary.pgmm tries to do a second order Arelland-Bond test of serial correlation on your data, but your data only have two points (2008 and 2009) so it fails.
To fix this problem, you could patch the function so that it checks whether you only have two points in the data set and runs the test only if you have more than two points. I provide a patched function below:
summary.pgmm.patched <- function (object, robust = FALSE, time.dummies = FALSE, ...)
{
model <- plm:::describe(object, "model")
effect <- plm:::describe(object, "effect")
transformation <- plm:::describe(object, "transformation")
if (robust) {
vv <- vcovHC(object)
}
else {
vv <- vcov(object)
}
if (model == "onestep")
K <- length(object$coefficients)
else K <- length(object$coefficients[[2]])
Kt <- length(object$args$namest)
if (!time.dummies && effect == "twoways")
rowsel <- -c((K - Kt + 1):K)
else rowsel <- 1:K
std.err <- sqrt(diag(vv))
b <- coef(object)
z <- b/std.err
p <- 2 * pnorm(abs(z), lower.tail = FALSE)
CoefTable <- cbind(b, std.err, z, p)
colnames(CoefTable) <- c("Estimate", "Std. Error", "z-value",
"Pr(>|z|)")
object$CoefTable <- CoefTable[rowsel, , drop = FALSE]
object$sargan <- sargan(object)
object$m1 <- plm:::mtest(object, 1, vv)
# The problem line:
# object$m2 <- mtest(object, 2, vv)
if (length(object$residuals[[1]] ) > 2) object$m2 <- plm:::mtest(object, 2, vv)
object$wald.coef <- plm:::wald(object, "param", vv)
if (plm:::describe(object, "effect") == "twoways")
object$wald.td <- plm:::wald(object, "time", vv)
class(object) <- "summary.pgmm"
object
}
You might want to write to the author of the plm package and show him this post. The author will be able to write a less 'hacky' patch.
Using your own (slightly modified) example data, here is how you would use the function:
library(WDI) # Load package
library(plm)
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
names(PANEL) [c(4,5)] = c('gdp','fertility')
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
summary.pgmm.patched(EQ)

Resources