standardized residuals in r package fgarch - r

I'm using the fGARCH package in R in order to fit an ARMA_GARCH(1,1) model to a time series. I want to extract standardized residuals, meaning the residuals divided by the corresponding daily volatility estimate. I tried to things
res <- residuals(m1, standardize=FALSE)
vol <- volatility(m1)
stand.res <- res/vol
and
stand.res <- residuals(m1, standardize=TRUE)
if I plot both results, they differ from each other. Why is that?
Thank you very much.

I'm having a similar problem; please consider:
rm(list=ls(all=TRUE))
library(fGarch)
set.seed(4)
x <- runif(6587, -0.10, 0.10)
gfit <- garchFit(formula = ~ garch(2,2), cond.dist = "std", data = x, include.shape=TRUE, trace=FALSE)
and
condVar = gfit#h.t
resid <- (x / sqrt(condVar));
tail(resid) # Standardized Residuals
#[1] -0.4201041 -0.8342208 1.5639541 1.0237848 -0.1779349 -0.7820030
#or
tail(x/ gfit#sigma.t)
#[1] -0.4201041 -0.8342208 1.5639541 1.0237848 -0.1779349 -0.7820030
vs
tail(residuals(gfit, standardize = TRUE))
#[1] -0.4156200 -0.8297368 1.5684382 1.0282689 -0.1734509 -0.7775190

Related

Cointegration/GPH test on time series data

I am quite new to R and statistics. I want to test two time series with the GPH test for cointegration. For this I use the package LongMemoryTS and the function gph(): gph(X, m, l = 1). As output I get the value for the parameter d, but I don't know how to determine the t-value or how to generate it.
Here a link to the function in the package: https://rdrr.io/cran/LongMemoryTS/man/gph.html
Here my first try as a code snippet:
lr.reg.IP_AT_EN <- lm(IP_AT~IP_DE)
error.IP_AT_EN <- residuals(lr.reg.IP_AT_DE)
T <- length(error.IP_AT_EN)
d.IP_AT_DE <- gph(X=error.IP_AT_DE, m=T^0.4)
Thanks a lot in advance for the help!
For obtaining critical values, it seems like bootstrapping is a good approach, see here http://repec.org/sce2004/up.4249.1077649371.pdf.
There's also the approach of using the theoretical error variance (see page 3 in the paper). I'm not entirely sure if this is correct, but you would at least obtain a statistic with this code:
library(xts)
library(LongMemoryTS)
set.seed(123)
# generate some data
dat_ts <- xts(matrix(rnorm(200), ncol = 2), order.by = seq.Date(Sys.Date()-100, length.out = 100, by = 'days'))
reg1 <- lm(dat_ts[, 1]~dat_ts[, 2])
error_reg1 <- residuals(reg1)
nT <- length(error_reg1) # bad idea to name it T!
res1 <- gph(X=error_reg1, m=nT^0.4)
res1
t_stat <- (res1-0)/((pi^2)/6)^0.5
p_value <- pt(q = t_stat, df = 1, lower.tail = TRUE)
p_value
[1] 0.3831716

How to use the replicate function in R to repeat the function

I have a problem when using replicate to repeat the function.
I tried to use the bootstrap to fit
a quadratic model using concentration as the predictor and Total_lignin as the response and going to report an estimate of the maximum with a corresponding standard error.
My idea is to create a function called bootFun that essentially did everything within one iteration of a for loop. bootFun took in only the data set the predictor, and the response to use (both variable names in quotes).
However, the SD is 0, not correct. I do not know where is the wrong place. Could you please help me with it?
# Load the libraries
library(dplyr)
library(tidyverse)
# Read the .csv and only use M.giganteus and S.ravennae.
dat <- read_csv('concentration.csv') %>%
filter(variety == 'M.giganteus' | variety == 'S.ravennae') %>%
arrange(variety)
# Check the data
head(dat)
# sample size
n <- nrow(dat)
# A function to do one iteration
bootFun <- function(dat, pred, resp){
# Draw the sample size from the dataset
sample <- sample_n(dat, n, replace = TRUE)
# A quadratic model fit
formula <- paste0('resp', '~', 'pred', '+', 'I(pred^2)')
fit <- lm(formula, data = sample)
# Derive the max of the value of concentration
max <- -fit$coefficients[2]/(2*fit$coefficients[3])
return(max)
}
max <- bootFun(dat = dat, pred = 'concentration', resp = 'Total_lignin' )
# Iterated times
N <- 5000
# Use 'replicate' function to do a loop
maxs <- replicate(N, max)
# An estimate of the max of predictor and corresponding SE
mean(maxs)
sd(maxs)
Base package boot, function boot, can ease the job of calling the bootstrap function repeatedly. The first argument must be the data set, the second argument is an indices argument, that the user does not set and other arguments can also be passed toit. In this case those other arguments are the predictor and the response names.
library(boot)
bootFun <- function(dat, indices, pred, resp){
# Draw the sample size from the dataset
dat.sample <- dat[indices, ]
# A quadratic model fit
formula <- paste0(resp, '~', pred, '+', 'I(', pred, '^2)')
formula <- as.formula(formula)
fit <- lm(formula, data = dat.sample)
# Derive the max of the value of concentration
max <- -fit$coefficients[2]/(2*fit$coefficients[3])
return(max)
}
N <- 5000
set.seed(1234) # Make the bootstrap results reproducible
results <- boot(dat, bootFun, R = N, pred = 'concentration', resp = 'Total_lignin')
results
#
#ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
#Call:
#boot(data = dat, statistic = bootFun, R = N, pred = "concentration",
# resp = "Total_lignin")
#
#
#Bootstrap Statistics :
# original bias std. error
#t1* -0.4629808 -0.0004433889 0.03014259
#
results$t0 # this is the statistic, not bootstrapped
#concentration
# -0.4629808
mean(results$t) # bootstrap value
#[1] -0.4633233
Note that to fit a polynomial, function poly is much simpler than to explicitly write down the polynomial terms one by one.
formula <- paste0(resp, '~ poly(', pred, ',2, raw = TRUE)')
Check the distribution of the bootstrapped statistic.
op <- par(mfrow = c(1, 2))
hist(results$t)
qqnorm(results$t)
qqline(results$t)
par(op)
Test data
set.seed(2020) # Make the results reproducible
x <- cumsum(rnorm(100))
y <- x + x^2 + rnorm(100)
dat <- data.frame(concentration = x, Total_lignin = y)

naiveBayes giving unexpected result when using nonzero laplace argument (package e1071)

I'm trying to use the naiveBayes() function from the e1071 package. When I add a non-zero laplace argument, my resulting probability estimates are not changing and I don't understand why.
Example:
library(e1071)
# Generate data
train.x <- data.frame(x1=c(1,1,0,0), x2=c(1,0,1,0))
train.y <- factor(c("cat", "cat", "dog", "dog"))
test.x <- data.frame(x1=c(1), x2=c(1))
# without laplace smoothing
classifier <- naiveBayes(x=train.x, y=train.y, laplace=0)
predict(classifier, test.x, type="raw") # returns (1, 0.00002507)
# with laplace smoothing
classifier <- naiveBayes(x=train.x, y=train.y, laplace=1)
predict(classifier, test.x, type="raw") # returns (1, 0.00002507)
I would expect the probabilities to change in this case since all the training instances for the "dog" class have 0 for x1. To check this, here's the same thing using Python
Python example:
import numpy as np
from sklearn.naive_bayes import BernoulliNB
train_x = pd.DataFrame({'x1':[1,1,0,0], 'x2':[1,0,1,0]})
train_y = np.array(["cat", "cat", "dog", "dog"])
test_x = pd.DataFrame({'x1':[1,], 'x2':[1,]})
# alpha (i.e. laplace = 0)
classifier = BernoulliNB(alpha=.00000001)
classifier.fit(X=train_x, y=train_y)
classifier.predict_proba(X=test_x) # returns (1, 0)
# alpha (i.e. laplace = 1)
classifier = BernoulliNB(alpha=1)
classifier.fit(X=train_x, y=train_y)
classifier.predict_proba(X=test_x) # returns (.75, .25)
Why am I getting this unexpected result using e1071?
Laplace estimates are only valid for categorical features, not numerical ones. You can find in the source code:
## estimation-function
est <- function(var)
if (is.numeric(var)) {
cbind(tapply(var, y, mean, na.rm = TRUE),
tapply(var, y, sd, na.rm = TRUE))
} else {
tab <- table(y, var)
(tab + laplace) / (rowSums(tab) + laplace * nlevels(var))
}
that for numerical values Gaussian estimates are used. Thus convert your data to factors and you are good to go.
train.x <- data.frame(x1=c("1","1","0","0"), x2=c("1","0","1","0"))
train.y <- factor(c("cat", "cat", "dog", "dog"))
test.x <- data.frame(x1=c("1"), x2=c("1"))
# without laplace smoothing
classifier <- naiveBayes(x=train.x, y=train.y, laplace=0)
predict(classifier, test.x, type="raw") # returns (100% for dog)
# with laplace smoothing
classifier <- naiveBayes(x=train.x, y=train.y, laplace=1)
predict(classifier, test.x, type="raw") # returns (75% for dog)
Major facepalm on this one. The naiveBayes() method was interpreting x1 and x2 as numeric variables and thus trying to use Gaussian conditional probability distributions internally (I think). Encoding my variables as factors solved my problem.
train.x <- data.frame(x1=factor(c(1,1,0,0)), x2=factor(c(1,0,1,0)))
train.y <- factor(c("cat", "cat", "dog", "dog"))
test.x <- data.frame(x1=factor(c(1)), x2=factor(c(1)))

Multiply coefficients with standard deviation

In R, the stargazer package offers the possibility to apply functions to the coefficients, standard errors, etc:
dat <- read.dta("http://www.ats.ucla.edu/stat/stata/dae/nb_data.dta")
dat <- within(dat, {
prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
id <- factor(id)
})
m1 <- glm.nb(daysabs ~ math + prog, data = dat)
transform_coef <- function(x) (exp(x) - 1)
stargazer(m1, apply.coef=transform_coef)
How can I apply a function where the factor with which I multiply depends on the variable, like the standard deviation of that variable?
This may not be exactly what you hoped for, but you can transform the coefficients, and give stargazer a custom list of coefficients. For example, if you would like to report the coefficient times the standard deviation of each variable, the following extension of your example could work:
library(foreign)
library(stargazer)
library(MASS)
dat <- read.dta("http://www.ats.ucla.edu/stat/stata/dae/nb_data.dta")
dat <- within(dat, {
prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
id <- factor(id)
})
m1 <- glm.nb(daysabs ~ math + prog, data = dat)
# Store coefficients (and other coefficient stats)
s1 <- summary(m1)$coefficients
# Calculate standard deviations (using zero for the constant)
math.sd <- sd(dat$math)
acad.sd <- sd(as.numeric(dat$prog == "Academic"))
voc.sd <- sd(as.numeric(dat$prog == "Vocational"))
int.sd <- 0
# Append standard deviations to stored coefficients
StdDev <- c(int.sd, math.sd, acad.sd, voc.sd)
s1 <- cbind(s1, StdDev)
# Store custom list
new.coef <- s1[ , "Estimate"] * s1[ , "StdDev"]
# Output
stargazer(m1, coef = list(new.coef))
You may want to consider a couple of issues outside your original question about outputting coefficients in stargazer. Should you report the intercept when multiplying times the standard deviation? Will your standard errors and inference be the same with this transformation?

binning continuous variables by IV value in R

I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))

Resources