Check for conditional heteroskedasticity - r

Data:
dput(head(mydata))
structure(list(DATE = structure(c(-315619200, -312940800, -310435200,
-307756800, -305164800, -302486400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), RF = c(0.33, 0.29, 0.35, 0.19, 0.27, 0.24), RMRF = c(-6.99,
0.99, -1.46, -1.7, 3.08, 2.09), SMB = c(2.13, 0.71, -0.65, 0.32,
1.42, -0.24), UMD = c(-3.28, 3.59, 1.85, 2.6, 4.77, 1.03), HML = c(2.65,
-2.15, -2.69, -2.22, -3.83, -0.3), JANDUM = c(1, 0, 0, 0, 0,
0), R4 = c(-4.57, 1.5, -2.83, -1.98, 3.54, 2.15)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
My data contains:
R4 is the percentage return of a portfolio, RF is the return of a good without risk (risk free rate), RMRF is the excess return of portfolio Market Portfolio, SMB, UMD, and HML are 3 factors, and JANDUM is a dummy variable for January (January Dummy).
The data is on a monthly frequency from 1/1960 to 12/2003 (there are 528 observations totaly).
I build the following code for the purpose of "Regress portfolio excess return (R4-RF) to one constant and all other variables (RMRF, SMB, UMD, HML, and JANDUM)".
mydata$PER <-mydata$R4 - mydata$RF
mydata$JANDUM <- as.factor(mydata$JANDUM)
# Fit regression model
model <- lm(PER ~ DATE + RMRF + SMB + UMD + HML + JANDUM, data = mydata)
summary(model)
Then i want to check for the presence of autocorrelation and conditional
heteroskedasticity.
My try:
# // Tests that comes to establish the presence or absence of heteroscedasticity
# Breusch-Pagan test
lmtest::bptest(model)
# Breusch-Pagan test
car::ncvTest(model)
PER_BCMod <- caret::BoxCoxTrans(mydata$PER)
print(PER_BCMod)
# The model for creating the box-cox transformed variable is ready. Lets now apply it on mydata$PER and append it to a new dataframe.
mydata <- cbind(mydata, PER_New = predict(PER_MCMod, mydata$PER))
head(mydata)
# The transformed data for our new regression model is ready. Lets build the model and check for heteroscedasticity.
model_bc <- lm(PER_New ~ DATE + RMRF + SMB + UMD + HML + JANDUM, data=mydata)
bptest(model_bc)
plot(model_bc)
Errors :
> print(PER_BCMod)
Box-Cox Transformation
528 data points used to estimate Lambda
Input data summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-30.0700 -2.7800 1.0500 0.6758 4.4325 23.5800
Lambda could not be estimated; no transformation is applied
> # The model for creating the box-cox transformed variable is ready. Lets now apply it on mydata$PER and append it to a new dataframe.
> mydata <- cbind(mydata, PER_New = predict(PER_MCMod, mydata$PER))
Error in predict(PER_MCMod, mydata$PER) : object 'PER_MCMod' not found
I cant understand whats the fault here ? Your suggestions would be welcome.
Is this process wrong , i must follow other solution ?

Related

Plot side-by-side forest plots using forest function in meta package

I'd like to be able to plot side by side forest plots using the forest() function in the meta package in R. I have successfully done this using the forest.default() function in the metafor package, but I prefer the forest() plots generated using the meta package. Here is what I have tried so far:
oldpar <- par(mfrow=c(1, 2))
oldpar
res <- metagen(TE=sens, seTE=sens.se, data=df, studlab=study)
forest(res, data=df, method.tau="REML", comb.random=TRUE,
leftcols="studlab", rightcols=c("effect", "ci")
res2 <- metagen(TE=sens2, seTE=sens.se2, data=df, studlab=study)
forest(res2, data=df, method.tau="REML", comb.random=TRUE,
leftcols="studlab", rightcols=c("effect", "ci")
I have also tried:
par(mfrow=c(1,2))
par(mar=c(5,4,1,1))
res <- metagen(TE=sens, seTE=sens.se, data=df, studlab=study)
forest(res, data=df, method.tau="REML", comb.random=TRUE,
leftcols="studlab", rightcols=c("effect", "ci")
par(mar=c(5,3,1,2))
res2 <- metagen(TE=sens2, seTE=sens.se2, data=df, studlab=study)
forest(res2, data=df, method.tau="REML", comb.random=TRUE,
leftcols="studlab", rightcols=c("effect", "ci")
Finally, I have tried to work with the "grid" and "lattice" packages to no avail. When I attempt to store the plots as objects, they come up as "NULL" in the global environment.
Both of those methods have worked on other types of plots, but inexplicably do not appear to work for the forest plots generated by the forest function in the meta package.
Please let me know if you have a solution to this.
Thank you!
Update - the structure of my dataset:
structure(list(study = 1:7, sens = c(0.88, 0.86, 0.75, 0.9, 0.91,
0.93, 0.98), sens.se = c(0.13, 0.08, 0.2, 0.06, 0.13, 0.15, 0.66
), sens2 = c(0.76, 0.68, 0.9, 0.82, 0.76, 0.85, 0.76), sens.se2 = c(0.14,
0.08, 0.2, 0.06, 0.14, 0.15, 0.66)), class = "data.frame", row.names =
c(NA, -7L))
Here a reproducible example using the dataset available in meta. As stated in the comments meta use very nice plot utilities but they are available in grid graphics system.
library(meta)
library(vcd)
library(gridGraphics)
library(gridExtra)
data(Olkin95)
meta.olkin95 <- metabin(event.e, n.e, event.c, n.c,
studlab = paste(author, year),
data = Olkin95, subset = c(41, 47, 51, 59),
method = "Inverse")
forest(meta.olkin95, comb.fixed = F)
plot.olkin95 <- grid.grab()
data(Fleiss93cont)
meta.fleiss93 <- metacont(n.e, mean.e, sd.e, n.c, mean.c, sd.c,
studlab = paste(study, year),
data=Fleiss93cont,
sm="SMD")
forest(meta.fleiss93, comb.fixed = F)
plot.fleiss93 <- grid.grab()
grid.newpage()
grid.arrange(plot.olkin95, plot.fleiss93, ncol=1)

Error in nonlinear least squeares in R - Logistic and Gompertz curves

I'm working on a model for variable y, in which I intend to use time as an explanatory variable. I've chosen a Gompertz and a logistic curve as candidates, but when I try to estimate the coefficients (using both nls and nls2), I end up getting different errors (singularity or step factor reduced below 'minFactor'). I would really appreciate any help. Here is my code and a deput version of the info object.
I chose the initial values according to the criteria in http://www.metla.fi/silvafennica/full/sf33/sf334327.pdf
library(nls2)
> dput(info)
structure(list(y = c(0.308, 0.279, 0.156, 0.214, 0.224, 0.222,
0.19, 0.139, 0.111, 0.17, 0.155, 0.198, 0.811, 0.688, 0.543,
0.536, 0.587, 0.765, 0.667, 0.811, 0.587, 0.617, 0.586, 0.633,
2.231, 2.202, 1.396, 1.442, 1.704, 2.59, 2.304, 3.026, 2.7, 3.275,
3.349, 3.936, 9.212, 8.773, 6.431, 6.983, 7.169, 9.756, 10.951,
13.938, 14.378, 18.406, 24.079, 28.462, 51.461, 46.555, 39.116,
43.982, 41.722), t = 1:53), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -53L))
summary(gomp_nls <- nls2(y ~ alpha*exp(-beta*exp(-gamma*t)),
data = info,
start = list(alpha = 40, beta = 4.9, gamma = 0.02),
algorithm = "default")
)
summary(logist_nls <- nls2(y ~ alpha/(1+beta*exp(-gamma*t)),
data = info,
start = list(alpha = 40, beta = 128, gamma = 0.02),
algorithm = "default"))
)
I'd appreciate any help
The "default" algorithm for nls2 is to use nls. You want to specify "brute-force" or one of the other algorithms for finding an initial value. The starting value should be a data frame of two rows such that it will fill in the hypercube so defined with potential starting values.
It will then evaluate the residual sum of squares at each of those starting values and return the starting values at which the formula gives the least sum of squares.
If you find that the result returned by nls2 is at the boundary of the region you defined then enlarge the region and try again. (You might not need this step if the starting value returned are good enough anyways.)
Finally run nls with the starting values you found.
library(nls2)
## 1
fo1 <- y ~ alpha*exp(-beta*exp(-gamma*t))
st1 <- data.frame(alpha = c(10, 100), beta = c(1, 100), gamma = c(0.01, 0.20))
fm1.0 <- nls2(fo1, data = info, start = st1, algorithm = "brute-force")
fm1 <- nls(fo1, data = info, start = coef(fm1.0))
## 2
fo2 <- y ~ alpha/(1+beta*exp(-gamma*t))
st2 <- data.frame(alpha = c(10, 1000), beta = c(1, 10000), gamma = c(0.01, 0.20))
fm2.0 <- nls2(fo2, data = info, start = st2, algorithm = "brute-force")
fm2 <- nls(fo2, data = info, start = coef(fm2.0))
# plot both fits
plot(y ~ t, info)
lines(fitted(fm1) ~ t, info, col = "blue")
lines(fitted(fm2) ~ t, info, col = "red")
Note
Note that for the data shown these two 2-parameter exponential models fit reasonably well so if you are only interested in the range where it rises exponentially then these could be alternatives to consider. (The first one below is better because the coefficients are more similar to each other. The second one may have scaling problems.)
fm3 <- nls(y ~ a * exp(b/t), info, start = c(a = 1, b = 1))
fm4 <- nls(y ~ a * t^b, info, start = c(a = .001, b = 6))

Solve best fit polynomial and plot drop-down lines

I'm using R 3.3.1 (64-bit) on Windows 10. I have an x-y dataset that I've fit with a 2nd order polynomial. I'd like to solve that best-fit polynomial for x at y=4, and plot drop-down lines from y=4 to the x-axis.
This will generate the data in a dataframe v1:
v1 <- structure(list(x = c(-5.2549, -3.4893, -3.5909, -2.5546, -3.7247,
-5.1733, -3.3451, -2.8993, -2.6835, -3.9495, -4.9649, -2.8438,
-4.6926, -3.4768, -3.1221, -4.8175, -4.5641, -3.549, -3.08, -2.4153,
-2.9882, -3.4045, -4.6394, -3.3404, -2.6728, -3.3517, -2.6098,
-3.7733, -4.051, -2.9385, -4.5024, -4.59, -4.5617, -4.0658, -2.4986,
-3.7559, -4.245, -4.8045, -4.6615, -4.0696, -4.6638, -4.6505,
-3.7978, -4.5649, -5.7669, -4.519, -3.8561, -3.779, -3.0549,
-3.1241, -2.1423, -3.2759, -4.224, -4.028, -3.3412, -2.8832,
-3.3866, -0.1852, -3.3763, -4.317, -5.3607, -3.3398, -1.9087,
-4.431, -3.7535, -3.2545, -0.806, -3.1419, -3.7269, -3.4853,
-4.3129, -2.8891, -3.0572, -5.3309, -2.5837, -4.1128, -4.6631,
-3.4695, -4.1045, -7.064, -5.1681, -6.4866, -2.7522, -4.6305,
-4.2957, -3.7552, -4.9482, -5.6452, -6.0302, -5.3244, -3.9819,
-3.8123, -5.3085, -5.6096, -6.4557), y = c(0.99, 0.56, 0.43,
2.31, 0.31, 0.59, 0.62, 1.65, 2.12, 0.1, 0.24, 1.68, 0.09, 0.59,
1.23, 0.4, 0.36, 0.49, 1.41, 3.29, 1.22, 0.56, 0.1, 0.67, 2.38,
0.43, 1.56, 0.07, 0.08, 1.53, -0.01, 0.12, 0.1, 0.04, 3.42, 0.23,
0, 0.34, 0.15, 0.03, 0.19, 0.17, 0.2, 0.09, 2.3, 0.07, 0.15,
0.18, 1.07, 1.21, 3.4, 0.8, -0.04, 0.02, 0.74, 1.59, 0.71, 10.64,
0.64, -0.01, 1.06, 0.81, 4.58, 0.01, 0.14, 0.59, 7.35, 0.63,
0.17, 0.38, -0.08, 1.1, 0.89, 0.94, 1.52, 0.01, 0.1, 0.38, 0.02,
7.76, 0.72, 4.1, 1.36, 0.13, -0.02, 0.13, 0.42, 1.49, 2.64, 1.01,
0.08, 0.22, 1.01, 1.53, 4.39)), .Names = c("x", "y"), class = "data.frame", row.names = c(NA,
-95L))
Here's the code to plot y vs x, plot the best fit polynomial, and draw a line at y=4.
> attach(v1)
> # simple x-y plot of the data
> plot(x,y, pch=16)
> # 2nd order polynomial fit
> fit2 <- lm(y~poly(x,2,raw=TRUE))
> summary(fit2)
> # generate range of numbers for plotting polynomial
> xx <- seq(-8,0, length=50)
> # overlay best fit polynomial
>lines(xx, predict(fit2, data.frame(x=xx)), col="blue")
> # add horizontal line at y=4
> abline(h=4, col="red")
>
It's obvious from the plot that y=4 at x of around -2 and -6.5, but I'd like to actually solve the regression polynomial for those values.
Ideally, I'd like lines that drop down from the red-blue line intersections to the x-axis (i.e plot vertical ablines that terminate at the two y=4 solutions). If that's not possible, I'd be happy with good old vertical ablines that go all the way up the plot, so long as they at the proper x solution values.
This graph represents parts that will be out-of-spec when y>4, so I want to use the drop-down lines to highlight the range of x values that will produce in-spec parts.
You can use the quadratic formula to calculate the values:
betas <- coef(fit2) # get coefficients
betas[1] <- betas[1] - 4 # adjust intercept to look for values where y = 4
# note degree increases, so betas[1] is c, etc.
betas
## (Intercept) poly(x, 2, raw = TRUE)1 poly(x, 2, raw = TRUE)2
## 8.7555833 6.0807302 0.7319848
solns <- c((-betas[2] + sqrt(betas[2]^2 - 4 * betas[3] * betas[1])) / (2 * betas[3]),
(-betas[2] - sqrt(betas[2]^2 - 4 * betas[3] * betas[1])) / (2 * betas[3]))
solns
## poly(x, 2, raw = TRUE)1 poly(x, 2, raw = TRUE)1
## -1.853398 -6.453783
segments(solns, -1, solns, 4, col = 'green') # add segments to graph
Much simpler (if you can find it) is polyroot:
polyroot(betas)
## [1] -1.853398+0i -6.453783+0i
Since it returns a complex vector, you'll need to wrap it in as.numeric if you want to pass it to segments.
I absolutely understand that there is an analytical solution for this simple quadratic polynomial. The reason I show you numerical solution is that you ask this question in regression setting. Numerical solution may always be your solution in general, when you have more complicated regression curve.
In the following I will use uniroot function. If you are not familiar with it, read this short answer first: Uniroot solution in R.
This is the plot produced with your code. You are almost there. This is a root finding problem, and you may numerically use uniroot. Let's define a function:
f <- function (x) {
## subtract 4
predict(fit2, newdata = data.frame(x = x)) - 4
}
From the figure, it is clear that there are two roots, one inside [-7, -6], the other inside [-3, -1]. We use uniroot to find both:
x1 <- uniroot(f, c(-7, -6))$root
#[1] -6.453769
x2 <- uniroot(f, c(-3, -1))$root
#[1] -1.853406
Now you can drop a vertical line from these points down to x-axis:
y1 <- f(x1) + 4 ## add 4 back
y2 <- f(x2) + 4
abline(h = 0, col = 4) ## x-axis
segments(x1, 0, x1, y1, lty = 2)
segments(x2, 0, x2, y2, lty = 2)
You have a quadratic equation
0.73198 * x^2 + 6.08073 * x + 12.75558 = 4
OR
0.73198 * x^2 + 6.08073 * x + 8.75558 = 0
You can just use the quadratic formula to solve this analytically. R gives the two roots:
(-6.08073 + sqrt(6.08073^2 -4*0.73198 * 8.75558)) / (2 * 0.73198)
[1] -1.853392
(-6.08073 - sqrt(6.08073^2 -4*0.73198 * 8.75558)) / (2 * 0.73198)
[1] -6.453843
abline(v=c(-1.853392, -6.453843))
Here is one more solution, based on this
attach(v1)
fit2 = lm(y~poly(x,2,raw=TRUE))
xx = seq(-8,0, length=50)
vector1 = predict(fit2, data.frame(x=xx))
vector2= replicate(length(vector1),4)
# Find points where vector1 is above vector2.
above = vector1 > vector2
# Points always intersect when above=TRUE, then FALSE or reverse
intersect.points = which(diff(above)!=0)
# Find the slopes for each line segment.
vector1.slopes = vector1[intersect.points+1] - vector1[intersect.points]
vector2.slopes = vector2[intersect.points+1] - vector2[intersect.points]
# Find the intersection for each segment.
x.points = intersect.points + ((vector2[intersect.points] - vector1[intersect.points]) / (vector1.slopes-vector2.slopes))
y.points = vector1[intersect.points] + (vector1.slopes*(x.points-intersect.points))
#Scale x.points to the axis value of xx
x.points = xx[1] + ((x.points - 1)/(49))*(xx[50]-xx[1])
plot(xx, y = vector1, type= "l", col = "blue")
points(x,y,pch = 20)
lines(x = c(x.points[1],x.points[1]), y = c(0,y.points[1]), col='red')
lines(x = c(x.points[2],x.points[2]), y = c(0,y.points[2]), col='red')
Many solutions are already proposed, here is another one.
As obvious, we are interested to find the x values that satisfy the polynomial (quadratic) equation a_0 + a_1.x + a_2.x^2 = 4, where a_0, a_1, a_2 are the coefficients of the fitted polynomial. We can rewrite the equation as a standard quadratic equation ax^2+bx+c=0 and find the roots using Sridhar's formula using the coefficients of the fitted polynomial with polynomial regression as follows:
a <- fit2$coefficients[3]
b <- fit2$coefficients[2]
c <- fit2$coefficients[1] - 4
as.numeric((-b + sqrt(b^2-4*a*c)) / (2*a))
#[1] -1.853398
as.numeric((-b-+ sqrt(b^2-4*a*c)) / (2*a))
#[1] -6.453783
We can use some numerical methods such as Newton-Raphson to find the roots as well (although there are faster numerical methods but this will solve our purpose and it's quite fast too, takes ~160 ms on my machine), as we can see from the following code, the numerical and the theoretical solutions agree.
a <- fit2$coefficients # fitted quadratic polynomial coefficients
f <- function(x) {
as.numeric(a[1] + a[2]*x + a[3]*x^2-4)
}
df <- function(x) {
as.numeric(a[2] + 2*a[3]*x)
}
Newton.Raphson <- function(x0) {
eps <- 1e-6
x <- x0
while(TRUE) {
x <- x0 - f(x0) / df(x0)
if (abs(x - x0) < eps) {
return(x0)
}
x0 <- x
}
}
t1 <- Sys.time()
x1 <- Newton.Raphson(-10)
x2 <- Newton.Raphson(10)
x1
#[1] -6.453783
x2
#[1] -1.853398
s2
print(paste('time taken to compute the roots:' ,Sys.time() - t1))
#[1] "time taken to compute the roots: 0.0160109996795654"
points(x1, 4, pch=19, col='green')
points(x2, 4, pch=19, col='green')
abline(v=x1, col='green')
abline(v=x2, col='green')

bounds for parameters in mle2() with optimizer=optimx

In the mle2, I used "optimx" as a optimizer. I want to use lower and upper bounds for parameters. How can I do this?
For example:
library("bbmle"); library("optimx")
y <- c(0.654, 0.613, 0.315, 0.449, 0.297, 0.402, 0.379,
0.423, 0.379, 0.3235, 0.269, 0.740, 0.418, 0.412,
0.494, 0.416, 0.338, 0.392, 0.484, 0.265)
gamma4 <- function(shape, scale) {
-sum(dgamma(y, shape = shape, scale = scale,log = TRUE))
}
gm <- mean(y)
cv <- var(y)/mean(y)
m5 <- mle2(gamma4,start = list(shape = gm/cv, scale = cv),
optimizer="optimx")
m5
Or:
mle2(gengamma3,start = list(shape = ci,
scale = bet, k=alp),
optimizer="optimx")
Thanks
You can try to write lower function as last parametr, like in example below:
## use bounded optimization
## the lower bounds are really > 0, but we use >=0 to stress-test
## profiling; note lower must be named
(fit1 <- mle2(LL, method="L-BFGS-B", lower=c(ymax=0, xhalf=0)))
p1 <- profile(fit1)
Or in that one:
# try bounded optimization with nlminb and constrOptim
(fit1B <- mle2(LL, optimizer="nlminb", lower=c(lymax=1e-7, lhalf=1e-7)))
p1B <- profile(fit1B)
confint(p1B)
(fit1C <- mle2(LL, optimizer="constrOptim", ui = c(lymax=1,lhalf=1), ci=2,
method="Nelder-Mead"))
But for fully understanding i advise to look here

Anesrake algorithm doesn't work with zero as a weight

I tried using the anesrake package, but it won't accept a weight of zero, giving the error message:
Error in while (range(weightvec)[2] > cap + 1e-04) { :
missing value where TRUE/FALSE needed
Sample code:
ipfdata<- read.csv("dummydata.csv", header = T)
ipfdata$caseid <- 1:length(ipfdata$age)
sex <- c(0.30, 0.70)
age <- c(0.2, 0.1, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2)
ses <- c(0.20, 0.20, 0.0)
targets <- list(sex, age, ses)
names(targets) <- c("sex", "age", "ses")
outsave <- anesrake(targets, ipfdata, caseid = ipfdata$caseid, weightvec = NULL, cap = 10, verbose = TRUE, maxit = 50, choosemethod = "total", type = "nolim", pctlim = 0.0001, nlim=10, iterate = T, force1 = TRUE)
(sample code modified from this question: https://stackoverflow.com/questions/19458306/ipf-raking-using-anesrake-in-r-error)
The package was never updated despite my contacting the author to address this issue. The only workaround is to remove any rows with the variable set to zero before raking.
In the given sample above, you would have to remove any rows with the third SES factor, and then change the SES vector to c(0.20, 0.20) instead of c(0.20, 0.20, 0.0).

Resources