Error in confidence interval mice R package - r

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards

Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

Related

Creating new datasets for each iteration of a power analysis

I have the following code to estimate the power for my study which runs perfectly fine. The issue is that I am running n = 1000 iterations, but each iteration generates the exact same dataset. I think this is because the commands in the function that I created (powercrosssw) draw on the data definitions above that are fixed in value? How do I ensure that each dataset (named dx below) that is generated is different (i.e. the values for u_3, error, and y are different for each iteration) so that I am calculating the power appropriately?
library(simstudy)
library(nlme)
library(gendata)
library(data.table)
library(geepack)
set.seed(12345)
clusterDef <- defDataAdd(varname = "u_3", dist = "normal", formula = 0, variance = 25.77) #cluster-level random effect
patError <- defDataAdd(varname = "error", dist = "normal", formula = 0, variance = 38.35) #error term
#Generate cluster-level data
cohortsw <- genData(3, id = "cluster")
cohortsw <- addColumns(clusterDef, cohortsw)
cohortswTm <- addPeriods(cohortsw, nPeriods = 6, idvars = "cluster", perName = "period")
cohortstep <- trtStepWedge(cohortswTm, "cluster", nWaves = 3, lenWaves = 1, startPer = 1, grpName = "Ijt")
cohortstep
#Generate individual patient-level data
pat <- genCluster(cohortswTm, cLevelVar = "timeID", numIndsVar = 5, level1ID = "id")
pat
dx <- merge(pat[, .(cluster, period, id)], cohortstep, by = c("cluster", "period"))
dx <- addColumns(patError, dx)
setkey(dx, id, cluster, period)
#Define outcome y
outDef <- defDataAdd(varname = "y", formula = "17.87 + 5.0*Ijt - 5.42*I(period == 1) - 5.72*I(period == 2) - 7.03*I(period == 3) - 6.13*I(period == 4) - 9.13*I(period == 5) + u_3 + error", dist = "normal")
dx <- addColumns(outDef, dx)
#Fit GLMM model to simulated dataset
model1 <- lme(y ~ factor(period) + factor(Ijt), random = ~1|cluster, data = dx, method = "REML")
summary(model1)
#Power analysis
powercrosssw <- function(nclus = 3, clsize = 5) {
cohortsw <- genData(nclus, id = "cluster")
cohortsw <- addColumns(clusterDef, cohortsw)
cohortswTm <- addPeriods(cohortsw, nPeriods = 6, idvars = "cluster", perName = "period")
cohortstep <- trtStepWedge(cohortswTm, "cluster", nWaves = 3, lenWaves = 1, startPer = 1, grpName = "Ijt")
pat <- genCluster(cohortswTm, cLevelVar = "timeID", numIndsVar = clsize, level1ID = "id")
dx <- merge(pat[, .(cluster, period, id)], cohortstep, by = c("cluster", "period"))
dx <- addColumns(patError, dx)
setkey(dx, id, cluster, period)
return(dx)
}
bresult <- NULL
presult <- NULL
eresult <- NULL
intercept <- NULL
trt <- NULL
timecoeff1 <- NULL
timecoeff2 <- NULL
timecoeff3 <- NULL
timecoeff4 <- NULL
timecoeff5 <- NULL
ranclus <- NULL
error <- NULL
i=1
while (i < 1000) {
cohortsw <- powercrosssw()
#Fit multi-level model to simulated dataset
model1 <- tryCatch(lme(y ~ factor(period) + factor(Ijt), data = dx, random = ~1|cluster, method = "REML"),
warning = function(w) { "warning" }
)
if (! is.character(model1)) {
coeff <- coef(summary(model1))["factor(Ijt)1", "Value"]
pvalue <- coef(summary(model1))["factor(Ijt)1", "p-value"]
error <- coef(summary(model1))["factor(Ijt)1", "Std.Error"]
bresult <- c(bresult, coeff)
presult <- c(presult, pvalue)
eresult <- c(eresult, error)
i <- i + 1
}
}

nls boot error must have positive length

I am getting the error below with nlsBoot() any idea what is wrong?
Error in apply(tabboot, 1, quantile, c(0.5, 0.025, 0.975)) :
dim(X) must have a positive length
set.seed(1)
x = 1:100
y = x^2+rnorm(100,50,500)
plot(x,y)
d = data.frame(x =x, y=y)
mymodel = nls(y~x^b,start= list(b=1),data = d)
mymodel
library(nlstools)
nlsBoot(mymodel, niter = 999)
Try to define the formula before applying the nls function, like this:
formula <- as.formula(y ~ x^b)
mymodel <- nls(formula,start= list(b=1),data = d)
added
Well, I've modified the code and now it can handle one parameter fit.
# My suggestion is to erase all the environment first:
rm(list = ls())
# Then we start again:
set.seed(1)
x = 1:100
y = x^2+rnorm(100,50,500)
plot(x,y)
d = data.frame(x =x, y=y)
mymodel = nls(y~x^b,start= list(b=1),data = d)
Here is the function that you have to use:
nlsboot_onepar <- function (nls, niter = 999)
{
if (!inherits(nls, "nls"))
stop("Use only with 'nls' objects")
data2 <- eval(nls$data, sys.frame(0))
fitted1 <- fitted(nls)
resid1 <- resid(nls)
var1 <- all.vars(formula(nls)[[2]])
l1 <- lapply(1:niter, function(i) {
data2[, var1] <- fitted1 + sample(scale(resid1, scale = FALSE),
replace = TRUE)
nls2 <- try(update(nls, start = as.list(coef(nls)),
data = data2), silent = TRUE)
if (inherits(nls2, "nls"))
return(list(coef = coef(nls2), rse = summary(nls2)$sigma))
})
if (sum(sapply(l1, is.null)) > niter/2)
stop(paste("Procedure aborted: the fit only converged in",
round(sum(sapply(l1, is.null))/niter), "% during bootstrapping"))
tabboot <- sapply(l1[!sapply(l1, is.null)], function(z) z$coef,simplify =
FALSE)
tabboot <- as.matrix(t(as.numeric(tabboot)))
rownames(tabboot) <- "b"
rseboot <- sapply(l1[!sapply(l1, is.null)], function(z) z$rse)
recapboot <- t(apply(tabboot, 1, quantile, c(0.5, 0.025,
0.975)))
colnames(recapboot) <- c("Median", "2.5%", "97.5%")
estiboot <- t(apply(tabboot, 1, function(z) c(mean(z), sd(z))))
colnames(estiboot) <- c("Estimate", "Std. error")
serr <- sum(sapply(l1, is.null))
if (serr > 0)
warning(paste("The fit did not converge", serr, "times during
bootstrapping"))
listboot <- list(coefboot = t(tabboot), rse = rseboot, bootCI = recapboot,
estiboot = estiboot)
class(listboot) <- "nlsBoot"
return(listboot)
}
And then we use it:
result <- nlsboot_onepar(mymodel, niter = 999)
If you want to plot the parameter distribution, you can do this:
graphics.off()
plot(density(as.vector(result$coefboot)))
# or
hist(as.vector(result$coefboot))
I hope that helps you.

Performing t-Test Selection manually

I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.

Running existing function with non-default option

The code pasted below from ResourceSelection::hoslem.test performs a Hosmer and Lemeshow goodness of fit test. While investigating why the output that does not agree exactly with that performed by another software (Stata), I found that the difference relates to use of default R argument for the quantile function (type=7). I would like to use this function with a different default for calculation of quantiles (type=6).
FWIW, the reference to the 9 possible methods used by R can be found at:
https://www.amherst.edu/media/view/129116/original/Sample+Quantiles.pdf
The Stata manual for pctile refers to a default method and an 'altdef' method. I found it difficult to map these two methods to corresponding R types.
However,
hoslem.test(yhat, y, type=6)
Produces:
> hl <- hoslem.test(y, yhat, type=6)
Error in hoslem.test(y, yhat, type = 6) : unused argument (type = 6)
Is there a way to run the function below with a non-default argument for the quantile function?
Ie. allows the following line adding ', type=6':
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g), type=6))
The function in question is:
> ResourceSelection::hoslem.test
function (x, y, g = 10)
{
DNAME <- paste(deparse(substitute(x)), deparse(substitute(y)),
sep = ", ")
METHOD <- "Hosmer and Lemeshow goodness of fit (GOF) test"
yhat <- y
y <- x
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g)))
cutyhat <- cut(yhat, breaks = qq, include.lowest = TRUE)
observed <- xtabs(cbind(y0 = 1 - y, y1 = y) ~ cutyhat)
expected <- xtabs(cbind(yhat0 = 1 - yhat, yhat1 = yhat) ~
cutyhat)
chisq <- sum((observed - expected)^2/expected)
PVAL = 1 - pchisq(chisq, g - 2)
PARAMETER <- g - 2
names(chisq) <- "X-squared"
names(PARAMETER) <- "df"
structure(list(statistic = chisq, parameter = PARAMETER,
p.value = PVAL, method = METHOD, data.name = DNAME, observed = observed,
expected = expected), class = "htest")
}
We can modify pieces of functions. Look at the body of the function
as.list(body(hoslem.test))
See that the element we want to modify is the 6th element in the body
[[1]]
`{`
[[2]]
DNAME <- paste(deparse(substitute(x)), deparse(substitute(y)),
sep = ", ")
[[3]]
METHOD <- "Hosmer and Lemeshow goodness of fit (GOF) test"
[[4]]
yhat <- y
[[5]]
y <- x
[[6]]
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g)))
Modify the 6th element to what you want
body(hoslem.test)[[6]] = substitute(qq <- unique(quantile(yhat,
probs = seq(0, 1, 1/g), type = 6)))
The easiest way would be to reenter the function as your own:
myhoslem.test<-function(x, y, g = 10, mytype = 6){
DNAME <- paste(deparse(substitute(x)), deparse(substitute(y)),
sep = ", ")
METHOD <- "Hosmer and Lemeshow goodness of fit (GOF) test"
yhat <- y
y <- x
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g), type = mytype))
cutyhat <- cut(yhat, breaks = qq, include.lowest = TRUE)
observed <- xtabs(cbind(y0 = 1 - y, y1 = y) ~ cutyhat)
expected <- xtabs(cbind(yhat0 = 1 - yhat, yhat1 = yhat) ~
cutyhat)
chisq <- sum((observed - expected)^2/expected)
PVAL = 1 - pchisq(chisq, g - 2)
PARAMETER <- g - 2
names(chisq) <- "X-squared"
names(PARAMETER) <- "df"
structure(list(statistic = chisq, parameter = PARAMETER,
p.value = PVAL, method = METHOD, data.name = DNAME, observed = observed,
expected = expected), class = "htest")
}
The key change here is :
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g), type = mytype))
and allowing mytype as a argument to the function with default as 6
The two answers suggest a wrapper function to flexibly modify hoslem.test
myhoslem.test<-function(x, y, g = 10, mytype = 6){
body(hoslem.test)[[6]] = substitute(qq <- unique(quantile(yhat,
probs = seq(0, 1, 1/g), type = mytype)))
hoslem.test(x,y, g=10)
}

Error in R-script: error in abs (alpha) non-numeric argument to mathematical function

I am trying to reproduce some results from the book "Financial Risk Modelling and Portfolio Optimisation with R" and I get an error that I can't seem to get my head around.
I get the following error in the COPPosterior function:
error in abs(alpha) : non-numeric argument to mathematical function
Is anyone able to see why I get the error?
The error is from the following script:
library(urca)
library(vars)
library(fMultivar)
## Loading data set and converting to zoo
data(EuStockMarkets)
Assets <- as.zoo(EuStockMarkets)
## Aggregating as month-end series
AssetsM <- aggregate(Assets, as.yearmon, tail, 1)
head(AssetsM)
## Applying unit root tests for sub-sample
AssetsMsub <- window(AssetsM, start = start(AssetsM),
end = "Jun 1996")
## Levels
ADF <- lapply(AssetsMsub, ur.df, type = "drift",
selectlags = "AIC")
ERS <- lapply(AssetsMsub, ur.ers)
## Differences
DADF <- lapply(diff(AssetsMsub), ur.df, selectlags = "AIC")
DERS <- lapply(diff(AssetsMsub), ur.ers)
## VECM
VEC <- ca.jo(AssetsMsub, ecdet = "none", spec = "transitory")
summary(VEC)
## Index of time stamps in back test (extending window)
idx <- index(AssetsM)[-c(1:60)]
ANames <- colnames(AssetsM)
NAssets <- ncol(AssetsM)
## Function for return expectations
f1 <- function(x, ci, percent = TRUE){
data <- window(AssetsM, start = start(AssetsM), end = x)
Lobs <- t(tail(data, 1))
vec <- ca.jo(data, ecdet = "none", spec = "transitory")
m <- vec2var(vec, r = 1)
fcst <- predict(m, n.ahead = 1, ci = ci)
LU <- matrix(unlist(fcst$fcst),
ncol = 4, byrow = TRUE)[, c(2, 3)]
RE <- rep(0, NAssets)
PView <- LU[, 1] > Lobs
NView <- LU[, 2] < Lobs
RE[PView] <- (LU[PView, 1] / Lobs[PView, 1] - 1)
RE[NView] <- (LU[NView, 1] / Lobs[NView, 1] - 1)
names(RE) <- ANames
if(percent) RE <- RE * 100
return(RE)
}
ReturnEst <- lapply(idx, f1, ci = 0.5)
qv <- zoo(matrix(unlist(ReturnEst),
ncol = NAssets, byrow = TRUE), idx)
colnames(qv) <- ANames
tail(qv)
library(BLCOP)
library(fPortfolio)
## Computing returns and EW-benchmark returns
R <- (AssetsM / lag(AssetsM, k = -1) -1.0) * 100
## Prior distribution
## Fitting of skewed Student's t distribution
MSTfit <- mvFit(R, method = "st")
mu <- c(MSTfit#fit[["beta"]])
S <- MSTfit#fit[["Omega"]]
skew <- c(MSTfit#fit[["alpha"]])
df <- MSTfit#fit[["df"]]
CopPrior <- mvdistribution("mvst", dim = NAssets, mu = mu,
Omega = S, alpha = skew, df = df)
## Pick matrix and view distributions for last forecast
RetEstCop <- ReturnEst[[27]]
RetEstCop
PCop <- matrix(0, ncol = NAssets, nrow = 3)
colnames(PCop) <- ANames
PCop[1, ANames[1]] <- 1
PCop[2, ANames[2]] <- 1
PCop[3, ANames[4]] <- 1
Sds <- apply(R, 2, sd)
RetViews <- list(distribution("norm", mean = RetEstCop[1],
sd = Sds[1]),
distribution("norm", mean = RetEstCop[2],
sd = Sds[2]),
distribution("norm", mean = RetEstCop[4],
sd = Sds[4])
)
CopViews <- COPViews(pick = PCop, viewDist = RetViews,
confidences = rep(0.5, 3),
assetNames = ANames)
## Simulation of posterior
NumSim <- 10000
CopPost <- COPPosterior(CopPrior, CopViews,
numSimulations = NumSim)
print(CopPrior)
print(CopViews)
slotNames(CopPost)
look at the structure of MSTfit:
str(MSTfit)
You can see that if you want the estimated alpha value, you need to access it via:
MSTfit#fit$estimated[['alpha']]
rather than
MSTfit#fit[['alpha']]

Resources