Performing t-Test Selection manually - r

I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.

Related

Error in confidence interval mice R package

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards
Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

why random effect estiamator are not correct

I'm trying to simulate glmmLasso using a binomial data.
but random effect estiamator are not similar 5 that i given.
something wrong in my code?
if not, why random effect shown like that.
makedata <- function(I, J, p, sigmaB){
N <- I*J
# fixed effect generation
beta0 <- runif(1, 0, 1)
beta <- sort(runif(p, 0, 1))
# x generation
x <- matrix(runif(N*p, -1, 1), N, p)
# random effect generation
b0 <- rep(rnorm(I, 0, sigmaB), each=J)
# group
group <- as.factor(rep(1:I, each = J))
# y generation
k <- exp(-(beta0 + x %*% beta + b0))
y <- rbinom(n = length(k), size = 1, prob = (1/(1+k)))
#standardization
sx <- scale(x, center = TRUE, scale = TRUE)
simuldata <- data.frame(y = y, x = sx, group)
res <- list(simuldata=simuldata)
return(res)
}
# I : number of groups
I <- 20
# J : number of observation in group
J <- 10
# p : number of variables
p <- 20
# sigmaB : sd of random effect b0
sigmaB <- 5
set.seed(231233)
simdata <- makedata(I, J, p, sigmaB)
lam <- 10
xnam <- paste("x", 1:p, sep=".")
fmla <- as.formula(paste("y ~ ", paste(xnam, collapse= "+")))
glmm <- glmmLasso(fmla, rnd = list(group=~1), data = simdata, lambda = lam, control = list(scale = T, center = T))
summary(glmm)

Deep NN for multivariate regression

I implemented this simple NN but even when making it do all the interactions it fails to converge and the MSE remains very high
I tried to change the number of iterations and the learning rate but it doesn't work
rm(list=ls())
data <- read.csv("C:/Users/Mikele/Documents/Uni/IA AI & Machine Learning/R/11_23_2018/wine.csv",sep = ',',header = FALSE)
x <- data[,1:11]
y <- as.matrix(data[,12])
y_matrix <- matrix(rep(0,length(y)),nrow = length(y), ncol = 6)
k <-1
for (w in 1:length(y))
{
temp <- y[k] - 2
y_matrix[k,temp] <-1
k <- k + 1
}
hl <- c(40, 30, 20)
iter <- 1000
lr <- 0.1
## add in intercept
x_1 <- as.matrix(cbind(rep(1, nrow(x)),x))
## set error array
error <- rep(0, iter)
## set up weights
## the +1 is to add in the intercept/bias parameter
W1 <- matrix(runif(ncol(x_1)*hl[1], -1, 1), nrow = ncol(x_1))
W2 <- matrix(runif((hl[1]+1)*hl[2], -1, 1), nrow = hl[1]+1)
W3 <- matrix(runif((hl[2]+1)*hl[3], -1, 1), nrow = hl[2]+1)
W4 <- matrix(runif((hl[3]+1)*ncol(y), -1, 1), nrow = hl[3]+1)
for(k in 1:iter)
{
# calculate the hidden and output layers using X and hidden layer as inputs
# hidden layer 1 and 2 have a column of ones appended for the bias term
hidden1 <- cbind(matrix(1, nrow = nrow(x_1)), sigm(x_1 %*% W1))
hidden2 <- cbind(matrix(1, nrow = nrow(x_1)), sigm(hidden1 %*% W2))
hidden3 <- cbind(matrix(1, nrow = nrow(x_1)), sigm(hidden2 %*% W3))
y_hat <- sigm(hidden3 %*% W4)
# calculate the gradient and back prop the errors
# see theory above
y_hat_del <- (y-y_hat)*(d.sigm(y_hat))
hidden3_del <- y_hat_del %*% t(W4)*d.sigm(hidden3)
hidden2_del <- hidden3_del[,-1] %*% t(W3)*d.sigm(hidden2)
hidden1_del <- hidden2_del[,-1] %*% t(W2)*d.sigm(hidden1)
# update the weights
W4 <- W4 + lr*t(hidden3) %*% y_hat_del
W3 <- W3 + lr*t(hidden2) %*% hidden3_del[,-1]
W2 <- W2 + lr*t(hidden1) %*% hidden2_del[,-1]
W1 <- W1 + lr*t(x_1) %*% hidden1_del[,-1]
error[k] <- 1/nrow(y)*sum((y-y_hat)^2)
if((k %% (10^4+1)) == 0) cat("mse:", error[k], "\n")
}
# plot loss
xvals <- seq(1, iter, length = 100)
print(qplot(xvals, error[xvals], geom = "line", main = "MSE", xlab = "Iteration"))
no error message but I can't understand how to make a deep NN for
Multivariate Linear Regression
in addition I divided the ys into a 6-column matrix (the maximum and minimum of the initial dataset) now there is someone who can help me understand why not cover and in any case the final results are all concentrated on column 4?

reiterating a script using r

I have the following script
Posdef <- function (n, ev = runif(n, 0, 10))
{
Z <- matrix(ncol=n, rnorm(n^2))
decomp <- qr(Z)
Q <- qr.Q(decomp)
R <- qr.R(decomp)
d <- diag(R)
ph <- d / abs(d)
O <- Q %*% diag(ph)
Z <- t(O) %*% diag(ev) %*% O
return(Z)
}
Sigma <- Posdef(n = 11)
mu <- runif(11,0,10)
data <- as.data.frame(mvrnorm(n=1000, mu, Sigma))
data[data < 0] <- 0 #setting a floor#
data[data > 10] <- 10 #setting a ceiling#
names(data) = c('criteria_1', 'criteria_2', 'criteria_3', 'criteria_4', 'criteria_5',
'criteria_6', 'criteria_7', 'criteria_8', 'criteria_9', 'criteria_10',
'outcome')
data$outcome <- ifelse(data$outcome > 5, 1, 0)
data <- data[, sapply(data, is.numeric)]
maxValue <- as.numeric(apply (data, 2, max))
minValue <- as.numeric(apply (data, 2, min))
data_scaled <- as.data.frame(scale(data, center = minValue,
scale = maxValue-minValue))
ind <- sample (1:nrow(data_scaled), 600)
train <- data_scaled[ind,]
test <- data_scaled[-ind,]
model <- glm (formula =
outcome ~ criteria_1 + criteria_2 + criteria_3 + criteria_4 + criteria_5 +
criteria_6 + criteria_7 + criteria_8 + criteria_9 + criteria_10,
family = "binomial",
data = train)
summary (model)
predicted_model <- predict(model, test)
neural_model <- neuralnet(formula =
outcome ~ criteria_1 + criteria_2 + criteria_3 + criteria_4 + criteria_5 +
criteria_6 + criteria_7 + criteria_8 + criteria_9 + criteria_10,
hidden = c(2,2) ,
threshold = 0.01,
stepmax = 1e+07,
startweights = NULL,
rep = 1,
learningrate = NULL,
algorithm = "rprop+",
linear.output=FALSE,
data= train)
plot (neural_model)
results <- compute (neural_model, test[1:10])
results <- results$net.result*(max(data$outcome)-
min(data$outcome))+ min(data$outcome)
Values <- (test$outcome)*(max(data$outcome)-
min(data$outcome)) + min(data$outcome)
MSE_nueral_model <- sum((results - Values)^2)/nrow(test)
MSE_model <- sum((predicted_model - test$outcome)^2)/nrow(test)
print(MSE_model - MSE_nueral_model)
R1 <- (MSE_model - MSE_nueral_model)
The purpose of this script is to generate some arbitrary multivariate distribution and then compare two methods. In this case its a neural net and logistic regression. The end result is a difference in mean square error.
Now my issue with creating a loop has been with generating the 1000 observations.
I am able to create a loop without the data simulation portion of the script, putting that into the loop seems to make things go haywire. I tried creating a column vector filled with NA's but all I ended up getting was a single value returned rather than a vector of length n populated by the MSE reductions for each iteration of the loop.
Any help would be greatly appreciated.

Error in R-script: error in abs (alpha) non-numeric argument to mathematical function

I am trying to reproduce some results from the book "Financial Risk Modelling and Portfolio Optimisation with R" and I get an error that I can't seem to get my head around.
I get the following error in the COPPosterior function:
error in abs(alpha) : non-numeric argument to mathematical function
Is anyone able to see why I get the error?
The error is from the following script:
library(urca)
library(vars)
library(fMultivar)
## Loading data set and converting to zoo
data(EuStockMarkets)
Assets <- as.zoo(EuStockMarkets)
## Aggregating as month-end series
AssetsM <- aggregate(Assets, as.yearmon, tail, 1)
head(AssetsM)
## Applying unit root tests for sub-sample
AssetsMsub <- window(AssetsM, start = start(AssetsM),
end = "Jun 1996")
## Levels
ADF <- lapply(AssetsMsub, ur.df, type = "drift",
selectlags = "AIC")
ERS <- lapply(AssetsMsub, ur.ers)
## Differences
DADF <- lapply(diff(AssetsMsub), ur.df, selectlags = "AIC")
DERS <- lapply(diff(AssetsMsub), ur.ers)
## VECM
VEC <- ca.jo(AssetsMsub, ecdet = "none", spec = "transitory")
summary(VEC)
## Index of time stamps in back test (extending window)
idx <- index(AssetsM)[-c(1:60)]
ANames <- colnames(AssetsM)
NAssets <- ncol(AssetsM)
## Function for return expectations
f1 <- function(x, ci, percent = TRUE){
data <- window(AssetsM, start = start(AssetsM), end = x)
Lobs <- t(tail(data, 1))
vec <- ca.jo(data, ecdet = "none", spec = "transitory")
m <- vec2var(vec, r = 1)
fcst <- predict(m, n.ahead = 1, ci = ci)
LU <- matrix(unlist(fcst$fcst),
ncol = 4, byrow = TRUE)[, c(2, 3)]
RE <- rep(0, NAssets)
PView <- LU[, 1] > Lobs
NView <- LU[, 2] < Lobs
RE[PView] <- (LU[PView, 1] / Lobs[PView, 1] - 1)
RE[NView] <- (LU[NView, 1] / Lobs[NView, 1] - 1)
names(RE) <- ANames
if(percent) RE <- RE * 100
return(RE)
}
ReturnEst <- lapply(idx, f1, ci = 0.5)
qv <- zoo(matrix(unlist(ReturnEst),
ncol = NAssets, byrow = TRUE), idx)
colnames(qv) <- ANames
tail(qv)
library(BLCOP)
library(fPortfolio)
## Computing returns and EW-benchmark returns
R <- (AssetsM / lag(AssetsM, k = -1) -1.0) * 100
## Prior distribution
## Fitting of skewed Student's t distribution
MSTfit <- mvFit(R, method = "st")
mu <- c(MSTfit#fit[["beta"]])
S <- MSTfit#fit[["Omega"]]
skew <- c(MSTfit#fit[["alpha"]])
df <- MSTfit#fit[["df"]]
CopPrior <- mvdistribution("mvst", dim = NAssets, mu = mu,
Omega = S, alpha = skew, df = df)
## Pick matrix and view distributions for last forecast
RetEstCop <- ReturnEst[[27]]
RetEstCop
PCop <- matrix(0, ncol = NAssets, nrow = 3)
colnames(PCop) <- ANames
PCop[1, ANames[1]] <- 1
PCop[2, ANames[2]] <- 1
PCop[3, ANames[4]] <- 1
Sds <- apply(R, 2, sd)
RetViews <- list(distribution("norm", mean = RetEstCop[1],
sd = Sds[1]),
distribution("norm", mean = RetEstCop[2],
sd = Sds[2]),
distribution("norm", mean = RetEstCop[4],
sd = Sds[4])
)
CopViews <- COPViews(pick = PCop, viewDist = RetViews,
confidences = rep(0.5, 3),
assetNames = ANames)
## Simulation of posterior
NumSim <- 10000
CopPost <- COPPosterior(CopPrior, CopViews,
numSimulations = NumSim)
print(CopPrior)
print(CopViews)
slotNames(CopPost)
look at the structure of MSTfit:
str(MSTfit)
You can see that if you want the estimated alpha value, you need to access it via:
MSTfit#fit$estimated[['alpha']]
rather than
MSTfit#fit[['alpha']]

Resources