Bootstrap with Stepwise Regression in R - r

I am trying to take bootstrap samples and conduct stepwise regression in R. I am trying to look at the distribution of the regression coefficients, but the "bhat" matrix I am working on is mostly printing out NAs with the exception of the first row (which is the same number across all columns). How can I fix this?
B <- 100 #number of bootstrap samples
n <- nrow(dat)
d <- ncol(dat) - 1
bhat <- matrix(NA, nrow = B, ncol = ncol(dat) - 1)
for(b in 1:B) {
s <- sample(1:n, n, replace=TRUE)
samp <- as.matrix(dat[s, c(1:15)])
samp <- as.data.frame(samp)
#stepwise regression
null <- lm(dat$Y ~ 1, data=samp)
full <- lm(dat$Y ~ ., data=samp)
fit <- step(null, scope=formula(full), direction="forward", k=log(n), trace=0)
bhat[b,] <- coef(fit)
}

Every time you do a step the number of coefficients are different, it's not the same dimension as your matrix, and you cannot "slot" it in, you can do:
dat = data.frame(Y=rnorm(100),matrix(rnorm(100*15),ncol=15))
colnames(dat)[-1] = paste0("Var",1:15)
B <- 100 #number of bootstrap samples
n <- nrow(dat)
d <- ncol(dat) - 1
full_mdl = colnames(model.matrix(Y~.,data=dat))
bhat <- matrix(NA, nrow = B, ncol = length(full_mdl))
colnames(bhat) = full_mdl
for(b in 1:B) {
samp <- dat[sample(1:n, n, replace=TRUE), c(1:15)]
null <- lm(dat$Y ~ 1, data=samp)
full <- lm(dat$Y ~ ., data=samp)
fit <- step(null, scope=formula(full), direction="forward", k=log(n), trace=0)
bhat[b,names(coef(fit))] <- coef(fit)
}

Related

Cannot make sense of the error while using OptimParallel in R

I'm trying to run the following function mentioned below using OptimParallel in R on a certain data set. The code is as follows:
install.packages("optimParallel")
install.packages('parallel')
library(parallel)
library(optimParallel)
library(doParallel)
library(data.table)
library(Rlab)
library(HDInterval)
library(mvtnorm)
library(matrixStats)
library(dplyr)
library(cold)
## Bolus data:
data("bolus")
d1 <- bolus
d1$group <- ifelse(d1$group == "2mg",1,0)
colnames(d1) <- c("index",'group',"time","y")
d2 <- d1 %>% select(index, y, group, time)
colnames(d2) <- c('index','y','x1','x2') ### Final data
## Modification of the objective function:
## Another approach:
dpd_poi <- function(x,fixed = c(rep(FALSE,5))){
params <- fixed
dpd_1 <- function(p){
params[!fixed] <- p
alpha <- params[1]
beta_0 <- params[2]
beta_1 <- params[3]
beta_2 <- params[4]
rho <- params[5]
add_pi <- function(d){
k <- beta_0+(d[3]*beta_1)+(d[4]*beta_2)
k1 <- exp(k) ## for Poisson regression
d <- cbind(d,k1)
}
dat_split <- split(x , f = x$index)
result <- lapply(dat_split, add_pi)
result <- rbindlist(result)
result <- as.data.frame(result)
colnames(result) <- c('index','y','x1','x2','lamb')
result_split <- split(result, f = result$index)
expression <- function(d){
bin <- as.data.frame(combn(d$y , 2))
pr <- as.data.frame(combn(d$lamb , 2))
## Evaluation of the probabilities:
f_jk <- function(u,v){
dummy_func <- function(x,y){
ppois(x, lambda = y)
}
dummy_func_1 <- function(x,y){
ppois(x-1, lambda = y)
}
k <- mapply(dummy_func,u,v)
k_1 <- mapply(dummy_func_1,u,v)
inv1 <- inverseCDF(as.matrix(k), pnorm)
inv2 <- inverseCDF(as.matrix(k_1), pnorm)
mean <- rep(0,2)
lower <- inv2
upper <- inv1
corr <- diag(2)
corr[lower.tri(corr)] <- rho
corr[upper.tri(corr)] <- rho
prob <- pmvnorm(lower = lower, upper = upper, mean = mean, corr = corr)
prob <- (1+(1/alpha))*(prob^alpha)
## First expression: (changes for Poisson regression)
lam <- as.vector(t(v))
v1 <- rpois(1000, lambda = lam[1])
v2 <- rpois(1000, lambda = lam[2])
all_possib <- as.data.frame(rbind(v1,v2))
new_func <- function(u){
k <- mapply(dummy_func,u,v)
k_1 <- mapply(dummy_func_1,u,v)
inv1_1 <- inverseCDF(as.matrix(k), pnorm)
inv2_1 <- inverseCDF(as.matrix(k_1), pnorm)
mean1 <- rep(0,2)
lower1 <- inv2_1
upper1 <- inv1_1
corr1 <- diag(2)
corr1[lower.tri(corr1)] <- rho
corr1[upper.tri(corr1)] <- rho
prob1 <- pmvnorm(lower = lower1, upper = upper1, mean = mean1, corr = corr1)
prob1 <- prob1^(alpha)
}
val <- apply(all_possib, 2, new_func)
val_s <- mean(val) ## approximation
return(val_s - prob)
}
final_res <- mapply(f_jk, bin, pr)
final_value <- sum(final_res)
}
u <- sapply(result_split,expression)
return(sum(u))
}
}
## run the objective function:
cl <- makeCluster(25)
setDefaultCluster(cl=cl)
clusterExport(cl,c('d2','val'))
clusterEvalQ(cl,c(library(data.table), library(Rlab),library(HDInterval),library(mvtnorm),library(matrixStats),library(dplyr),library(cold)))
val <- dpd_poi(d2, c(0.5,FALSE,FALSE,FALSE,FALSE))
optimParallel(par = c(beta_0 =1, beta_1 =0.1 ,beta_2 = 1,rho=0.2),fn = val ,method = "L-BFGS-B",lower = c(-10,-10,-10,0),upper = c(Inf,Inf,Inf,1))
stopCluster(cl)
After running for some time, it returns the following error:
checkForRemoteErrors(val)
9 nodes produced errors; first error: missing value where TRUE/FALSE needed
However, when I make a minor change in the objective function (pick 2 random numbers from rpois instead of 1000) and run the same code using optim, it converges and gives me a proper result. This is a Monte Carlo simulation and it does not make sense to draw so few Poisson variables. I have to use optimParllel, otherwise, it takes way too long to converge. I could also run this code using simulated data.
I'm unable to figure out where the issue truly lies. I truly appreciate any help in this regard.

Classification in high-dimensional data by extreme learning machine in R

When I simulate 50 high-dimensional data set from multivariate normal distribution and classify by ELM, I find the AUC scores I get very different from each other in each loop. In some loops it goes below 50%. Where am I doing wrong? I want the deviation in the scores I get to be low for each loop. How can I achieve stable and high scores for each loop? How should I make changes in data generation? I look forward to your valuable contributions.
install.packages("MASS")
install.packages("stats")
install.packages("pROC")
install.packages("elmNNRcpp")
library(MASS)
library(stats)
library(pROC)
library(elmNNRcpp)
######################################################
# DATA SIMULATE FUNCTION
######################################################
# rm(list = ls())
generateData<- function(n,p) {
pr <- seq(0.80, 0.40, length.out = p)
pr[1] <- 1
covmat <- toeplitz(pr)
mu= rep(0,p)
X_ <- data.frame(mvrnorm(n, mu = mu, Sigma = covmat))
X <- unname(as.matrix(sample(X_)))
vCoef = rnorm(ncol(X))
vProb =exp(X%*%vCoef)/(1+exp(X%*%vCoef))
Y <- rbinom(nrow(X), 1, vProb)
mydata= data.frame(cbind(X,Y))
return(mydata)
}
######################################################
# SIMULATED DATA
######################################################
n <- 100
p <- 120
nsim <- 50
set.seed(123)
mydata <- list()
for (k in 1 : nsim ) {
data <- generateData(n , p)
# table(data[ncol(data)])
X <- data[-ncol(data)]
Y <- data[ncol(data)]
mydata[[k]] <- data
}
######################################################
# ELM CLASSIFICATION
######################################################
######################################################
# ELM CLASSIFICATION FUNCTION
######################################################
fELMCLASS <- function(x, col_names){
trainIndex <- sample(1:nrow(x), size=0.7*nrow(x))
trainSet <- x[trainIndex,]
testSet <- x[-trainIndex,]
xtrain <- as.matrix(trainSet[, 1:(length(trainSet)-1)])
ytrain <- as.matrix(trainSet[, length(trainSet)])
xtest <- as.matrix(testSet[, 1:(length(testSet)-1)])
ytest <- as.matrix(testSet[, length(testSet)])
model=elm_train(xtrain, ytrain, nhid=25 , actfun='relu')
pred.class=elm_predict(model,xtest,normalize=TRUE)
roc.model=roc(as.factor(ytest) ~ as.numeric(pred.class), direction=c("auto"))
performance_metrics <-t(data.frame("AUC" = roc.model$auc))
return(performance_metrics)
}
######################################################
######################################################
# SAVE RESULTS
######################################################
datalist = data.frame()
for (j in 1:nsim) {
data_ELM <- as.data.frame(mydata[[j]])
datalist <- rbind(datalist,fELMCLASS(data_ELM, "data"))
}
datalist

How to estimate SAR spatial model without row-normalizing the matrix?

I am trying to estimate a SAR spatial model without row-normalizing the matrix. For some reason, when I do not row-normalize, the command does not return the correct estimates. Am I missing something on the command options?
Here is an example of what I mean.
If I run the following code, simulating a data with a row-normalized matrix, lagsarlm returns the correct estimates:
set.seed(20100817)
rho <- .5
B <- c(2, 5)
e <- as.matrix(rnorm(100, sd=2))
X0 <- matrix(1, ncol=1, nrow=100) # create Intercept
X1 <- matrix(runif(100, min = -10, max = 10), nrow=100) # generate covariate
Xbe <- X0*B[1]+X1*B[2]+e
I <- diag(100)
W <- rgraph(100, m=1, tprob=0.1, mode="graph", diag=FALSE) #assume I need to start with a matrix of relationships
spatialList <- mat2listw(W)
nb7rt <- spatialList$neighbours
listw <- nb2listw(nb7rt)
W <- nb2mat(nb7rt)
y <- solve(I - rho*W) %*% Xbe
model <- lagsarlm(y ~ X1, listw=listw)
summary(model)
However, if I try to do exactly the same but without row-normalizing, the results are incorrect:
set.seed(20100817)
rho <- .5
B <- c(2, 5)
e <- as.matrix(rnorm(100, sd=2))
X0 <- matrix(1, ncol=1, nrow=100) # create Intercept
X1 <- matrix(runif(100, min = -10, max = 10), nrow=100) # generate covariate
Xbe <- X0*B[1]+X1*B[2]+e
I <- diag(100)
W <- rgraph(100, m=1, tprob=0.1, mode="graph", diag=FALSE) #assume I need to start with a matrix of relationships
spatialList <- mat2listw(W, style ="B")
nb7rt <- spatialList$neighbours
listw <- nb2listw(nb7rt, style="B")
W <- nb2mat(nb7rt, style="B")
y <- solve(I - rho*W) %*% Xbe
model <- lagsarlm(y ~ X1, listw=listw)
summary(model)
The base for this code can be found here https://stat.ethz.ch/pipermail/r-sig-geo/2010-August/009023.html.

how to conduct a simulation study using stepAIC

I have to code a simulation study in R. I have X1,...,X15~N(0,1) explanatory variables and Y~N(2+2*X1+0.8*X2-1.2*X15, 1) and I need to simulate n=100 values and repeat that for iter=100 times. Then, for each linear model created I have to find the best sub-model, using stepAIC. I wrote the following code:
set.seed(1234)
sim <- function (sd) {
n <- 100
p <- 15
X <- matrix(rnorm(n * p), n, p)
mu <- 2 + 2*X[,1] + 0.8*X[,2] - 1.2*X[,15]
Y <- matrix(rnorm(100, mu,sd))
sim<-data.frame(Y,X)
r<- lm(Y~., data = sim)
library(MASS)
r0<-lm(Y~1, data=sim)
res<-stepAIC(r0,k=2,direction="forward", scope=list(lower=~1, upper=r))
return(res$coefficients)
}
sim(1)
oo1<- lapply(1:100, sim)
As I am an inexperienced R-user, I think that I'm doing something wrong. The purpose of the study is to find if within the 100 best sub-models (according to stepAIC), there are models that can find the real one (Y=2+2*X1+0.8*X2-1.2*X15+e). In case, I'm doing the wrong things could I get some help/hints so as to implement it correctly?
Here is a working version of your code:
library(MASS)
set.seed(1234)
sim <- function(sd, n, p) {
X <- matrix(rnorm(n * p), n, p)
mu <- 2 + 2*X[,1] + 0.8*X[,2] - 1.2*X[,p]
Y <- rnorm(n, mean=mu, sd=rep(sd,n))
df <- data.frame(Y,X)
r <- lm(Y~., data=df)
r0 <- lm(Y~1, data=df)
res <- stepAIC(r0, k=2, direction="forward",
scope=list(lower=~1, upper=r), trace=F)
return(res$coefficients)
}
n <- 100
p <- 15
sim(1, n, p)
oo1 <- lapply(1:100, sim, n, p)

Multi-data likelihood function and mle2 function from bbmle package in R

I have written a custom likelihood function that fits a multi-data model that integrates mark-recapture and telemetry data (sensu Royle et al. 2013 Methods in Ecology and Evolution). The likelihood function is designed to be flexible in terms of whether and how many covariates are specified for different linear models in different likelihood components which is determined by values supplied as function arguments (i.e., data matrices "detcovs" and "dencovs" in my code). The likelihood function works when I directly supply it to optimization functions (e.g., optim or nlm), but does not play nice with the mle2 function in the bbmle package. My problem is that I continually run into the following error: "some named arguments in 'start' are not arguments to the specified log-likelihood function". This is my first attempt at writing custom likelihood functions so I'm sure there are general coding conventions of which I'm unaware that make such tasks much more efficient and amendable to the mle2 function. Below is my likelihood function, code creating the staring value objects, and code calling the mle2 function. Any advice how to solve the error problem and general comments on writing cleaner functions is welcome. Many thanks in advance.
Edit: As requested, I have simplified the likelihood function and provided code to simulate reproducible data to which the model can be fit. Included in the simulation code are 2 custom functions and use of the raster function from the raster package. Hopefully, I have sufficiently simplified everything to enable others to troubleshoot. Again, many thanks for your help!
Jared
Likelihood function:
CSCR.RSF.intlik2.EXAMPLE <- function(alpha0,sigma,alphas=NULL,betas=NULL,n0,yscr=NULL,K=NULL,X=X,trapcovs=NULL,Gden=NULL,Gdet=NULL,ytel=NULL,stel=NULL,
dencovs=NULL,detcovs=NULL){
#
# this version of the code handles a covariate on log(Density). This is starting value 5
#
# start = vector of starting values
# yscr = nind x ntraps encounter matrix
# K = number of occasions
# X = trap locations
# Gden = matrix with grid cell coordinates for density raster
# Gdet = matrix with gride cell coordinates for RSF raster
# dencovs = all covariate values for all nGden pixels in density raster
# trapcovs = covariate value at trap locations
# detcovs = all covariate values for all nGrsf pixels in RSF raster
# ytel = nguys x nGdet matrix of telemetry fixes in each nGdet pixels
# stel = home range center of telemetered individuals, IF you wish to estimate it. Not necessary
# alphas = starting values for RSF/detfn coefficients excluding sigma and intercept
# alpha0 = starting values for RSF/detfn intercept
# sigma = starting value for RSF/detfn sigma
# betas = starting values for density function coefficients
# n0 = starting value for number of undetected individuals on log scale
#
n0 = exp(n0)
nGden = nrow(Gden)
D = e2dist(X,Gden)
nGdet <- nrow(Gdet)
alphas = alphas
loglam = alpha0 -(1/(2*sigma*sigma))*D*D + as.vector(trapcovs%*%alphas) # ztrap recycled over nG
psi = exp(as.vector(dencovs%*%betas))
psi = psi/sum(psi)
probcap = 1-exp(-exp(loglam))
#probcap = (exp(theta0)/(1+exp(theta0)))*exp(-theta1*D*D)
Pm = matrix(NA,nrow=nrow(probcap),ncol=ncol(probcap))
ymat = yscr
ymat = rbind(yscr,rep(0,ncol(yscr)))
lik.marg = rep(NA,nrow(ymat))
for(i in 1:nrow(ymat)){
Pm[1:length(Pm)] = (dbinom(rep(ymat[i,],nGden),rep(K,nGden),probcap[1:length(Pm)],log=TRUE))
lik.cond = exp(colSums(Pm))
lik.marg[i] = sum( lik.cond*psi )
}
nv = c(rep(1,length(lik.marg)-1),n0)
part1 = lgamma(nrow(yscr)+n0+1) - lgamma(n0+1)
part2 = sum(nv*log(lik.marg))
out = -1*(part1+ part2)
lam = t(exp(a0 - (1/(2*sigma*sigma))*t(D2)+ as.vector(detcovs%*%alphas)))# recycle zall over all ytel guys
# lam is now nGdet x nG!
denom = rowSums(lam)
probs = lam/denom # each column is the probs for a guy at column [j]
tel.loglik = -1*sum( ytel*log(probs) )
out = out + tel.loglik
out
}
Data simulation code:
library(raster)
library(bbmle)
e2dist <- function (x, y){
i <- sort(rep(1:nrow(y), nrow(x)))
dvec <- sqrt((x[, 1] - y[i, 1])^2 + (x[, 2] - y[i, 2])^2)
matrix(dvec, nrow = nrow(x), ncol = nrow(y), byrow = F)
}
spcov <- function(R) {
v <- sqrt(nrow(R))
D <- as.matrix(dist(R))
V <- exp(-D/2)
cov1 <- t(chol(V)) %*% rnorm(nrow(R))
Rd <- as.data.frame(R)
colnames(Rd) <- c("x", "y")
Rd$C <- as.numeric((cov1 - mean(cov1)) / sd(cov1))
return(Rd)
}
set.seed(1234)
co <- seq(0.3, 0.7, length=5)
X <- cbind(rep(co, each=5),
rep(co, times=5))
B <- 10
co <- seq(0, 1, length=B)
Z <- cbind(rep(co, each=B), rep(co, times=B))
dencovs <- cbind(spcov(Z),spcov(Z)[,3]) # ordered as reading raster image from left to right, bottom to top
dimnames(dencovs)[[2]][3:4] <- c("dencov1","dencov2")
denr.list <- vector("list",2)
for(i in 1:2){
denr.list[[i]] <- raster(
list(x=seq(0,1,length=10),
y=seq(0,1,length=10),
z=t(matrix(dencovs[,i+2],10,10,byrow=TRUE)))
)
}
B <- 20
co <- seq(0, 1, length=B)
Z <- cbind(rep(co, each=B), rep(co, times=B))
detcovs <- cbind(spcov(Z),spcov(Z)[,3]) # ordered as reading raster image from left to right, bottom to top
dimnames(detcovs)[[2]][3:4] <- c("detcov1","detcov2")
detcov.raster.list <- vector("list",2)
trapcovs <- matrix(0,J,2)
for(i in 1:2){
detr.list[[i]] <- raster(
list(x=seq(0,1,length=20),
y=seq(0,1,length=20),
z=t(matrix(detcovs[,i+2],20,20,byrow=TRUE)))
)
trapcovs[,i] <- extract(detr.list[[i]],X)
}
alpha0 <- -3
sigma <- 0.15
alphas <- c(1,-1)
beta0 <- 3
betas <- c(-1,1)
pixelArea <- (dencovs$y[2] - dencovs$y[1])^2
mu <- exp(beta0 + as.matrix(dencovs[,3:4])%*%betas)*pixelArea
EN <- sum(mu)
N <- rpois(1, EN)
pi <- mu/sum(mu)
s <- dencovs[sample(1:nrow(dencovs), size=N, replace=TRUE, prob=pi),1:2]
J <- nrow(X)
K <- 10
yc <- d <- p <- matrix(NA, N, J)
D <- e2dist(s,X)
loglam <- t(alpha0 - t((1/(2*sigma*sigma))*D*D) + as.vector(trapcovs%*%alphas))
p <- 1-exp(-exp(loglam))
for(i in 1:N) {
for(j in 1:J) {
yc[i,j] <- rbinom(1, K, p[i,j])
}
}
detected <- apply(yc>0, 1, any)
yscr <- yc[detected,]
ntel <- 5
nfixes <- 100
poss.tel <- which(s[,1]>0.2 & s[,1]<0.8 & s[,2]>0.2 & s[,2]<0.8)
stel.id <- sample(poss.tel,ntel)
stel <- s[stel.id,]
ytel <- matrix(NA,ntel,nrow(detcovs))
d <- e2dist(stel,detcovs[,1:2])
lam <- t(exp(1 - t((1/(2*sigma*sigma))*d*d) + as.vector(as.matrix(detcovs[,3:4])%*%alphas)))
for(i in 1:ntel){
ytel[i,] <- rmultinom(1,nfixes,lam[i,]/sum(lam[i,]))
}
Specify starting values and call mle2 function:
start1 <- list(alpha0=alpha0,sigma=sigma,alphas=alphas,betas=betas,n0=log(N-nrow(yscr)))
parnames(CSCR.RSF.intlik2.EXAMPLE) <- names(start)
out1 <- mle2(CSCR.RSF.intlik2.EXAMPLE,start=start1,method="SANN",optimizer="optim",
data=list(yscr=yscr,K=K,X=X,trapcovs=trapcovs,Gden=dencovs[,1:2],Gdet=detcovs[,1:2],
ytel=ytel,stel=stel,dencovs=as.matrix(dencovs[,3:4]),detcovs=as.matrix(detcovs[,3:4]))
)

Resources