How to measure the robustness to outliers of some regression models - r

I am adding the variances of model coefficients and then returning the means of the sums.
I simply want to check which Regression Method is more robust to the outliers. I will examine many scenarios.
However, my code is giving me that ordinary least square is the best, but this is not expected results since MM estimation and Huber is called robust regression methods.
Am I doing something wrong with my code?
#####################################
rmn <- function(n, mu) {
p <- length(mu)
matrix(rnorm(n*p, mean = mu), ncol = p)
}
#####################################
RI<-function(y,x,a,mu,R=30,t=1000){
x <- as.matrix(x)
dm <- dim(x)
n <- dm[1]
bias1 <- bias2 <- bias3 <- numeric(t)
b1 <- b2<- b3 <- numeric(R)
### Outliers in X ######
for (j in 1:t) {
for (i in 1:R) {
id <- sample(n, a * n)
z <- x
z[id, ] <- rmn(length(id), mu)
b1[i] <- var(coef(lm(y ~., data = data.frame(z))))
b2[i] <- var(coef(rlm(y ~ ., data = data.frame(z), maxit = 2000, method = "MM")))
b3[i] <- var(coef(rlm(y ~ ., data = data.frame(z), psi = psi.huber,maxit = 300)))
}
bias1[j] <- sum(b1); bias2[j] <- sum(b2); bias3[j] <- sum(b3)
}
bias <- cbind("lm" = bias1,"MM-rlm" = bias2, "H-rlm" = bias3)
colMeans(bias)
}
#####################################
p <- 5
n <- 300
x<- matrix(rnorm(n * p), ncol = p)
y<-rnorm(n)
a=0.2
mu <-colMeans(x)+10
#####################################
RI(y,x,a,mu)
#####################################
UPDATE
I changed the idea of measuring Robustness, due to the first provided answer.
I measured robustness by calculating the mean absolute difference between coefficients when data is uncontaminated and when they are contaminated. I introduce outliers first in y and then in x. I still have a problem.
############ R CODE ##############
rmn <- function(n, mu, seed = TRUE) {
 if (seed) set.seed(12345)
 p <- length(mu)
 matrix( rnorm(n * p, mean = mu), ncol = p)
}
##################################
out.cv <- function(y, x, a, mu, R = 500, seed = TRUE) {
 ## y: response variable
 ## x: independent variables
 ## a: percent of outliers
 ## mu: how far should the outliers be. A vector if outliers in x,
 ## or a single number if outliers in y
 ## R: how many times to repeat this process
 x <- as.matrix(x)
 dm <- dim(x)
 n <- dm[1] ; d <- dm[2] + 1
 b1 <- b2<- b3 <- numeric(R)
 be <- coef( lm(y ~., data = as.data.frame(x[,-1]) ) )
####################################
 ### Outliers in Y ######
 if ( length(mu) == 1 ) {
 for (i in 1:R) {
 if (seed) set.seed(12345)
 id <- sample(n, a * n)
 z <- y
 if (seed) set.seed(12345)
 z[id] <- rnorm(id, mu) ## mu has to be a single number here
 ## mean absolute difference between coefficients of clean data
 ## and coefficients with contaminated data
 b1[i] <- mean( abs( coef( lm(z ~., data = as.data.frame(x[,-1])) ) - be) )
 b2[i] <- mean( abs( coef( rlm(z ~ ., data = data.frame(x[,-1]), maxit = 2000, method = "MM") ) - be ) )
 b3[i] <- mean( abs( coef( rlm(z ~ ., data = data.frame(x[,-1]), psi = psi.huber,maxit = 300) ) - be ) )
 }
########################
##### Outliers in X #########
 } else {
 for (i in 1:R) {
 if (seed) set.seed(12345)
 id <- sample(n, a * n)
 z <- x
 z[id, ] <- rmn( length(id), mu, seed ) ## mu must be a vector
 b1[i] <- mean( abs( coef( lm(y ~., data = as.data.frame(z[,-1])) )- be) )
 b2[i] <- mean( abs( coef( rlm(y ~ ., data = data.frame(z[,-1]), maxit = 2000, method = "MM") ) - be ) )
 b3[i] <- mean( abs( coef( rlm(y ~ ., data = data.frame(z[,-1]), psi = psi.huber,maxit = 300) ) - be ) )
 }
 }
 bias1 <- mean(b1) ; bias2 <- mean(b2); bias3 <- mean(b3)
 bias <- c(bias1, bias2, bias3)
 names(bias) <- c("lm", "MM-rlm","Huber-rlm")
 bias
}
################################
p <- 5
n <- 200
##############################
# Independent X and Y ####
#set.seed(12345)
#x<- matrix( rnorm(n * p), ncol = p)
#y<-rnorm(n)
## Related X and Y ####
set.seed(12345)
x <- rmn(n, numeric(p))
ber <- rnorm(p)
m <- x %*% ber
y <- rnorm(n, m, 1)
############################
a <- 0.2 #outliers 10%
mu <- 15 ## outliers in y
out.cv(y, x, a, mu)
###########################
mu <-colMeans(x)+15 ## outliers in x
out.cv(y, x, a, mu)
###################

First of all I do not see that you generate a sample from long tailed distribution. Please, use rt(n, 3) t student with very small df to get such distribution or play with other ones like log-normal. Thus do not use rnorm for sure. I see that you use some injection produce which seems to be overcomplicated.
Another thing is that specification of the MASS::rlm is not as trivial.
In my opinion start with quantreg::rq which is a quantile regression and treat it as a robust benchmark method.
Additionally your sampling procedure looks to not be a valid one. You generating a new observations each iteration which are not know apriori. I would expect bootstrapping on train or test set.

Related

Jackknife in R to obtain interval estimates

I have a question on how to use the jackknife using the bootstrap package. I want to obtain the interval estimate for the jackknife method.
I've tried running the code below, but no results for my parameter estimate.
rm(list=ls())
library(bootstrap)
library(maxLik)
set.seed(20)
lambda <- 0.02
beta <- 0.5
alpha <- 0.10
n <- 40
N <- 1000
lambda_hat <- NULL
beta_hat <- NULL
cp <- NULL
jack_lambda <- matrix(NA, nrow = N, ncol = 2)
jack_beta <- matrix(NA, nrow = N, ncol = 2)
### group all data frame generated from for loop into a list of data frame
data_full <- list()
for(i in 1:N){
u <- runif(n)
c_i <- rexp(n, 0.0001)
t_i <- (log(1 - (1 / lambda) * log(1 - u))) ^ (1 / beta)
s_i <- 1 * (t_i < c_i)
t <- pmin(t_i, c_i)
data_full[[i]] <- data.frame(u, t_i, c_i, s_i, t)
}
### statistic function for jackknife()
estjack <- function(data, j) {
data <- data[j, ]
data0 <- data[which(data$s_i == 0), ] #uncensored data
data1 <- data[which(data$s_i == 1), ] #right censored data
data
LLF <- function(para) {
t1 <- data$t_i
lambda <- para[1]
beta <- para[2]
e <- s_i*log(lambda*t1^(beta-1)*beta*exp(t1^beta)*exp(lambda*(1-exp(t1^beta))))
r <- (1-s_i)*log(exp(lambda*(1-exp(t1^beta))))
f <- sum(e + r)
return(f)
}
mle <- maxLik(LLF, start = c(para = c(0.02, 0.5)))
lambda_hat[i] <- mle$estimate[1]
beta_hat[i] <- mle$estimate[2]
return(c(lambda_hat[i], beta_hat[i]))
}
jackknife_resample<-list()
for(i in 1:N) {
jackknife_resample[[i]]<-data_full[[i]][-i]
results <- jackknife(jackknife_resample, estjack,R=1000)
jack_lambda[i,]<-lambda_hat[i]+c(-1,1)*qt(alpha/2,n-1,lower.tail = FALSE)*results$jack.se
jack_beta[i,]<-beta_hat[i]+c(-1,1)*qt(alpha/2,n-1,lower.tail = FALSE)*results$jack.se
}```
I couldn't get the parameter estimate that run in MLE and hence couldn't proceed to the next step. Can anyone help?

Optimising nested for loops in R

I tried to speed the below code but without any success.
I read about Rfast package but I also fail in implementing that package.
Is there any way to optimise the following code in R?
RI<-function(y,x,a,mu,R=500,t=500){
x <- as.matrix(x)
dm <- dim(x)
n <- dm[1]
bias1 <- bias2 <- bias3 <- numeric(t)
b1 <- b2<- b3 <- numeric(R)
### Outliers in Y ######
for (j in 1:t) {
for (i in 1:R) {
id <- sample(n, a * n)
z <- y
z[id] <- rnorm(id, mu)
b1[i] <- var(coef(lm(z ~., data = as.data.frame(x))))
b2[i] <- var(coef(rlm(z ~ ., data = data.frame(x), maxit = 2000, method = "MM")))
b3[i] <- var(coef(rlm(z ~ ., data = data.frame(x), psi = psi.huber,maxit = 300)))
}
bias1[j] <- sum(b1) ; bias2[j] <- sum(b2); bias3[j] <- sum(b3)
}
bias <- cbind("lm" = bias1,"MM-rlm" = bias2, "H-rlm" = bias3)
colMeans(bias)
}
#######################################
p <- 5
n <- 200
x<- matrix(rnorm(n * p), ncol = p)
y<-rnorm(n)
a=0.2
mu <-10
#######################################
RI(y,x,a,mu)

Calculate stderr, t-value, p-value, predict value for linear regression

I'm fitting linear models with MatrixModels:::lm.fit.sparse and MatrixModels::glm4 (also sparse).
However, these functions return coeff, residuals and fitted.values only.
What's the fastest and easiest way to get/calculate another values such as stderr, t-value, p-value, predict value?
I use the data from MatrixModels:::lm.fit.sparse example.
I built a custom function summary_sparse to perform a summary for this model.
All matrix operations are performed with Matrix package.
Results are compared with dense type model.
Note lm.fit.sparse have to be evaluated with method = "chol" to get proper results.
Functions:
summary_sparse <- function(l, X) {
XXinv <- Matrix::chol2inv(Matrix::chol(Matrix::crossprod(X)))
se <- sqrt(Matrix::diag(XXinv*sum(l$residuals**2)/(nrow(X)-ncol(X))))
ts <- l$coef/se
pvals <- 2*c(1 - pnorm(abs(ts)))
list(coef = l$coef, se = se, t = ts, p = pvals)
}
predict_sparse <- function(X, coef) {
X %*% coef
}
Application:
dd <- expand.grid(a = as.factor(1:3),
b = as.factor(1:4),
c = as.factor(1:2),
d= as.factor(1:8))
n <- nrow(dd <- dd[rep(seq_len(nrow(dd)), each = 10), ])
set.seed(17)
dM <- cbind(dd, x = round(rnorm(n), 1))
## randomly drop some
n <- nrow(dM <- dM[- sample(n, 50),])
dM <- within(dM, { A <- c(2,5,10)[a]
B <- c(-10,-1, 3:4)[b]
C <- c(-8,8)[c]
D <- c(10*(-5:-2), 20*c(0, 3:5))[d]
Y <- A + B + A*B + C + D + A*D + C*x + rnorm(n)/10
wts <- sample(1:10, n, replace=TRUE)
rm(A,B,C,D)
})
X <- Matrix::sparse.model.matrix( ~ (a+b+c+d)^2 + c*x, data = dM)
Xd <- as(X,"matrix")
fmDense <- lm(dM[,"Y"]~Xd-1)
ss <- summary(fmDense)
r1 <- MatrixModels:::lm.fit.sparse(X, y = dM[,"Y"], method = "chol")
f <- summary_sparse(r1, X)
all.equal(do.call(cbind, f), ss$coefficients, check.attributes = F)
#TRUE
all.equal(predict_sparse(X, r1$coef)#x, predict(fmDense), check.attributes = F, check.names=F)
#TRUE

How could I solve Dimension mismatch in Jags model.?

I'm super new in bayesian analysis and I'm trying to practice with an example for Classic Capture-recapture models: Mh2
This is my code
nind <- dim(venados)[1]
K <- 43
ntraps <- 13
M <- 150
nz <- M - nind
Yaug <- array(0, dim = c(M, ntraps, K))
Yaug[1:nind,,] <- venados
y <- apply(Yaug, c(1,3), sum)
y[y > 1] <- 1
Bundle data
data1 <- list(y = y, nz = nz, nind = nind, K = K, sup = Buffer)
# Model JAGS
sink("Mh2_jags.txt")
cat("
model{
# Priors
p0 ~ dunif(0,1)
mup <- log(p0/(1-p0))
sigmap ~ dunif(0,10)
taup <- 1/(sigmap*sigmap)
psi ~ dunif(0,1)
# Likelihood
for (i in 1:(nind+nz)) {
z[i] ~ dbern(psi)
lp[i] ~ dnorm(mup,taup)
logit(p[i]) <- lp[i]
y[i] ~ dbin(mu[i],K)
} # i
N <- sum(z[1:(nind+nz)])
D <- N/sup*100
} # modelo
",fill = TRUE)
sink()
# Inicial values
inits <- function(){list(z = as.numeric(y >= 1), psi = 0.6, p0 = runif(1), sigmap = runif(1, 0.7, 1.2), lp = rnorm(M, -0.2))}
params1 <- c("p0","sigmap","psi","N","D")
# MCMC
ni <- 10000; nt <- 1; nb <- 1000; nc <- 3
# JAGS and posteriors
fM2 <- jags(data1, inits, params1, "Mh2_jags.txt", n.chains = nc, n.thin = nt, n.iter = ni, n.burnin = nb)
I received this error message
Processing function input.......
Done.
Compiling model graph
Resolving undeclared variables
Deleting model
Error in jags.model(file = model.file, data = data, inits = inits, n.chains = n.chains, :
RUNTIME ERROR:
Compilation error on line 16.
Dimension mismatch in subset expression of y
I have read that some letters as s and n have to be changed. However,
I do not know what to do. Please if you could give an advice.
Thank you very much
The issue is because y is two dimensional but the model assumes it is one dimensional. If you are assuming that the secondary surveys are i.i.d. Bernoulli trials (and each session had K trials)n then you would just need to take the sum of the rows of the y matrix. Assuming this is the case then you just need to modify a couple lines at the top of this script.
nind <- dim(venados)[1]
K <- 43
ntraps <- 13
M <- 150
nz <- M - nind
Yaug <- array(0, dim = c(M, ntraps, K))
Yaug[1:nind,,] <- venados
y <- apply(Yaug, c(1,3), sum)
y[y > 1] <- 1
# Take the rowSum
y_vector <- rowSums(y)
# Use y_vector instead of y
data1 <- list(y = y_vector, nz = nz, nind = nind, K = K, sup = Buffer)
Conversely, if you wanted to include covariates for the observational process (and those covariates vary by survey) you would use the matrix y and modify the model.
sink("Mh2_jags_Kloop.txt")
cat("
model{
# Priors
p0 ~ dunif(0,1)
mup <- log(p0/(1-p0))
sigmap ~ dunif(0,10)
taup <- 1/(sigmap*sigmap)
psi ~ dunif(0,1)
# Likelihood
for (i in 1:(nind+nz)) {
z[i] ~ dbern(psi)
lp[i] ~ dnorm(mup,taup)
logit(p[i]) <- lp[i]
# Loop over K surveys
for(j in 1:K){
y[i,j] ~ dbern(p[i]*z[i])
}
} # i
N <- sum(z[1:(nind+nz)])
D <- N/sup*100
} # modelo
",fill = TRUE)
sink()
Finally, you don't specify what mu is within the model. I think you want it to be p, but you also need to link the latent state model to the observational state model (if z=0 then that individual cannot be sampled. In this case you would interpret psi as the probability that nind+nz individuals are at your site.
# Model JAGS
sink("Mh2_jags.txt")
cat("
model{
# Priors
p0 ~ dunif(0,1)
mup <- log(p0/(1-p0))
sigmap ~ dunif(0,10)
taup <- 1/(sigmap*sigmap)
psi ~ dunif(0,1)
# Likelihood
for (i in 1:(nind+nz)) {
z[i] ~ dbern(psi)
lp[i] ~ dnorm(mup,taup)
logit(p[i]) <- lp[i]
y[i] ~ dbin(p[i] * z[i],K)
} # i
N <- sum(z[1:(nind+nz)])
D <- N/sup*100
} # modelo
",fill = TRUE)
sink()

Objective function in optim evaluates to length 3 not 1

I am new to R and trying to find the optimal values of 3 parameters via indirect inference from a simulated panel data set, but getting an error "objective function in optim evaluates to length 3 not 1". I tried to check past posts, but the one I found didn't address the problem I am facing.
The code works if I only try for one parameter instead of 3. Here is the code:
#Generating data
modelp <- function(Y,alpha,N,T){
Yt <- Y[,2:T]
Ylag <- Y[,1:(T-1)]
Alpha <- alpha[,2:T]
yt <- matrix(t(Yt), (T-1)*N, 1)
ylag <- matrix(t(Ylag), (T-1)*N, 1)
alph <- matrix(t(Alpha), (T-1)*N, 1)
rho.ind <- rep(NA,N)
sigma_u <- rep(NA,N)
sigma_a <- rep(NA,N)
for(n in 1:N){
sigma_u[n] <- sigma(lm(yt~alph+ylag))
sigma_a[n] <- lm(yt~alph+ylag)$coef[2] #
(diag(vcov((lm(yt~alph+ylag)$coef),complete=TRUE)))[2] #
rho.ind[n] <- lm(yt~alph+ylag)$coef[3]
}
param <- matrix(NA,1,3)
param[1]<- mean(sum(rho.ind))
param[2]<- mean(sum(sigma_u))
param[3]<- mean(sum(sigma_a))
return(param)
}
## Function to estimate parameters
H.theta <- function(param.s){
set.seed(tmp.seed) #set seed
param.s.tmp <- matrix(0,1,3)
for(s in 1:H){
eps.s <- matrix(rnorm(N*T), N, T) #white noise erros
eps0.s <- matrix(rnorm(N*T), N, 1) #error for initial condition
alph.s <- matrix(rnorm(N*T),N,T)
Y.s <- matrix( 0, N, T)
ys.lag <- eps0.s
for(t in 1:T){ #Simulating the AR(1) process data
ys <- alph.s[,t]+param.s[1] * ys.lag + eps.s[,t] # [n,1:t]
Y.s[,t] <- ys
ys.lag <- ys
}
param.s.tmp <- param.s.tmp + modelp(Y.s, alph.s,N, T)
param.s[2] <- param.s.tmp[2]
param.s[3] <- mean(var(alph.s)) #param.s.tmp[3]
}
return( (param.data - param.s.tmp/H)^2 )
#return(param.s[1])
}
#Results for T = 10 & H = 10, N=100
nrep <-10
rho <-0.9
sigma_u <- 1
sigma_a <- 1.5
param <- matrix(NA,1,3)
param[1] <- rho
param[2] <- sigma_u
param[3] <- sigma_u
s.mu <- 0 # Mean
s.ep <- 0.5 #White Noise -initial conditions
Box <- cbind(rep(100,1),c(20),rep(c(5),1))
r.simu.box <- matrix(0,nrep,nrow(Box))
r.data.box <- matrix(0,nrep,nrow(Box))
for(k in 1:nrow(Box)){
N <- Box[k,1] #Number of individuals in panel
T <- Box[k,2] #Length of Panel
H <- Box[k,3] # Number of simulation paths
p.data <-matrix(NA,nrep,3)
p.simu <-matrix(NA,nrep,3)
est <- matrix(NA,1,3)
for(i in 1:nrep){
mu <- matrix(rnorm(N )*s.mu, N, 1)
eps <- matrix(rnorm(N*T)*s.ep, N, T)
eps0 <- matrix(rnorm(N*T)*s.ep, N, 1)
alph <- matrix(rnorm(N ), N, T)
Y <- matrix( 0, N, T)
y.lag <- (1-param[1])*mu + eps0
for(t in 1:T){
y <- alph[,t]+param[1]*y.lag +eps[,t]
Y[,t] <- y
y.lag <- y
}
param.data <- modelp(Y,alph,N,T) #Actual data
p.data[i,1:3] <- param.data
tmp.seed <- 3864+i+100*(k-1) #Simulated data
x0 <- c(0.5, 0,0)
est[i] <- optim(x0, H.theta,method = "BFGS", hessian = TRUE)$par
p.simu[i,1:3] <- est[i]
if(i%%10==0) print(c("Finished the (",i,")-th replication"))
}
}
mean(p.data[,1])- mean(p.simu[,1])
mean(p.data[,2])- mean(p.simu[,2])
sqrt(mean((p.data[1]-p.simu[1])^2))
I expect to get three values. Any help or suggestion will be greatly appreciated.

Resources