Simulate data for regression with standardized Y - r

Based on this topic, I have created a function that returns a dataset with variables related to the outcome (y) by specific linear coefs.
simulate_data_regression <- function(sample=10, coefs=0, error=0){
n_var <- length(coefs)
X <- matrix(0, ncol=n_var, nrow=sample)
beta <- as.matrix(coefs)
for (i in 1:n_var){
X[,i] <- scale(rnorm(sample, 0, 1))
}
y <- X %*% beta
if(error != 0){
y <- y + rnorm(sample, 0, error)
}
data = data.frame(X=X)
names(data) <- paste0("V", 1:n_var)
data$y <- as.vector(y)
return(data)
}
data <- simulate_data_regression(sample=50, coefs=c(0.1, 0.8), error=0)
summary(data)
sd(data$V1)
sd(data$y)
It works great. However, I would need to have a standardized y (mean 0 and SD 1). But when I try to scale it, the coefficients change:
data <- simulate_data_regression(sample=50, coefs=c(0.1, 0.8), error=0)
data$y <- as.vector(scale(data$y))
coef(lm(y ~ ., data=data))
It is possible to do such thing? Thank you very much!
Edit
In other words, I would like the coefs that are specified to be standardized coefs (expressed in outcome's SD).
Scaling y a posteriori changes the coefs by 1/sd(y). However, I can't think of any way to change the betas before generating y, so that the betas return to their specified value after the scaling of y.
Edit 2: Failed attempt
I've tried running the function twice, first extracting sd(y) and scaling the coefficients with it, in the hope that those scaled coefficients will change to the specified ones once I'll scale y. But it doens't work, which is expected, as sd(y) changes when I change the coefs :'(
Here's the failed attempt:
simulate_data_regression <- function(sample=10, coefs=0, error=0, standardized=TRUE){
stuff <- .simulate_data_regression(sample=sample, coefs=coefs, error=error)
if(standardized == TRUE){
y_sd <- sd(data$y)
data <- .simulate_data_regression(sample=sample, coefs=y_sd*coefs, error=error, X=stuff$X)$data
data$y <- as.vector(scale(data$y))
} else{
data <- stuff$data
}
return(data)
}
.simulate_data_regression <- function(sample=10, coefs=0, error=0, X=NULL, y=NULL){
n_var <- length(coefs)
if(is.null(X)){
X <- matrix(0, ncol=n_var, nrow=sample)
for (i in 1:n_var){
X[,i] <- scale(rnorm(sample, 0, 1))
}
}
beta <- as.matrix(coefs)
y <- X %*% beta
if(error != 0){
y <- y + rnorm(sample, 0, error)
}
data = data.frame(X=X)
names(data) <- paste0("V", 1:n_var)
data$y <- as.vector(y)
return(list(X=X, y=y, data=data))
}

If you scale y the inference will be the same, only the p-values of the intercepts change, not the p-values of the coefficients.
In this example I have set error = 1.
set.seed(1234) # Make the results reproducible
data <- simulate_data_regression(sample = 50, coefs = c(0.1, 0.8), error = 1)
data2 <- data
data2$y <- scale(data2$y)
fit <- lm(y ~ ., data)
fit2 <- lm(y ~ ., data2)
summary(fit)
summary(fit2)
As you can see the p-values of the coefficients are exactly the same though the coefficients themselves are different. You would expect that since you are scaling by the standard errors of the regressors and therefore the coefficients will be scaled by the inverses of those standard errors.
The version of your function below has an argument, which, that allows to specify which regressors to scale. Its default is all of them.
simulate_data_regression2 <- function(sample = 10, coefs = 0, error = 0, which = seq_along(coefs)){
n_var <- length(coefs)
X <- matrix(0, ncol=n_var, nrow=sample)
beta <- as.matrix(coefs)
for (i in 1:n_var){
X[,i] <- rnorm(sample, 0, 1)
if(i %in% which) X[, i] <- scale(X[, i])
}
y <- X %*% beta
if(error != 0){
y <- y + rnorm(sample, 0, error)
}
data = data.frame(X=X)
names(data) <- paste0("V", 1:n_var)
data$y <- as.vector(y)
data
}
Now test the function.
set.seed(1234) # Make the results reproducible
data <- simulate_data_regression2(sample=50, coefs=c(0.1, 0.8), error=1)
set.seed(1234) # Reproduce the data generation process
data2 <- simulate_data_regression2(sample=50, coefs=c(0.1, 0.8), error=1, which = 2)
fit <- lm(y ~ ., data)
fit2 <- lm(y ~ ., data2)
As you can see the coefficients of V2 are equal.
coef(fit)
#(Intercept) V1 V2
# 0.01997809 0.19851020 0.96310013
coef(fit2)
#(Intercept) V1 V2
# 0.07040538 0.21130549 0.96310013
The p-values of the estimates of the coefficients V2 are also equal
summary(fit)
summary(fit2)

Related

How to measure the robustness to outliers of some regression models

I am adding the variances of model coefficients and then returning the means of the sums.
I simply want to check which Regression Method is more robust to the outliers. I will examine many scenarios.
However, my code is giving me that ordinary least square is the best, but this is not expected results since MM estimation and Huber is called robust regression methods.
Am I doing something wrong with my code?
#####################################
rmn <- function(n, mu) {
p <- length(mu)
matrix(rnorm(n*p, mean = mu), ncol = p)
}
#####################################
RI<-function(y,x,a,mu,R=30,t=1000){
x <- as.matrix(x)
dm <- dim(x)
n <- dm[1]
bias1 <- bias2 <- bias3 <- numeric(t)
b1 <- b2<- b3 <- numeric(R)
### Outliers in X ######
for (j in 1:t) {
for (i in 1:R) {
id <- sample(n, a * n)
z <- x
z[id, ] <- rmn(length(id), mu)
b1[i] <- var(coef(lm(y ~., data = data.frame(z))))
b2[i] <- var(coef(rlm(y ~ ., data = data.frame(z), maxit = 2000, method = "MM")))
b3[i] <- var(coef(rlm(y ~ ., data = data.frame(z), psi = psi.huber,maxit = 300)))
}
bias1[j] <- sum(b1); bias2[j] <- sum(b2); bias3[j] <- sum(b3)
}
bias <- cbind("lm" = bias1,"MM-rlm" = bias2, "H-rlm" = bias3)
colMeans(bias)
}
#####################################
p <- 5
n <- 300
x<- matrix(rnorm(n * p), ncol = p)
y<-rnorm(n)
a=0.2
mu <-colMeans(x)+10
#####################################
RI(y,x,a,mu)
#####################################
UPDATE
I changed the idea of measuring Robustness, due to the first provided answer.
I measured robustness by calculating the mean absolute difference between coefficients when data is uncontaminated and when they are contaminated. I introduce outliers first in y and then in x. I still have a problem.
############ R CODE ##############
rmn <- function(n, mu, seed = TRUE) {
 if (seed) set.seed(12345)
 p <- length(mu)
 matrix( rnorm(n * p, mean = mu), ncol = p)
}
##################################
out.cv <- function(y, x, a, mu, R = 500, seed = TRUE) {
 ## y: response variable
 ## x: independent variables
 ## a: percent of outliers
 ## mu: how far should the outliers be. A vector if outliers in x,
 ## or a single number if outliers in y
 ## R: how many times to repeat this process
 x <- as.matrix(x)
 dm <- dim(x)
 n <- dm[1] ; d <- dm[2] + 1
 b1 <- b2<- b3 <- numeric(R)
 be <- coef( lm(y ~., data = as.data.frame(x[,-1]) ) )
####################################
 ### Outliers in Y ######
 if ( length(mu) == 1 ) {
 for (i in 1:R) {
 if (seed) set.seed(12345)
 id <- sample(n, a * n)
 z <- y
 if (seed) set.seed(12345)
 z[id] <- rnorm(id, mu) ## mu has to be a single number here
 ## mean absolute difference between coefficients of clean data
 ## and coefficients with contaminated data
 b1[i] <- mean( abs( coef( lm(z ~., data = as.data.frame(x[,-1])) ) - be) )
 b2[i] <- mean( abs( coef( rlm(z ~ ., data = data.frame(x[,-1]), maxit = 2000, method = "MM") ) - be ) )
 b3[i] <- mean( abs( coef( rlm(z ~ ., data = data.frame(x[,-1]), psi = psi.huber,maxit = 300) ) - be ) )
 }
########################
##### Outliers in X #########
 } else {
 for (i in 1:R) {
 if (seed) set.seed(12345)
 id <- sample(n, a * n)
 z <- x
 z[id, ] <- rmn( length(id), mu, seed ) ## mu must be a vector
 b1[i] <- mean( abs( coef( lm(y ~., data = as.data.frame(z[,-1])) )- be) )
 b2[i] <- mean( abs( coef( rlm(y ~ ., data = data.frame(z[,-1]), maxit = 2000, method = "MM") ) - be ) )
 b3[i] <- mean( abs( coef( rlm(y ~ ., data = data.frame(z[,-1]), psi = psi.huber,maxit = 300) ) - be ) )
 }
 }
 bias1 <- mean(b1) ; bias2 <- mean(b2); bias3 <- mean(b3)
 bias <- c(bias1, bias2, bias3)
 names(bias) <- c("lm", "MM-rlm","Huber-rlm")
 bias
}
################################
p <- 5
n <- 200
##############################
# Independent X and Y ####
#set.seed(12345)
#x<- matrix( rnorm(n * p), ncol = p)
#y<-rnorm(n)
## Related X and Y ####
set.seed(12345)
x <- rmn(n, numeric(p))
ber <- rnorm(p)
m <- x %*% ber
y <- rnorm(n, m, 1)
############################
a <- 0.2 #outliers 10%
mu <- 15 ## outliers in y
out.cv(y, x, a, mu)
###########################
mu <-colMeans(x)+15 ## outliers in x
out.cv(y, x, a, mu)
###################
First of all I do not see that you generate a sample from long tailed distribution. Please, use rt(n, 3) t student with very small df to get such distribution or play with other ones like log-normal. Thus do not use rnorm for sure. I see that you use some injection produce which seems to be overcomplicated.
Another thing is that specification of the MASS::rlm is not as trivial.
In my opinion start with quantreg::rq which is a quantile regression and treat it as a robust benchmark method.
Additionally your sampling procedure looks to not be a valid one. You generating a new observations each iteration which are not know apriori. I would expect bootstrapping on train or test set.

Skip Line if Error Occurs within Function in R

I am currently trying to solve a bug but believe the data I am working with may be too complex and cause errors that shouldn't normally occur. I've written a function, and was hoping to add a try or tryCatch statement to skip the error if it occurs. I currently have:
library(glmnet)
foo <- function(data, ols_ps = TRUE, index) {
# index is the bootstrap sample index
x <- data[index, -1]
y <- data[index, 1]
ridge <- cv.glmnet(x, y, alpha = 0)
## The intercept estimate should be dropped.
weights <- as.numeric(coef(ridge, s = ridge$lambda.min))[-1]
# alpha=1, lasso
alasso <- cv.glmnet(x, y, alpha = 1,
penalty.factor = 1 / abs(weights))
# Select nonzero coefficients
coef <- as.vector(coef(alasso, s = alasso$lambda.min,
exact = TRUE, x = x, y = y,
penalty.factor = 1 / abs(weights)))[-1]
if (ols_ps == TRUE) {
coef_nonzero <- coef != 0
new_x <- tryCatch(x[, coef_nonzero, drop = FALSE],
error=function(e) NA)
if (!any(is.na(new_x)) & ncol(new_x) > 0) {
ls.obj <- lm(y ~ new_x)
ls_coef <- (ls.obj$coefficients)[-1]
coef[coef_nonzero] <- ls_coef
} else {
coef <- coef
}
} else {
coef <- coef
}
return(coef)
}
which normally works and works on most datasets. I think the error may be coming from a complex dataset. Is it possible to skip OLS if I get the below error?
"Error in x[, coef_nonzero, drop = FALSE] : \n (subscript) logical subscript too long\n"
attr(,"class")
Here is a minimal working example per request.
set.seed(123)
matrix <- matrix(runif(1000), ncol=10)
boot(matrix,foo,R=50)
Thanks in advance.
Maybe like this?
foo <- function(data, index) {
# index is the bootstrap sample index
x <- data[index, -1]
y <- data[index, 1]
ridge <- cv.glmnet(x, y, alpha = 0)
## The intercept estimate should be dropped.
weights <- as.numeric(coef(ridge, s = ridge$lambda.min))[-1]
# alpha=1, lasso
alasso <- cv.glmnet(x, y, alpha = 1,
penalty.factor = 1 / abs(weights))
# Select nonzero coefficients
coef <- as.vector(coef(alasso, s = alasso$lambda.min,
exact = TRUE, x = x, y = y,
penalty.factor = 1 / abs(weights)))[-1]
coef_nonzero <- coef != 0
new_x <- tryCatch(x[, coef_nonzero, drop = FALSE],
error=function(e) NA)
if (!any(is.na(new_x))) {
ls.obj <- lm(y ~ new_x)
ls_coef <- (ls.obj$coefficients)[-1]
coef[coef_nonzero] <- ls_coef
}
return(coef)
}
The problem is that we have no case when it fails so far.

reiterating a script using r

I have the following script
Posdef <- function (n, ev = runif(n, 0, 10))
{
Z <- matrix(ncol=n, rnorm(n^2))
decomp <- qr(Z)
Q <- qr.Q(decomp)
R <- qr.R(decomp)
d <- diag(R)
ph <- d / abs(d)
O <- Q %*% diag(ph)
Z <- t(O) %*% diag(ev) %*% O
return(Z)
}
Sigma <- Posdef(n = 11)
mu <- runif(11,0,10)
data <- as.data.frame(mvrnorm(n=1000, mu, Sigma))
data[data < 0] <- 0 #setting a floor#
data[data > 10] <- 10 #setting a ceiling#
names(data) = c('criteria_1', 'criteria_2', 'criteria_3', 'criteria_4', 'criteria_5',
'criteria_6', 'criteria_7', 'criteria_8', 'criteria_9', 'criteria_10',
'outcome')
data$outcome <- ifelse(data$outcome > 5, 1, 0)
data <- data[, sapply(data, is.numeric)]
maxValue <- as.numeric(apply (data, 2, max))
minValue <- as.numeric(apply (data, 2, min))
data_scaled <- as.data.frame(scale(data, center = minValue,
scale = maxValue-minValue))
ind <- sample (1:nrow(data_scaled), 600)
train <- data_scaled[ind,]
test <- data_scaled[-ind,]
model <- glm (formula =
outcome ~ criteria_1 + criteria_2 + criteria_3 + criteria_4 + criteria_5 +
criteria_6 + criteria_7 + criteria_8 + criteria_9 + criteria_10,
family = "binomial",
data = train)
summary (model)
predicted_model <- predict(model, test)
neural_model <- neuralnet(formula =
outcome ~ criteria_1 + criteria_2 + criteria_3 + criteria_4 + criteria_5 +
criteria_6 + criteria_7 + criteria_8 + criteria_9 + criteria_10,
hidden = c(2,2) ,
threshold = 0.01,
stepmax = 1e+07,
startweights = NULL,
rep = 1,
learningrate = NULL,
algorithm = "rprop+",
linear.output=FALSE,
data= train)
plot (neural_model)
results <- compute (neural_model, test[1:10])
results <- results$net.result*(max(data$outcome)-
min(data$outcome))+ min(data$outcome)
Values <- (test$outcome)*(max(data$outcome)-
min(data$outcome)) + min(data$outcome)
MSE_nueral_model <- sum((results - Values)^2)/nrow(test)
MSE_model <- sum((predicted_model - test$outcome)^2)/nrow(test)
print(MSE_model - MSE_nueral_model)
R1 <- (MSE_model - MSE_nueral_model)
The purpose of this script is to generate some arbitrary multivariate distribution and then compare two methods. In this case its a neural net and logistic regression. The end result is a difference in mean square error.
Now my issue with creating a loop has been with generating the 1000 observations.
I am able to create a loop without the data simulation portion of the script, putting that into the loop seems to make things go haywire. I tried creating a column vector filled with NA's but all I ended up getting was a single value returned rather than a vector of length n populated by the MSE reductions for each iteration of the loop.
Any help would be greatly appreciated.

How to calculate lambda.1se from iterative cross validation?

I have the following code to choose a value of lambda based on the lowest resulting mean squared error (MSE) after iterated cross validation.
library(glmnet)
set.seed(3)
IV1 <- data.frame(IV1 = rnorm(100))
IV2 <- data.frame(IV2 = rnorm(100))
IV3 <- data.frame(IV3 = rnorm(100))
IV4 <- data.frame(IV4 = rnorm(100))
IV5 <- data.frame(IV5 = rnorm(100))
DV <- data.frame(DV = rnorm(100))
data <- data.frame(IV1,IV2,IV3,IV4,IV5,DV)
x <- model.matrix(DV~.-IV5 , data)[ , -1]
y <- data$DV
AB <- glmnet(x=x, y=y, alpha=1)
plot(AB,xvar="lambda")
lambdas <- NULL
for (i in 1:100){
fit <- cv.glmnet(x, y)
errors <- data.frame(fit$lambda, fit$cvm)
lambdas <- rbind(lambdas, errors)
}
lambdas <- aggregate(lambdas[ , 2], list(lambdas$fit.lambda), mean)
bestindex <- which(lambdas[2]== min(lambdas[2]))
bestlambda <- lambdas[bestindex,1]
How would I modify this to select lambda.1se (i.e., the largest λ at which the MSE is within one standard error of the minimal MSE)?
Edit:
How about this
lambdas <- NULL #initialize
n.fits <- 100
for (i in 1:n.fits) {
{
fit <- cv.glmnet(x,y)
errors = data.frame(fit$lambda,fit$cvm)
lambdas <- rbind(lambdas,errors)
r2[i]<-max(1-fit$cvm/var(y))
}
# take mean cvm for each lambda
lambdas <- aggregate(lambdas[, 2], list(lambdas$fit.lambda), mean)
lambdas<-as.data.frame(lambdas)
# find subset with mse within 1 se of mean
onese<-std.error(lambdas[2])
min<-min(lambdas[2])
low<-min-onese
high<-min+onese
lambdas<-subset(lambdas, x>low)
lambdas<-subset(lambdas, x<high)
#choose highest lambda among those
bestindex = which(lambdas[1]==max(lambdas[1]))
bestlambda = lambdas[bestindex,1]
If you decide to use cv.glmnet, the following might be what you are looking for. (p.s. I also cleaned up your simulation code a bit; note that I also didn't make use of the AB object from glmnet which is obviously not the same as cv.glmnet)
library(glmnet)
## Simulate data:
set.seed(3)
x <- data.frame(
IV1 = rnorm(100),
IV2 = rnorm(100),
IV3 = rnorm(100),
IV4 = rnorm(100),
IV5 = rnorm(100)
)
x <- as.matrix(x)
y <- rnorm(100) #target or response
## Iteratively fit models
lambdas <- NULL #initialize
n.fits <- 100
for (i in 1:n.fits) {
fit <- cv.glmnet(x, y, family="gaussian")
df <- data.frame(fit$lambda.1se, mean(fit$cvm) ) #can use median for CVM also
lambdas <- rbind(lambdas, df)
}
## Select best lambda:
bestindex <- which.min(lambdas[ , 2]) #the way you had it was way too complicated
bestlambda <- lambdas[bestindex, 1]
bestlambda

How to export all coefficients of a penlized regression model from package `penalized`? Need them for reporting rolling regression estimate

I have been able to run regression with some coefficients constrained to positive territory, but I'm doing alot of rolling regressions where I face the problem. Here is my sample code:
library(penalized)
set.seed(1)
x1=rnorm(100)*10
x2=rnorm(100)*10
x3=rnorm(100)*10
y=sin(x1)+cos(x2)-x3+rnorm(100)
data <- data.frame(y, x1, x2, x3)
win <- 10
coefs <- matrix(NA, ncol=4, nrow=length(y))
for(i in 1:(length(y)-win)) {
d <- data[(1+i):(win+i),]
p <- win+i
# Linear Regression
coefs[p,] <- as.vector(coef(penalized(y, ~ x1 + x2 + x3, ~1,
lambda1=0, lambda2=0, positive = c(F, F, T), data=data)))}
This is how I usually populate matrix with coefs from rolling regression and now I receive error:
Error in coefs[p, ] <- as.vector(coef(penalized(y, ~x1 + x2 + x3, ~1, :
number of items to replace is not a multiple of replacement length
I assume that this error is produced because there is not always Intercept + 3 coefficients coming out of that penalized regression function. Is there away to get penalized function to show 0 coefs as well? or other way to populated matrix / data.frame?
Perhaps you are unaware of the which argument for coef for "penfit" object. Have a look at:
getMethod(coef, "penfit")
#function (object, ...)
#{
# .local <- function (object, which = c("nonzero", "all", "penalized",
# "unpenalized"), standardize = FALSE)
# {
# coefficients(object, which, standardize)
# }
# .local(object, ...)
#}
#<environment: namespace:penalized>
We can set which = "all" to report all coefficients. The default is which = "nonzero", which is causing the "replacement length differs" issue.
The following works:
library(penalized)
set.seed(1)
x1 = rnorm(100)*10
x2 = rnorm(100)*10
x3 = rnorm(100)*10
y = sin(x1) + cos(x2) - x3 + rnorm(100)
data <- data.frame(y, x1, x2, x3)
win <- 10
coefs <- matrix(NA, ncol=4, nrow=length(y))
for(i in 1:(length(y)-win)) {
d <- data[(1+i):(win+i),]
p <- win + i
pen <- penalized(y, ~ x1 + x2 + x3, ~1, lambda1 = 0, lambda2 = 0,
positive = c(F, F, T), data = data)
beta <- coef(pen, which = "all")
coefs[p,] <- unname(beta)
}

Resources